dfa.rs source code [crates/aho-corasick-0.7.20/src/dfa.rs]

1	use std::mem::size_of;
2
3	use crate::ahocorasick::MatchKind;
4	use crate::automaton::Automaton;
5	use crate::classes::ByteClasses;
6	use crate::error::Result;
7	use crate::nfa::{PatternID, PatternLength, NFA};
8	use crate::prefilter::{Prefilter, PrefilterObj, PrefilterState};
9	use crate::state_id::{dead_id, fail_id, premultiply_overflow_error, StateID};
10	use crate::Match;
11
12	#[derive(Clone, Debug)]
13	pub enum DFA<S> {
14	Standard(Standard<S>),
15	ByteClass(ByteClass<S>),
16	Premultiplied(Premultiplied<S>),
17	PremultipliedByteClass(PremultipliedByteClass<S>),
18	}
19
20	impl<S: StateID> DFA<S> {
21	fn repr(&self) -> &Repr<S> {
22	match *self {
23	DFA::Standard(ref dfa) => dfa.repr(),
24	DFA::ByteClass(ref dfa) => dfa.repr(),
25	DFA::Premultiplied(ref dfa) => dfa.repr(),
26	DFA::PremultipliedByteClass(ref dfa) => dfa.repr(),
27	}
28	}
29
30	pub fn match_kind(&self) -> &MatchKind {
31	&self.repr().match_kind
32	}
33
34	pub fn heap_bytes(&self) -> usize {
35	self.repr().heap_bytes
36	}
37
38	pub fn max_pattern_len(&self) -> usize {
39	self.repr().max_pattern_len
40	}
41
42	pub fn pattern_count(&self) -> usize {
43	self.repr().pattern_count
44	}
45
46	pub fn prefilter(&self) -> Option<&dyn Prefilter> {
47	self.repr().prefilter.as_ref().map(\|p\| p.as_ref())
48	}
49
50	pub fn start_state(&self) -> S {
51	self.repr().start_id
52	}
53
54	#[inline(always)]
55	pub fn overlapping_find_at(
56	&self,
57	prestate: &mut PrefilterState,
58	haystack: &[u8],
59	at: usize,
60	state_id: &mut S,
61	match_index: &mut usize,
62	) -> Option<Match> {
63	match *self {
64	DFA::Standard(ref dfa) => dfa.overlapping_find_at(
65	prestate,
66	haystack,
67	at,
68	state_id,
69	match_index,
70	),
71	DFA::ByteClass(ref dfa) => dfa.overlapping_find_at(
72	prestate,
73	haystack,
74	at,
75	state_id,
76	match_index,
77	),
78	DFA::Premultiplied(ref dfa) => dfa.overlapping_find_at(
79	prestate,
80	haystack,
81	at,
82	state_id,
83	match_index,
84	),
85	DFA::PremultipliedByteClass(ref dfa) => dfa.overlapping_find_at(
86	prestate,
87	haystack,
88	at,
89	state_id,
90	match_index,
91	),
92	}
93	}
94
95	#[inline(always)]
96	pub fn earliest_find_at(
97	&self,
98	prestate: &mut PrefilterState,
99	haystack: &[u8],
100	at: usize,
101	state_id: &mut S,
102	) -> Option<Match> {
103	match *self {
104	DFA::Standard(ref dfa) => {
105	dfa.earliest_find_at(prestate, haystack, at, state_id)
106	}
107	DFA::ByteClass(ref dfa) => {
108	dfa.earliest_find_at(prestate, haystack, at, state_id)
109	}
110	DFA::Premultiplied(ref dfa) => {
111	dfa.earliest_find_at(prestate, haystack, at, state_id)
112	}
113	DFA::PremultipliedByteClass(ref dfa) => {
114	dfa.earliest_find_at(prestate, haystack, at, state_id)
115	}
116	}
117	}
118
119	#[inline(always)]
120	pub fn find_at_no_state(
121	&self,
122	prestate: &mut PrefilterState,
123	haystack: &[u8],
124	at: usize,
125	) -> Option<Match> {
126	match *self {
127	DFA::Standard(ref dfa) => {
128	dfa.find_at_no_state(prestate, haystack, at)
129	}
130	DFA::ByteClass(ref dfa) => {
131	dfa.find_at_no_state(prestate, haystack, at)
132	}
133	DFA::Premultiplied(ref dfa) => {
134	dfa.find_at_no_state(prestate, haystack, at)
135	}
136	DFA::PremultipliedByteClass(ref dfa) => {
137	dfa.find_at_no_state(prestate, haystack, at)
138	}
139	}
140	}
141	}
142
143	#[derive(Clone, Debug)]
144	pub struct Standard<S>(Repr<S>);
145
146	impl<S: StateID> Standard<S> {
147	fn repr(&self) -> &Repr<S> {
148	&self.0
149	}
150	}
151
152	impl<S: StateID> Automaton for Standard<S> {
153	type ID = S;
154
155	fn match_kind(&self) -> &MatchKind {
156	&self.repr().match_kind
157	}
158
159	fn anchored(&self) -> bool {
160	self.repr().anchored
161	}
162
163	fn prefilter(&self) -> Option<&dyn Prefilter> {
164	self.repr().prefilter.as_ref().map(\|p\| p.as_ref())
165	}
166
167	fn start_state(&self) -> S {
168	self.repr().start_id
169	}
170
171	fn is_valid(&self, id: S) -> bool {
172	id.to_usize() < self.repr().state_count
173	}
174
175	fn is_match_state(&self, id: S) -> bool {
176	self.repr().is_match_state(id)
177	}
178
179	fn is_match_or_dead_state(&self, id: S) -> bool {
180	self.repr().is_match_or_dead_state(id)
181	}
182
183	fn get_match(
184	&self,
185	id: S,
186	match_index: usize,
187	end: usize,
188	) -> Option<Match> {
189	self.repr().get_match(id, match_index, end)
190	}
191
192	fn match_count(&self, id: S) -> usize {
193	self.repr().match_count(id)
194	}
195
196	fn next_state(&self, current: S, input: u8) -> S {
197	let o = current.to_usize() * `256` + input as usize;
198	self.repr().trans[o]
199	}
200	}
201
202	#[derive(Clone, Debug)]
203	pub struct ByteClass<S>(Repr<S>);
204
205	impl<S: StateID> ByteClass<S> {
206	fn repr(&self) -> &Repr<S> {
207	&self.0
208	}
209	}
210
211	impl<S: StateID> Automaton for ByteClass<S> {
212	type ID = S;
213
214	fn match_kind(&self) -> &MatchKind {
215	&self.repr().match_kind
216	}
217
218	fn anchored(&self) -> bool {
219	self.repr().anchored
220	}
221
222	fn prefilter(&self) -> Option<&dyn Prefilter> {
223	self.repr().prefilter.as_ref().map(\|p\| p.as_ref())
224	}
225
226	fn start_state(&self) -> S {
227	self.repr().start_id
228	}
229
230	fn is_valid(&self, id: S) -> bool {
231	id.to_usize() < self.repr().state_count
232	}
233
234	fn is_match_state(&self, id: S) -> bool {
235	self.repr().is_match_state(id)
236	}
237
238	fn is_match_or_dead_state(&self, id: S) -> bool {
239	self.repr().is_match_or_dead_state(id)
240	}
241
242	fn get_match(
243	&self,
244	id: S,
245	match_index: usize,
246	end: usize,
247	) -> Option<Match> {
248	self.repr().get_match(id, match_index, end)
249	}
250
251	fn match_count(&self, id: S) -> usize {
252	self.repr().match_count(id)
253	}
254
255	fn next_state(&self, current: S, input: u8) -> S {
256	let alphabet_len = self.repr().byte_classes.alphabet_len();
257	let input = self.repr().byte_classes.get(input);
258	let o = current.to_usize() * alphabet_len + input as usize;
259	self.repr().trans[o]
260	}
261	}
262
263	#[derive(Clone, Debug)]
264	pub struct Premultiplied<S>(Repr<S>);
265
266	impl<S: StateID> Premultiplied<S> {
267	fn repr(&self) -> &Repr<S> {
268	&self.0
269	}
270	}
271
272	impl<S: StateID> Automaton for Premultiplied<S> {
273	type ID = S;
274
275	fn match_kind(&self) -> &MatchKind {
276	&self.repr().match_kind
277	}
278
279	fn anchored(&self) -> bool {
280	self.repr().anchored
281	}
282
283	fn prefilter(&self) -> Option<&dyn Prefilter> {
284	self.repr().prefilter.as_ref().map(\|p\| p.as_ref())
285	}
286
287	fn start_state(&self) -> S {
288	self.repr().start_id
289	}
290
291	fn is_valid(&self, id: S) -> bool {
292	(id.to_usize() / `256`) < self.repr().state_count
293	}
294
295	fn is_match_state(&self, id: S) -> bool {
296	self.repr().is_match_state(id)
297	}
298
299	fn is_match_or_dead_state(&self, id: S) -> bool {
300	self.repr().is_match_or_dead_state(id)
301	}
302
303	fn get_match(
304	&self,
305	id: S,
306	match_index: usize,
307	end: usize,
308	) -> Option<Match> {
309	if id > self.repr().max_match {
310	return None;
311	}
312	self.repr()
313	.matches
314	.get(id.to_usize() / `256`)
315	.and_then(\|m\| m.get(match_index))
316	.map(\|&(id, len)\| Match { pattern: id, len, end })
317	}
318
319	fn match_count(&self, id: S) -> usize {
320	let o = id.to_usize() / `256`;
321	self.repr().matches[o].len()
322	}
323
324	fn next_state(&self, current: S, input: u8) -> S {
325	let o = current.to_usize() + input as usize;
326	self.repr().trans[o]
327	}
328	}
329
330	#[derive(Clone, Debug)]
331	pub struct PremultipliedByteClass<S>(Repr<S>);
332
333	impl<S: StateID> PremultipliedByteClass<S> {
334	fn repr(&self) -> &Repr<S> {
335	&self.0
336	}
337	}
338
339	impl<S: StateID> Automaton for PremultipliedByteClass<S> {
340	type ID = S;
341
342	fn match_kind(&self) -> &MatchKind {
343	&self.repr().match_kind
344	}
345
346	fn anchored(&self) -> bool {
347	self.repr().anchored
348	}
349
350	fn prefilter(&self) -> Option<&dyn Prefilter> {
351	self.repr().prefilter.as_ref().map(\|p\| p.as_ref())
352	}
353
354	fn start_state(&self) -> S {
355	self.repr().start_id
356	}
357
358	fn is_valid(&self, id: S) -> bool {
359	(id.to_usize() / self.repr().alphabet_len()) < self.repr().state_count
360	}
361
362	fn is_match_state(&self, id: S) -> bool {
363	self.repr().is_match_state(id)
364	}
365
366	fn is_match_or_dead_state(&self, id: S) -> bool {
367	self.repr().is_match_or_dead_state(id)
368	}
369
370	fn get_match(
371	&self,
372	id: S,
373	match_index: usize,
374	end: usize,
375	) -> Option<Match> {
376	if id > self.repr().max_match {
377	return None;
378	}
379	self.repr()
380	.matches
381	.get(id.to_usize() / self.repr().alphabet_len())
382	.and_then(\|m\| m.get(match_index))
383	.map(\|&(id, len)\| Match { pattern: id, len, end })
384	}
385
386	fn match_count(&self, id: S) -> usize {
387	let o = id.to_usize() / self.repr().alphabet_len();
388	self.repr().matches[o].len()
389	}
390
391	fn next_state(&self, current: S, input: u8) -> S {
392	let input = self.repr().byte_classes.get(input);
393	let o = current.to_usize() + input as usize;
394	self.repr().trans[o]
395	}
396	}
397
398	#[derive(Clone, Debug)]
399	pub struct Repr<S> {
400	match_kind: MatchKind,
401	anchored: bool,
402	premultiplied: bool,
403	start_id: S,
404	/// The length, in bytes, of the longest pattern in this automaton. This
405	/// information is useful for keeping correct buffer sizes when searching
406	/// on streams.
407	max_pattern_len: usize,
408	/// The total number of patterns added to this automaton. This includes
409	/// patterns that may never match.
410	pattern_count: usize,
411	state_count: usize,
412	max_match: S,
413	/// The number of bytes of heap used by this NFA's transition table.
414	heap_bytes: usize,
415	/// A prefilter for quickly detecting candidate matchs, if pertinent.
416	prefilter: Option<PrefilterObj>,
417	byte_classes: ByteClasses,
418	trans: Vec<S>,
419	matches: Vec<Vec<(PatternID, PatternLength)>>,
420	}
421
422	impl<S: StateID> Repr<S> {
423	/// Returns the total alphabet size for this DFA.
424	///
425	/// If byte classes are enabled, then this corresponds to the number of
426	/// equivalence classes. If they are disabled, then this is always 256.
427	fn alphabet_len(&self) -> usize {
428	self.byte_classes.alphabet_len()
429	}
430
431	/// Returns true only if the given state is a match state.
432	fn is_match_state(&self, id: S) -> bool {
433	id <= self.max_match && id > dead_id()
434	}
435
436	/// Returns true only if the given state is either a dead state or a match
437	/// state.
438	fn is_match_or_dead_state(&self, id: S) -> bool {
439	id <= self.max_match
440	}
441
442	/// Get the ith match for the given state, where the end position of a
443	/// match was found at `end`.
444	///
445	/// # Panics
446	///
447	/// The caller must ensure that the given state identifier is valid,
448	/// otherwise this may panic. The `match_index` need not be valid. That is,
449	/// if the given state has no matches then this returns `None`.
450	fn get_match(
451	&self,
452	id: S,
453	match_index: usize,
454	end: usize,
455	) -> Option<Match> {
456	if id > self.max_match {
457	return None;
458	}
459	self.matches
460	.get(id.to_usize())
461	.and_then(\|m\| m.get(match_index))
462	.map(\|&(id, len)\| Match { pattern: id, len, end })
463	}
464
465	/// Return the total number of matches for the given state.
466	///
467	/// # Panics
468	///
469	/// The caller must ensure that the given identifier is valid, or else
470	/// this panics.
471	fn match_count(&self, id: S) -> usize {
472	self.matches[id.to_usize()].len()
473	}
474
475	/// Get the next state given `from` as the current state and `byte` as the
476	/// current input byte.
477	fn next_state(&self, from: S, byte: u8) -> S {
478	let alphabet_len = self.alphabet_len();
479	let byte = self.byte_classes.get(byte);
480	self.trans[from.to_usize() * alphabet_len + byte as usize]
481	}
482
483	/// Set the `byte` transition for the `from` state to point to `to`.
484	fn set_next_state(&mut self, from: S, byte: u8, to: S) {
485	let alphabet_len = self.alphabet_len();
486	let byte = self.byte_classes.get(byte);
487	self.trans[from.to_usize() * alphabet_len + byte as usize] = to;
488	}
489
490	/// Swap the given states in place.
491	fn swap_states(&mut self, id1: S, id2: S) {
492	assert!(!self.premultiplied, "can't swap states in premultiplied DFA");
493
494	let o1 = id1.to_usize() * self.alphabet_len();
495	let o2 = id2.to_usize() * self.alphabet_len();
496	for b in `0`..self.alphabet_len() {
497	self.trans.swap(o1 + b, o2 + b);
498	}
499	self.matches.swap(id1.to_usize(), id2.to_usize());
500	}
501
502	/// This routine shuffles all match states in this DFA to the beginning
503	/// of the DFA such that every non-match state appears after every match
504	/// state. (With one exception: the special fail and dead states remain as
505	/// the first two states.)
506	///
507	/// The purpose of doing this shuffling is to avoid an extra conditional
508	/// in the search loop, and in particular, detecting whether a state is a
509	/// match or not does not need to access any memory.
510	///
511	/// This updates `self.max_match` to point to the last matching state as
512	/// well as `self.start` if the starting state was moved.
513	fn shuffle_match_states(&mut self) {
514	assert!(
515	!self.premultiplied,
516	"cannot shuffle match states of premultiplied DFA"
517	);
518
519	if self.state_count <= `1` {
520	return;
521	}
522
523	let mut first_non_match = self.start_id.to_usize();
524	while first_non_match < self.state_count
525	&& self.matches[first_non_match].len() > `0`
526	{
527	first_non_match += `1`;
528	}
529
530	let mut swaps: Vec<S> = vec![fail_id(); self.state_count];
531	let mut cur = self.state_count - `1`;
532	while cur > first_non_match {
533	if self.matches[cur].len() > `0` {
534	self.swap_states(
535	S::from_usize(cur),
536	S::from_usize(first_non_match),
537	);
538	swaps[cur] = S::from_usize(first_non_match);
539	swaps[first_non_match] = S::from_usize(cur);
540
541	first_non_match += `1`;
542	while first_non_match < cur
543	&& self.matches[first_non_match].len() > `0`
544	{
545	first_non_match += `1`;
546	}
547	}
548	cur -= `1`;
549	}
550	for id in (`0`..self.state_count).map(S::from_usize) {
551	let alphabet_len = self.alphabet_len();
552	let offset = id.to_usize() * alphabet_len;
553	for next in &mut self.trans[offset..offset + alphabet_len] {
554	if swaps[next.to_usize()] != fail_id() {
555	*next = swaps[next.to_usize()];
556	}
557	}
558	}
559	if swaps[self.start_id.to_usize()] != fail_id() {
560	self.start_id = swaps[self.start_id.to_usize()];
561	}
562	self.max_match = S::from_usize(first_non_match - `1`);
563	}
564
565	fn premultiply(&mut self) -> Result<()> {
566	if self.premultiplied \|\| self.state_count <= `1` {
567	return Ok(());
568	}
569
570	let alpha_len = self.alphabet_len();
571	premultiply_overflow_error(
572	S::from_usize(self.state_count - `1`),
573	alpha_len,
574	)?;
575
576	for id in (`2`..self.state_count).map(S::from_usize) {
577	let offset = id.to_usize() * alpha_len;
578	for next in &mut self.trans[offset..offset + alpha_len] {
579	if *next == dead_id() {
580	continue;
581	}
582	next = S::from_usize(next.to_usize() alpha_len);
583	}
584	}
585	self.premultiplied = `true`;
586	self.start_id = S::from_usize(self.start_id.to_usize() * alpha_len);
587	self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len);
588	Ok(())
589	}
590
591	/// Computes the total amount of heap used by this NFA in bytes.
592	fn calculate_size(&mut self) {
593	let mut size = (self.trans.len() * size_of::<S>())
594	+ (self.matches.len()
595	* size_of::<Vec<(PatternID, PatternLength)>>());
596	for state_matches in &self.matches {
597	size +=
598	state_matches.len() * size_of::<(PatternID, PatternLength)>();
599	}
600	size += self.prefilter.as_ref().map_or(`0`, \|p\| p.as_ref().heap_bytes());
601	self.heap_bytes = size;
602	}
603	}
604
605	/// A builder for configuring the determinization of an NFA into a DFA.
606	#[derive(Clone, Debug)]
607	pub struct Builder {
608	premultiply: bool,
609	byte_classes: bool,
610	}
611
612	impl Builder {
613	/// Create a new builder for a DFA.
614	pub fn new() -> Builder {
615	Builder { premultiply: `true`, byte_classes: `true` }
616	}
617
618	/// Build a DFA from the given NFA.
619	///
620	/// This returns an error if the state identifiers exceed their
621	/// representation size. This can only happen when state ids are
622	/// premultiplied (which is enabled by default).
623	pub fn build<S: StateID>(&self, nfa: &NFA<S>) -> Result<DFA<S>> {
624	let byte_classes = if self.byte_classes {
625	nfa.byte_classes().clone()
626	} else {
627	ByteClasses::singletons()
628	};
629	let alphabet_len = byte_classes.alphabet_len();
630	let trans = vec![fail_id(); alphabet_len * nfa.state_len()];
631	let matches = vec![vec![]; nfa.state_len()];
632	let mut repr = Repr {
633	match_kind: nfa.match_kind().clone(),
634	anchored: nfa.anchored(),
635	premultiplied: `false`,
636	start_id: nfa.start_state(),
637	max_pattern_len: nfa.max_pattern_len(),
638	pattern_count: nfa.pattern_count(),
639	state_count: nfa.state_len(),
640	max_match: fail_id(),
641	heap_bytes: `0`,
642	prefilter: nfa.prefilter_obj().map(\|p\| p.clone()),
643	byte_classes: byte_classes.clone(),
644	trans,
645	matches,
646	};
647	for id in (`0`..nfa.state_len()).map(S::from_usize) {
648	repr.matches[id.to_usize()].extend_from_slice(nfa.matches(id));
649
650	let fail = nfa.failure_transition(id);
651	nfa.iter_all_transitions(&byte_classes, id, \|b, mut next\| {
652	if next == fail_id() {
653	next = nfa_next_state_memoized(nfa, &repr, id, fail, b);
654	}
655	repr.set_next_state(id, b, next);
656	});
657	}
658	repr.shuffle_match_states();
659	repr.calculate_size();
660	if self.premultiply {
661	repr.premultiply()?;
662	if byte_classes.is_singleton() {
663	Ok(DFA::Premultiplied(Premultiplied(repr)))
664	} else {
665	Ok(DFA::PremultipliedByteClass(PremultipliedByteClass(repr)))
666	}
667	} else {
668	if byte_classes.is_singleton() {
669	Ok(DFA::Standard(Standard(repr)))
670	} else {
671	Ok(DFA::ByteClass(ByteClass(repr)))
672	}
673	}
674	}
675
676	/// Whether to use byte classes or in the DFA.
677	pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
678	self.byte_classes = yes;
679	self
680	}
681
682	/// Whether to premultiply state identifier in the DFA.
683	pub fn premultiply(&mut self, yes: bool) -> &mut Builder {
684	self.premultiply = yes;
685	self
686	}
687	}
688
689	/// This returns the next NFA transition (including resolving failure
690	/// transitions), except once it sees a state id less than the id of the DFA
691	/// state that is currently being populated, then we no longer need to follow
692	/// failure transitions and can instead query the pre-computed state id from
693	/// the DFA itself.
694	///
695	/// In general, this should only be called when a failure transition is seen.
696	fn nfa_next_state_memoized<S: StateID>(
697	nfa: &NFA<S>,
698	dfa: &Repr<S>,
699	populating: S,
700	mut current: S,
701	input: u8,
702	) -> S {
703	loop {
704	if current < populating {
705	return dfa.next_state(from:current, byte:input);
706	}
707	let next: S = nfa.next_state(current, input);
708	if next != fail_id() {
709	return next;
710	}
711	current = nfa.failure_transition(id:current);
712	}
713	}
714