nfa.rs source code [crates/aho-corasick-0.7.20/src/nfa.rs]

1	use std::cmp;
2	use std::collections::{BTreeSet, VecDeque};
3	use std::fmt;
4	use std::mem::size_of;
5	use std::ops::{Index, IndexMut};
6
7	use crate::ahocorasick::MatchKind;
8	use crate::automaton::Automaton;
9	use crate::classes::{ByteClassBuilder, ByteClasses};
10	use crate::error::Result;
11	use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
12	use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID};
13	use crate::Match;
14
15	/// The identifier for a pattern, which is simply the position of the pattern
16	/// in the sequence of patterns given by the caller.
17	pub type PatternID = usize;
18
19	/// The length of a pattern, in bytes.
20	pub type PatternLength = usize;
21
22	/// An Aho-Corasick automaton, represented as an NFA.
23	///
24	/// This is the classical formulation of Aho-Corasick, which involves building
25	/// up a prefix trie of a given set of patterns, and then wiring up failure
26	/// transitions between states in order to guarantee linear time matching. The
27	/// standard formulation is, technically, an NFA because of these failure
28	/// transitions. That is, one can see them as enabling the automaton to be in
29	/// multiple states at once. Indeed, during search, it is possible to check
30	/// the transitions on multiple states for a single input byte.
31	///
32	/// This particular implementation not only supports the standard style of
33	/// matching, but also provides a mode for choosing leftmost-first or
34	/// leftmost-longest match semantics. When a leftmost mode is chosen, some
35	/// failure transitions that would otherwise be added are elided. See
36	/// the documentation of `MatchKind` for more details and examples on how the
37	/// match semantics may differ.
38	///
39	/// If one wants a DFA, then it is necessary to first build an NFA and convert
40	/// it into a DFA. Note, however, that because we've constrained ourselves to
41	/// matching literal patterns, this does not need to use subset construction
42	/// for determinization. Instead, the DFA has at most a number of states
43	/// equivalent to the number of NFA states. The only real difference between
44	/// them is that all failure transitions are followed and pre-computed. This
45	/// uses much more memory, but also executes searches more quickly.
46	#[derive(Clone)]
47	pub struct NFA<S> {
48	/// The match semantics built into this NFA.
49	match_kind: MatchKind,
50	/// The start state id as an index into `states`.
51	start_id: S,
52	/// The length, in bytes, of the longest pattern in this automaton. This
53	/// information is useful for keeping correct buffer sizes when searching
54	/// on streams.
55	max_pattern_len: usize,
56	/// The total number of patterns added to this automaton, including
57	/// patterns that may never be matched.
58	pattern_count: usize,
59	/// The number of bytes of heap used by this NFA's transition table.
60	heap_bytes: usize,
61	/// A prefilter for quickly skipping to candidate matches, if pertinent.
62	prefilter: Option<PrefilterObj>,
63	/// Whether this automaton anchors all matches to the start of input.
64	anchored: bool,
65	/// A set of equivalence classes in terms of bytes. We compute this while
66	/// building the NFA, but don't use it in the NFA's states. Instead, we
67	/// use this for building the DFA. We store it on the NFA since it's easy
68	/// to compute while visiting the patterns.
69	byte_classes: ByteClasses,
70	/// A set of states. Each state defines its own transitions, a fail
71	/// transition and a set of indices corresponding to matches.
72	///
73	/// The first state is always the fail state, which is used only as a
74	/// sentinel. Namely, in the final NFA, no transition into the fail state
75	/// exists. (Well, they do, but they aren't followed. Instead, the state's
76	/// failure transition is followed.)
77	///
78	/// The second state (index 1) is always the dead state. Dead states are
79	/// in every automaton, but only used when leftmost-{first,longest} match
80	/// semantics are enabled. Specifically, they instruct search to stop
81	/// at specific points in order to report the correct match location. In
82	/// the standard Aho-Corasick construction, there are no transitions to
83	/// the dead state.
84	///
85	/// The third state (index 2) is generally intended to be the starting or
86	/// "root" state.
87	states: Vec<State<S>>,
88	}
89
90	impl<S: StateID> NFA<S> {
91	/// Returns the equivalence classes of bytes found while constructing
92	/// this NFA.
93	///
94	/// Note that the NFA doesn't actually make use of these equivalence
95	/// classes. Instead, these are useful for building the DFA when desired.
96	pub fn byte_classes(&self) -> &ByteClasses {
97	&self.byte_classes
98	}
99
100	/// Returns a prefilter, if one exists.
101	pub fn prefilter_obj(&self) -> Option<&PrefilterObj> {
102	self.prefilter.as_ref()
103	}
104
105	/// Returns the total number of heap bytes used by this NFA's transition
106	/// table.
107	pub fn heap_bytes(&self) -> usize {
108	self.heap_bytes
109	+ self.prefilter.as_ref().map_or(`0`, \|p\| p.as_ref().heap_bytes())
110	}
111
112	/// Return the length of the longest pattern in this automaton.
113	pub fn max_pattern_len(&self) -> usize {
114	self.max_pattern_len
115	}
116
117	/// Return the total number of patterns added to this automaton.
118	pub fn pattern_count(&self) -> usize {
119	self.pattern_count
120	}
121
122	/// Returns the total number of states in this NFA.
123	pub fn state_len(&self) -> usize {
124	self.states.len()
125	}
126
127	/// Returns the matches for the given state.
128	pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] {
129	&self.states[id.to_usize()].matches
130	}
131
132	/// Returns an iterator over all transitions in the given state according
133	/// to the given equivalence classes, including transitions to `fail_id()`.
134	/// The number of transitions returned is always equivalent to the number
135	/// of equivalence classes.
136	pub fn iter_all_transitions<F: FnMut(u8, S)>(
137	&self,
138	byte_classes: &ByteClasses,
139	id: S,
140	f: F,
141	) {
142	self.states[id.to_usize()].trans.iter_all(byte_classes, f);
143	}
144
145	/// Returns the failure transition for the given state.
146	pub fn failure_transition(&self, id: S) -> S {
147	self.states[id.to_usize()].fail
148	}
149
150	/// Returns the next state for the given state and input byte.
151	///
152	/// Note that this does not follow failure transitions. As such, the id
153	/// returned may be `fail_id`.
154	pub fn next_state(&self, current: S, input: u8) -> S {
155	self.states[current.to_usize()].next_state(input)
156	}
157
158	fn state(&self, id: S) -> &State<S> {
159	&self.states[id.to_usize()]
160	}
161
162	fn state_mut(&mut self, id: S) -> &mut State<S> {
163	&mut self.states[id.to_usize()]
164	}
165
166	fn start(&self) -> &State<S> {
167	self.state(self.start_id)
168	}
169
170	fn start_mut(&mut self) -> &mut State<S> {
171	let id = self.start_id;
172	self.state_mut(id)
173	}
174
175	fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> {
176	IterTransitionsMut::new(self, id)
177	}
178
179	fn copy_matches(&mut self, src: S, dst: S) {
180	let (src, dst) =
181	get_two_mut(&mut self.states, src.to_usize(), dst.to_usize());
182	dst.matches.extend_from_slice(&src.matches);
183	}
184
185	fn copy_empty_matches(&mut self, dst: S) {
186	let start_id = self.start_id;
187	self.copy_matches(start_id, dst);
188	}
189
190	fn add_dense_state(&mut self, depth: usize) -> Result<S> {
191	let trans = Transitions::Dense(Dense::new());
192	let id = usize_to_state_id(self.states.len())?;
193	self.states.push(State {
194	trans,
195	// Anchored automatons do not have any failure transitions.
196	fail: if self.anchored { dead_id() } else { self.start_id },
197	depth,
198	matches: vec![],
199	});
200	Ok(id)
201	}
202
203	fn add_sparse_state(&mut self, depth: usize) -> Result<S> {
204	let trans = Transitions::Sparse(vec![]);
205	let id = usize_to_state_id(self.states.len())?;
206	self.states.push(State {
207	trans,
208	// Anchored automatons do not have any failure transitions.
209	fail: if self.anchored { dead_id() } else { self.start_id },
210	depth,
211	matches: vec![],
212	});
213	Ok(id)
214	}
215	}
216
217	impl<S: StateID> Automaton for NFA<S> {
218	type ID = S;
219
220	fn match_kind(&self) -> &MatchKind {
221	&self.match_kind
222	}
223
224	fn anchored(&self) -> bool {
225	self.anchored
226	}
227
228	fn prefilter(&self) -> Option<&dyn Prefilter> {
229	self.prefilter.as_ref().map(\|p\| p.as_ref())
230	}
231
232	fn start_state(&self) -> S {
233	self.start_id
234	}
235
236	fn is_valid(&self, id: S) -> bool {
237	id.to_usize() < self.states.len()
238	}
239
240	fn is_match_state(&self, id: S) -> bool {
241	self.states[id.to_usize()].is_match()
242	}
243
244	fn get_match(
245	&self,
246	id: S,
247	match_index: usize,
248	end: usize,
249	) -> Option<Match> {
250	let state = match self.states.get(id.to_usize()) {
251	None => return None,
252	Some(state) => state,
253	};
254	state.matches.get(match_index).map(\|&(id, len)\| Match {
255	pattern: id,
256	len,
257	end,
258	})
259	}
260
261	fn match_count(&self, id: S) -> usize {
262	self.states[id.to_usize()].matches.len()
263	}
264
265	fn next_state(&self, mut current: S, input: u8) -> S {
266	// This terminates since:
267	//
268	// 1. `State.fail` never points to fail_id().
269	// 2. All `State.fail` values point to a state closer to `start`.
270	// 3. The start state has no transitions to fail_id().
271	loop {
272	let state = &self.states[current.to_usize()];
273	let next = state.next_state(input);
274	if next != fail_id() {
275	return next;
276	}
277	current = state.fail;
278	}
279	}
280	}
281
282	/// A representation of an NFA state for an Aho-Corasick automaton.
283	///
284	/// It contains the transitions to the next state, a failure transition for
285	/// cases where there exists no other transition for the current input byte,
286	/// the matches implied by visiting this state (if any) and the depth of this
287	/// state. The depth of a state is simply the distance from it to the start
288	/// state in the automaton, where the depth of the start state is 0.
289	#[derive(Clone, Debug)]
290	pub struct State<S> {
291	trans: Transitions<S>,
292	fail: S,
293	matches: Vec<(PatternID, PatternLength)>,
294	// TODO: Strictly speaking, this isn't needed for searching. It's only
295	// used when building an NFA that supports leftmost match semantics. We
296	// could drop this from the state and dynamically build a map only when
297	// computing failure transitions, but it's not clear which is better.
298	// Benchmark this.
299	depth: usize,
300	}
301
302	impl<S: StateID> State<S> {
303	fn heap_bytes(&self) -> usize {
304	self.trans.heap_bytes()
305	+ (self.matches.len() * size_of::<(PatternID, PatternLength)>())
306	}
307
308	fn add_match(&mut self, i: PatternID, len: PatternLength) {
309	self.matches.push((i, len));
310	}
311
312	fn is_match(&self) -> bool {
313	!self.matches.is_empty()
314	}
315
316	fn next_state(&self, input: u8) -> S {
317	self.trans.next_state(input)
318	}
319
320	fn set_next_state(&mut self, input: u8, next: S) {
321	self.trans.set_next_state(input, next);
322	}
323	}
324
325	/// Represents the transitions for a single dense state.
326	///
327	/// The primary purpose here is to encapsulate index access. Namely, since a
328	/// dense representation always contains 256 elements, all values of `u8` are
329	/// valid indices.
330	#[derive(Clone, Debug)]
331	struct Dense<S>(Vec<S>);
332
333	impl<S> Dense<S>
334	where
335	S: StateID,
336	{
337	fn new() -> Self {
338	Dense(vec![fail_id(); `256`])
339	}
340
341	#[inline]
342	fn len(&self) -> usize {
343	self.0.len()
344	}
345	}
346
347	impl<S> Index<u8> for Dense<S> {
348	type Output = S;
349
350	#[inline]
351	fn index(&self, i: u8) -> &S {
352	// SAFETY: This is safe because all dense transitions have
353	// exactly 256 elements, so all u8 values are valid indices.
354	&self.0[i as usize]
355	}
356	}
357
358	impl<S> IndexMut<u8> for Dense<S> {
359	#[inline]
360	fn index_mut(&mut self, i: u8) -> &mut S {
361	// SAFETY: This is safe because all dense transitions have
362	// exactly 256 elements, so all u8 values are valid indices.
363	&mut self.0[i as usize]
364	}
365	}
366
367	/// A representation of transitions in an NFA.
368	///
369	/// Transitions have either a sparse representation, which is slower for
370	/// lookups but uses less memory, or a dense representation, which is faster
371	/// for lookups but uses more memory. In the sparse representation, the absence
372	/// of a state implies a transition to `fail_id()`. Transitions to `dead_id()`
373	/// are still explicitly represented.
374	///
375	/// For the NFA, by default, we use a dense representation for transitions for
376	/// states close to the start state because it's likely these are the states
377	/// that will be most frequently visited.
378	#[derive(Clone, Debug)]
379	enum Transitions<S> {
380	Sparse(Vec<(u8, S)>),
381	Dense(Dense<S>),
382	}
383
384	impl<S: StateID> Transitions<S> {
385	fn heap_bytes(&self) -> usize {
386	match *self {
387	Transitions::Sparse(ref sparse) => {
388	sparse.len() * size_of::<(u8, S)>()
389	}
390	Transitions::Dense(ref dense) => dense.len() * size_of::<S>(),
391	}
392	}
393
394	fn next_state(&self, input: u8) -> S {
395	match *self {
396	Transitions::Sparse(ref sparse) => {
397	for &(b, id) in sparse {
398	if b == input {
399	return id;
400	}
401	}
402	fail_id()
403	}
404	Transitions::Dense(ref dense) => dense[input],
405	}
406	}
407
408	fn set_next_state(&mut self, input: u8, next: S) {
409	match *self {
410	Transitions::Sparse(ref mut sparse) => {
411	match sparse.binary_search_by_key(&input, \|&(b, _)\| b) {
412	Ok(i) => sparse[i] = (input, next),
413	Err(i) => sparse.insert(i, (input, next)),
414	}
415	}
416	Transitions::Dense(ref mut dense) => {
417	dense[input] = next;
418	}
419	}
420	}
421
422	/// Iterate over transitions in this state while skipping over transitions
423	/// to `fail_id()`.
424	fn iter<F: FnMut(u8, S)>(&self, mut f: F) {
425	match *self {
426	Transitions::Sparse(ref sparse) => {
427	for &(b, id) in sparse {
428	f(b, id);
429	}
430	}
431	Transitions::Dense(ref dense) => {
432	for b in AllBytesIter::new() {
433	let id = dense[b];
434	if id != fail_id() {
435	f(b, id);
436	}
437	}
438	}
439	}
440	}
441
442	/// Iterate over all transitions in this state according to the given
443	/// equivalence classes, including transitions to `fail_id()`.
444	fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) {
445	if classes.is_singleton() {
446	match *self {
447	Transitions::Sparse(ref sparse) => {
448	sparse_iter(sparse, f);
449	}
450	Transitions::Dense(ref dense) => {
451	for b in AllBytesIter::new() {
452	f(b, dense[b]);
453	}
454	}
455	}
456	} else {
457	// In this case, we only want to yield a single byte for each
458	// equivalence class.
459	match *self {
460	Transitions::Sparse(ref sparse) => {
461	let mut last_class = None;
462	sparse_iter(sparse, \|b, next\| {
463	let class = classes.get(b);
464	if last_class != Some(class) {
465	last_class = Some(class);
466	f(b, next);
467	}
468	})
469	}
470	Transitions::Dense(ref dense) => {
471	for b in classes.representatives() {
472	f(b, dense[b]);
473	}
474	}
475	}
476	}
477	}
478	}
479
480	/// Iterator over transitions in a state, skipping transitions to `fail_id()`.
481	///
482	/// This abstracts over the representation of NFA transitions, which may be
483	/// either in a sparse or dense representation.
484	///
485	/// This somewhat idiosyncratically borrows the NFA mutably, so that when one
486	/// is iterating over transitions, the caller can still mutate the NFA. This
487	/// is useful when creating failure transitions.
488	#[derive(Debug)]
489	struct IterTransitionsMut<'a, S: StateID> {
490	nfa: &'a mut NFA<S>,
491	state_id: S,
492	cur: usize,
493	}
494
495	impl<'a, S: StateID> IterTransitionsMut<'a, S> {
496	fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> {
497	IterTransitionsMut { nfa, state_id, cur: `0` }
498	}
499
500	fn nfa(&mut self) -> &mut NFA<S> {
501	self.nfa
502	}
503	}
504
505	impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> {
506	type Item = (u8, S);
507
508	fn next(&mut self) -> Option<(u8, S)> {
509	match self.nfa.states[self.state_id.to_usize()].trans {
510	Transitions::Sparse(ref sparse) => {
511	if self.cur >= sparse.len() {
512	return None;
513	}
514	let i = self.cur;
515	self.cur += `1`;
516	Some(sparse[i])
517	}
518	Transitions::Dense(ref dense) => {
519	while self.cur < dense.len() {
520	// There are always exactly 255 transitions in dense repr.
521	debug_assert!(self.cur < `256`);
522
523	let b = self.cur as u8;
524	let id = dense[b];
525	self.cur += `1`;
526	if id != fail_id() {
527	return Some((b, id));
528	}
529	}
530	None
531	}
532	}
533	}
534	}
535
536	/// A simple builder for configuring the NFA construction of Aho-Corasick.
537	#[derive(Clone, Debug)]
538	pub struct Builder {
539	dense_depth: usize,
540	match_kind: MatchKind,
541	prefilter: bool,
542	anchored: bool,
543	ascii_case_insensitive: bool,
544	}
545
546	impl Default for Builder {
547	fn default() -> Builder {
548	Builder {
549	dense_depth: `2`,
550	match_kind: MatchKind::default(),
551	prefilter: `true`,
552	anchored: `false`,
553	ascii_case_insensitive: `false`,
554	}
555	}
556	}
557
558	impl Builder {
559	pub fn new() -> Builder {
560	Builder::default()
561	}
562
563	pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>>
564	where
565	I: IntoIterator<Item = P>,
566	P: AsRef<[u8]>,
567	{
568	Compiler::new(self)?.compile(patterns)
569	}
570
571	pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
572	self.match_kind = kind;
573	self
574	}
575
576	pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
577	self.dense_depth = depth;
578	self
579	}
580
581	pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
582	self.prefilter = yes;
583	self
584	}
585
586	pub fn anchored(&mut self, yes: bool) -> &mut Builder {
587	self.anchored = yes;
588	self
589	}
590
591	pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
592	self.ascii_case_insensitive = yes;
593	self
594	}
595	}
596
597	/// A compiler uses a builder configuration and builds up the NFA formulation
598	/// of an Aho-Corasick automaton. This roughly corresponds to the standard
599	/// formulation described in textbooks.
600	#[derive(Debug)]
601	struct Compiler<'a, S: StateID> {
602	builder: &'a Builder,
603	prefilter: prefilter::Builder,
604	nfa: NFA<S>,
605	byte_classes: ByteClassBuilder,
606	}
607
608	impl<'a, S: StateID> Compiler<'a, S> {
609	fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
610	Ok(Compiler {
611	builder,
612	prefilter: prefilter::Builder::new(builder.match_kind)
613	.ascii_case_insensitive(builder.ascii_case_insensitive),
614	nfa: NFA {
615	match_kind: builder.match_kind,
616	start_id: usize_to_state_id(`2`)?,
617	max_pattern_len: `0`,
618	pattern_count: `0`,
619	heap_bytes: `0`,
620	prefilter: None,
621	anchored: builder.anchored,
622	byte_classes: ByteClasses::singletons(),
623	states: vec![],
624	},
625	byte_classes: ByteClassBuilder::new(),
626	})
627	}
628
629	fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>>
630	where
631	I: IntoIterator<Item = P>,
632	P: AsRef<[u8]>,
633	{
634	self.add_state(`0`)?; // the fail state, which is never entered
635	self.add_state(`0`)?; // the dead state, only used for leftmost
636	self.add_state(`0`)?; // the start state
637	self.build_trie(patterns)?;
638	self.add_start_state_loop();
639	self.add_dead_state_loop();
640	if !self.builder.anchored {
641	self.fill_failure_transitions();
642	}
643	self.close_start_state_loop();
644	self.nfa.byte_classes = self.byte_classes.build();
645	if !self.builder.anchored {
646	self.nfa.prefilter = self.prefilter.build();
647	}
648	self.calculate_size();
649	Ok(self.nfa)
650	}
651
652	/// This sets up the initial prefix trie that makes up the Aho-Corasick
653	/// automaton. Effectively, it creates the basic structure of the
654	/// automaton, where every pattern given has a path from the start state to
655	/// the end of the pattern.
656	fn build_trie<I, P>(&mut self, patterns: I) -> Result<()>
657	where
658	I: IntoIterator<Item = P>,
659	P: AsRef<[u8]>,
660	{
661	'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() {
662	let pat = pat.as_ref();
663	self.nfa.max_pattern_len =
664	cmp::max(self.nfa.max_pattern_len, pat.len());
665	self.nfa.pattern_count += `1`;
666
667	let mut prev = self.nfa.start_id;
668	let mut saw_match = `false`;
669	for (depth, &b) in pat.iter().enumerate() {
670	// When leftmost-first match semantics are requested, we
671	// specifically stop adding patterns when a previously added
672	// pattern is a prefix of it. We avoid adding it because
673	// leftmost-first semantics imply that the pattern can never
674	// match. This is not just an optimization to save space! It
675	// is necessary for correctness. In fact, this is the only
676	// difference in the automaton between the implementations for
677	// leftmost-first and leftmost-longest.
678	saw_match = saw_match \|\| self.nfa.state(prev).is_match();
679	if self.builder.match_kind.is_leftmost_first() && saw_match {
680	// Skip to the next pattern immediately. This avoids
681	// incorrectly adding a match after this loop terminates.
682	continue 'PATTERNS;
683	}
684
685	// Add this byte to our equivalence classes. We don't use these
686	// for NFA construction. These are instead used only if we're
687	// building a DFA. They would technically be useful for the
688	// NFA, but it would require a second pass over the patterns.
689	self.byte_classes.set_range(b, b);
690	if self.builder.ascii_case_insensitive {
691	let b = opposite_ascii_case(b);
692	self.byte_classes.set_range(b, b);
693	}
694
695	// If the transition from prev using the current byte already
696	// exists, then just move through it. Otherwise, add a new
697	// state. We track the depth here so that we can determine
698	// how to represent transitions. States near the start state
699	// use a dense representation that uses more memory but is
700	// faster. Other states use a sparse representation that uses
701	// less memory but is slower.
702	let next = self.nfa.state(prev).next_state(b);
703	if next != fail_id() {
704	prev = next;
705	} else {
706	let next = self.add_state(depth + `1`)?;
707	self.nfa.state_mut(prev).set_next_state(b, next);
708	if self.builder.ascii_case_insensitive {
709	let b = opposite_ascii_case(b);
710	self.nfa.state_mut(prev).set_next_state(b, next);
711	}
712	prev = next;
713	}
714	}
715	// Once the pattern has been added, log the match in the final
716	// state that it reached.
717	self.nfa.state_mut(prev).add_match(pati, pat.len());
718	// ... and hand it to the prefilter builder, if applicable.
719	if self.builder.prefilter {
720	self.prefilter.add(pat);
721	}
722	}
723	Ok(())
724	}
725
726	/// This routine creates failure transitions according to the standard
727	/// textbook formulation of the Aho-Corasick algorithm, with a couple small
728	/// tweaks to support "leftmost" semantics.
729	///
730	/// Building failure transitions is the most interesting part of building
731	/// the Aho-Corasick automaton, because they are what allow searches to
732	/// be performed in linear time. Specifically, a failure transition is
733	/// a single transition associated with each state that points back to
734	/// the longest proper suffix of the pattern being searched. The failure
735	/// transition is followed whenever there exists no transition on the
736	/// current state for the current input byte. If there is no other proper
737	/// suffix, then the failure transition points back to the starting state.
738	///
739	/// For example, let's say we built an Aho-Corasick automaton with the
740	/// following patterns: 'abcd' and 'cef'. The trie looks like this:
741	///
742	/// ```ignore
743	/// a - S1 - b - S2 - c - S3 - d - S4*
744	/// /
745	/// S0 - c - S5 - e - S6 - f - S7*
746	/// ```
747	///
748	/// At this point, it should be fairly straight-forward to see how this
749	/// trie can be used in a simplistic way. At any given position in the
750	/// text we're searching (called the "subject" string), all we need to do
751	/// is follow the transitions in the trie by consuming one transition for
752	/// each byte in the subject string. If we reach a match state, then we can
753	/// report that location as a match.
754	///
755	/// The trick comes when searching a subject string like 'abcef'. We'll
756	/// initially follow the transition from S0 to S1 and wind up in S3 after
757	/// observng the 'c' byte. At this point, the next byte is 'e' but state
758	/// S3 has no transition for 'e', so the search fails. We then would need
759	/// to restart the search at the next position in 'abcef', which
760	/// corresponds to 'b'. The match would fail, but the next search starting
761	/// at 'c' would finally succeed. The problem with this approach is that
762	/// we wind up searching the subject string potentially many times. In
763	/// effect, this makes the algorithm have worst case `O(n m)` complexity,*
764	/// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
765	/// like to achieve a `O(n + m)` worst case complexity.
766	///
767	/// This is where failure transitions come in. Instead of dying at S3 in
768	/// the first search, the automaton can instruct the search to move to
769	/// another part of the automaton that corresponds to a suffix of what
770	/// we've seen so far. Recall that we've seen 'abc' in the subject string,
771	/// and the automaton does indeed have a non-empty suffix, 'c', that could
772	/// potentially lead to another match. Thus, the actual Aho-Corasick
773	/// automaton for our patterns in this case looks like this:
774	///
775	/// ```ignore
776	/// a - S1 - b - S2 - c - S3 - d - S4*
777	/// / /
778	/// / ----------------
779	/// / /
780	/// S0 - c - S5 - e - S6 - f - S7*
781	/// ```
782	///
783	/// That is, we have a failure transition from S3 to S5, which is followed
784	/// exactly in cases when we are in state S3 but see any byte other than
785	/// 'd' (that is, we've "failed" to find a match in this portion of our
786	/// trie). We know we can transition back to S5 because we've already seen
787	/// a 'c' byte, so we don't need to re-scan it. We can then pick back up
788	/// with the search starting at S5 and complete our match.
789	///
790	/// Adding failure transitions to a trie is fairly simple, but subtle. The
791	/// key issue is that you might have multiple failure transition that you
792	/// need to follow. For example, look at the trie for the patterns
793	/// 'abcd', 'b', 'bcd' and 'cd':
794	///
795	/// ```ignore
796	/// - a - S1 - b - S2* - c - S3 - d - S4*
797	/// / / /
798	/// / ------- -------
799	/// / / /
800	/// S0 --- b - S5* - c - S6 - d - S7*
801	/// \ /
802	/// \ --------
803	/// \ /
804	/// - c - S8 - d - S9*
805	/// ```
806	///
807	/// The failure transitions for this trie are defined from S2 to S5,
808	/// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
809	/// corresponds to a match, since its failure transition to S5 is itself
810	/// a match state.
811	///
812	/// Perhaps simplest way to think about adding these failure transitions
813	/// is recursively. That is, if you know the failure transitions for every
814	/// possible previous state that could be visited (e.g., when computing the
815	/// failure transition for S3, you already know the failure transitions
816	/// for S0, S1 and S2), then you can simply follow the failure transition
817	/// of the previous state and check whether the incoming transition is
818	/// defined after following the failure transition.
819	///
820	/// For example, when determining the failure state for S3, by our
821	/// assumptions, we already know that there is a failure transition from
822	/// S2 (the previous state) to S5. So we follow that transition and check
823	/// whether the transition connecting S2 to S3 is defined. Indeed, it is,
824	/// as there is a transition from S5 to S6 for the byte 'c'. If no such
825	/// transition existed, we could keep following the failure transitions
826	/// until we reach the start state, which is the failure transition for
827	/// every state that has no corresponding proper suffix.
828	///
829	/// We don't actually use recursion to implement this, but instead, use a
830	/// breadth first search of the automaton. Our base case is the start
831	/// state, whose failure transition is just a transition to itself.
832	///
833	/// When building a leftmost automaton, we proceed as above, but only
834	/// include a subset of failure transitions. Namely, we omit any failure
835	/// transitions that appear after a match state in the trie. This is
836	/// because failure transitions always point back to a proper suffix of
837	/// what has been seen so far. Thus, following a failure transition after
838	/// a match implies looking for a match that starts after the one that has
839	/// already been seen, which is of course therefore not the leftmost match.
840	///
841	/// N.B. I came up with this algorithm on my own, and after scouring all of
842	/// the other AC implementations I know of (Perl, Snort, many on GitHub).
843	/// I couldn't find any that implement leftmost semantics like this.
844	/// Perl of course needs leftmost-first semantics, but they implement it
845	/// with a seeming hack at search* time instead of encoding it into the*
846	/// automaton. There are also a couple Java libraries that support leftmost
847	/// longest semantics, but they do it by building a queue of matches at
848	/// search time, which is even worse than what Perl is doing. ---AG
849	fn fill_failure_transitions(&mut self) {
850	let kind = self.match_kind();
851	// Initialize the queue for breadth first search with all transitions
852	// out of the start state. We handle the start state specially because
853	// we only want to follow non-self transitions. If we followed self
854	// transitions, then this would never terminate.
855	let mut queue = VecDeque::new();
856	let mut seen = self.queued_set();
857	let mut it = self.nfa.iter_transitions_mut(self.nfa.start_id);
858	while let Some((_, next)) = it.next() {
859	// Skip anything we've seen before and any self-transitions on the
860	// start state.
861	if next == it.nfa().start_id \|\| seen.contains(next) {
862	continue;
863	}
864	queue.push_back(next);
865	seen.insert(next);
866	// Under leftmost semantics, if a state immediately following
867	// the start state is a match state, then we never want to
868	// follow its failure transition since the failure transition
869	// necessarily leads back to the start state, which we never
870	// want to do for leftmost matching after a match has been
871	// found.
872	//
873	// We apply the same logic to non-start states below as well.
874	if kind.is_leftmost() && it.nfa().state(next).is_match() {
875	it.nfa().state_mut(next).fail = dead_id();
876	}
877	}
878	while let Some(id) = queue.pop_front() {
879	let mut it = self.nfa.iter_transitions_mut(id);
880	while let Some((b, next)) = it.next() {
881	if seen.contains(next) {
882	// The only way to visit a duplicate state in a transition
883	// list is when ASCII case insensitivity is enabled. In
884	// this case, we want to skip it since it's redundant work.
885	// But it would also end up duplicating matches, which
886	// results in reporting duplicate matches in some cases.
887	// See the 'acasei010' regression test.
888	continue;
889	}
890	queue.push_back(next);
891	seen.insert(next);
892
893	// As above for start states, under leftmost semantics, once
894	// we see a match all subsequent states should have no failure
895	// transitions because failure transitions always imply looking
896	// for a match that is a suffix of what has been seen so far
897	// (where "seen so far" corresponds to the string formed by
898	// following the transitions from the start state to the
899	// current state). Under leftmost semantics, we specifically do
900	// not want to allow this to happen because we always want to
901	// report the match found at the leftmost position.
902	//
903	// The difference between leftmost-first and leftmost-longest
904	// occurs previously while we build the trie. For
905	// leftmost-first, we simply omit any entries that would
906	// otherwise require passing through a match state.
907	//
908	// Note that for correctness, the failure transition has to be
909	// set to the dead state for ALL states following a match, not
910	// just the match state itself. However, by setting the failure
911	// transition to the dead state on all match states, the dead
912	// state will automatically propagate to all subsequent states
913	// via the failure state computation below.
914	if kind.is_leftmost() && it.nfa().state(next).is_match() {
915	it.nfa().state_mut(next).fail = dead_id();
916	continue;
917	}
918	let mut fail = it.nfa().state(id).fail;
919	while it.nfa().state(fail).next_state(b) == fail_id() {
920	fail = it.nfa().state(fail).fail;
921	}
922	fail = it.nfa().state(fail).next_state(b);
923	it.nfa().state_mut(next).fail = fail;
924	it.nfa().copy_matches(fail, next);
925	}
926	// If the start state is a match state, then this automaton can
927	// match the empty string. This implies all states are match states
928	// since every position matches the empty string, so copy the
929	// matches from the start state to every state. Strictly speaking,
930	// this is only necessary for overlapping matches since each
931	// non-empty non-start match state needs to report empty matches
932	// in addition to its own. For the non-overlapping case, such
933	// states only report the first match, which is never empty since
934	// it isn't a start state.
935	if !kind.is_leftmost() {
936	it.nfa().copy_empty_matches(id);
937	}
938	}
939	}
940
941	/// Returns a set that tracked queued states.
942	///
943	/// This is only necessary when ASCII case insensitivity is enabled, since
944	/// it is the only way to visit the same state twice. Otherwise, this
945	/// returns an inert set that nevers adds anything and always reports
946	/// `false` for every member test.
947	fn queued_set(&self) -> QueuedSet<S> {
948	if self.builder.ascii_case_insensitive {
949	QueuedSet::active()
950	} else {
951	QueuedSet::inert()
952	}
953	}
954
955	/// Set the failure transitions on the start state to loop back to the
956	/// start state. This effectively permits the Aho-Corasick automaton to
957	/// match at any position. This is also required for finding the next
958	/// state to terminate, namely, finding the next state should never return
959	/// a fail_id.
960	///
961	/// This must be done after building the initial trie, since trie
962	/// construction depends on transitions to `fail_id` to determine whether a
963	/// state already exists or not.
964	fn add_start_state_loop(&mut self) {
965	let start_id = self.nfa.start_id;
966	let start = self.nfa.start_mut();
967	for b in AllBytesIter::new() {
968	if start.next_state(b) == fail_id() {
969	start.set_next_state(b, start_id);
970	}
971	}
972	}
973
974	/// Remove the start state loop by rewriting any transitions on the start
975	/// state back to the start state with transitions to the dead state.
976	///
977	/// The loop is only closed when two conditions are met: the start state
978	/// is a match state and the match kind is leftmost-first or
979	/// leftmost-longest. (Alternatively, if this is an anchored automaton,
980	/// then the start state is always closed, regardless of aforementioned
981	/// conditions.)
982	///
983	/// The reason for this is that under leftmost semantics, a start state
984	/// that is also a match implies that we should never restart the search
985	/// process. We allow normal transitions out of the start state, but if
986	/// none exist, we transition to the dead state, which signals that
987	/// searching should stop.
988	fn close_start_state_loop(&mut self) {
989	if self.builder.anchored
990	\|\| (self.match_kind().is_leftmost() && self.nfa.start().is_match())
991	{
992	let start_id = self.nfa.start_id;
993	let start = self.nfa.start_mut();
994	for b in AllBytesIter::new() {
995	if start.next_state(b) == start_id {
996	start.set_next_state(b, dead_id());
997	}
998	}
999	}
1000	}
1001
1002	/// Sets all transitions on the dead state to point back to the dead state.
1003	/// Normally, missing transitions map back to the failure state, but the
1004	/// point of the dead state is to act as a sink that can never be escaped.
1005	fn add_dead_state_loop(&mut self) {
1006	let dead = self.nfa.state_mut(dead_id());
1007	for b in AllBytesIter::new() {
1008	dead.set_next_state(b, dead_id());
1009	}
1010	}
1011
1012	/// Computes the total amount of heap used by this NFA in bytes.
1013	fn calculate_size(&mut self) {
1014	let mut size = `0`;
1015	for state in &self.nfa.states {
1016	size += size_of::<State<S>>() + state.heap_bytes();
1017	}
1018	self.nfa.heap_bytes = size;
1019	}
1020
1021	/// Add a new state to the underlying NFA with the given depth. The depth
1022	/// is used to determine how to represent the transitions.
1023	///
1024	/// If adding the new state would overflow the chosen state ID
1025	/// representation, then this returns an error.
1026	fn add_state(&mut self, depth: usize) -> Result<S> {
1027	if depth < self.builder.dense_depth {
1028	self.nfa.add_dense_state(depth)
1029	} else {
1030	self.nfa.add_sparse_state(depth)
1031	}
1032	}
1033
1034	/// Returns the match kind configured on the underlying builder.
1035	fn match_kind(&self) -> MatchKind {
1036	self.builder.match_kind
1037	}
1038	}
1039
1040	/// A set of state identifiers used to avoid revisiting the same state multiple
1041	/// times when filling in failure transitions.
1042	///
1043	/// This set has an "inert" and an "active" mode. When inert, the set never
1044	/// stores anything and always returns `false` for every member test. This is
1045	/// useful to avoid the performance and memory overhead of maintaining this
1046	/// set when it is not needed.
1047	#[derive(Debug)]
1048	struct QueuedSet<S> {
1049	set: Option<BTreeSet<S>>,
1050	}
1051
1052	impl<S: StateID> QueuedSet<S> {
1053	/// Return an inert set that returns `false` for every state ID membership
1054	/// test.
1055	fn inert() -> QueuedSet<S> {
1056	QueuedSet { set: None }
1057	}
1058
1059	/// Return an active set that tracks state ID membership.
1060	fn active() -> QueuedSet<S> {
1061	QueuedSet { set: Some(BTreeSet::new()) }
1062	}
1063
1064	/// Inserts the given state ID into this set. (If the set is inert, then
1065	/// this is a no-op.)
1066	fn insert(&mut self, state_id: S) {
1067	if let Some(ref mut set) = self.set {
1068	set.insert(state_id);
1069	}
1070	}
1071
1072	/// Returns true if and only if the given state ID is in this set. If the
1073	/// set is inert, this always returns false.
1074	fn contains(&self, state_id: S) -> bool {
1075	match self.set {
1076	None => `false`,
1077	Some(ref set) => set.contains(&state_id),
1078	}
1079	}
1080	}
1081
1082	/// An iterator over every byte value.
1083	///
1084	/// We use this instead of (0..256).map(\|b\| b as u8) because this optimizes
1085	/// better in debug builds.
1086	///
1087	/// We also use this instead of 0..=255 because we're targeting Rust 1.24 and
1088	/// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this
1089	/// once our MSRV is Rust 1.26 or newer.
1090	#[derive(Debug)]
1091	struct AllBytesIter(u16);
1092
1093	impl AllBytesIter {
1094	fn new() -> AllBytesIter {
1095	AllBytesIter(`0`)
1096	}
1097	}
1098
1099	impl Iterator for AllBytesIter {
1100	type Item = u8;
1101
1102	fn next(&mut self) -> Option<Self::Item> {
1103	if self.0 >= `256` {
1104	None
1105	} else {
1106	let b: u8 = self.0 as u8;
1107	self.0 += `1`;
1108	Some(b)
1109	}
1110	}
1111	}
1112
1113	impl<S: StateID> fmt::Debug for NFA<S> {
1114	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1115	writeln!(f, "NFA(")?;
1116	writeln!(f, "match_kind: {:?}", self.match_kind)?;
1117	writeln!(f, "prefilter: {:?}", self.prefilter)?;
1118	writeln!(f, "{}", "-".repeat(`79`))?;
1119	for (id, s) in self.states.iter().enumerate() {
1120	let mut trans = vec![];
1121	s.trans.iter(\|byte, next\| {
1122	// The start state has a bunch of uninteresting transitions
1123	// back into itself. It's questionable to hide them since they
1124	// are critical to understanding the automaton, but they are
1125	// very noisy without better formatting for contiugous ranges
1126	// to the same state.
1127	if id == self.start_id.to_usize() && next == self.start_id {
1128	return;
1129	}
1130	// Similarly, the dead state has a bunch of uninteresting
1131	// transitions too.
1132	if id == dead_id() {
1133	return;
1134	}
1135	trans.push(format!("{} => {}", escape(byte), next.to_usize()));
1136	});
1137	writeln!(f, "{:`04`}: {}", id, trans.join(", "))?;
1138
1139	let matches: Vec<String> = s
1140	.matches
1141	.iter()
1142	.map(\|&(pattern_id, _)\| pattern_id.to_string())
1143	.collect();
1144	writeln!(f, " matches: {}", matches.join(", "))?;
1145	writeln!(f, " fail: {}", s.fail.to_usize())?;
1146	writeln!(f, " depth: {}", s.depth)?;
1147	}
1148	writeln!(f, "{}", "-".repeat(`79`))?;
1149	writeln!(f, ")")?;
1150	Ok(())
1151	}
1152	}
1153
1154	/// Iterate over all possible byte transitions given a sparse set.
1155	fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) {
1156	let mut byte: u16 = `0u16`;
1157	for &(b: u8, id: S) in trans {
1158	while byte < (b as u16) {
1159	f(byte as u8, fail_id());
1160	byte += `1`;
1161	}
1162	f(b, id);
1163	byte += `1`;
1164	}
1165	for b: u16 in byte..`256` {
1166	f(b as u8, fail_id());
1167	}
1168	}
1169
1170	/// Safely return two mutable borrows to two different locations in the given
1171	/// slice.
1172	///
1173	/// This panics if i == j.
1174	fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
1175	assert!(i != j, "{} must not be equal to {}", i, j);
1176	if i < j {
1177	let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:j);
1178	(&mut before[i], &mut after[`0`])
1179	} else {
1180	let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:i);
1181	(&mut after[`0`], &mut before[j])
1182	}
1183	}
1184
1185	/// Return the given byte as its escaped string form.
1186	fn escape(b: u8) -> String {
1187	use std::ascii;
1188
1189	String::from_utf8(vec:ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
1190	}
1191
1192	#[cfg(test)]
1193	mod tests {
1194	use super::*;
1195
1196	#[test]
1197	fn scratch() {
1198	let nfa: NFA<usize> = Builder::new()
1199	.dense_depth(`0`)
1200	// .match_kind(MatchKind::LeftmostShortest)
1201	// .match_kind(MatchKind::LeftmostLongest)
1202	.match_kind(MatchKind::LeftmostFirst)
1203	// .build(&["abcd", "ce", "b"])
1204	// .build(&["ab", "bc"])
1205	// .build(&["b", "bcd", "ce"])
1206	// .build(&["abc", "bx"])
1207	// .build(&["abc", "bd", "ab"])
1208	// .build(&["abcdefghi", "hz", "abcdefgh"])
1209	// .build(&["abcd", "bce", "b"])
1210	.build(&["abcdefg", "bcde", "bcdef"])
1211	.unwrap();
1212	println!("{:?}", nfa);
1213	}
1214	}
1215