noncontiguous.rs source code [crates/aho-corasick-1.0.2/src/nfa/noncontiguous.rs]

1	/!*
2	Provides a noncontiguous NFA implementation of Aho-Corasick.
3
4	This is a low-level API that generally only needs to be used in niche
5	circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6	instead of a noncontiguous NFA directly. Using an `NFA` directly is typically
7	only necessary when one needs access to the [`Automaton`] trait implementation.
8	*/
9
10	use alloc::{
11	collections::{BTreeSet, VecDeque},
12	vec,
13	vec::Vec,
14	};
15
16	use crate::{
17	automaton::Automaton,
18	util::{
19	alphabet::{ByteClassSet, ByteClasses},
20	error::{BuildError, MatchError},
21	prefilter::{self, opposite_ascii_case, Prefilter},
22	primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
23	remapper::Remapper,
24	search::{Anchored, MatchKind},
25	special::Special,
26	},
27	};
28
29	/// A noncontiguous NFA implementation of Aho-Corasick.
30	///
31	/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
32	/// this type directly. Using an `NFA` directly is typically only necessary
33	/// when one needs access to the [`Automaton`] trait implementation.
34	///
35	/// This NFA represents the "core" implementation of Aho-Corasick in this
36	/// crate. Namely, constructing this NFA involving building a trie and then
37	/// filling in the failure transitions between states, similar to what is
38	/// described in any standard textbook description of Aho-Corasick.
39	///
40	/// In order to minimize heap usage and to avoid additional construction costs,
41	/// this implementation represents the transitions of all states as distinct
42	/// sparse memory allocations. This is where it gets its name from. That is,
43	/// this NFA has no contiguous memory allocation for its transition table. Each
44	/// state gets its own allocation.
45	///
46	/// While the sparse representation keeps memory usage to somewhat reasonable
47	/// levels, it is still quite large and also results in somewhat mediocre
48	/// search performance. For this reason, it is almost always a good idea to
49	/// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is
50	/// marginally slower to build, but has higher throughput and can sometimes use
51	/// an order of magnitude less memory. The main reason to use a noncontiguous
52	/// NFA is when you need the fastest possible construction time, or when a
53	/// contiguous NFA does not have the desired capacity. (The total number of NFA
54	/// states it can have is fewer than a noncontiguous NFA.)
55	///
56	/// # Example
57	///
58	/// This example shows how to build an `NFA` directly and use it to execute
59	/// [`Automaton::try_find`]:
60	///
61	/// ```
62	/// use aho_corasick::{
63	/// automaton::Automaton,
64	/// nfa::noncontiguous::NFA,
65	/// Input, Match,
66	/// };
67	///
68	/// let patterns = &["b", "abc", "abcd"];
69	/// let haystack = "abcd";
70	///
71	/// let nfa = NFA::new(patterns).unwrap();
72	/// assert_eq!(
73	/// Some(Match::must(`0`, `1`..`2`)),
74	/// nfa.try_find(&Input::new(haystack))?,
75	/// );
76	/// # Ok::<(), Box<dyn std::error::Error>>(())
77	/// ```
78	///
79	/// It is also possible to implement your own version of `try_find`. See the
80	/// [`Automaton`] documentation for an example.
81	#[derive(Clone)]
82	pub struct NFA {
83	/// The match semantics built into this NFA.
84	match_kind: MatchKind,
85	/// A set of states. Each state defines its own transitions, a fail
86	/// transition and a set of indices corresponding to matches.
87	///
88	/// The first state is always the fail state, which is used only as a
89	/// sentinel. Namely, in the final NFA, no transition into the fail state
90	/// exists. (Well, they do, but they aren't followed. Instead, the state's
91	/// failure transition is followed.)
92	///
93	/// The second state (index 1) is always the dead state. Dead states are
94	/// in every automaton, but only used when leftmost-{first,longest} match
95	/// semantics are enabled. Specifically, they instruct search to stop
96	/// at specific points in order to report the correct match location. In
97	/// the standard Aho-Corasick construction, there are no transitions to
98	/// the dead state.
99	///
100	/// The third state (index 2) is generally intended to be the starting or
101	/// "root" state.
102	states: Vec<State>,
103	/// The length, in bytes, of each pattern in this NFA. This slice is
104	/// indexed by `PatternID`.
105	///
106	/// The number of entries in this vector corresponds to the total number of
107	/// patterns in this automaton.
108	pattern_lens: Vec<SmallIndex>,
109	/// A prefilter for quickly skipping to candidate matches, if pertinent.
110	prefilter: Option<Prefilter>,
111	/// A set of equivalence classes in terms of bytes. We compute this while
112	/// building the NFA, but don't use it in the NFA's states. Instead, we
113	/// use this for building the DFA. We store it on the NFA since it's easy
114	/// to compute while visiting the patterns.
115	byte_classes: ByteClasses,
116	/// The length, in bytes, of the shortest pattern in this automaton. This
117	/// information is useful for detecting whether an automaton matches the
118	/// empty string or not.
119	min_pattern_len: usize,
120	/// The length, in bytes, of the longest pattern in this automaton. This
121	/// information is useful for keeping correct buffer sizes when searching
122	/// on streams.
123	max_pattern_len: usize,
124	/// The information required to deduce which states are "special" in this
125	/// NFA.
126	///
127	/// Since the DEAD and FAIL states are always the first two states and
128	/// there are only ever two start states (which follow all of the match
129	/// states), it follows that we can determine whether a state is a fail,
130	/// dead, match or start with just a few comparisons on the ID itself:
131	///
132	/// is_dead(sid): sid == NFA::DEAD
133	/// is_fail(sid): sid == NFA::FAIL
134	/// is_match(sid): NFA::FAIL < sid && sid <= max_match_id
135	/// is_start(sid): sid == start_unanchored_id \|\| sid == start_anchored_id
136	///
137	/// Note that this only applies to the NFA after it has been constructed.
138	/// During construction, the start states are the first ones added and the
139	/// match states are inter-leaved with non-match states. Once all of the
140	/// states have been added, the states are shuffled such that the above
141	/// predicates hold.
142	special: Special,
143	/// The number of bytes of heap used by this sparse NFA.
144	memory_usage: usize,
145	}
146
147	impl NFA {
148	/// Create a new Aho-Corasick noncontiguous NFA using the default
149	/// configuration.
150	///
151	/// Use a [`Builder`] if you want to change the configuration.
152	pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
153	where
154	I: IntoIterator<Item = P>,
155	P: AsRef<[u8]>,
156	{
157	NFA::builder().build(patterns)
158	}
159
160	/// A convenience method for returning a new Aho-Corasick noncontiguous NFA
161	/// builder.
162	///
163	/// This usually permits one to just import the `NFA` type.
164	pub fn builder() -> Builder {
165	Builder::new()
166	}
167	}
168
169	impl NFA {
170	/// The DEAD state is a sentinel state like the FAIL state. The DEAD state
171	/// instructs any search to stop and return any currently recorded match,
172	/// or no match otherwise. Generally speaking, it is impossible for an
173	/// unanchored standard search to enter a DEAD state. But an anchored
174	/// search can, and so to can a leftmost search.
175	///
176	/// We put DEAD before FAIL so that DEAD is always 0. We repeat this
177	/// decision across the other Aho-Corasicm automata, so that DEAD
178	/// states there are always 0 too. It's not that we need all of the
179	/// implementations to agree, but rather, the contiguous NFA and the DFA
180	/// use a sort of "premultiplied" state identifier where the only state
181	/// whose ID is always known and constant is the first state. Subsequent
182	/// state IDs depend on how much space has already been used in the
183	/// transition table.
184	pub(crate) const DEAD: StateID = StateID::new_unchecked(`0`);
185	/// The FAIL state mostly just corresponds to the ID of any transition on a
186	/// state that isn't explicitly defined. When one transitions into the FAIL
187	/// state, one must follow the previous state's failure transition before
188	/// doing the next state lookup. In this way, FAIL is more of a sentinel
189	/// than a state that one actually transitions into. In particular, it is
190	/// never exposed in the `Automaton` interface.
191	pub(crate) const FAIL: StateID = StateID::new_unchecked(`1`);
192
193	/// Returns the equivalence classes of bytes found while constructing
194	/// this NFA.
195	///
196	/// Note that the NFA doesn't actually make use of these equivalence
197	/// classes. Instead, these are useful for building the DFA when desired.
198	pub(crate) fn byte_classes(&self) -> &ByteClasses {
199	&self.byte_classes
200	}
201
202	/// Returns a slice containing the length of each pattern in this searcher.
203	/// It is indexed by `PatternID` and has length `NFA::patterns_len`.
204	///
205	/// This is exposed for convenience when building a contiguous NFA. But it
206	/// can be reconstructed from the `Automaton` API if necessary.
207	pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] {
208	&self.pattern_lens
209	}
210
211	/// Returns a slice of all states in this non-contiguous NFA.
212	pub(crate) fn states(&self) -> &[State] {
213	&self.states
214	}
215
216	/// Returns the underlying "special" state information for this NFA.
217	pub(crate) fn special(&self) -> &Special {
218	&self.special
219	}
220
221	/// Swaps the states at `id1` and `id2`.
222	///
223	/// This does not update the transitions of any state to account for the
224	/// state swap.
225	pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
226	self.states.swap(id1.as_usize(), id2.as_usize());
227	}
228
229	/// Re-maps all state IDs in this NFA according to the `map` function
230	/// given.
231	pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
232	for state in self.states.iter_mut() {
233	state.fail = map(state.fail);
234	for (_, ref mut sid) in state.trans.iter_mut() {
235	sid = map(sid);
236	}
237	}
238	}
239	}
240
241	// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
242	// returns a valid state ID given a valid state ID. We otherwise claim that
243	// all other methods are correct as well.
244	unsafe impl Automaton for NFA {
245	#[inline(always)]
246	fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
247	match anchored {
248	Anchored::No => Ok(self.special.start_unanchored_id),
249	Anchored::Yes => Ok(self.special.start_anchored_id),
250	}
251	}
252
253	#[inline(always)]
254	fn next_state(
255	&self,
256	anchored: Anchored,
257	mut sid: StateID,
258	byte: u8,
259	) -> StateID {
260	// This terminates since:
261	//
262	// 1. state.fail never points to the FAIL state.
263	// 2. All state.fail values point to a state closer to the start state.
264	// 3. The start state has no transitions to the FAIL state.
265	loop {
266	let state = &self.states[sid];
267	let next = state.next_state(byte);
268	if next != NFA::FAIL {
269	return next;
270	}
271	// For an anchored search, we never follow failure transitions
272	// because failure transitions lead us down a path to matching
273	// a proper* suffix of the path we were on. Thus, it can only*
274	// produce matches that appear after the beginning of the search.
275	if anchored.is_anchored() {
276	return NFA::DEAD;
277	}
278	sid = state.fail;
279	}
280	}
281
282	#[inline(always)]
283	fn is_special(&self, sid: StateID) -> bool {
284	sid <= self.special.max_special_id
285	}
286
287	#[inline(always)]
288	fn is_dead(&self, sid: StateID) -> bool {
289	sid == NFA::DEAD
290	}
291
292	#[inline(always)]
293	fn is_match(&self, sid: StateID) -> bool {
294	// N.B. This returns true when sid==NFA::FAIL but that's okay because
295	// NFA::FAIL is not actually a valid state ID from the perspective of
296	// the Automaton trait. Namely, it is never returned by 'start_state'
297	// or by 'next_state'. So we don't need to care about it here.
298	!self.is_dead(sid) && sid <= self.special.max_match_id
299	}
300
301	#[inline(always)]
302	fn is_start(&self, sid: StateID) -> bool {
303	sid == self.special.start_unanchored_id
304	\|\| sid == self.special.start_anchored_id
305	}
306
307	#[inline(always)]
308	fn match_kind(&self) -> MatchKind {
309	self.match_kind
310	}
311
312	#[inline(always)]
313	fn patterns_len(&self) -> usize {
314	self.pattern_lens.len()
315	}
316
317	#[inline(always)]
318	fn pattern_len(&self, pid: PatternID) -> usize {
319	self.pattern_lens[pid].as_usize()
320	}
321
322	#[inline(always)]
323	fn min_pattern_len(&self) -> usize {
324	self.min_pattern_len
325	}
326
327	#[inline(always)]
328	fn max_pattern_len(&self) -> usize {
329	self.max_pattern_len
330	}
331
332	#[inline(always)]
333	fn match_len(&self, sid: StateID) -> usize {
334	self.states[sid].matches.len()
335	}
336
337	#[inline(always)]
338	fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
339	self.states[sid].matches[index]
340	}
341
342	#[inline(always)]
343	fn memory_usage(&self) -> usize {
344	self.memory_usage
345	+ self.prefilter.as_ref().map_or(`0`, \|p\| p.memory_usage())
346	}
347
348	#[inline(always)]
349	fn prefilter(&self) -> Option<&Prefilter> {
350	self.prefilter.as_ref()
351	}
352	}
353
354	/// A representation of a sparse NFA state for an Aho-Corasick automaton.
355	///
356	/// It contains the transitions to the next state, a failure transition for
357	/// cases where there exists no other transition for the current input byte
358	/// and the matches implied by visiting this state (if any).
359	#[derive(Clone)]
360	pub(crate) struct State {
361	/// The set of defined transitions for this state sorted by `u8`. In an
362	/// unanchored search, if a byte is not in this set of transitions, then
363	/// it should transition to `fail`. In an anchored search, it should
364	/// transition to the special DEAD state.
365	pub(crate) trans: Vec<(u8, StateID)>,
366	/// The patterns that match once this state is entered. Note that order
367	/// is important in the leftmost case. For example, if one adds 'foo' and
368	/// 'foo' (duplicate patterns are not disallowed), then in a leftmost-first
369	/// search, only the first 'foo' will ever match.
370	pub(crate) matches: Vec<PatternID>,
371	/// The state that should be transitioned to if the current byte in the
372	/// haystack does not have a corresponding transition defined in this
373	/// state.
374	pub(crate) fail: StateID,
375	/// The depth of this state. Specifically, this is the distance from this
376	/// state to the starting state. (For the special sentinel states DEAD and
377	/// FAIL, their depth is always 0.) The depth of a starting state is 0.
378	///
379	/// Note that depth is currently not used in this non-contiguous NFA. It
380	/// may in the future, but it is used in the contiguous NFA. Namely, it
381	/// permits an optimization where states near the starting state have their
382	/// transitions stored in a dense fashion, but all other states have their
383	/// transitions stored in a sparse fashion. (This non-contiguous NFA uses
384	/// a sparse representation for all states unconditionally.) In any case,
385	/// this is really the only convenient place to compute and store this
386	/// information, which we need when building the contiguous NFA.
387	pub(crate) depth: SmallIndex,
388	}
389
390	impl State {
391	/// Return the heap memory used by this state. Note that if `State` is
392	/// itself on the heap, then callers need to call this in addition to
393	/// `size_of::<State>()` to get the full heap memory used.
394	fn memory_usage(&self) -> usize {
395	use core::mem::size_of;
396
397	(self.trans.len() * size_of::<(u8, StateID)>())
398	+ (self.matches.len() * size_of::<PatternID>())
399	}
400
401	/// Return true if and only if this state is a match state.
402	fn is_match(&self) -> bool {
403	!self.matches.is_empty()
404	}
405
406	/// Return the next state by following the transition for the given byte.
407	/// If no transition for the given byte is defined, then the FAIL state ID
408	/// is returned.
409	#[inline(always)]
410	fn next_state(&self, byte: u8) -> StateID {
411	// This is a special case that targets the unanchored starting state.
412	// By construction, the unanchored starting state is actually a dense
413	// state, because every possible transition is defined on it. Any
414	// transitions that weren't added as part of initial trie construction
415	// get explicitly added as a self-transition back to itself. Thus, we
416	// can treat it as if it were dense and do a constant time lookup.
417	//
418	// This has massive* benefit when executing searches because the*
419	// unanchored starting state is by far the hottest state and is
420	// frequently visited. Moreover, the 'for' loop below that works
421	// decently on an actually sparse state is disastrous on a state that
422	// is nearly or completely dense.
423	//
424	// This optimization also works in general, including for non-starting
425	// states that happen to have every transition defined. Namely, it
426	// is impossible for 'self.trans' to have duplicate transitions (by
427	// construction) and transitions are always in sorted ascending order.
428	// So if a state has 256 transitions, it is, by construction, dense and
429	// amenable to constant time indexing.
430	if self.trans.len() == `256` {
431	self.trans[usize::from(byte)].1
432	} else {
433	for &(b, id) in self.trans.iter() {
434	if b == byte {
435	return id;
436	}
437	}
438	NFA::FAIL
439	}
440	}
441
442	/// Set the transition for the given byte to the state ID given.
443	///
444	/// Note that one should not set transitions to the FAIL state. It is not
445	/// technically incorrect, but it wastes space. If a transition is not
446	/// defined, then it is automatically assumed to lead to the FAIL state.
447	fn set_next_state(&mut self, byte: u8, next: StateID) {
448	match self.trans.binary_search_by_key(&byte, \|&(b, _)\| b) {
449	Ok(i) => self.trans[i] = (byte, next),
450	Err(i) => self.trans.insert(i, (byte, next)),
451	}
452	}
453	}
454
455	impl core::fmt::Debug for State {
456	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
457	use crate::{automaton::sparse_transitions, util::debug::DebugByte};
458
459	let it: impl Iterator = sparse_transitions(self.trans.iter().copied()).enumerate();
460	for (i: usize, (start: u8, end: u8, sid: StateID)) in it {
461	if i > `0` {
462	write!(f, ", ")?;
463	}
464	if start == end {
465	write!(f, "{:?} => {:?}", DebugByte(start), sid.as_usize())?;
466	} else {
467	write!(
468	f,
469	"{:?}-{:?} => {:?}",
470	DebugByte(start),
471	DebugByte(end),
472	sid.as_usize()
473	)?;
474	}
475	}
476	Ok(())
477	}
478	}
479
480	/// A builder for configuring an Aho-Corasick noncontiguous NFA.
481	///
482	/// This builder has a subset of the options available to a
483	/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
484	/// their behavior is identical.
485	#[derive(Clone, Debug)]
486	pub struct Builder {
487	match_kind: MatchKind,
488	prefilter: bool,
489	ascii_case_insensitive: bool,
490	}
491
492	impl Default for Builder {
493	fn default() -> Builder {
494	Builder {
495	match_kind: MatchKind::default(),
496	prefilter: `true`,
497	ascii_case_insensitive: `false`,
498	}
499	}
500	}
501
502	impl Builder {
503	/// Create a new builder for configuring an Aho-Corasick noncontiguous NFA.
504	pub fn new() -> Builder {
505	Builder::default()
506	}
507
508	/// Build an Aho-Corasick noncontiguous NFA from the given iterator of
509	/// patterns.
510	///
511	/// A builder may be reused to create more NFAs.
512	pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
513	where
514	I: IntoIterator<Item = P>,
515	P: AsRef<[u8]>,
516	{
517	debug!("building non-contiguous NFA");
518	let nfa = Compiler::new(self)?.compile(patterns)?;
519	debug!(
520	"non-contiguous NFA built, <states: {:?}, size: {:?}>",
521	nfa.states.len(),
522	nfa.memory_usage()
523	);
524	Ok(nfa)
525	}
526
527	/// Set the desired match semantics.
528	///
529	/// See
530	/// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
531	/// for more documentation and examples.
532	pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
533	self.match_kind = kind;
534	self
535	}
536
537	/// Enable ASCII-aware case insensitive matching.
538	///
539	/// See
540	/// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
541	/// for more documentation and examples.
542	pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
543	self.ascii_case_insensitive = yes;
544	self
545	}
546
547	/// Enable heuristic prefilter optimizations.
548	///
549	/// See
550	/// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
551	/// for more documentation and examples.
552	pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
553	self.prefilter = yes;
554	self
555	}
556	}
557
558	/// A compiler uses a builder configuration and builds up the NFA formulation
559	/// of an Aho-Corasick automaton. This roughly corresponds to the standard
560	/// formulation described in textbooks, with some tweaks to support leftmost
561	/// searching.
562	#[derive(Debug)]
563	struct Compiler<'a> {
564	builder: &'a Builder,
565	prefilter: prefilter::Builder,
566	nfa: NFA,
567	byteset: ByteClassSet,
568	}
569
570	impl<'a> Compiler<'a> {
571	fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> {
572	let prefilter = prefilter::Builder::new(builder.match_kind)
573	.ascii_case_insensitive(builder.ascii_case_insensitive);
574	Ok(Compiler {
575	builder,
576	prefilter,
577	nfa: NFA {
578	match_kind: builder.match_kind,
579	states: vec![],
580	pattern_lens: vec![],
581	prefilter: None,
582	byte_classes: ByteClasses::singletons(),
583	min_pattern_len: usize::MAX,
584	max_pattern_len: `0`,
585	special: Special::zero(),
586	memory_usage: `0`,
587	},
588	byteset: ByteClassSet::empty(),
589	})
590	}
591
592	fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError>
593	where
594	I: IntoIterator<Item = P>,
595	P: AsRef<[u8]>,
596	{
597	// the dead state, only used for leftmost and fixed to id==0
598	self.add_state(`0`)?;
599	// the fail state, which is never entered and fixed to id==1
600	self.add_state(`0`)?;
601	// unanchored start state, initially fixed to id==2 but later shuffled
602	// to appear after all non-start match states.
603	self.nfa.special.start_unanchored_id = self.add_state(`0`)?;
604	// anchored start state, initially fixed to id==3 but later shuffled
605	// to appear after unanchored start state.
606	self.nfa.special.start_anchored_id = self.add_state(`0`)?;
607	// Initialize the unanchored starting state in order to make it dense,
608	// and thus make transition lookups on this state faster.
609	self.init_unanchored_start_state();
610	// Build the base trie from the given patterns.
611	self.build_trie(patterns)?;
612	// Add transitions (and maybe matches) to the anchored starting state.
613	// The anchored starting state is used for anchored searches. The only
614	// mechanical difference between it and the unanchored start state is
615	// that missing transitions map to the DEAD state instead of the FAIL
616	// state.
617	self.set_anchored_start_state();
618	// Rewrite transitions to the FAIL state on the unanchored start state
619	// as self-transitions. This keeps the start state active at all times.
620	self.add_unanchored_start_state_loop();
621	// Set all transitions on the DEAD state to point to itself. This way,
622	// the DEAD state can never be escaped. It MUST be used as a sentinel
623	// in any correct search.
624	self.add_dead_state_loop();
625	// The meat of the Aho-Corasick algorithm: compute and write failure
626	// transitions. i.e., the state to move to when a transition isn't
627	// defined in the current state. These are epsilon transitions and thus
628	// make this formulation an NFA.
629	self.fill_failure_transitions();
630	// Handle a special case under leftmost semantics when at least one
631	// of the patterns is the empty string.
632	self.close_start_state_loop_for_leftmost();
633	// Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START,
634	// NON-MATCH, ... This permits us to very quickly query the type of
635	// the state we're currently in during a search.
636	self.shuffle();
637	// Turn our set of bytes into equivalent classes. This NFA
638	// implementation doesn't use byte classes directly, but any
639	// Aho-Corasick searcher built from this one might.
640	self.nfa.byte_classes = self.byteset.byte_classes();
641	self.nfa.prefilter = self.prefilter.build();
642	self.calculate_memory_usage();
643	// Store the maximum ID of all relevant* special states. Start states*
644	// are only relevant when we have a prefilter, otherwise, there is zero
645	// reason to care about whether a state is a start state or not during
646	// a search. Indeed, without a prefilter, we are careful to explicitly
647	// NOT care about start states, otherwise the search can ping pong
648	// between the unrolled loop and the handling of special-status states
649	// and destroy perf.
650	self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() {
651	// Why the anchored starting state? Because we always put it
652	// after the unanchored starting state and it is therefore the
653	// maximum. Why put unanchored followed by anchored? No particular
654	// reason, but that's how the states are logically organized in the
655	// Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯
656	self.nfa.special.start_anchored_id
657	} else {
658	self.nfa.special.max_match_id
659	};
660	Ok(self.nfa)
661	}
662
663	/// This sets up the initial prefix trie that makes up the Aho-Corasick
664	/// automaton. Effectively, it creates the basic structure of the
665	/// automaton, where every pattern given has a path from the start state to
666	/// the end of the pattern.
667	fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError>
668	where
669	I: IntoIterator<Item = P>,
670	P: AsRef<[u8]>,
671	{
672	'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() {
673	let pid = PatternID::new(i).map_err(\|e\| {
674	BuildError::pattern_id_overflow(
675	PatternID::MAX.as_u64(),
676	e.attempted(),
677	)
678	})?;
679	let pat = pat.as_ref();
680	let patlen = SmallIndex::new(pat.len())
681	.map_err(\|_\| BuildError::pattern_too_long(pid, pat.len()))?;
682	self.nfa.min_pattern_len =
683	core::cmp::min(self.nfa.min_pattern_len, pat.len());
684	self.nfa.max_pattern_len =
685	core::cmp::max(self.nfa.max_pattern_len, pat.len());
686	assert_eq!(
687	i,
688	self.nfa.pattern_lens.len(),
689	"expected number of patterns to match pattern ID"
690	);
691	self.nfa.pattern_lens.push(patlen);
692	// We add the pattern to the prefilter here because the pattern
693	// ID in the prefilter is determined with respect to the patterns
694	// added to the prefilter. That is, it isn't the ID we have here,
695	// but the one determined by its own accounting of patterns.
696	// To ensure they line up, we add every pattern we see to the
697	// prefilter, even if some patterns ultimately are impossible to
698	// match (in leftmost-first semantics specifically).
699	//
700	// Another way of doing this would be to expose an API in the
701	// prefilter to permit setting your own pattern IDs. Or to just use
702	// our own map and go between them. But this case is sufficiently
703	// rare that we don't bother and just make sure they're in sync.
704	if self.builder.prefilter {
705	self.prefilter.add(pat);
706	}
707
708	let mut prev = self.nfa.special.start_unanchored_id;
709	let mut saw_match = `false`;
710	for (depth, &b) in pat.iter().enumerate() {
711	// When leftmost-first match semantics are requested, we
712	// specifically stop adding patterns when a previously added
713	// pattern is a prefix of it. We avoid adding it because
714	// leftmost-first semantics imply that the pattern can never
715	// match. This is not just an optimization to save space! It
716	// is necessary for correctness. In fact, this is the only
717	// difference in the automaton between the implementations for
718	// leftmost-first and leftmost-longest.
719	saw_match = saw_match \|\| self.nfa.states[prev].is_match();
720	if self.builder.match_kind.is_leftmost_first() && saw_match {
721	// Skip to the next pattern immediately. This avoids
722	// incorrectly adding a match after this loop terminates.
723	continue 'PATTERNS;
724	}
725
726	// Add this byte to our equivalence classes. We don't use these
727	// for NFA construction. These are instead used only if we're
728	// building a DFA. They would technically be useful for the
729	// NFA, but it would require a second pass over the patterns.
730	self.byteset.set_range(b, b);
731	if self.builder.ascii_case_insensitive {
732	let b = opposite_ascii_case(b);
733	self.byteset.set_range(b, b);
734	}
735
736	// If the transition from prev using the current byte already
737	// exists, then just move through it. Otherwise, add a new
738	// state. We track the depth here so that we can determine
739	// how to represent transitions. States near the start state
740	// use a dense representation that uses more memory but is
741	// faster. Other states use a sparse representation that uses
742	// less memory but is slower.
743	let next = self.nfa.states[prev].next_state(b);
744	if next != NFA::FAIL {
745	prev = next;
746	} else {
747	let next = self.add_state(depth)?;
748	self.nfa.states[prev].set_next_state(b, next);
749	if self.builder.ascii_case_insensitive {
750	let b = opposite_ascii_case(b);
751	self.nfa.states[prev].set_next_state(b, next);
752	}
753	prev = next;
754	}
755	}
756	// Once the pattern has been added, log the match in the final
757	// state that it reached.
758	self.nfa.states[prev].matches.push(pid);
759	}
760	Ok(())
761	}
762
763	/// This routine creates failure transitions according to the standard
764	/// textbook formulation of the Aho-Corasick algorithm, with a couple small
765	/// tweaks to support "leftmost" semantics.
766	///
767	/// Building failure transitions is the most interesting part of building
768	/// the Aho-Corasick automaton, because they are what allow searches to
769	/// be performed in linear time. Specifically, a failure transition is
770	/// a single transition associated with each state that points back to
771	/// the longest proper suffix of the pattern being searched. The failure
772	/// transition is followed whenever there exists no transition on the
773	/// current state for the current input byte. If there is no other proper
774	/// suffix, then the failure transition points back to the starting state.
775	///
776	/// For example, let's say we built an Aho-Corasick automaton with the
777	/// following patterns: 'abcd' and 'cef'. The trie looks like this:
778	///
779	/// ```ignore
780	/// a - S1 - b - S2 - c - S3 - d - S4*
781	/// /
782	/// S0 - c - S5 - e - S6 - f - S7*
783	/// ```
784	///
785	/// At this point, it should be fairly straight-forward to see how this
786	/// trie can be used in a simplistic way. At any given position in the
787	/// text we're searching (called the "subject" string), all we need to do
788	/// is follow the transitions in the trie by consuming one transition for
789	/// each byte in the subject string. If we reach a match state, then we can
790	/// report that location as a match.
791	///
792	/// The trick comes when searching a subject string like 'abcef'. We'll
793	/// initially follow the transition from S0 to S1 and wind up in S3 after
794	/// observng the 'c' byte. At this point, the next byte is 'e' but state
795	/// S3 has no transition for 'e', so the search fails. We then would need
796	/// to restart the search at the next position in 'abcef', which
797	/// corresponds to 'b'. The match would fail, but the next search starting
798	/// at 'c' would finally succeed. The problem with this approach is that
799	/// we wind up searching the subject string potentially many times. In
800	/// effect, this makes the algorithm have worst case `O(n m)` complexity,*
801	/// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
802	/// like to achieve a `O(n + m)` worst case complexity.
803	///
804	/// This is where failure transitions come in. Instead of dying at S3 in
805	/// the first search, the automaton can instruct the search to move to
806	/// another part of the automaton that corresponds to a suffix of what
807	/// we've seen so far. Recall that we've seen 'abc' in the subject string,
808	/// and the automaton does indeed have a non-empty suffix, 'c', that could
809	/// potentially lead to another match. Thus, the actual Aho-Corasick
810	/// automaton for our patterns in this case looks like this:
811	///
812	/// ```ignore
813	/// a - S1 - b - S2 - c - S3 - d - S4*
814	/// / /
815	/// / ----------------
816	/// / /
817	/// S0 - c - S5 - e - S6 - f - S7*
818	/// ```
819	///
820	/// That is, we have a failure transition from S3 to S5, which is followed
821	/// exactly in cases when we are in state S3 but see any byte other than
822	/// 'd' (that is, we've "failed" to find a match in this portion of our
823	/// trie). We know we can transition back to S5 because we've already seen
824	/// a 'c' byte, so we don't need to re-scan it. We can then pick back up
825	/// with the search starting at S5 and complete our match.
826	///
827	/// Adding failure transitions to a trie is fairly simple, but subtle. The
828	/// key issue is that you might have multiple failure transition that you
829	/// need to follow. For example, look at the trie for the patterns
830	/// 'abcd', 'b', 'bcd' and 'cd':
831	///
832	/// ```ignore
833	/// - a - S1 - b - S2* - c - S3 - d - S4*
834	/// / / /
835	/// / ------- -------
836	/// / / /
837	/// S0 --- b - S5* - c - S6 - d - S7*
838	/// \ /
839	/// \ --------
840	/// \ /
841	/// - c - S8 - d - S9*
842	/// ```
843	///
844	/// The failure transitions for this trie are defined from S2 to S5,
845	/// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
846	/// corresponds to a match, since its failure transition to S5 is itself
847	/// a match state.
848	///
849	/// Perhaps simplest way to think about adding these failure transitions
850	/// is recursively. That is, if you know the failure transitions for every
851	/// possible previous state that could be visited (e.g., when computing the
852	/// failure transition for S3, you already know the failure transitions
853	/// for S0, S1 and S2), then you can simply follow the failure transition
854	/// of the previous state and check whether the incoming transition is
855	/// defined after following the failure transition.
856	///
857	/// For example, when determining the failure state for S3, by our
858	/// assumptions, we already know that there is a failure transition from
859	/// S2 (the previous state) to S5. So we follow that transition and check
860	/// whether the transition connecting S2 to S3 is defined. Indeed, it is,
861	/// as there is a transition from S5 to S6 for the byte 'c'. If no such
862	/// transition existed, we could keep following the failure transitions
863	/// until we reach the start state, which is the failure transition for
864	/// every state that has no corresponding proper suffix.
865	///
866	/// We don't actually use recursion to implement this, but instead, use a
867	/// breadth first search of the automaton. Our base case is the start
868	/// state, whose failure transition is just a transition to itself.
869	///
870	/// When building a leftmost automaton, we proceed as above, but only
871	/// include a subset of failure transitions. Namely, we omit any failure
872	/// transitions that appear after a match state in the trie. This is
873	/// because failure transitions always point back to a proper suffix of
874	/// what has been seen so far. Thus, following a failure transition after
875	/// a match implies looking for a match that starts after the one that has
876	/// already been seen, which is of course therefore not the leftmost match.
877	///
878	/// N.B. I came up with this algorithm on my own, and after scouring all of
879	/// the other AC implementations I know of (Perl, Snort, many on GitHub).
880	/// I couldn't find any that implement leftmost semantics like this.
881	/// Perl of course needs leftmost-first semantics, but they implement it
882	/// with a seeming hack at search* time instead of encoding it into the*
883	/// automaton. There are also a couple Java libraries that support leftmost
884	/// longest semantics, but they do it by building a queue of matches at
885	/// search time, which is even worse than what Perl is doing. ---AG
886	fn fill_failure_transitions(&mut self) {
887	let is_leftmost = self.builder.match_kind.is_leftmost();
888	let start_uid = self.nfa.special.start_unanchored_id;
889	// Initialize the queue for breadth first search with all transitions
890	// out of the start state. We handle the start state specially because
891	// we only want to follow non-self transitions. If we followed self
892	// transitions, then this would never terminate.
893	let mut queue = VecDeque::new();
894	let mut seen = self.queued_set();
895	for i in `0`..self.nfa.states[start_uid].trans.len() {
896	let (_, next) = self.nfa.states[start_uid].trans[i];
897	// Skip anything we've seen before and any self-transitions on the
898	// start state.
899	if next == start_uid \|\| seen.contains(next) {
900	continue;
901	}
902	queue.push_back(next);
903	seen.insert(next);
904	// Under leftmost semantics, if a state immediately following
905	// the start state is a match state, then we never want to
906	// follow its failure transition since the failure transition
907	// necessarily leads back to the start state, which we never
908	// want to do for leftmost matching after a match has been
909	// found.
910	//
911	// We apply the same logic to non-start states below as well.
912	if is_leftmost && self.nfa.states[next].is_match() {
913	self.nfa.states[next].fail = NFA::DEAD;
914	}
915	}
916	while let Some(id) = queue.pop_front() {
917	for i in `0`..self.nfa.states[id].trans.len() {
918	let (b, next) = self.nfa.states[id].trans[i];
919	if seen.contains(next) {
920	// The only way to visit a duplicate state in a transition
921	// list is when ASCII case insensitivity is enabled. In
922	// this case, we want to skip it since it's redundant work.
923	// But it would also end up duplicating matches, which
924	// results in reporting duplicate matches in some cases.
925	// See the 'acasei010' regression test.
926	continue;
927	}
928	queue.push_back(next);
929	seen.insert(next);
930
931	// As above for start states, under leftmost semantics, once
932	// we see a match all subsequent states should have no failure
933	// transitions because failure transitions always imply looking
934	// for a match that is a suffix of what has been seen so far
935	// (where "seen so far" corresponds to the string formed by
936	// following the transitions from the start state to the
937	// current state). Under leftmost semantics, we specifically do
938	// not want to allow this to happen because we always want to
939	// report the match found at the leftmost position.
940	//
941	// The difference between leftmost-first and leftmost-longest
942	// occurs previously while we build the trie. For
943	// leftmost-first, we simply omit any entries that would
944	// otherwise require passing through a match state.
945	//
946	// Note that for correctness, the failure transition has to be
947	// set to the dead state for ALL states following a match, not
948	// just the match state itself. However, by setting the failure
949	// transition to the dead state on all match states, the dead
950	// state will automatically propagate to all subsequent states
951	// via the failure state computation below.
952	if is_leftmost && self.nfa.states[next].is_match() {
953	self.nfa.states[next].fail = NFA::DEAD;
954	continue;
955	}
956	let mut fail = self.nfa.states[id].fail;
957	while self.nfa.states[fail].next_state(b) == NFA::FAIL {
958	fail = self.nfa.states[fail].fail;
959	}
960	fail = self.nfa.states[fail].next_state(b);
961	self.nfa.states[next].fail = fail;
962	self.copy_matches(fail, next);
963	}
964	// If the start state is a match state, then this automaton can
965	// match the empty string. This implies all states are match states
966	// since every position matches the empty string, so copy the
967	// matches from the start state to every state. Strictly speaking,
968	// this is only necessary for overlapping matches since each
969	// non-empty non-start match state needs to report empty matches
970	// in addition to its own. For the non-overlapping case, such
971	// states only report the first match, which is never empty since
972	// it isn't a start state.
973	if !is_leftmost {
974	self.copy_matches(self.nfa.special.start_unanchored_id, id);
975	}
976	}
977	}
978
979	/// Shuffle the states so that they appear in this sequence:
980	///
981	/// DEAD, FAIL, MATCH..., START, START, NON-MATCH...
982	///
983	/// The idea here is that if we know how special states are laid out in our
984	/// transition table, then we can determine what "kind" of state we're in
985	/// just by comparing our current state ID with a particular value. In this
986	/// way, we avoid doing extra memory lookups.
987	///
988	/// Before shuffling begins, our states look something like this:
989	///
990	/// DEAD, FAIL, START, START, (MATCH \| NON-MATCH)...
991	///
992	/// So all we need to do is move all of the MATCH states so that they
993	/// all appear before any NON-MATCH state, like so:
994	///
995	/// DEAD, FAIL, START, START, MATCH... NON-MATCH...
996	///
997	/// Then it's just a simple matter of swapping the two START states with
998	/// the last two MATCH states.
999	///
1000	/// (This is the same technique used for fully compiled DFAs in
1001	/// regex-automata.)
1002	fn shuffle(&mut self) {
1003	let old_start_uid = self.nfa.special.start_unanchored_id;
1004	let old_start_aid = self.nfa.special.start_anchored_id;
1005	assert!(old_start_uid < old_start_aid);
1006	assert_eq!(
1007	`3`,
1008	old_start_aid.as_usize(),
1009	"anchored start state should be at index 3"
1010	);
1011	// We implement shuffling by a sequence of pairwise swaps of states.
1012	// Since we have a number of things referencing states via their
1013	// IDs and swapping them changes their IDs, we need to record every
1014	// swap we make so that we can remap IDs. The remapper handles this
1015	// book-keeping for us.
1016	let mut remapper = Remapper::new(&self.nfa, `0`);
1017	// The way we proceed here is by moving all match states so that
1018	// they directly follow the start states. So it will go: DEAD, FAIL,
1019	// START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ...
1020	//
1021	// To do that, we proceed forward through all states after
1022	// START-ANCHORED and swap match states so that they appear before all
1023	// non-match states.
1024	let mut next_avail = StateID::from(`4u8`);
1025	for i in next_avail.as_usize()..self.nfa.states.len() {
1026	let sid = StateID::new(i).unwrap();
1027	if !self.nfa.states[sid].is_match() {
1028	continue;
1029	}
1030	remapper.swap(&mut self.nfa, sid, next_avail);
1031	// The key invariant here is that only non-match states exist
1032	// between 'next_avail' and 'sid' (with them being potentially
1033	// equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed
1034	// to land on the leftmost non-match state. (Unless 'next_avail'
1035	// and 'sid' are equivalent, in which case, a swap will occur but
1036	// it is a no-op.)
1037	next_avail = StateID::new(next_avail.one_more()).unwrap();
1038	}
1039	// Now we'd like to move the start states to immediately following the
1040	// match states. (The start states may themselves be match states, but
1041	// we'll handle that later.) We arrange the states this way so that we
1042	// don't necessarily need to check whether a state is a start state or
1043	// not before checking whether a state is a match state. For example,
1044	// we'd like to be able to write this as our state machine loop:
1045	//
1046	// sid = start()
1047	// for byte in haystack:
1048	// sid = next(sid, byte)
1049	// if sid <= nfa.max_start_id:
1050	// if sid <= nfa.max_dead_id:
1051	// # search complete
1052	// elif sid <= nfa.max_match_id:
1053	// # found match
1054	//
1055	// The important context here is that we might not want to look for
1056	// start states at all. Namely, if a searcher doesn't have a prefilter,
1057	// then there is no reason to care about whether we're in a start state
1058	// or not. And indeed, if we did check for it, this very hot loop would
1059	// ping pong between the special state handling and the main state
1060	// transition logic. This in turn stalls the CPU by killing branch
1061	// prediction.
1062	//
1063	// So essentially, we really want to be able to "forget" that start
1064	// states even exist and this is why we put them at the end.
1065	let new_start_aid =
1066	StateID::new(next_avail.as_usize().checked_sub(`1`).unwrap())
1067	.unwrap();
1068	remapper.swap(&mut self.nfa, old_start_aid, new_start_aid);
1069	let new_start_uid =
1070	StateID::new(next_avail.as_usize().checked_sub(`2`).unwrap())
1071	.unwrap();
1072	remapper.swap(&mut self.nfa, old_start_uid, new_start_uid);
1073	let new_max_match_id =
1074	StateID::new(next_avail.as_usize().checked_sub(`3`).unwrap())
1075	.unwrap();
1076	self.nfa.special.max_match_id = new_max_match_id;
1077	self.nfa.special.start_unanchored_id = new_start_uid;
1078	self.nfa.special.start_anchored_id = new_start_aid;
1079	// If one start state is a match state, then they both are.
1080	if self.nfa.states[self.nfa.special.start_anchored_id].is_match() {
1081	self.nfa.special.max_match_id = self.nfa.special.start_anchored_id;
1082	}
1083	remapper.remap(&mut self.nfa);
1084	}
1085
1086	/// Returns a set that tracked queued states.
1087	///
1088	/// This is only necessary when ASCII case insensitivity is enabled, since
1089	/// it is the only way to visit the same state twice. Otherwise, this
1090	/// returns an inert set that nevers adds anything and always reports
1091	/// `false` for every member test.
1092	fn queued_set(&self) -> QueuedSet {
1093	if self.builder.ascii_case_insensitive {
1094	QueuedSet::active()
1095	} else {
1096	QueuedSet::inert()
1097	}
1098	}
1099
1100	/// Initializes the unanchored start state by making it dense. This is
1101	/// achieved by explicitly setting every transition to the FAIL state.
1102	/// This isn't necessary for correctness, since any missing transition is
1103	/// automatically assumed to be mapped to the FAIL state. We do this to
1104	/// make the unanchored starting state dense, and thus in turn make
1105	/// transition lookups on it faster. (Which is worth doing because it's
1106	/// the most active state.)
1107	fn init_unanchored_start_state(&mut self) {
1108	let start_uid = self.nfa.special.start_unanchored_id;
1109	for byte in `0`..=`255` {
1110	self.nfa.states[start_uid].set_next_state(byte, NFA::FAIL);
1111	}
1112	}
1113
1114	/// Setup the anchored start state by copying all of the transitions and
1115	/// matches from the unanchored starting state with one change: the failure
1116	/// transition is changed to the DEAD state, so that for any undefined
1117	/// transitions, the search will stop.
1118	fn set_anchored_start_state(&mut self) {
1119	let start_uid = self.nfa.special.start_unanchored_id;
1120	let start_aid = self.nfa.special.start_anchored_id;
1121	self.nfa.states[start_aid].trans =
1122	self.nfa.states[start_uid].trans.clone();
1123	self.copy_matches(start_uid, start_aid);
1124	// This is the main difference between the unanchored and anchored
1125	// starting states. If a lookup on an anchored starting state fails,
1126	// then the search should stop.
1127	//
1128	// N.B. This assumes that the loop on the unanchored starting state
1129	// hasn't been created yet.
1130	self.nfa.states[start_aid].fail = NFA::DEAD;
1131	}
1132
1133	/// Set the failure transitions on the start state to loop back to the
1134	/// start state. This effectively permits the Aho-Corasick automaton to
1135	/// match at any position. This is also required for finding the next
1136	/// state to terminate, namely, finding the next state should never return
1137	/// a fail_id.
1138	///
1139	/// This must be done after building the initial trie, since trie
1140	/// construction depends on transitions to `fail_id` to determine whether a
1141	/// state already exists or not.
1142	fn add_unanchored_start_state_loop(&mut self) {
1143	let start_uid = self.nfa.special.start_unanchored_id;
1144	let start = &mut self.nfa.states[start_uid];
1145	for b in `0`..=`255` {
1146	if start.next_state(b) == NFA::FAIL {
1147	start.set_next_state(b, start_uid);
1148	}
1149	}
1150	}
1151
1152	/// Remove the start state loop by rewriting any transitions on the start
1153	/// state back to the start state with transitions to the dead state.
1154	///
1155	/// The loop is only closed when two conditions are met: the start state
1156	/// is a match state and the match kind is leftmost-first or
1157	/// leftmost-longest.
1158	///
1159	/// The reason for this is that under leftmost semantics, a start state
1160	/// that is also a match implies that we should never restart the search
1161	/// process. We allow normal transitions out of the start state, but if
1162	/// none exist, we transition to the dead state, which signals that
1163	/// searching should stop.
1164	fn close_start_state_loop_for_leftmost(&mut self) {
1165	let start_uid = self.nfa.special.start_unanchored_id;
1166	let start = &mut self.nfa.states[start_uid];
1167	if self.builder.match_kind.is_leftmost() && start.is_match() {
1168	for b in `0`..=`255` {
1169	if start.next_state(b) == start_uid {
1170	start.set_next_state(b, NFA::DEAD);
1171	}
1172	}
1173	}
1174	}
1175
1176	/// Sets all transitions on the dead state to point back to the dead state.
1177	/// Normally, missing transitions map back to the failure state, but the
1178	/// point of the dead state is to act as a sink that can never be escaped.
1179	fn add_dead_state_loop(&mut self) {
1180	let dead = &mut self.nfa.states[NFA::DEAD];
1181	for b in `0`..=`255` {
1182	dead.set_next_state(b, NFA::DEAD);
1183	}
1184	}
1185
1186	/// Copy matches from the `src` state to the `dst` state. This is useful
1187	/// when a match state can be reached via a failure transition. In which
1188	/// case, you'll want to copy the matches (if any) from the state reached
1189	/// by the failure transition to the original state you were at.
1190	fn copy_matches(&mut self, src: StateID, dst: StateID) {
1191	let (src, dst) =
1192	get_two_mut(&mut self.nfa.states, src.as_usize(), dst.as_usize());
1193	dst.matches.extend_from_slice(&src.matches);
1194	}
1195
1196	/// Allocate and add a fresh state to the underlying NFA and return its
1197	/// ID (guaranteed to be one more than the ID of the previously allocated
1198	/// state). If the ID would overflow `StateID`, then this returns an error.
1199	fn add_state(&mut self, depth: usize) -> Result<StateID, BuildError> {
1200	// This is OK because we error when building the trie if we see a
1201	// pattern whose length cannot fit into a 'SmallIndex', and the longest
1202	// possible depth corresponds to the length of the longest pattern.
1203	let depth = SmallIndex::new(depth)
1204	.expect("patterns longer than SmallIndex::MAX are not allowed");
1205	let id = StateID::new(self.nfa.states.len()).map_err(\|e\| {
1206	BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
1207	})?;
1208	self.nfa.states.push(State {
1209	trans: vec![],
1210	matches: vec![],
1211	fail: self.nfa.special.start_unanchored_id,
1212	depth,
1213	});
1214	Ok(id)
1215	}
1216
1217	/// Computes the total amount of heap used by this NFA in bytes.
1218	fn calculate_memory_usage(&mut self) {
1219	use core::mem::size_of;
1220
1221	for state in self.nfa.states.iter() {
1222	self.nfa.memory_usage += size_of::<State>() + state.memory_usage();
1223	}
1224	}
1225	}
1226
1227	/// A set of state identifiers used to avoid revisiting the same state multiple
1228	/// times when filling in failure transitions.
1229	///
1230	/// This set has an "inert" and an "active" mode. When inert, the set never
1231	/// stores anything and always returns `false` for every member test. This is
1232	/// useful to avoid the performance and memory overhead of maintaining this
1233	/// set when it is not needed.
1234	#[derive(Debug)]
1235	struct QueuedSet {
1236	set: Option<BTreeSet<StateID>>,
1237	}
1238
1239	impl QueuedSet {
1240	/// Return an inert set that returns `false` for every state ID membership
1241	/// test.
1242	fn inert() -> QueuedSet {
1243	QueuedSet { set: None }
1244	}
1245
1246	/// Return an active set that tracks state ID membership.
1247	fn active() -> QueuedSet {
1248	QueuedSet { set: Some(BTreeSet::new()) }
1249	}
1250
1251	/// Inserts the given state ID into this set. (If the set is inert, then
1252	/// this is a no-op.)
1253	fn insert(&mut self, state_id: StateID) {
1254	if let Some(ref mut set) = self.set {
1255	set.insert(state_id);
1256	}
1257	}
1258
1259	/// Returns true if and only if the given state ID is in this set. If the
1260	/// set is inert, this always returns false.
1261	fn contains(&self, state_id: StateID) -> bool {
1262	match self.set {
1263	None => `false`,
1264	Some(ref set) => set.contains(&state_id),
1265	}
1266	}
1267	}
1268
1269	impl core::fmt::Debug for NFA {
1270	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1271	use crate::automaton::fmt_state_indicator;
1272
1273	writeln!(f, "noncontiguous::NFA(")?;
1274	for (sid, state) in self.states.iter().with_state_ids() {
1275	// The FAIL state doesn't actually have space for a state allocated
1276	// for it, so we have to treat it as a special case.
1277	if sid == NFA::FAIL {
1278	writeln!(f, "F {:`06`}:", sid.as_usize())?;
1279	continue;
1280	}
1281	fmt_state_indicator(f, self, sid)?;
1282	write!(
1283	f,
1284	"{:`06`}({:`06`}): ",
1285	sid.as_usize(),
1286	state.fail.as_usize()
1287	)?;
1288	state.fmt(f)?;
1289	write!(f, "`\n`")?;
1290	if self.is_match(sid) {
1291	write!(f, " matches: ")?;
1292	for (i, pid) in state.matches.iter().enumerate() {
1293	if i > `0` {
1294	write!(f, ", ")?;
1295	}
1296	write!(f, "{}", pid.as_usize())?;
1297	}
1298	write!(f, "`\n`")?;
1299	}
1300	}
1301	writeln!(f, "match kind: {:?}", self.match_kind)?;
1302	writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
1303	writeln!(f, "state length: {:?}", self.states.len())?;
1304	writeln!(f, "pattern length: {:?}", self.patterns_len())?;
1305	writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
1306	writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
1307	writeln!(f, "memory usage: {:?}", self.memory_usage())?;
1308	writeln!(f, ")")?;
1309	Ok(())
1310	}
1311	}
1312
1313	/// Safely return two mutable borrows to two different locations in the given
1314	/// slice.
1315	///
1316	/// This panics if i == j.
1317	fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
1318	assert!(i != j, "{} must not be equal to {}", i, j);
1319	if i < j {
1320	let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:j);
1321	(&mut before[i], &mut after[`0`])
1322	} else {
1323	let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:i);
1324	(&mut after[`0`], &mut before[j])
1325	}
1326	}
1327