noncontiguous.rs source code [crates/aho-corasick-1.1.3/src/nfa/noncontiguous.rs]

1	/!*
2	Provides a noncontiguous NFA implementation of Aho-Corasick.
3
4	This is a low-level API that generally only needs to be used in niche
5	circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6	instead of a noncontiguous NFA directly. Using an `NFA` directly is typically
7	only necessary when one needs access to the [`Automaton`] trait implementation.
8	*/
9
10	use alloc::{
11	collections::{BTreeSet, VecDeque},
12	vec,
13	vec::Vec,
14	};
15
16	use crate::{
17	automaton::Automaton,
18	util::{
19	alphabet::{ByteClassSet, ByteClasses},
20	error::{BuildError, MatchError},
21	prefilter::{self, opposite_ascii_case, Prefilter},
22	primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
23	remapper::Remapper,
24	search::{Anchored, MatchKind},
25	special::Special,
26	},
27	};
28
29	/// A noncontiguous NFA implementation of Aho-Corasick.
30	///
31	/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
32	/// this type directly. Using an `NFA` directly is typically only necessary
33	/// when one needs access to the [`Automaton`] trait implementation.
34	///
35	/// This NFA represents the "core" implementation of Aho-Corasick in this
36	/// crate. Namely, constructing this NFA involving building a trie and then
37	/// filling in the failure transitions between states, similar to what is
38	/// described in any standard textbook description of Aho-Corasick.
39	///
40	/// In order to minimize heap usage and to avoid additional construction costs,
41	/// this implementation represents the transitions of all states as distinct
42	/// sparse memory allocations. This is where it gets its name from. That is,
43	/// this NFA has no contiguous memory allocation for its transition table. Each
44	/// state gets its own allocation.
45	///
46	/// While the sparse representation keeps memory usage to somewhat reasonable
47	/// levels, it is still quite large and also results in somewhat mediocre
48	/// search performance. For this reason, it is almost always a good idea to
49	/// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is
50	/// marginally slower to build, but has higher throughput and can sometimes use
51	/// an order of magnitude less memory. The main reason to use a noncontiguous
52	/// NFA is when you need the fastest possible construction time, or when a
53	/// contiguous NFA does not have the desired capacity. (The total number of NFA
54	/// states it can have is fewer than a noncontiguous NFA.)
55	///
56	/// # Example
57	///
58	/// This example shows how to build an `NFA` directly and use it to execute
59	/// [`Automaton::try_find`]:
60	///
61	/// ```
62	/// use aho_corasick::{
63	/// automaton::Automaton,
64	/// nfa::noncontiguous::NFA,
65	/// Input, Match,
66	/// };
67	///
68	/// let patterns = &["b", "abc", "abcd"];
69	/// let haystack = "abcd";
70	///
71	/// let nfa = NFA::new(patterns).unwrap();
72	/// assert_eq!(
73	/// Some(Match::must(`0`, `1`..`2`)),
74	/// nfa.try_find(&Input::new(haystack))?,
75	/// );
76	/// # Ok::<(), Box<dyn std::error::Error>>(())
77	/// ```
78	///
79	/// It is also possible to implement your own version of `try_find`. See the
80	/// [`Automaton`] documentation for an example.
81	#[derive(Clone)]
82	pub struct NFA {
83	/// The match semantics built into this NFA.
84	match_kind: MatchKind,
85	/// A set of states. Each state defines its own transitions, a fail
86	/// transition and a set of indices corresponding to matches.
87	///
88	/// The first state is always the fail state, which is used only as a
89	/// sentinel. Namely, in the final NFA, no transition into the fail state
90	/// exists. (Well, they do, but they aren't followed. Instead, the state's
91	/// failure transition is followed.)
92	///
93	/// The second state (index 1) is always the dead state. Dead states are
94	/// in every automaton, but only used when leftmost-{first,longest} match
95	/// semantics are enabled. Specifically, they instruct search to stop
96	/// at specific points in order to report the correct match location. In
97	/// the standard Aho-Corasick construction, there are no transitions to
98	/// the dead state.
99	///
100	/// The third state (index 2) is generally intended to be the starting or
101	/// "root" state.
102	states: Vec<State>,
103	/// Transitions stored in a sparse representation via a linked list.
104	///
105	/// Each transition contains three pieces of information: the byte it
106	/// is defined for, the state it transitions to and a link to the next
107	/// transition in the same state (or `StateID::ZERO` if it is the last
108	/// transition).
109	///
110	/// The first transition for each state is determined by `State::sparse`.
111	///
112	/// Note that this contains a complete set of all transitions in this NFA,
113	/// including states that have a dense representation for transitions.
114	/// (Adding dense transitions for a state doesn't remove its sparse
115	/// transitions, since deleting transitions from this particular sparse
116	/// representation would be fairly expensive.)
117	sparse: Vec<Transition>,
118	/// Transitions stored in a dense representation.
119	///
120	/// A state has a row in this table if and only if `State::dense` is
121	/// not equal to `StateID::ZERO`. When not zero, there are precisely
122	/// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense`
123	/// in this table.
124	///
125	/// Generally a very small minority of states have a dense representation
126	/// since it uses so much memory.
127	dense: Vec<StateID>,
128	/// Matches stored in linked list for each state.
129	///
130	/// Like sparse transitions, each match has a link to the next match in the
131	/// state.
132	///
133	/// The first match for each state is determined by `State::matches`.
134	matches: Vec<Match>,
135	/// The length, in bytes, of each pattern in this NFA. This slice is
136	/// indexed by `PatternID`.
137	///
138	/// The number of entries in this vector corresponds to the total number of
139	/// patterns in this automaton.
140	pattern_lens: Vec<SmallIndex>,
141	/// A prefilter for quickly skipping to candidate matches, if pertinent.
142	prefilter: Option<Prefilter>,
143	/// A set of equivalence classes in terms of bytes. We compute this while
144	/// building the NFA, but don't use it in the NFA's states. Instead, we
145	/// use this for building the DFA. We store it on the NFA since it's easy
146	/// to compute while visiting the patterns.
147	byte_classes: ByteClasses,
148	/// The length, in bytes, of the shortest pattern in this automaton. This
149	/// information is useful for detecting whether an automaton matches the
150	/// empty string or not.
151	min_pattern_len: usize,
152	/// The length, in bytes, of the longest pattern in this automaton. This
153	/// information is useful for keeping correct buffer sizes when searching
154	/// on streams.
155	max_pattern_len: usize,
156	/// The information required to deduce which states are "special" in this
157	/// NFA.
158	///
159	/// Since the DEAD and FAIL states are always the first two states and
160	/// there are only ever two start states (which follow all of the match
161	/// states), it follows that we can determine whether a state is a fail,
162	/// dead, match or start with just a few comparisons on the ID itself:
163	///
164	/// is_dead(sid): sid == NFA::DEAD
165	/// is_fail(sid): sid == NFA::FAIL
166	/// is_match(sid): NFA::FAIL < sid && sid <= max_match_id
167	/// is_start(sid): sid == start_unanchored_id \|\| sid == start_anchored_id
168	///
169	/// Note that this only applies to the NFA after it has been constructed.
170	/// During construction, the start states are the first ones added and the
171	/// match states are inter-leaved with non-match states. Once all of the
172	/// states have been added, the states are shuffled such that the above
173	/// predicates hold.
174	special: Special,
175	}
176
177	impl NFA {
178	/// Create a new Aho-Corasick noncontiguous NFA using the default
179	/// configuration.
180	///
181	/// Use a [`Builder`] if you want to change the configuration.
182	pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
183	where
184	I: IntoIterator<Item = P>,
185	P: AsRef<[u8]>,
186	{
187	NFA::builder().build(patterns)
188	}
189
190	/// A convenience method for returning a new Aho-Corasick noncontiguous NFA
191	/// builder.
192	///
193	/// This usually permits one to just import the `NFA` type.
194	pub fn builder() -> Builder {
195	Builder::new()
196	}
197	}
198
199	impl NFA {
200	/// The DEAD state is a sentinel state like the FAIL state. The DEAD state
201	/// instructs any search to stop and return any currently recorded match,
202	/// or no match otherwise. Generally speaking, it is impossible for an
203	/// unanchored standard search to enter a DEAD state. But an anchored
204	/// search can, and so to can a leftmost search.
205	///
206	/// We put DEAD before FAIL so that DEAD is always 0. We repeat this
207	/// decision across the other Aho-Corasicm automata, so that DEAD
208	/// states there are always 0 too. It's not that we need all of the
209	/// implementations to agree, but rather, the contiguous NFA and the DFA
210	/// use a sort of "premultiplied" state identifier where the only state
211	/// whose ID is always known and constant is the first state. Subsequent
212	/// state IDs depend on how much space has already been used in the
213	/// transition table.
214	pub(crate) const DEAD: StateID = StateID::new_unchecked(`0`);
215	/// The FAIL state mostly just corresponds to the ID of any transition on a
216	/// state that isn't explicitly defined. When one transitions into the FAIL
217	/// state, one must follow the previous state's failure transition before
218	/// doing the next state lookup. In this way, FAIL is more of a sentinel
219	/// than a state that one actually transitions into. In particular, it is
220	/// never exposed in the `Automaton` interface.
221	pub(crate) const FAIL: StateID = StateID::new_unchecked(`1`);
222
223	/// Returns the equivalence classes of bytes found while constructing
224	/// this NFA.
225	///
226	/// Note that the NFA doesn't actually make use of these equivalence
227	/// classes. Instead, these are useful for building the DFA when desired.
228	pub(crate) fn byte_classes(&self) -> &ByteClasses {
229	&self.byte_classes
230	}
231
232	/// Returns a slice containing the length of each pattern in this searcher.
233	/// It is indexed by `PatternID` and has length `NFA::patterns_len`.
234	///
235	/// This is exposed for convenience when building a contiguous NFA. But it
236	/// can be reconstructed from the `Automaton` API if necessary.
237	pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] {
238	&self.pattern_lens
239	}
240
241	/// Returns a slice of all states in this non-contiguous NFA.
242	pub(crate) fn states(&self) -> &[State] {
243	&self.states
244	}
245
246	/// Returns the underlying "special" state information for this NFA.
247	pub(crate) fn special(&self) -> &Special {
248	&self.special
249	}
250
251	/// Swaps the states at `id1` and `id2`.
252	///
253	/// This does not update the transitions of any state to account for the
254	/// state swap.
255	pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
256	self.states.swap(id1.as_usize(), id2.as_usize());
257	}
258
259	/// Re-maps all state IDs in this NFA according to the `map` function
260	/// given.
261	pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
262	let alphabet_len = self.byte_classes.alphabet_len();
263	for state in self.states.iter_mut() {
264	state.fail = map(state.fail);
265	let mut link = state.sparse;
266	while link != StateID::ZERO {
267	let t = &mut self.sparse[link];
268	t.next = map(t.next);
269	link = t.link;
270	}
271	if state.dense != StateID::ZERO {
272	let start = state.dense.as_usize();
273	for next in self.dense[start..][..alphabet_len].iter_mut() {
274	next = map(next);
275	}
276	}
277	}
278	}
279
280	/// Iterate over all of the transitions for the given state ID.
281	pub(crate) fn iter_trans(
282	&self,
283	sid: StateID,
284	) -> impl Iterator<Item = Transition> + '_ {
285	let mut link = self.states[sid].sparse;
286	core::iter::from_fn(move \|\| {
287	if link == StateID::ZERO {
288	return None;
289	}
290	let t = self.sparse[link];
291	link = t.link;
292	Some(t)
293	})
294	}
295
296	/// Iterate over all of the matches for the given state ID.
297	pub(crate) fn iter_matches(
298	&self,
299	sid: StateID,
300	) -> impl Iterator<Item = PatternID> + '_ {
301	let mut link = self.states[sid].matches;
302	core::iter::from_fn(move \|\| {
303	if link == StateID::ZERO {
304	return None;
305	}
306	let m = self.matches[link];
307	link = m.link;
308	Some(m.pid)
309	})
310	}
311
312	/// Return the link following the one given. If the one given is the last
313	/// link for the given state, then return `None`.
314	///
315	/// If no previous link is given, then this returns the first link in the
316	/// state, if one exists.
317	///
318	/// This is useful for manually iterating over the transitions in a single
319	/// state without borrowing the NFA. This permits mutating other parts of
320	/// the NFA during iteration. Namely, one can access the transition pointed
321	/// to by the link via `self.sparse[link]`.
322	fn next_link(
323	&self,
324	sid: StateID,
325	prev: Option<StateID>,
326	) -> Option<StateID> {
327	let link =
328	prev.map_or(self.states[sid].sparse, \|p\| self.sparse[p].link);
329	if link == StateID::ZERO {
330	None
331	} else {
332	Some(link)
333	}
334	}
335
336	/// Follow the transition for the given byte in the given state. If no such
337	/// transition exists, then the FAIL state ID is returned.
338	#[inline(always)]
339	fn follow_transition(&self, sid: StateID, byte: u8) -> StateID {
340	let s = &self.states[sid];
341	// This is a special case that targets starting states and states
342	// near a start state. Namely, after the initial trie is constructed,
343	// we look for states close to the start state to convert to a dense
344	// representation for their transitions. This winds up using a lot more
345	// memory per state in exchange for faster transition lookups. But
346	// since we only do this for a small number of states (by default), the
347	// memory usage is usually minimal.
348	//
349	// This has massive* benefit when executing searches because the*
350	// unanchored starting state is by far the hottest state and is
351	// frequently visited. Moreover, the 'for' loop below that works
352	// decently on an actually sparse state is disastrous on a state that
353	// is nearly or completely dense.
354	if s.dense == StateID::ZERO {
355	self.follow_transition_sparse(sid, byte)
356	} else {
357	let class = usize::from(self.byte_classes.get(byte));
358	self.dense[s.dense.as_usize() + class]
359	}
360	}
361
362	/// Like `follow_transition`, but always uses the sparse representation.
363	#[inline(always)]
364	fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID {
365	for t in self.iter_trans(sid) {
366	if byte <= t.byte {
367	if byte == t.byte {
368	return t.next;
369	}
370	break;
371	}
372	}
373	NFA::FAIL
374	}
375
376	/// Set the transition for the given byte to the state ID given.
377	///
378	/// Note that one should not set transitions to the FAIL state. It is not
379	/// technically incorrect, but it wastes space. If a transition is not
380	/// defined, then it is automatically assumed to lead to the FAIL state.
381	fn add_transition(
382	&mut self,
383	prev: StateID,
384	byte: u8,
385	next: StateID,
386	) -> Result<(), BuildError> {
387	if self.states[prev].dense != StateID::ZERO {
388	let dense = self.states[prev].dense;
389	let class = usize::from(self.byte_classes.get(byte));
390	self.dense[dense.as_usize() + class] = next;
391	}
392
393	let head = self.states[prev].sparse;
394	if head == StateID::ZERO \|\| byte < self.sparse[head].byte {
395	let new_link = self.alloc_transition()?;
396	self.sparse[new_link] = Transition { byte, next, link: head };
397	self.states[prev].sparse = new_link;
398	return Ok(());
399	} else if byte == self.sparse[head].byte {
400	self.sparse[head].next = next;
401	return Ok(());
402	}
403
404	// We handled the only cases where the beginning of the transition
405	// chain needs to change. At this point, we now know that there is
406	// at least one entry in the transition chain and the byte for that
407	// transition is less than the byte for the transition we're adding.
408	let (mut link_prev, mut link_next) = (head, self.sparse[head].link);
409	while link_next != StateID::ZERO && byte > self.sparse[link_next].byte
410	{
411	link_prev = link_next;
412	link_next = self.sparse[link_next].link;
413	}
414	if link_next == StateID::ZERO \|\| byte < self.sparse[link_next].byte {
415	let link = self.alloc_transition()?;
416	self.sparse[link] = Transition { byte, next, link: link_next };
417	self.sparse[link_prev].link = link;
418	} else {
419	assert_eq!(byte, self.sparse[link_next].byte);
420	self.sparse[link_next].next = next;
421	}
422	Ok(())
423	}
424
425	/// This sets every possible transition (all 255 of them) for the given
426	/// state to the name `next` value.
427	///
428	/// This is useful for efficiently initializing start/dead states.
429	///
430	/// # Panics
431	///
432	/// This requires that the state has no transitions added to it already.
433	/// If it has any transitions, then this panics. It will also panic if
434	/// the state has been densified prior to calling this.
435	fn init_full_state(
436	&mut self,
437	prev: StateID,
438	next: StateID,
439	) -> Result<(), BuildError> {
440	assert_eq!(
441	StateID::ZERO,
442	self.states[prev].dense,
443	"state must not be dense yet"
444	);
445	assert_eq!(
446	StateID::ZERO,
447	self.states[prev].sparse,
448	"state must have zero transitions"
449	);
450	let mut prev_link = StateID::ZERO;
451	for byte in `0`..=`255` {
452	let new_link = self.alloc_transition()?;
453	self.sparse[new_link] =
454	Transition { byte, next, link: StateID::ZERO };
455	if prev_link == StateID::ZERO {
456	self.states[prev].sparse = new_link;
457	} else {
458	self.sparse[prev_link].link = new_link;
459	}
460	prev_link = new_link;
461	}
462	Ok(())
463	}
464
465	/// Add a match for the given pattern ID to the state for the given ID.
466	fn add_match(
467	&mut self,
468	sid: StateID,
469	pid: PatternID,
470	) -> Result<(), BuildError> {
471	let head = self.states[sid].matches;
472	let mut link = head;
473	while self.matches[link].link != StateID::ZERO {
474	link = self.matches[link].link;
475	}
476	let new_match_link = self.alloc_match()?;
477	self.matches[new_match_link].pid = pid;
478	if link == StateID::ZERO {
479	self.states[sid].matches = new_match_link;
480	} else {
481	self.matches[link].link = new_match_link;
482	}
483	Ok(())
484	}
485
486	/// Copy matches from the `src` state to the `dst` state. This is useful
487	/// when a match state can be reached via a failure transition. In which
488	/// case, you'll want to copy the matches (if any) from the state reached
489	/// by the failure transition to the original state you were at.
490	fn copy_matches(
491	&mut self,
492	src: StateID,
493	dst: StateID,
494	) -> Result<(), BuildError> {
495	let head_dst = self.states[dst].matches;
496	let mut link_dst = head_dst;
497	while self.matches[link_dst].link != StateID::ZERO {
498	link_dst = self.matches[link_dst].link;
499	}
500	let mut link_src = self.states[src].matches;
501	while link_src != StateID::ZERO {
502	let new_match_link =
503	StateID::new(self.matches.len()).map_err(\|e\| {
504	BuildError::state_id_overflow(
505	StateID::MAX.as_u64(),
506	e.attempted(),
507	)
508	})?;
509	self.matches.push(Match {
510	pid: self.matches[link_src].pid,
511	link: StateID::ZERO,
512	});
513	if link_dst == StateID::ZERO {
514	self.states[dst].matches = new_match_link;
515	} else {
516	self.matches[link_dst].link = new_match_link;
517	}
518
519	link_dst = new_match_link;
520	link_src = self.matches[link_src].link;
521	}
522	Ok(())
523	}
524
525	/// Create a new entry in `NFA::trans`, if there's room, and return that
526	/// entry's ID. If there's no room, then an error is returned.
527	fn alloc_transition(&mut self) -> Result<StateID, BuildError> {
528	let id = StateID::new(self.sparse.len()).map_err(\|e\| {
529	BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
530	})?;
531	self.sparse.push(Transition::default());
532	Ok(id)
533	}
534
535	/// Create a new entry in `NFA::matches`, if there's room, and return that
536	/// entry's ID. If there's no room, then an error is returned.
537	fn alloc_match(&mut self) -> Result<StateID, BuildError> {
538	let id = StateID::new(self.matches.len()).map_err(\|e\| {
539	BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
540	})?;
541	self.matches.push(Match::default());
542	Ok(id)
543	}
544
545	/// Create a new set of `N` transitions in this NFA's dense transition
546	/// table. The ID return corresponds to the index at which the `N`
547	/// transitions begin. So `id+0` is the first transition and `id+(N-1)` is
548	/// the last.
549	///
550	/// `N` is determined via `NFA::byte_classes::alphabet_len`.
551	fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> {
552	let id = StateID::new(self.dense.len()).map_err(\|e\| {
553	BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
554	})?;
555	// We use FAIL because it's the correct default. If a state doesn't
556	// have a transition defined for every possible byte value, then the
557	// transition function should return NFA::FAIL.
558	self.dense.extend(
559	core::iter::repeat(NFA::FAIL)
560	.take(self.byte_classes.alphabet_len()),
561	);
562	Ok(id)
563	}
564
565	/// Allocate and add a fresh state to the underlying NFA and return its
566	/// ID (guaranteed to be one more than the ID of the previously allocated
567	/// state). If the ID would overflow `StateID`, then this returns an error.
568	fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> {
569	// This is OK because we error when building the trie if we see a
570	// pattern whose length cannot fit into a 'SmallIndex', and the longest
571	// possible depth corresponds to the length of the longest pattern.
572	let depth = SmallIndex::new(depth)
573	.expect("patterns longer than SmallIndex::MAX are not allowed");
574	let id = StateID::new(self.states.len()).map_err(\|e\| {
575	BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
576	})?;
577	self.states.push(State {
578	sparse: StateID::ZERO,
579	dense: StateID::ZERO,
580	matches: StateID::ZERO,
581	fail: self.special.start_unanchored_id,
582	depth,
583	});
584	Ok(id)
585	}
586	}
587
588	// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
589	// returns a valid state ID given a valid state ID. We otherwise claim that
590	// all other methods are correct as well.
591	unsafe impl Automaton for NFA {
592	#[inline(always)]
593	fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
594	match anchored {
595	Anchored::No => Ok(self.special.start_unanchored_id),
596	Anchored::Yes => Ok(self.special.start_anchored_id),
597	}
598	}
599
600	#[inline(always)]
601	fn next_state(
602	&self,
603	anchored: Anchored,
604	mut sid: StateID,
605	byte: u8,
606	) -> StateID {
607	// This terminates since:
608	//
609	// 1. state.fail never points to the FAIL state.
610	// 2. All state.fail values point to a state closer to the start state.
611	// 3. The start state has no transitions to the FAIL state.
612	loop {
613	let next = self.follow_transition(sid, byte);
614	if next != NFA::FAIL {
615	return next;
616	}
617	// For an anchored search, we never follow failure transitions
618	// because failure transitions lead us down a path to matching
619	// a proper* suffix of the path we were on. Thus, it can only*
620	// produce matches that appear after the beginning of the search.
621	if anchored.is_anchored() {
622	return NFA::DEAD;
623	}
624	sid = self.states[sid].fail();
625	}
626	}
627
628	#[inline(always)]
629	fn is_special(&self, sid: StateID) -> bool {
630	sid <= self.special.max_special_id
631	}
632
633	#[inline(always)]
634	fn is_dead(&self, sid: StateID) -> bool {
635	sid == NFA::DEAD
636	}
637
638	#[inline(always)]
639	fn is_match(&self, sid: StateID) -> bool {
640	// N.B. This returns true when sid==NFA::FAIL but that's okay because
641	// NFA::FAIL is not actually a valid state ID from the perspective of
642	// the Automaton trait. Namely, it is never returned by 'start_state'
643	// or by 'next_state'. So we don't need to care about it here.
644	!self.is_dead(sid) && sid <= self.special.max_match_id
645	}
646
647	#[inline(always)]
648	fn is_start(&self, sid: StateID) -> bool {
649	sid == self.special.start_unanchored_id
650	\|\| sid == self.special.start_anchored_id
651	}
652
653	#[inline(always)]
654	fn match_kind(&self) -> MatchKind {
655	self.match_kind
656	}
657
658	#[inline(always)]
659	fn patterns_len(&self) -> usize {
660	self.pattern_lens.len()
661	}
662
663	#[inline(always)]
664	fn pattern_len(&self, pid: PatternID) -> usize {
665	self.pattern_lens[pid].as_usize()
666	}
667
668	#[inline(always)]
669	fn min_pattern_len(&self) -> usize {
670	self.min_pattern_len
671	}
672
673	#[inline(always)]
674	fn max_pattern_len(&self) -> usize {
675	self.max_pattern_len
676	}
677
678	#[inline(always)]
679	fn match_len(&self, sid: StateID) -> usize {
680	self.iter_matches(sid).count()
681	}
682
683	#[inline(always)]
684	fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
685	self.iter_matches(sid).nth(index).unwrap()
686	}
687
688	#[inline(always)]
689	fn memory_usage(&self) -> usize {
690	self.states.len() * core::mem::size_of::<State>()
691	+ self.sparse.len() * core::mem::size_of::<Transition>()
692	+ self.matches.len() * core::mem::size_of::<Match>()
693	+ self.dense.len() * StateID::SIZE
694	+ self.pattern_lens.len() * SmallIndex::SIZE
695	+ self.prefilter.as_ref().map_or(`0`, \|p\| p.memory_usage())
696	}
697
698	#[inline(always)]
699	fn prefilter(&self) -> Option<&Prefilter> {
700	self.prefilter.as_ref()
701	}
702	}
703
704	/// A representation of a sparse NFA state for an Aho-Corasick automaton.
705	///
706	/// It contains the transitions to the next state, a failure transition for
707	/// cases where there exists no other transition for the current input byte
708	/// and the matches implied by visiting this state (if any).
709	#[derive(Clone, Debug)]
710	pub(crate) struct State {
711	/// A pointer to `NFA::trans` corresponding to the head of a linked list
712	/// containing all of the transitions for this state.
713	///
714	/// This is `StateID::ZERO` if and only if this state has zero transitions.
715	sparse: StateID,
716	/// A pointer to a row of `N` transitions in `NFA::dense`. These
717	/// transitions correspond precisely to what is obtained by traversing
718	/// `sparse`, but permits constant time lookup.
719	///
720	/// When this is zero (which is true for most states in the default
721	/// configuration), then this state has no dense representation.
722	///
723	/// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is
724	/// typically much less than 256 (the maximum value).
725	dense: StateID,
726	/// A pointer to `NFA::matches` corresponding to the head of a linked list
727	/// containing all of the matches for this state.
728	///
729	/// This is `StateID::ZERO` if and only if this state is not a match state.
730	matches: StateID,
731	/// The state that should be transitioned to if the current byte in the
732	/// haystack does not have a corresponding transition defined in this
733	/// state.
734	fail: StateID,
735	/// The depth of this state. Specifically, this is the distance from this
736	/// state to the starting state. (For the special sentinel states DEAD and
737	/// FAIL, their depth is always 0.) The depth of a starting state is 0.
738	///
739	/// Note that depth is currently not used in this non-contiguous NFA. It
740	/// may in the future, but it is used in the contiguous NFA. Namely, it
741	/// permits an optimization where states near the starting state have their
742	/// transitions stored in a dense fashion, but all other states have their
743	/// transitions stored in a sparse fashion. (This non-contiguous NFA uses
744	/// a sparse representation for all states unconditionally.) In any case,
745	/// this is really the only convenient place to compute and store this
746	/// information, which we need when building the contiguous NFA.
747	depth: SmallIndex,
748	}
749
750	impl State {
751	/// Return true if and only if this state is a match state.
752	pub(crate) fn is_match(&self) -> bool {
753	self.matches != StateID::ZERO
754	}
755
756	/// Returns the failure transition for this state.
757	pub(crate) fn fail(&self) -> StateID {
758	self.fail
759	}
760
761	/// Returns the depth of this state. That is, the number of transitions
762	/// this state is from the start state of the NFA.
763	pub(crate) fn depth(&self) -> SmallIndex {
764	self.depth
765	}
766	}
767
768	/// A single transition in a non-contiguous NFA.
769	#[derive(Clone, Copy, Default)]
770	#[repr(packed)]
771	pub(crate) struct Transition {
772	byte: u8,
773	next: StateID,
774	link: StateID,
775	}
776
777	impl Transition {
778	/// Return the byte for which this transition is defined.
779	pub(crate) fn byte(&self) -> u8 {
780	self.byte
781	}
782
783	/// Return the ID of the state that this transition points to.
784	pub(crate) fn next(&self) -> StateID {
785	self.next
786	}
787
788	/// Return the ID of the next transition.
789	fn link(&self) -> StateID {
790	self.link
791	}
792	}
793
794	impl core::fmt::Debug for Transition {
795	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
796	write!(
797	f,
798	"Transition(byte: {:X?}, next: {:?}, link: {:?})",
799	self.byte,
800	self.next().as_usize(),
801	self.link().as_usize()
802	)
803	}
804	}
805
806	/// A single match in a non-contiguous NFA.
807	#[derive(Clone, Copy, Default)]
808	struct Match {
809	pid: PatternID,
810	link: StateID,
811	}
812
813	impl Match {
814	/// Return the pattern ID for this match.
815	pub(crate) fn pattern(&self) -> PatternID {
816	self.pid
817	}
818
819	/// Return the ID of the next match.
820	fn link(&self) -> StateID {
821	self.link
822	}
823	}
824
825	impl core::fmt::Debug for Match {
826	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
827	write!(
828	f,
829	"Match(pid: {:?}, link: {:?})",
830	self.pattern().as_usize(),
831	self.link().as_usize()
832	)
833	}
834	}
835
836	/// A builder for configuring an Aho-Corasick noncontiguous NFA.
837	///
838	/// This builder has a subset of the options available to a
839	/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
840	/// their behavior is identical.
841	#[derive(Clone, Debug)]
842	pub struct Builder {
843	match_kind: MatchKind,
844	prefilter: bool,
845	ascii_case_insensitive: bool,
846	dense_depth: usize,
847	}
848
849	impl Default for Builder {
850	fn default() -> Builder {
851	Builder {
852	match_kind: MatchKind::default(),
853	prefilter: `true`,
854	ascii_case_insensitive: `false`,
855	dense_depth: `3`,
856	}
857	}
858	}
859
860	impl Builder {
861	/// Create a new builder for configuring an Aho-Corasick noncontiguous NFA.
862	pub fn new() -> Builder {
863	Builder::default()
864	}
865
866	/// Build an Aho-Corasick noncontiguous NFA from the given iterator of
867	/// patterns.
868	///
869	/// A builder may be reused to create more NFAs.
870	pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
871	where
872	I: IntoIterator<Item = P>,
873	P: AsRef<[u8]>,
874	{
875	debug!("building non-contiguous NFA");
876	let nfa = Compiler::new(self)?.compile(patterns)?;
877	debug!(
878	"non-contiguous NFA built, <states: {:?}, size: {:?}>",
879	nfa.states.len(),
880	nfa.memory_usage()
881	);
882	Ok(nfa)
883	}
884
885	/// Set the desired match semantics.
886	///
887	/// See
888	/// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
889	/// for more documentation and examples.
890	pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
891	self.match_kind = kind;
892	self
893	}
894
895	/// Enable ASCII-aware case insensitive matching.
896	///
897	/// See
898	/// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
899	/// for more documentation and examples.
900	pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
901	self.ascii_case_insensitive = yes;
902	self
903	}
904
905	/// Set the limit on how many states use a dense representation for their
906	/// transitions. Other states will generally use a sparse representation.
907	///
908	/// See
909	/// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
910	/// for more documentation and examples.
911	pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
912	self.dense_depth = depth;
913	self
914	}
915
916	/// Enable heuristic prefilter optimizations.
917	///
918	/// See
919	/// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
920	/// for more documentation and examples.
921	pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
922	self.prefilter = yes;
923	self
924	}
925	}
926
927	/// A compiler uses a builder configuration and builds up the NFA formulation
928	/// of an Aho-Corasick automaton. This roughly corresponds to the standard
929	/// formulation described in textbooks, with some tweaks to support leftmost
930	/// searching.
931	#[derive(Debug)]
932	struct Compiler<'a> {
933	builder: &'a Builder,
934	prefilter: prefilter::Builder,
935	nfa: NFA,
936	byteset: ByteClassSet,
937	}
938
939	impl<'a> Compiler<'a> {
940	fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> {
941	let prefilter = prefilter::Builder::new(builder.match_kind)
942	.ascii_case_insensitive(builder.ascii_case_insensitive);
943	Ok(Compiler {
944	builder,
945	prefilter,
946	nfa: NFA {
947	match_kind: builder.match_kind,
948	states: vec![],
949	sparse: vec![],
950	dense: vec![],
951	matches: vec![],
952	pattern_lens: vec![],
953	prefilter: None,
954	byte_classes: ByteClasses::singletons(),
955	min_pattern_len: usize::MAX,
956	max_pattern_len: `0`,
957	special: Special::zero(),
958	},
959	byteset: ByteClassSet::empty(),
960	})
961	}
962
963	fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError>
964	where
965	I: IntoIterator<Item = P>,
966	P: AsRef<[u8]>,
967	{
968	// Add dummy transition/match links, so that no valid link will point
969	// to another link at index 0.
970	self.nfa.sparse.push(Transition::default());
971	self.nfa.matches.push(Match::default());
972	// Add a dummy dense transition so that no states can have dense==0
973	// represent a valid pointer to dense transitions. This permits
974	// dense==0 to be a sentinel indicating "no dense transitions."
975	self.nfa.dense.push(NFA::DEAD);
976	// the dead state, only used for leftmost and fixed to id==0
977	self.nfa.alloc_state(`0`)?;
978	// the fail state, which is never entered and fixed to id==1
979	self.nfa.alloc_state(`0`)?;
980	// unanchored start state, initially fixed to id==2 but later shuffled
981	// to appear after all non-start match states.
982	self.nfa.special.start_unanchored_id = self.nfa.alloc_state(`0`)?;
983	// anchored start state, initially fixed to id==3 but later shuffled
984	// to appear after unanchored start state.
985	self.nfa.special.start_anchored_id = self.nfa.alloc_state(`0`)?;
986	// Initialize the unanchored starting state in order to make it dense,
987	// and thus make transition lookups on this state faster.
988	self.init_unanchored_start_state()?;
989	// Set all transitions on the DEAD state to point to itself. This way,
990	// the DEAD state can never be escaped. It MUST be used as a sentinel
991	// in any correct search.
992	self.add_dead_state_loop()?;
993	// Build the base trie from the given patterns.
994	self.build_trie(patterns)?;
995	self.nfa.states.shrink_to_fit();
996	// Turn our set of bytes into equivalent classes. This NFA
997	// implementation uses byte classes only for states that use a dense
998	// representation of transitions. (And that's why this comes before
999	// `self.densify()`, as the byte classes need to be set first.)
1000	self.nfa.byte_classes = self.byteset.byte_classes();
1001	// Add transitions (and maybe matches) to the anchored starting state.
1002	// The anchored starting state is used for anchored searches. The only
1003	// mechanical difference between it and the unanchored start state is
1004	// that missing transitions map to the DEAD state instead of the FAIL
1005	// state.
1006	self.set_anchored_start_state()?;
1007	// Rewrite transitions to the FAIL state on the unanchored start state
1008	// as self-transitions. This keeps the start state active at all times.
1009	self.add_unanchored_start_state_loop();
1010	// Make some (possibly zero) states use a dense representation for
1011	// transitions. It's important to do this right after the states
1012	// and non-failure transitions are solidified. That way, subsequent
1013	// accesses (particularly `fill_failure_transitions`) will benefit from
1014	// the faster transition lookup in densified states.
1015	self.densify()?;
1016	// The meat of the Aho-Corasick algorithm: compute and write failure
1017	// transitions. i.e., the state to move to when a transition isn't
1018	// defined in the current state. These are epsilon transitions and thus
1019	// make this formulation an NFA.
1020	self.fill_failure_transitions()?;
1021	// Handle a special case under leftmost semantics when at least one
1022	// of the patterns is the empty string.
1023	self.close_start_state_loop_for_leftmost();
1024	// Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START,
1025	// NON-MATCH, ... This permits us to very quickly query the type of
1026	// the state we're currently in during a search.
1027	self.shuffle();
1028	self.nfa.prefilter = self.prefilter.build();
1029	// Store the maximum ID of all relevant* special states. Start states*
1030	// are only relevant when we have a prefilter, otherwise, there is zero
1031	// reason to care about whether a state is a start state or not during
1032	// a search. Indeed, without a prefilter, we are careful to explicitly
1033	// NOT care about start states, otherwise the search can ping pong
1034	// between the unrolled loop and the handling of special-status states
1035	// and destroy perf.
1036	self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() {
1037	// Why the anchored starting state? Because we always put it
1038	// after the unanchored starting state and it is therefore the
1039	// maximum. Why put unanchored followed by anchored? No particular
1040	// reason, but that's how the states are logically organized in the
1041	// Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯
1042	self.nfa.special.start_anchored_id
1043	} else {
1044	self.nfa.special.max_match_id
1045	};
1046	self.nfa.sparse.shrink_to_fit();
1047	self.nfa.dense.shrink_to_fit();
1048	self.nfa.matches.shrink_to_fit();
1049	self.nfa.pattern_lens.shrink_to_fit();
1050	Ok(self.nfa)
1051	}
1052
1053	/// This sets up the initial prefix trie that makes up the Aho-Corasick
1054	/// automaton. Effectively, it creates the basic structure of the
1055	/// automaton, where every pattern given has a path from the start state to
1056	/// the end of the pattern.
1057	fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError>
1058	where
1059	I: IntoIterator<Item = P>,
1060	P: AsRef<[u8]>,
1061	{
1062	'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() {
1063	let pid = PatternID::new(i).map_err(\|e\| {
1064	BuildError::pattern_id_overflow(
1065	PatternID::MAX.as_u64(),
1066	e.attempted(),
1067	)
1068	})?;
1069	let pat = pat.as_ref();
1070	let patlen = SmallIndex::new(pat.len())
1071	.map_err(\|_\| BuildError::pattern_too_long(pid, pat.len()))?;
1072	self.nfa.min_pattern_len =
1073	core::cmp::min(self.nfa.min_pattern_len, pat.len());
1074	self.nfa.max_pattern_len =
1075	core::cmp::max(self.nfa.max_pattern_len, pat.len());
1076	assert_eq!(
1077	i,
1078	self.nfa.pattern_lens.len(),
1079	"expected number of patterns to match pattern ID"
1080	);
1081	self.nfa.pattern_lens.push(patlen);
1082	// We add the pattern to the prefilter here because the pattern
1083	// ID in the prefilter is determined with respect to the patterns
1084	// added to the prefilter. That is, it isn't the ID we have here,
1085	// but the one determined by its own accounting of patterns.
1086	// To ensure they line up, we add every pattern we see to the
1087	// prefilter, even if some patterns ultimately are impossible to
1088	// match (in leftmost-first semantics specifically).
1089	//
1090	// Another way of doing this would be to expose an API in the
1091	// prefilter to permit setting your own pattern IDs. Or to just use
1092	// our own map and go between them. But this case is sufficiently
1093	// rare that we don't bother and just make sure they're in sync.
1094	if self.builder.prefilter {
1095	self.prefilter.add(pat);
1096	}
1097
1098	let mut prev = self.nfa.special.start_unanchored_id;
1099	let mut saw_match = `false`;
1100	for (depth, &b) in pat.iter().enumerate() {
1101	// When leftmost-first match semantics are requested, we
1102	// specifically stop adding patterns when a previously added
1103	// pattern is a prefix of it. We avoid adding it because
1104	// leftmost-first semantics imply that the pattern can never
1105	// match. This is not just an optimization to save space! It
1106	// is necessary for correctness. In fact, this is the only
1107	// difference in the automaton between the implementations for
1108	// leftmost-first and leftmost-longest.
1109	saw_match = saw_match \|\| self.nfa.states[prev].is_match();
1110	if self.builder.match_kind.is_leftmost_first() && saw_match {
1111	// Skip to the next pattern immediately. This avoids
1112	// incorrectly adding a match after this loop terminates.
1113	continue 'PATTERNS;
1114	}
1115
1116	// Add this byte to our equivalence classes. These don't
1117	// get used while building the trie, but other Aho-Corasick
1118	// implementations may use them.
1119	self.byteset.set_range(b, b);
1120	if self.builder.ascii_case_insensitive {
1121	let b = opposite_ascii_case(b);
1122	self.byteset.set_range(b, b);
1123	}
1124
1125	// If the transition from prev using the current byte already
1126	// exists, then just move through it. Otherwise, add a new
1127	// state. We track the depth here so that we can determine
1128	// how to represent transitions. States near the start state
1129	// use a dense representation that uses more memory but is
1130	// faster. Other states use a sparse representation that uses
1131	// less memory but is slower.
1132	let next = self.nfa.follow_transition(prev, b);
1133	if next != NFA::FAIL {
1134	prev = next;
1135	} else {
1136	let next = self.nfa.alloc_state(depth)?;
1137	self.nfa.add_transition(prev, b, next)?;
1138	if self.builder.ascii_case_insensitive {
1139	let b = opposite_ascii_case(b);
1140	self.nfa.add_transition(prev, b, next)?;
1141	}
1142	prev = next;
1143	}
1144	}
1145	// Once the pattern has been added, log the match in the final
1146	// state that it reached.
1147	self.nfa.add_match(prev, pid)?;
1148	}
1149	Ok(())
1150	}
1151
1152	/// This routine creates failure transitions according to the standard
1153	/// textbook formulation of the Aho-Corasick algorithm, with a couple small
1154	/// tweaks to support "leftmost" semantics.
1155	///
1156	/// Building failure transitions is the most interesting part of building
1157	/// the Aho-Corasick automaton, because they are what allow searches to
1158	/// be performed in linear time. Specifically, a failure transition is
1159	/// a single transition associated with each state that points back to
1160	/// the longest proper suffix of the pattern being searched. The failure
1161	/// transition is followed whenever there exists no transition on the
1162	/// current state for the current input byte. If there is no other proper
1163	/// suffix, then the failure transition points back to the starting state.
1164	///
1165	/// For example, let's say we built an Aho-Corasick automaton with the
1166	/// following patterns: 'abcd' and 'cef'. The trie looks like this:
1167	///
1168	/// ```ignore
1169	/// a - S1 - b - S2 - c - S3 - d - S4*
1170	/// /
1171	/// S0 - c - S5 - e - S6 - f - S7*
1172	/// ```
1173	///
1174	/// At this point, it should be fairly straight-forward to see how this
1175	/// trie can be used in a simplistic way. At any given position in the
1176	/// text we're searching (called the "subject" string), all we need to do
1177	/// is follow the transitions in the trie by consuming one transition for
1178	/// each byte in the subject string. If we reach a match state, then we can
1179	/// report that location as a match.
1180	///
1181	/// The trick comes when searching a subject string like 'abcef'. We'll
1182	/// initially follow the transition from S0 to S1 and wind up in S3 after
1183	/// observng the 'c' byte. At this point, the next byte is 'e' but state
1184	/// S3 has no transition for 'e', so the search fails. We then would need
1185	/// to restart the search at the next position in 'abcef', which
1186	/// corresponds to 'b'. The match would fail, but the next search starting
1187	/// at 'c' would finally succeed. The problem with this approach is that
1188	/// we wind up searching the subject string potentially many times. In
1189	/// effect, this makes the algorithm have worst case `O(n m)` complexity,*
1190	/// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
1191	/// like to achieve a `O(n + m)` worst case complexity.
1192	///
1193	/// This is where failure transitions come in. Instead of dying at S3 in
1194	/// the first search, the automaton can instruct the search to move to
1195	/// another part of the automaton that corresponds to a suffix of what
1196	/// we've seen so far. Recall that we've seen 'abc' in the subject string,
1197	/// and the automaton does indeed have a non-empty suffix, 'c', that could
1198	/// potentially lead to another match. Thus, the actual Aho-Corasick
1199	/// automaton for our patterns in this case looks like this:
1200	///
1201	/// ```ignore
1202	/// a - S1 - b - S2 - c - S3 - d - S4*
1203	/// / /
1204	/// / ----------------
1205	/// / /
1206	/// S0 - c - S5 - e - S6 - f - S7*
1207	/// ```
1208	///
1209	/// That is, we have a failure transition from S3 to S5, which is followed
1210	/// exactly in cases when we are in state S3 but see any byte other than
1211	/// 'd' (that is, we've "failed" to find a match in this portion of our
1212	/// trie). We know we can transition back to S5 because we've already seen
1213	/// a 'c' byte, so we don't need to re-scan it. We can then pick back up
1214	/// with the search starting at S5 and complete our match.
1215	///
1216	/// Adding failure transitions to a trie is fairly simple, but subtle. The
1217	/// key issue is that you might have multiple failure transition that you
1218	/// need to follow. For example, look at the trie for the patterns
1219	/// 'abcd', 'b', 'bcd' and 'cd':
1220	///
1221	/// ```ignore
1222	/// - a - S1 - b - S2* - c - S3 - d - S4*
1223	/// / / /
1224	/// / ------- -------
1225	/// / / /
1226	/// S0 --- b - S5* - c - S6 - d - S7*
1227	/// \ /
1228	/// \ --------
1229	/// \ /
1230	/// - c - S8 - d - S9*
1231	/// ```
1232	///
1233	/// The failure transitions for this trie are defined from S2 to S5,
1234	/// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
1235	/// corresponds to a match, since its failure transition to S5 is itself
1236	/// a match state.
1237	///
1238	/// Perhaps simplest way to think about adding these failure transitions
1239	/// is recursively. That is, if you know the failure transitions for every
1240	/// possible previous state that could be visited (e.g., when computing the
1241	/// failure transition for S3, you already know the failure transitions
1242	/// for S0, S1 and S2), then you can simply follow the failure transition
1243	/// of the previous state and check whether the incoming transition is
1244	/// defined after following the failure transition.
1245	///
1246	/// For example, when determining the failure state for S3, by our
1247	/// assumptions, we already know that there is a failure transition from
1248	/// S2 (the previous state) to S5. So we follow that transition and check
1249	/// whether the transition connecting S2 to S3 is defined. Indeed, it is,
1250	/// as there is a transition from S5 to S6 for the byte 'c'. If no such
1251	/// transition existed, we could keep following the failure transitions
1252	/// until we reach the start state, which is the failure transition for
1253	/// every state that has no corresponding proper suffix.
1254	///
1255	/// We don't actually use recursion to implement this, but instead, use a
1256	/// breadth first search of the automaton. Our base case is the start
1257	/// state, whose failure transition is just a transition to itself.
1258	///
1259	/// When building a leftmost automaton, we proceed as above, but only
1260	/// include a subset of failure transitions. Namely, we omit any failure
1261	/// transitions that appear after a match state in the trie. This is
1262	/// because failure transitions always point back to a proper suffix of
1263	/// what has been seen so far. Thus, following a failure transition after
1264	/// a match implies looking for a match that starts after the one that has
1265	/// already been seen, which is of course therefore not the leftmost match.
1266	///
1267	/// N.B. I came up with this algorithm on my own, and after scouring all of
1268	/// the other AC implementations I know of (Perl, Snort, many on GitHub).
1269	/// I couldn't find any that implement leftmost semantics like this.
1270	/// Perl of course needs leftmost-first semantics, but they implement it
1271	/// with a seeming hack at search* time instead of encoding it into the*
1272	/// automaton. There are also a couple Java libraries that support leftmost
1273	/// longest semantics, but they do it by building a queue of matches at
1274	/// search time, which is even worse than what Perl is doing. ---AG
1275	fn fill_failure_transitions(&mut self) -> Result<(), BuildError> {
1276	let is_leftmost = self.builder.match_kind.is_leftmost();
1277	let start_uid = self.nfa.special.start_unanchored_id;
1278	// Initialize the queue for breadth first search with all transitions
1279	// out of the start state. We handle the start state specially because
1280	// we only want to follow non-self transitions. If we followed self
1281	// transitions, then this would never terminate.
1282	let mut queue = VecDeque::new();
1283	let mut seen = self.queued_set();
1284	let mut prev_link = None;
1285	while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1286	prev_link = Some(link);
1287	let t = self.nfa.sparse[link];
1288
1289	// Skip anything we've seen before and any self-transitions on the
1290	// start state.
1291	if start_uid == t.next() \|\| seen.contains(t.next) {
1292	continue;
1293	}
1294	queue.push_back(t.next);
1295	seen.insert(t.next);
1296	// Under leftmost semantics, if a state immediately following
1297	// the start state is a match state, then we never want to
1298	// follow its failure transition since the failure transition
1299	// necessarily leads back to the start state, which we never
1300	// want to do for leftmost matching after a match has been
1301	// found.
1302	//
1303	// We apply the same logic to non-start states below as well.
1304	if is_leftmost && self.nfa.states[t.next].is_match() {
1305	self.nfa.states[t.next].fail = NFA::DEAD;
1306	}
1307	}
1308	while let Some(id) = queue.pop_front() {
1309	let mut prev_link = None;
1310	while let Some(link) = self.nfa.next_link(id, prev_link) {
1311	prev_link = Some(link);
1312	let t = self.nfa.sparse[link];
1313
1314	if seen.contains(t.next) {
1315	// The only way to visit a duplicate state in a transition
1316	// list is when ASCII case insensitivity is enabled. In
1317	// this case, we want to skip it since it's redundant work.
1318	// But it would also end up duplicating matches, which
1319	// results in reporting duplicate matches in some cases.
1320	// See the 'acasei010' regression test.
1321	continue;
1322	}
1323	queue.push_back(t.next);
1324	seen.insert(t.next);
1325
1326	// As above for start states, under leftmost semantics, once
1327	// we see a match all subsequent states should have no failure
1328	// transitions because failure transitions always imply looking
1329	// for a match that is a suffix of what has been seen so far
1330	// (where "seen so far" corresponds to the string formed by
1331	// following the transitions from the start state to the
1332	// current state). Under leftmost semantics, we specifically do
1333	// not want to allow this to happen because we always want to
1334	// report the match found at the leftmost position.
1335	//
1336	// The difference between leftmost-first and leftmost-longest
1337	// occurs previously while we build the trie. For
1338	// leftmost-first, we simply omit any entries that would
1339	// otherwise require passing through a match state.
1340	//
1341	// Note that for correctness, the failure transition has to be
1342	// set to the dead state for ALL states following a match, not
1343	// just the match state itself. However, by setting the failure
1344	// transition to the dead state on all match states, the dead
1345	// state will automatically propagate to all subsequent states
1346	// via the failure state computation below.
1347	if is_leftmost && self.nfa.states[t.next].is_match() {
1348	self.nfa.states[t.next].fail = NFA::DEAD;
1349	continue;
1350	}
1351	let mut fail = self.nfa.states[id].fail;
1352	while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL {
1353	fail = self.nfa.states[fail].fail;
1354	}
1355	fail = self.nfa.follow_transition(fail, t.byte);
1356	self.nfa.states[t.next].fail = fail;
1357	self.nfa.copy_matches(fail, t.next)?;
1358	}
1359	// If the start state is a match state, then this automaton can
1360	// match the empty string. This implies all states are match states
1361	// since every position matches the empty string, so copy the
1362	// matches from the start state to every state. Strictly speaking,
1363	// this is only necessary for overlapping matches since each
1364	// non-empty non-start match state needs to report empty matches
1365	// in addition to its own. For the non-overlapping case, such
1366	// states only report the first match, which is never empty since
1367	// it isn't a start state.
1368	if !is_leftmost {
1369	self.nfa
1370	.copy_matches(self.nfa.special.start_unanchored_id, id)?;
1371	}
1372	}
1373	Ok(())
1374	}
1375
1376	/// Shuffle the states so that they appear in this sequence:
1377	///
1378	/// DEAD, FAIL, MATCH..., START, START, NON-MATCH...
1379	///
1380	/// The idea here is that if we know how special states are laid out in our
1381	/// transition table, then we can determine what "kind" of state we're in
1382	/// just by comparing our current state ID with a particular value. In this
1383	/// way, we avoid doing extra memory lookups.
1384	///
1385	/// Before shuffling begins, our states look something like this:
1386	///
1387	/// DEAD, FAIL, START, START, (MATCH \| NON-MATCH)...
1388	///
1389	/// So all we need to do is move all of the MATCH states so that they
1390	/// all appear before any NON-MATCH state, like so:
1391	///
1392	/// DEAD, FAIL, START, START, MATCH... NON-MATCH...
1393	///
1394	/// Then it's just a simple matter of swapping the two START states with
1395	/// the last two MATCH states.
1396	///
1397	/// (This is the same technique used for fully compiled DFAs in
1398	/// regex-automata.)
1399	fn shuffle(&mut self) {
1400	let old_start_uid = self.nfa.special.start_unanchored_id;
1401	let old_start_aid = self.nfa.special.start_anchored_id;
1402	assert!(old_start_uid < old_start_aid);
1403	assert_eq!(
1404	`3`,
1405	old_start_aid.as_usize(),
1406	"anchored start state should be at index 3"
1407	);
1408	// We implement shuffling by a sequence of pairwise swaps of states.
1409	// Since we have a number of things referencing states via their
1410	// IDs and swapping them changes their IDs, we need to record every
1411	// swap we make so that we can remap IDs. The remapper handles this
1412	// book-keeping for us.
1413	let mut remapper = Remapper::new(&self.nfa, `0`);
1414	// The way we proceed here is by moving all match states so that
1415	// they directly follow the start states. So it will go: DEAD, FAIL,
1416	// START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ...
1417	//
1418	// To do that, we proceed forward through all states after
1419	// START-ANCHORED and swap match states so that they appear before all
1420	// non-match states.
1421	let mut next_avail = StateID::from(`4u8`);
1422	for i in next_avail.as_usize()..self.nfa.states.len() {
1423	let sid = StateID::new(i).unwrap();
1424	if !self.nfa.states[sid].is_match() {
1425	continue;
1426	}
1427	remapper.swap(&mut self.nfa, sid, next_avail);
1428	// The key invariant here is that only non-match states exist
1429	// between 'next_avail' and 'sid' (with them being potentially
1430	// equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed
1431	// to land on the leftmost non-match state. (Unless 'next_avail'
1432	// and 'sid' are equivalent, in which case, a swap will occur but
1433	// it is a no-op.)
1434	next_avail = StateID::new(next_avail.one_more()).unwrap();
1435	}
1436	// Now we'd like to move the start states to immediately following the
1437	// match states. (The start states may themselves be match states, but
1438	// we'll handle that later.) We arrange the states this way so that we
1439	// don't necessarily need to check whether a state is a start state or
1440	// not before checking whether a state is a match state. For example,
1441	// we'd like to be able to write this as our state machine loop:
1442	//
1443	// sid = start()
1444	// for byte in haystack:
1445	// sid = next(sid, byte)
1446	// if sid <= nfa.max_start_id:
1447	// if sid <= nfa.max_dead_id:
1448	// # search complete
1449	// elif sid <= nfa.max_match_id:
1450	// # found match
1451	//
1452	// The important context here is that we might not want to look for
1453	// start states at all. Namely, if a searcher doesn't have a prefilter,
1454	// then there is no reason to care about whether we're in a start state
1455	// or not. And indeed, if we did check for it, this very hot loop would
1456	// ping pong between the special state handling and the main state
1457	// transition logic. This in turn stalls the CPU by killing branch
1458	// prediction.
1459	//
1460	// So essentially, we really want to be able to "forget" that start
1461	// states even exist and this is why we put them at the end.
1462	let new_start_aid =
1463	StateID::new(next_avail.as_usize().checked_sub(`1`).unwrap())
1464	.unwrap();
1465	remapper.swap(&mut self.nfa, old_start_aid, new_start_aid);
1466	let new_start_uid =
1467	StateID::new(next_avail.as_usize().checked_sub(`2`).unwrap())
1468	.unwrap();
1469	remapper.swap(&mut self.nfa, old_start_uid, new_start_uid);
1470	let new_max_match_id =
1471	StateID::new(next_avail.as_usize().checked_sub(`3`).unwrap())
1472	.unwrap();
1473	self.nfa.special.max_match_id = new_max_match_id;
1474	self.nfa.special.start_unanchored_id = new_start_uid;
1475	self.nfa.special.start_anchored_id = new_start_aid;
1476	// If one start state is a match state, then they both are.
1477	if self.nfa.states[self.nfa.special.start_anchored_id].is_match() {
1478	self.nfa.special.max_match_id = self.nfa.special.start_anchored_id;
1479	}
1480	remapper.remap(&mut self.nfa);
1481	}
1482
1483	/// Attempts to convert the transition representation of a subset of states
1484	/// in this NFA from sparse to dense. This can greatly improve search
1485	/// performance since states with a higher number of transitions tend to
1486	/// correlate with very active states.
1487	///
1488	/// We generally only densify states that are close to the start state.
1489	/// These tend to be the most active states and thus benefit from a dense
1490	/// representation more than other states.
1491	///
1492	/// This tends to best balance between memory usage and performance. In
1493	/// particular, the vast majority* of all states in a typical Aho-Corasick*
1494	/// automaton have only 1 transition and are usually farther from the start
1495	/// state and thus don't get densified.
1496	///
1497	/// Note that this doesn't remove the sparse representation of transitions
1498	/// for states that are densified. It could be done, but actually removing
1499	/// entries from `NFA::sparse` is likely more expensive than it's worth.
1500	fn densify(&mut self) -> Result<(), BuildError> {
1501	for i in `0`..self.nfa.states.len() {
1502	let sid = StateID::new(i).unwrap();
1503	// Don't bother densifying states that are only used as sentinels.
1504	if sid == NFA::DEAD \|\| sid == NFA::FAIL {
1505	continue;
1506	}
1507	// Only densify states that are "close enough" to the start state.
1508	if self.nfa.states[sid].depth.as_usize()
1509	>= self.builder.dense_depth
1510	{
1511	continue;
1512	}
1513	let dense = self.nfa.alloc_dense_state()?;
1514	let mut prev_link = None;
1515	while let Some(link) = self.nfa.next_link(sid, prev_link) {
1516	prev_link = Some(link);
1517	let t = self.nfa.sparse[link];
1518
1519	let class = usize::from(self.nfa.byte_classes.get(t.byte));
1520	let index = dense.as_usize() + class;
1521	self.nfa.dense[index] = t.next;
1522	}
1523	self.nfa.states[sid].dense = dense;
1524	}
1525	Ok(())
1526	}
1527
1528	/// Returns a set that tracked queued states.
1529	///
1530	/// This is only necessary when ASCII case insensitivity is enabled, since
1531	/// it is the only way to visit the same state twice. Otherwise, this
1532	/// returns an inert set that nevers adds anything and always reports
1533	/// `false` for every member test.
1534	fn queued_set(&self) -> QueuedSet {
1535	if self.builder.ascii_case_insensitive {
1536	QueuedSet::active()
1537	} else {
1538	QueuedSet::inert()
1539	}
1540	}
1541
1542	/// Initializes the unanchored start state by making it dense. This is
1543	/// achieved by explicitly setting every transition to the FAIL state.
1544	/// This isn't necessary for correctness, since any missing transition is
1545	/// automatically assumed to be mapped to the FAIL state. We do this to
1546	/// make the unanchored starting state dense, and thus in turn make
1547	/// transition lookups on it faster. (Which is worth doing because it's
1548	/// the most active state.)
1549	fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> {
1550	let start_uid = self.nfa.special.start_unanchored_id;
1551	let start_aid = self.nfa.special.start_anchored_id;
1552	self.nfa.init_full_state(start_uid, NFA::FAIL)?;
1553	self.nfa.init_full_state(start_aid, NFA::FAIL)?;
1554	Ok(())
1555	}
1556
1557	/// Setup the anchored start state by copying all of the transitions and
1558	/// matches from the unanchored starting state with one change: the failure
1559	/// transition is changed to the DEAD state, so that for any undefined
1560	/// transitions, the search will stop.
1561	fn set_anchored_start_state(&mut self) -> Result<(), BuildError> {
1562	let start_uid = self.nfa.special.start_unanchored_id;
1563	let start_aid = self.nfa.special.start_anchored_id;
1564	let (mut uprev_link, mut aprev_link) = (None, None);
1565	loop {
1566	let unext = self.nfa.next_link(start_uid, uprev_link);
1567	let anext = self.nfa.next_link(start_aid, aprev_link);
1568	let (ulink, alink) = match (unext, anext) {
1569	(Some(ulink), Some(alink)) => (ulink, alink),
1570	(None, None) => break,
1571	_ => unreachable!(),
1572	};
1573	uprev_link = Some(ulink);
1574	aprev_link = Some(alink);
1575	self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next;
1576	}
1577	self.nfa.copy_matches(start_uid, start_aid)?;
1578	// This is the main difference between the unanchored and anchored
1579	// starting states. If a lookup on an anchored starting state fails,
1580	// then the search should stop.
1581	//
1582	// N.B. This assumes that the loop on the unanchored starting state
1583	// hasn't been created yet.
1584	self.nfa.states[start_aid].fail = NFA::DEAD;
1585	Ok(())
1586	}
1587
1588	/// Set the failure transitions on the start state to loop back to the
1589	/// start state. This effectively permits the Aho-Corasick automaton to
1590	/// match at any position. This is also required for finding the next
1591	/// state to terminate, namely, finding the next state should never return
1592	/// a fail_id.
1593	///
1594	/// This must be done after building the initial trie, since trie
1595	/// construction depends on transitions to `fail_id` to determine whether a
1596	/// state already exists or not.
1597	fn add_unanchored_start_state_loop(&mut self) {
1598	let start_uid = self.nfa.special.start_unanchored_id;
1599	let mut prev_link = None;
1600	while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1601	prev_link = Some(link);
1602	if self.nfa.sparse[link].next() == NFA::FAIL {
1603	self.nfa.sparse[link].next = start_uid;
1604	}
1605	}
1606	}
1607
1608	/// Remove the start state loop by rewriting any transitions on the start
1609	/// state back to the start state with transitions to the dead state.
1610	///
1611	/// The loop is only closed when two conditions are met: the start state
1612	/// is a match state and the match kind is leftmost-first or
1613	/// leftmost-longest.
1614	///
1615	/// The reason for this is that under leftmost semantics, a start state
1616	/// that is also a match implies that we should never restart the search
1617	/// process. We allow normal transitions out of the start state, but if
1618	/// none exist, we transition to the dead state, which signals that
1619	/// searching should stop.
1620	fn close_start_state_loop_for_leftmost(&mut self) {
1621	let start_uid = self.nfa.special.start_unanchored_id;
1622	let start = &mut self.nfa.states[start_uid];
1623	let dense = start.dense;
1624	if self.builder.match_kind.is_leftmost() && start.is_match() {
1625	let mut prev_link = None;
1626	while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1627	prev_link = Some(link);
1628	if self.nfa.sparse[link].next() == start_uid {
1629	self.nfa.sparse[link].next = NFA::DEAD;
1630	if dense != StateID::ZERO {
1631	let b = self.nfa.sparse[link].byte;
1632	let class = usize::from(self.nfa.byte_classes.get(b));
1633	self.nfa.dense[dense.as_usize() + class] = NFA::DEAD;
1634	}
1635	}
1636	}
1637	}
1638	}
1639
1640	/// Sets all transitions on the dead state to point back to the dead state.
1641	/// Normally, missing transitions map back to the failure state, but the
1642	/// point of the dead state is to act as a sink that can never be escaped.
1643	fn add_dead_state_loop(&mut self) -> Result<(), BuildError> {
1644	self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?;
1645	Ok(())
1646	}
1647	}
1648
1649	/// A set of state identifiers used to avoid revisiting the same state multiple
1650	/// times when filling in failure transitions.
1651	///
1652	/// This set has an "inert" and an "active" mode. When inert, the set never
1653	/// stores anything and always returns `false` for every member test. This is
1654	/// useful to avoid the performance and memory overhead of maintaining this
1655	/// set when it is not needed.
1656	#[derive(Debug)]
1657	struct QueuedSet {
1658	set: Option<BTreeSet<StateID>>,
1659	}
1660
1661	impl QueuedSet {
1662	/// Return an inert set that returns `false` for every state ID membership
1663	/// test.
1664	fn inert() -> QueuedSet {
1665	QueuedSet { set: None }
1666	}
1667
1668	/// Return an active set that tracks state ID membership.
1669	fn active() -> QueuedSet {
1670	QueuedSet { set: Some(BTreeSet::new()) }
1671	}
1672
1673	/// Inserts the given state ID into this set. (If the set is inert, then
1674	/// this is a no-op.)
1675	fn insert(&mut self, state_id: StateID) {
1676	if let Some(ref mut set) = self.set {
1677	set.insert(state_id);
1678	}
1679	}
1680
1681	/// Returns true if and only if the given state ID is in this set. If the
1682	/// set is inert, this always returns false.
1683	fn contains(&self, state_id: StateID) -> bool {
1684	match self.set {
1685	None => `false`,
1686	Some(ref set) => set.contains(&state_id),
1687	}
1688	}
1689	}
1690
1691	impl core::fmt::Debug for NFA {
1692	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1693	use crate::{
1694	automaton::{fmt_state_indicator, sparse_transitions},
1695	util::debug::DebugByte,
1696	};
1697
1698	writeln!(f, "noncontiguous::NFA(")?;
1699	for (sid, state) in self.states.iter().with_state_ids() {
1700	// The FAIL state doesn't actually have space for a state allocated
1701	// for it, so we have to treat it as a special case.
1702	if sid == NFA::FAIL {
1703	writeln!(f, "F {:`06`}:", sid.as_usize())?;
1704	continue;
1705	}
1706	fmt_state_indicator(f, self, sid)?;
1707	write!(
1708	f,
1709	"{:`06`}({:`06`}): ",
1710	sid.as_usize(),
1711	state.fail.as_usize()
1712	)?;
1713
1714	let it = sparse_transitions(
1715	self.iter_trans(sid).map(\|t\| (t.byte, t.next)),
1716	)
1717	.enumerate();
1718	for (i, (start, end, sid)) in it {
1719	if i > `0` {
1720	write!(f, ", ")?;
1721	}
1722	if start == end {
1723	write!(
1724	f,
1725	"{:?} => {:?}",
1726	DebugByte(start),
1727	sid.as_usize()
1728	)?;
1729	} else {
1730	write!(
1731	f,
1732	"{:?}-{:?} => {:?}",
1733	DebugByte(start),
1734	DebugByte(end),
1735	sid.as_usize()
1736	)?;
1737	}
1738	}
1739
1740	write!(f, "`\n`")?;
1741	if self.is_match(sid) {
1742	write!(f, " matches: ")?;
1743	for (i, pid) in self.iter_matches(sid).enumerate() {
1744	if i > `0` {
1745	write!(f, ", ")?;
1746	}
1747	write!(f, "{}", pid.as_usize())?;
1748	}
1749	write!(f, "`\n`")?;
1750	}
1751	}
1752	writeln!(f, "match kind: {:?}", self.match_kind)?;
1753	writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
1754	writeln!(f, "state length: {:?}", self.states.len())?;
1755	writeln!(f, "pattern length: {:?}", self.patterns_len())?;
1756	writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
1757	writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
1758	writeln!(f, "memory usage: {:?}", self.memory_usage())?;
1759	writeln!(f, ")")?;
1760	Ok(())
1761	}
1762	}
1763

Provided by KDAB

Definitions