1/*!
2Provides a noncontiguous NFA implementation of Aho-Corasick.
3
4This is a low-level API that generally only needs to be used in niche
5circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6instead of a noncontiguous NFA directly. Using an `NFA` directly is typically
7only necessary when one needs access to the [`Automaton`] trait implementation.
8*/
9
10use alloc::{
11 collections::{BTreeSet, VecDeque},
12 vec,
13 vec::Vec,
14};
15
16use crate::{
17 automaton::Automaton,
18 util::{
19 alphabet::{ByteClassSet, ByteClasses},
20 error::{BuildError, MatchError},
21 prefilter::{self, opposite_ascii_case, Prefilter},
22 primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
23 remapper::Remapper,
24 search::{Anchored, MatchKind},
25 special::Special,
26 },
27};
28
29/// A noncontiguous NFA implementation of Aho-Corasick.
30///
31/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
32/// this type directly. Using an `NFA` directly is typically only necessary
33/// when one needs access to the [`Automaton`] trait implementation.
34///
35/// This NFA represents the "core" implementation of Aho-Corasick in this
36/// crate. Namely, constructing this NFA involving building a trie and then
37/// filling in the failure transitions between states, similar to what is
38/// described in any standard textbook description of Aho-Corasick.
39///
40/// In order to minimize heap usage and to avoid additional construction costs,
41/// this implementation represents the transitions of all states as distinct
42/// sparse memory allocations. This is where it gets its name from. That is,
43/// this NFA has no contiguous memory allocation for its transition table. Each
44/// state gets its own allocation.
45///
46/// While the sparse representation keeps memory usage to somewhat reasonable
47/// levels, it is still quite large and also results in somewhat mediocre
48/// search performance. For this reason, it is almost always a good idea to
49/// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is
50/// marginally slower to build, but has higher throughput and can sometimes use
51/// an order of magnitude less memory. The main reason to use a noncontiguous
52/// NFA is when you need the fastest possible construction time, or when a
53/// contiguous NFA does not have the desired capacity. (The total number of NFA
54/// states it can have is fewer than a noncontiguous NFA.)
55///
56/// # Example
57///
58/// This example shows how to build an `NFA` directly and use it to execute
59/// [`Automaton::try_find`]:
60///
61/// ```
62/// use aho_corasick::{
63/// automaton::Automaton,
64/// nfa::noncontiguous::NFA,
65/// Input, Match,
66/// };
67///
68/// let patterns = &["b", "abc", "abcd"];
69/// let haystack = "abcd";
70///
71/// let nfa = NFA::new(patterns).unwrap();
72/// assert_eq!(
73/// Some(Match::must(0, 1..2)),
74/// nfa.try_find(&Input::new(haystack))?,
75/// );
76/// # Ok::<(), Box<dyn std::error::Error>>(())
77/// ```
78///
79/// It is also possible to implement your own version of `try_find`. See the
80/// [`Automaton`] documentation for an example.
81#[derive(Clone)]
82pub struct NFA {
83 /// The match semantics built into this NFA.
84 match_kind: MatchKind,
85 /// A set of states. Each state defines its own transitions, a fail
86 /// transition and a set of indices corresponding to matches.
87 ///
88 /// The first state is always the fail state, which is used only as a
89 /// sentinel. Namely, in the final NFA, no transition into the fail state
90 /// exists. (Well, they do, but they aren't followed. Instead, the state's
91 /// failure transition is followed.)
92 ///
93 /// The second state (index 1) is always the dead state. Dead states are
94 /// in every automaton, but only used when leftmost-{first,longest} match
95 /// semantics are enabled. Specifically, they instruct search to stop
96 /// at specific points in order to report the correct match location. In
97 /// the standard Aho-Corasick construction, there are no transitions to
98 /// the dead state.
99 ///
100 /// The third state (index 2) is generally intended to be the starting or
101 /// "root" state.
102 states: Vec<State>,
103 /// Transitions stored in a sparse representation via a linked list.
104 ///
105 /// Each transition contains three pieces of information: the byte it
106 /// is defined for, the state it transitions to and a link to the next
107 /// transition in the same state (or `StateID::ZERO` if it is the last
108 /// transition).
109 ///
110 /// The first transition for each state is determined by `State::sparse`.
111 ///
112 /// Note that this contains a complete set of all transitions in this NFA,
113 /// including states that have a dense representation for transitions.
114 /// (Adding dense transitions for a state doesn't remove its sparse
115 /// transitions, since deleting transitions from this particular sparse
116 /// representation would be fairly expensive.)
117 sparse: Vec<Transition>,
118 /// Transitions stored in a dense representation.
119 ///
120 /// A state has a row in this table if and only if `State::dense` is
121 /// not equal to `StateID::ZERO`. When not zero, there are precisely
122 /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense`
123 /// in this table.
124 ///
125 /// Generally a very small minority of states have a dense representation
126 /// since it uses so much memory.
127 dense: Vec<StateID>,
128 /// Matches stored in linked list for each state.
129 ///
130 /// Like sparse transitions, each match has a link to the next match in the
131 /// state.
132 ///
133 /// The first match for each state is determined by `State::matches`.
134 matches: Vec<Match>,
135 /// The length, in bytes, of each pattern in this NFA. This slice is
136 /// indexed by `PatternID`.
137 ///
138 /// The number of entries in this vector corresponds to the total number of
139 /// patterns in this automaton.
140 pattern_lens: Vec<SmallIndex>,
141 /// A prefilter for quickly skipping to candidate matches, if pertinent.
142 prefilter: Option<Prefilter>,
143 /// A set of equivalence classes in terms of bytes. We compute this while
144 /// building the NFA, but don't use it in the NFA's states. Instead, we
145 /// use this for building the DFA. We store it on the NFA since it's easy
146 /// to compute while visiting the patterns.
147 byte_classes: ByteClasses,
148 /// The length, in bytes, of the shortest pattern in this automaton. This
149 /// information is useful for detecting whether an automaton matches the
150 /// empty string or not.
151 min_pattern_len: usize,
152 /// The length, in bytes, of the longest pattern in this automaton. This
153 /// information is useful for keeping correct buffer sizes when searching
154 /// on streams.
155 max_pattern_len: usize,
156 /// The information required to deduce which states are "special" in this
157 /// NFA.
158 ///
159 /// Since the DEAD and FAIL states are always the first two states and
160 /// there are only ever two start states (which follow all of the match
161 /// states), it follows that we can determine whether a state is a fail,
162 /// dead, match or start with just a few comparisons on the ID itself:
163 ///
164 /// is_dead(sid): sid == NFA::DEAD
165 /// is_fail(sid): sid == NFA::FAIL
166 /// is_match(sid): NFA::FAIL < sid && sid <= max_match_id
167 /// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id
168 ///
169 /// Note that this only applies to the NFA after it has been constructed.
170 /// During construction, the start states are the first ones added and the
171 /// match states are inter-leaved with non-match states. Once all of the
172 /// states have been added, the states are shuffled such that the above
173 /// predicates hold.
174 special: Special,
175}
176
177impl NFA {
178 /// Create a new Aho-Corasick noncontiguous NFA using the default
179 /// configuration.
180 ///
181 /// Use a [`Builder`] if you want to change the configuration.
182 pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
183 where
184 I: IntoIterator<Item = P>,
185 P: AsRef<[u8]>,
186 {
187 NFA::builder().build(patterns)
188 }
189
190 /// A convenience method for returning a new Aho-Corasick noncontiguous NFA
191 /// builder.
192 ///
193 /// This usually permits one to just import the `NFA` type.
194 pub fn builder() -> Builder {
195 Builder::new()
196 }
197}
198
199impl NFA {
200 /// The DEAD state is a sentinel state like the FAIL state. The DEAD state
201 /// instructs any search to stop and return any currently recorded match,
202 /// or no match otherwise. Generally speaking, it is impossible for an
203 /// unanchored standard search to enter a DEAD state. But an anchored
204 /// search can, and so to can a leftmost search.
205 ///
206 /// We put DEAD before FAIL so that DEAD is always 0. We repeat this
207 /// decision across the other Aho-Corasicm automata, so that DEAD
208 /// states there are always 0 too. It's not that we need all of the
209 /// implementations to agree, but rather, the contiguous NFA and the DFA
210 /// use a sort of "premultiplied" state identifier where the only state
211 /// whose ID is always known and constant is the first state. Subsequent
212 /// state IDs depend on how much space has already been used in the
213 /// transition table.
214 pub(crate) const DEAD: StateID = StateID::new_unchecked(0);
215 /// The FAIL state mostly just corresponds to the ID of any transition on a
216 /// state that isn't explicitly defined. When one transitions into the FAIL
217 /// state, one must follow the previous state's failure transition before
218 /// doing the next state lookup. In this way, FAIL is more of a sentinel
219 /// than a state that one actually transitions into. In particular, it is
220 /// never exposed in the `Automaton` interface.
221 pub(crate) const FAIL: StateID = StateID::new_unchecked(1);
222
223 /// Returns the equivalence classes of bytes found while constructing
224 /// this NFA.
225 ///
226 /// Note that the NFA doesn't actually make use of these equivalence
227 /// classes. Instead, these are useful for building the DFA when desired.
228 pub(crate) fn byte_classes(&self) -> &ByteClasses {
229 &self.byte_classes
230 }
231
232 /// Returns a slice containing the length of each pattern in this searcher.
233 /// It is indexed by `PatternID` and has length `NFA::patterns_len`.
234 ///
235 /// This is exposed for convenience when building a contiguous NFA. But it
236 /// can be reconstructed from the `Automaton` API if necessary.
237 pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] {
238 &self.pattern_lens
239 }
240
241 /// Returns a slice of all states in this non-contiguous NFA.
242 pub(crate) fn states(&self) -> &[State] {
243 &self.states
244 }
245
246 /// Returns the underlying "special" state information for this NFA.
247 pub(crate) fn special(&self) -> &Special {
248 &self.special
249 }
250
251 /// Swaps the states at `id1` and `id2`.
252 ///
253 /// This does not update the transitions of any state to account for the
254 /// state swap.
255 pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
256 self.states.swap(id1.as_usize(), id2.as_usize());
257 }
258
259 /// Re-maps all state IDs in this NFA according to the `map` function
260 /// given.
261 pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
262 let alphabet_len = self.byte_classes.alphabet_len();
263 for state in self.states.iter_mut() {
264 state.fail = map(state.fail);
265 let mut link = state.sparse;
266 while link != StateID::ZERO {
267 let t = &mut self.sparse[link];
268 t.next = map(t.next);
269 link = t.link;
270 }
271 if state.dense != StateID::ZERO {
272 let start = state.dense.as_usize();
273 for next in self.dense[start..][..alphabet_len].iter_mut() {
274 *next = map(*next);
275 }
276 }
277 }
278 }
279
280 /// Iterate over all of the transitions for the given state ID.
281 pub(crate) fn iter_trans(
282 &self,
283 sid: StateID,
284 ) -> impl Iterator<Item = Transition> + '_ {
285 let mut link = self.states[sid].sparse;
286 core::iter::from_fn(move || {
287 if link == StateID::ZERO {
288 return None;
289 }
290 let t = self.sparse[link];
291 link = t.link;
292 Some(t)
293 })
294 }
295
296 /// Iterate over all of the matches for the given state ID.
297 pub(crate) fn iter_matches(
298 &self,
299 sid: StateID,
300 ) -> impl Iterator<Item = PatternID> + '_ {
301 let mut link = self.states[sid].matches;
302 core::iter::from_fn(move || {
303 if link == StateID::ZERO {
304 return None;
305 }
306 let m = self.matches[link];
307 link = m.link;
308 Some(m.pid)
309 })
310 }
311
312 /// Return the link following the one given. If the one given is the last
313 /// link for the given state, then return `None`.
314 ///
315 /// If no previous link is given, then this returns the first link in the
316 /// state, if one exists.
317 ///
318 /// This is useful for manually iterating over the transitions in a single
319 /// state without borrowing the NFA. This permits mutating other parts of
320 /// the NFA during iteration. Namely, one can access the transition pointed
321 /// to by the link via `self.sparse[link]`.
322 fn next_link(
323 &self,
324 sid: StateID,
325 prev: Option<StateID>,
326 ) -> Option<StateID> {
327 let link =
328 prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link);
329 if link == StateID::ZERO {
330 None
331 } else {
332 Some(link)
333 }
334 }
335
336 /// Follow the transition for the given byte in the given state. If no such
337 /// transition exists, then the FAIL state ID is returned.
338 #[inline(always)]
339 fn follow_transition(&self, sid: StateID, byte: u8) -> StateID {
340 let s = &self.states[sid];
341 // This is a special case that targets starting states and states
342 // near a start state. Namely, after the initial trie is constructed,
343 // we look for states close to the start state to convert to a dense
344 // representation for their transitions. This winds up using a lot more
345 // memory per state in exchange for faster transition lookups. But
346 // since we only do this for a small number of states (by default), the
347 // memory usage is usually minimal.
348 //
349 // This has *massive* benefit when executing searches because the
350 // unanchored starting state is by far the hottest state and is
351 // frequently visited. Moreover, the 'for' loop below that works
352 // decently on an actually sparse state is disastrous on a state that
353 // is nearly or completely dense.
354 if s.dense == StateID::ZERO {
355 self.follow_transition_sparse(sid, byte)
356 } else {
357 let class = usize::from(self.byte_classes.get(byte));
358 self.dense[s.dense.as_usize() + class]
359 }
360 }
361
362 /// Like `follow_transition`, but always uses the sparse representation.
363 #[inline(always)]
364 fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID {
365 for t in self.iter_trans(sid) {
366 if byte <= t.byte {
367 if byte == t.byte {
368 return t.next;
369 }
370 break;
371 }
372 }
373 NFA::FAIL
374 }
375
376 /// Set the transition for the given byte to the state ID given.
377 ///
378 /// Note that one should not set transitions to the FAIL state. It is not
379 /// technically incorrect, but it wastes space. If a transition is not
380 /// defined, then it is automatically assumed to lead to the FAIL state.
381 fn add_transition(
382 &mut self,
383 prev: StateID,
384 byte: u8,
385 next: StateID,
386 ) -> Result<(), BuildError> {
387 if self.states[prev].dense != StateID::ZERO {
388 let dense = self.states[prev].dense;
389 let class = usize::from(self.byte_classes.get(byte));
390 self.dense[dense.as_usize() + class] = next;
391 }
392
393 let head = self.states[prev].sparse;
394 if head == StateID::ZERO || byte < self.sparse[head].byte {
395 let new_link = self.alloc_transition()?;
396 self.sparse[new_link] = Transition { byte, next, link: head };
397 self.states[prev].sparse = new_link;
398 return Ok(());
399 } else if byte == self.sparse[head].byte {
400 self.sparse[head].next = next;
401 return Ok(());
402 }
403
404 // We handled the only cases where the beginning of the transition
405 // chain needs to change. At this point, we now know that there is
406 // at least one entry in the transition chain and the byte for that
407 // transition is less than the byte for the transition we're adding.
408 let (mut link_prev, mut link_next) = (head, self.sparse[head].link);
409 while link_next != StateID::ZERO && byte > self.sparse[link_next].byte
410 {
411 link_prev = link_next;
412 link_next = self.sparse[link_next].link;
413 }
414 if link_next == StateID::ZERO || byte < self.sparse[link_next].byte {
415 let link = self.alloc_transition()?;
416 self.sparse[link] = Transition { byte, next, link: link_next };
417 self.sparse[link_prev].link = link;
418 } else {
419 assert_eq!(byte, self.sparse[link_next].byte);
420 self.sparse[link_next].next = next;
421 }
422 Ok(())
423 }
424
425 /// This sets every possible transition (all 255 of them) for the given
426 /// state to the name `next` value.
427 ///
428 /// This is useful for efficiently initializing start/dead states.
429 ///
430 /// # Panics
431 ///
432 /// This requires that the state has no transitions added to it already.
433 /// If it has any transitions, then this panics. It will also panic if
434 /// the state has been densified prior to calling this.
435 fn init_full_state(
436 &mut self,
437 prev: StateID,
438 next: StateID,
439 ) -> Result<(), BuildError> {
440 assert_eq!(
441 StateID::ZERO,
442 self.states[prev].dense,
443 "state must not be dense yet"
444 );
445 assert_eq!(
446 StateID::ZERO,
447 self.states[prev].sparse,
448 "state must have zero transitions"
449 );
450 let mut prev_link = StateID::ZERO;
451 for byte in 0..=255 {
452 let new_link = self.alloc_transition()?;
453 self.sparse[new_link] =
454 Transition { byte, next, link: StateID::ZERO };
455 if prev_link == StateID::ZERO {
456 self.states[prev].sparse = new_link;
457 } else {
458 self.sparse[prev_link].link = new_link;
459 }
460 prev_link = new_link;
461 }
462 Ok(())
463 }
464
465 /// Add a match for the given pattern ID to the state for the given ID.
466 fn add_match(
467 &mut self,
468 sid: StateID,
469 pid: PatternID,
470 ) -> Result<(), BuildError> {
471 let head = self.states[sid].matches;
472 let mut link = head;
473 while self.matches[link].link != StateID::ZERO {
474 link = self.matches[link].link;
475 }
476 let new_match_link = self.alloc_match()?;
477 self.matches[new_match_link].pid = pid;
478 if link == StateID::ZERO {
479 self.states[sid].matches = new_match_link;
480 } else {
481 self.matches[link].link = new_match_link;
482 }
483 Ok(())
484 }
485
486 /// Copy matches from the `src` state to the `dst` state. This is useful
487 /// when a match state can be reached via a failure transition. In which
488 /// case, you'll want to copy the matches (if any) from the state reached
489 /// by the failure transition to the original state you were at.
490 fn copy_matches(
491 &mut self,
492 src: StateID,
493 dst: StateID,
494 ) -> Result<(), BuildError> {
495 let head_dst = self.states[dst].matches;
496 let mut link_dst = head_dst;
497 while self.matches[link_dst].link != StateID::ZERO {
498 link_dst = self.matches[link_dst].link;
499 }
500 let mut link_src = self.states[src].matches;
501 while link_src != StateID::ZERO {
502 let new_match_link =
503 StateID::new(self.matches.len()).map_err(|e| {
504 BuildError::state_id_overflow(
505 StateID::MAX.as_u64(),
506 e.attempted(),
507 )
508 })?;
509 self.matches.push(Match {
510 pid: self.matches[link_src].pid,
511 link: StateID::ZERO,
512 });
513 if link_dst == StateID::ZERO {
514 self.states[dst].matches = new_match_link;
515 } else {
516 self.matches[link_dst].link = new_match_link;
517 }
518
519 link_dst = new_match_link;
520 link_src = self.matches[link_src].link;
521 }
522 Ok(())
523 }
524
525 /// Create a new entry in `NFA::trans`, if there's room, and return that
526 /// entry's ID. If there's no room, then an error is returned.
527 fn alloc_transition(&mut self) -> Result<StateID, BuildError> {
528 let id = StateID::new(self.sparse.len()).map_err(|e| {
529 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
530 })?;
531 self.sparse.push(Transition::default());
532 Ok(id)
533 }
534
535 /// Create a new entry in `NFA::matches`, if there's room, and return that
536 /// entry's ID. If there's no room, then an error is returned.
537 fn alloc_match(&mut self) -> Result<StateID, BuildError> {
538 let id = StateID::new(self.matches.len()).map_err(|e| {
539 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
540 })?;
541 self.matches.push(Match::default());
542 Ok(id)
543 }
544
545 /// Create a new set of `N` transitions in this NFA's dense transition
546 /// table. The ID return corresponds to the index at which the `N`
547 /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is
548 /// the last.
549 ///
550 /// `N` is determined via `NFA::byte_classes::alphabet_len`.
551 fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> {
552 let id = StateID::new(self.dense.len()).map_err(|e| {
553 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
554 })?;
555 // We use FAIL because it's the correct default. If a state doesn't
556 // have a transition defined for every possible byte value, then the
557 // transition function should return NFA::FAIL.
558 self.dense.extend(
559 core::iter::repeat(NFA::FAIL)
560 .take(self.byte_classes.alphabet_len()),
561 );
562 Ok(id)
563 }
564
565 /// Allocate and add a fresh state to the underlying NFA and return its
566 /// ID (guaranteed to be one more than the ID of the previously allocated
567 /// state). If the ID would overflow `StateID`, then this returns an error.
568 fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> {
569 // This is OK because we error when building the trie if we see a
570 // pattern whose length cannot fit into a 'SmallIndex', and the longest
571 // possible depth corresponds to the length of the longest pattern.
572 let depth = SmallIndex::new(depth)
573 .expect("patterns longer than SmallIndex::MAX are not allowed");
574 let id = StateID::new(self.states.len()).map_err(|e| {
575 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
576 })?;
577 self.states.push(State {
578 sparse: StateID::ZERO,
579 dense: StateID::ZERO,
580 matches: StateID::ZERO,
581 fail: self.special.start_unanchored_id,
582 depth,
583 });
584 Ok(id)
585 }
586}
587
588// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
589// returns a valid state ID given a valid state ID. We otherwise claim that
590// all other methods are correct as well.
591unsafe impl Automaton for NFA {
592 #[inline(always)]
593 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
594 match anchored {
595 Anchored::No => Ok(self.special.start_unanchored_id),
596 Anchored::Yes => Ok(self.special.start_anchored_id),
597 }
598 }
599
600 #[inline(always)]
601 fn next_state(
602 &self,
603 anchored: Anchored,
604 mut sid: StateID,
605 byte: u8,
606 ) -> StateID {
607 // This terminates since:
608 //
609 // 1. state.fail never points to the FAIL state.
610 // 2. All state.fail values point to a state closer to the start state.
611 // 3. The start state has no transitions to the FAIL state.
612 loop {
613 let next = self.follow_transition(sid, byte);
614 if next != NFA::FAIL {
615 return next;
616 }
617 // For an anchored search, we never follow failure transitions
618 // because failure transitions lead us down a path to matching
619 // a *proper* suffix of the path we were on. Thus, it can only
620 // produce matches that appear after the beginning of the search.
621 if anchored.is_anchored() {
622 return NFA::DEAD;
623 }
624 sid = self.states[sid].fail();
625 }
626 }
627
628 #[inline(always)]
629 fn is_special(&self, sid: StateID) -> bool {
630 sid <= self.special.max_special_id
631 }
632
633 #[inline(always)]
634 fn is_dead(&self, sid: StateID) -> bool {
635 sid == NFA::DEAD
636 }
637
638 #[inline(always)]
639 fn is_match(&self, sid: StateID) -> bool {
640 // N.B. This returns true when sid==NFA::FAIL but that's okay because
641 // NFA::FAIL is not actually a valid state ID from the perspective of
642 // the Automaton trait. Namely, it is never returned by 'start_state'
643 // or by 'next_state'. So we don't need to care about it here.
644 !self.is_dead(sid) && sid <= self.special.max_match_id
645 }
646
647 #[inline(always)]
648 fn is_start(&self, sid: StateID) -> bool {
649 sid == self.special.start_unanchored_id
650 || sid == self.special.start_anchored_id
651 }
652
653 #[inline(always)]
654 fn match_kind(&self) -> MatchKind {
655 self.match_kind
656 }
657
658 #[inline(always)]
659 fn patterns_len(&self) -> usize {
660 self.pattern_lens.len()
661 }
662
663 #[inline(always)]
664 fn pattern_len(&self, pid: PatternID) -> usize {
665 self.pattern_lens[pid].as_usize()
666 }
667
668 #[inline(always)]
669 fn min_pattern_len(&self) -> usize {
670 self.min_pattern_len
671 }
672
673 #[inline(always)]
674 fn max_pattern_len(&self) -> usize {
675 self.max_pattern_len
676 }
677
678 #[inline(always)]
679 fn match_len(&self, sid: StateID) -> usize {
680 self.iter_matches(sid).count()
681 }
682
683 #[inline(always)]
684 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
685 self.iter_matches(sid).nth(index).unwrap()
686 }
687
688 #[inline(always)]
689 fn memory_usage(&self) -> usize {
690 self.states.len() * core::mem::size_of::<State>()
691 + self.sparse.len() * core::mem::size_of::<Transition>()
692 + self.matches.len() * core::mem::size_of::<Match>()
693 + self.dense.len() * StateID::SIZE
694 + self.pattern_lens.len() * SmallIndex::SIZE
695 + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
696 }
697
698 #[inline(always)]
699 fn prefilter(&self) -> Option<&Prefilter> {
700 self.prefilter.as_ref()
701 }
702}
703
704/// A representation of a sparse NFA state for an Aho-Corasick automaton.
705///
706/// It contains the transitions to the next state, a failure transition for
707/// cases where there exists no other transition for the current input byte
708/// and the matches implied by visiting this state (if any).
709#[derive(Clone, Debug)]
710pub(crate) struct State {
711 /// A pointer to `NFA::trans` corresponding to the head of a linked list
712 /// containing all of the transitions for this state.
713 ///
714 /// This is `StateID::ZERO` if and only if this state has zero transitions.
715 sparse: StateID,
716 /// A pointer to a row of `N` transitions in `NFA::dense`. These
717 /// transitions correspond precisely to what is obtained by traversing
718 /// `sparse`, but permits constant time lookup.
719 ///
720 /// When this is zero (which is true for most states in the default
721 /// configuration), then this state has no dense representation.
722 ///
723 /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is
724 /// typically much less than 256 (the maximum value).
725 dense: StateID,
726 /// A pointer to `NFA::matches` corresponding to the head of a linked list
727 /// containing all of the matches for this state.
728 ///
729 /// This is `StateID::ZERO` if and only if this state is not a match state.
730 matches: StateID,
731 /// The state that should be transitioned to if the current byte in the
732 /// haystack does not have a corresponding transition defined in this
733 /// state.
734 fail: StateID,
735 /// The depth of this state. Specifically, this is the distance from this
736 /// state to the starting state. (For the special sentinel states DEAD and
737 /// FAIL, their depth is always 0.) The depth of a starting state is 0.
738 ///
739 /// Note that depth is currently not used in this non-contiguous NFA. It
740 /// may in the future, but it is used in the contiguous NFA. Namely, it
741 /// permits an optimization where states near the starting state have their
742 /// transitions stored in a dense fashion, but all other states have their
743 /// transitions stored in a sparse fashion. (This non-contiguous NFA uses
744 /// a sparse representation for all states unconditionally.) In any case,
745 /// this is really the only convenient place to compute and store this
746 /// information, which we need when building the contiguous NFA.
747 depth: SmallIndex,
748}
749
750impl State {
751 /// Return true if and only if this state is a match state.
752 pub(crate) fn is_match(&self) -> bool {
753 self.matches != StateID::ZERO
754 }
755
756 /// Returns the failure transition for this state.
757 pub(crate) fn fail(&self) -> StateID {
758 self.fail
759 }
760
761 /// Returns the depth of this state. That is, the number of transitions
762 /// this state is from the start state of the NFA.
763 pub(crate) fn depth(&self) -> SmallIndex {
764 self.depth
765 }
766}
767
768/// A single transition in a non-contiguous NFA.
769#[derive(Clone, Copy, Default)]
770#[repr(packed)]
771pub(crate) struct Transition {
772 byte: u8,
773 next: StateID,
774 link: StateID,
775}
776
777impl Transition {
778 /// Return the byte for which this transition is defined.
779 pub(crate) fn byte(&self) -> u8 {
780 self.byte
781 }
782
783 /// Return the ID of the state that this transition points to.
784 pub(crate) fn next(&self) -> StateID {
785 self.next
786 }
787
788 /// Return the ID of the next transition.
789 fn link(&self) -> StateID {
790 self.link
791 }
792}
793
794impl core::fmt::Debug for Transition {
795 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
796 write!(
797 f,
798 "Transition(byte: {:X?}, next: {:?}, link: {:?})",
799 self.byte,
800 self.next().as_usize(),
801 self.link().as_usize()
802 )
803 }
804}
805
806/// A single match in a non-contiguous NFA.
807#[derive(Clone, Copy, Default)]
808struct Match {
809 pid: PatternID,
810 link: StateID,
811}
812
813impl Match {
814 /// Return the pattern ID for this match.
815 pub(crate) fn pattern(&self) -> PatternID {
816 self.pid
817 }
818
819 /// Return the ID of the next match.
820 fn link(&self) -> StateID {
821 self.link
822 }
823}
824
825impl core::fmt::Debug for Match {
826 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
827 write!(
828 f,
829 "Match(pid: {:?}, link: {:?})",
830 self.pattern().as_usize(),
831 self.link().as_usize()
832 )
833 }
834}
835
836/// A builder for configuring an Aho-Corasick noncontiguous NFA.
837///
838/// This builder has a subset of the options available to a
839/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
840/// their behavior is identical.
841#[derive(Clone, Debug)]
842pub struct Builder {
843 match_kind: MatchKind,
844 prefilter: bool,
845 ascii_case_insensitive: bool,
846 dense_depth: usize,
847}
848
849impl Default for Builder {
850 fn default() -> Builder {
851 Builder {
852 match_kind: MatchKind::default(),
853 prefilter: true,
854 ascii_case_insensitive: false,
855 dense_depth: 3,
856 }
857 }
858}
859
860impl Builder {
861 /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA.
862 pub fn new() -> Builder {
863 Builder::default()
864 }
865
866 /// Build an Aho-Corasick noncontiguous NFA from the given iterator of
867 /// patterns.
868 ///
869 /// A builder may be reused to create more NFAs.
870 pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
871 where
872 I: IntoIterator<Item = P>,
873 P: AsRef<[u8]>,
874 {
875 debug!("building non-contiguous NFA");
876 let nfa = Compiler::new(self)?.compile(patterns)?;
877 debug!(
878 "non-contiguous NFA built, <states: {:?}, size: {:?}>",
879 nfa.states.len(),
880 nfa.memory_usage()
881 );
882 Ok(nfa)
883 }
884
885 /// Set the desired match semantics.
886 ///
887 /// See
888 /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
889 /// for more documentation and examples.
890 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
891 self.match_kind = kind;
892 self
893 }
894
895 /// Enable ASCII-aware case insensitive matching.
896 ///
897 /// See
898 /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
899 /// for more documentation and examples.
900 pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
901 self.ascii_case_insensitive = yes;
902 self
903 }
904
905 /// Set the limit on how many states use a dense representation for their
906 /// transitions. Other states will generally use a sparse representation.
907 ///
908 /// See
909 /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
910 /// for more documentation and examples.
911 pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
912 self.dense_depth = depth;
913 self
914 }
915
916 /// Enable heuristic prefilter optimizations.
917 ///
918 /// See
919 /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
920 /// for more documentation and examples.
921 pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
922 self.prefilter = yes;
923 self
924 }
925}
926
927/// A compiler uses a builder configuration and builds up the NFA formulation
928/// of an Aho-Corasick automaton. This roughly corresponds to the standard
929/// formulation described in textbooks, with some tweaks to support leftmost
930/// searching.
931#[derive(Debug)]
932struct Compiler<'a> {
933 builder: &'a Builder,
934 prefilter: prefilter::Builder,
935 nfa: NFA,
936 byteset: ByteClassSet,
937}
938
939impl<'a> Compiler<'a> {
940 fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> {
941 let prefilter = prefilter::Builder::new(builder.match_kind)
942 .ascii_case_insensitive(builder.ascii_case_insensitive);
943 Ok(Compiler {
944 builder,
945 prefilter,
946 nfa: NFA {
947 match_kind: builder.match_kind,
948 states: vec![],
949 sparse: vec![],
950 dense: vec![],
951 matches: vec![],
952 pattern_lens: vec![],
953 prefilter: None,
954 byte_classes: ByteClasses::singletons(),
955 min_pattern_len: usize::MAX,
956 max_pattern_len: 0,
957 special: Special::zero(),
958 },
959 byteset: ByteClassSet::empty(),
960 })
961 }
962
963 fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError>
964 where
965 I: IntoIterator<Item = P>,
966 P: AsRef<[u8]>,
967 {
968 // Add dummy transition/match links, so that no valid link will point
969 // to another link at index 0.
970 self.nfa.sparse.push(Transition::default());
971 self.nfa.matches.push(Match::default());
972 // Add a dummy dense transition so that no states can have dense==0
973 // represent a valid pointer to dense transitions. This permits
974 // dense==0 to be a sentinel indicating "no dense transitions."
975 self.nfa.dense.push(NFA::DEAD);
976 // the dead state, only used for leftmost and fixed to id==0
977 self.nfa.alloc_state(0)?;
978 // the fail state, which is never entered and fixed to id==1
979 self.nfa.alloc_state(0)?;
980 // unanchored start state, initially fixed to id==2 but later shuffled
981 // to appear after all non-start match states.
982 self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?;
983 // anchored start state, initially fixed to id==3 but later shuffled
984 // to appear after unanchored start state.
985 self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?;
986 // Initialize the unanchored starting state in order to make it dense,
987 // and thus make transition lookups on this state faster.
988 self.init_unanchored_start_state()?;
989 // Set all transitions on the DEAD state to point to itself. This way,
990 // the DEAD state can never be escaped. It MUST be used as a sentinel
991 // in any correct search.
992 self.add_dead_state_loop()?;
993 // Build the base trie from the given patterns.
994 self.build_trie(patterns)?;
995 self.nfa.states.shrink_to_fit();
996 // Turn our set of bytes into equivalent classes. This NFA
997 // implementation uses byte classes only for states that use a dense
998 // representation of transitions. (And that's why this comes before
999 // `self.densify()`, as the byte classes need to be set first.)
1000 self.nfa.byte_classes = self.byteset.byte_classes();
1001 // Add transitions (and maybe matches) to the anchored starting state.
1002 // The anchored starting state is used for anchored searches. The only
1003 // mechanical difference between it and the unanchored start state is
1004 // that missing transitions map to the DEAD state instead of the FAIL
1005 // state.
1006 self.set_anchored_start_state()?;
1007 // Rewrite transitions to the FAIL state on the unanchored start state
1008 // as self-transitions. This keeps the start state active at all times.
1009 self.add_unanchored_start_state_loop();
1010 // Make some (possibly zero) states use a dense representation for
1011 // transitions. It's important to do this right after the states
1012 // and non-failure transitions are solidified. That way, subsequent
1013 // accesses (particularly `fill_failure_transitions`) will benefit from
1014 // the faster transition lookup in densified states.
1015 self.densify()?;
1016 // The meat of the Aho-Corasick algorithm: compute and write failure
1017 // transitions. i.e., the state to move to when a transition isn't
1018 // defined in the current state. These are epsilon transitions and thus
1019 // make this formulation an NFA.
1020 self.fill_failure_transitions()?;
1021 // Handle a special case under leftmost semantics when at least one
1022 // of the patterns is the empty string.
1023 self.close_start_state_loop_for_leftmost();
1024 // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START,
1025 // NON-MATCH, ... This permits us to very quickly query the type of
1026 // the state we're currently in during a search.
1027 self.shuffle();
1028 self.nfa.prefilter = self.prefilter.build();
1029 // Store the maximum ID of all *relevant* special states. Start states
1030 // are only relevant when we have a prefilter, otherwise, there is zero
1031 // reason to care about whether a state is a start state or not during
1032 // a search. Indeed, without a prefilter, we are careful to explicitly
1033 // NOT care about start states, otherwise the search can ping pong
1034 // between the unrolled loop and the handling of special-status states
1035 // and destroy perf.
1036 self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() {
1037 // Why the anchored starting state? Because we always put it
1038 // after the unanchored starting state and it is therefore the
1039 // maximum. Why put unanchored followed by anchored? No particular
1040 // reason, but that's how the states are logically organized in the
1041 // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯
1042 self.nfa.special.start_anchored_id
1043 } else {
1044 self.nfa.special.max_match_id
1045 };
1046 self.nfa.sparse.shrink_to_fit();
1047 self.nfa.dense.shrink_to_fit();
1048 self.nfa.matches.shrink_to_fit();
1049 self.nfa.pattern_lens.shrink_to_fit();
1050 Ok(self.nfa)
1051 }
1052
1053 /// This sets up the initial prefix trie that makes up the Aho-Corasick
1054 /// automaton. Effectively, it creates the basic structure of the
1055 /// automaton, where every pattern given has a path from the start state to
1056 /// the end of the pattern.
1057 fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError>
1058 where
1059 I: IntoIterator<Item = P>,
1060 P: AsRef<[u8]>,
1061 {
1062 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() {
1063 let pid = PatternID::new(i).map_err(|e| {
1064 BuildError::pattern_id_overflow(
1065 PatternID::MAX.as_u64(),
1066 e.attempted(),
1067 )
1068 })?;
1069 let pat = pat.as_ref();
1070 let patlen = SmallIndex::new(pat.len())
1071 .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?;
1072 self.nfa.min_pattern_len =
1073 core::cmp::min(self.nfa.min_pattern_len, pat.len());
1074 self.nfa.max_pattern_len =
1075 core::cmp::max(self.nfa.max_pattern_len, pat.len());
1076 assert_eq!(
1077 i,
1078 self.nfa.pattern_lens.len(),
1079 "expected number of patterns to match pattern ID"
1080 );
1081 self.nfa.pattern_lens.push(patlen);
1082 // We add the pattern to the prefilter here because the pattern
1083 // ID in the prefilter is determined with respect to the patterns
1084 // added to the prefilter. That is, it isn't the ID we have here,
1085 // but the one determined by its own accounting of patterns.
1086 // To ensure they line up, we add every pattern we see to the
1087 // prefilter, even if some patterns ultimately are impossible to
1088 // match (in leftmost-first semantics specifically).
1089 //
1090 // Another way of doing this would be to expose an API in the
1091 // prefilter to permit setting your own pattern IDs. Or to just use
1092 // our own map and go between them. But this case is sufficiently
1093 // rare that we don't bother and just make sure they're in sync.
1094 if self.builder.prefilter {
1095 self.prefilter.add(pat);
1096 }
1097
1098 let mut prev = self.nfa.special.start_unanchored_id;
1099 let mut saw_match = false;
1100 for (depth, &b) in pat.iter().enumerate() {
1101 // When leftmost-first match semantics are requested, we
1102 // specifically stop adding patterns when a previously added
1103 // pattern is a prefix of it. We avoid adding it because
1104 // leftmost-first semantics imply that the pattern can never
1105 // match. This is not just an optimization to save space! It
1106 // is necessary for correctness. In fact, this is the only
1107 // difference in the automaton between the implementations for
1108 // leftmost-first and leftmost-longest.
1109 saw_match = saw_match || self.nfa.states[prev].is_match();
1110 if self.builder.match_kind.is_leftmost_first() && saw_match {
1111 // Skip to the next pattern immediately. This avoids
1112 // incorrectly adding a match after this loop terminates.
1113 continue 'PATTERNS;
1114 }
1115
1116 // Add this byte to our equivalence classes. These don't
1117 // get used while building the trie, but other Aho-Corasick
1118 // implementations may use them.
1119 self.byteset.set_range(b, b);
1120 if self.builder.ascii_case_insensitive {
1121 let b = opposite_ascii_case(b);
1122 self.byteset.set_range(b, b);
1123 }
1124
1125 // If the transition from prev using the current byte already
1126 // exists, then just move through it. Otherwise, add a new
1127 // state. We track the depth here so that we can determine
1128 // how to represent transitions. States near the start state
1129 // use a dense representation that uses more memory but is
1130 // faster. Other states use a sparse representation that uses
1131 // less memory but is slower.
1132 let next = self.nfa.follow_transition(prev, b);
1133 if next != NFA::FAIL {
1134 prev = next;
1135 } else {
1136 let next = self.nfa.alloc_state(depth)?;
1137 self.nfa.add_transition(prev, b, next)?;
1138 if self.builder.ascii_case_insensitive {
1139 let b = opposite_ascii_case(b);
1140 self.nfa.add_transition(prev, b, next)?;
1141 }
1142 prev = next;
1143 }
1144 }
1145 // Once the pattern has been added, log the match in the final
1146 // state that it reached.
1147 self.nfa.add_match(prev, pid)?;
1148 }
1149 Ok(())
1150 }
1151
1152 /// This routine creates failure transitions according to the standard
1153 /// textbook formulation of the Aho-Corasick algorithm, with a couple small
1154 /// tweaks to support "leftmost" semantics.
1155 ///
1156 /// Building failure transitions is the most interesting part of building
1157 /// the Aho-Corasick automaton, because they are what allow searches to
1158 /// be performed in linear time. Specifically, a failure transition is
1159 /// a single transition associated with each state that points back to
1160 /// the longest proper suffix of the pattern being searched. The failure
1161 /// transition is followed whenever there exists no transition on the
1162 /// current state for the current input byte. If there is no other proper
1163 /// suffix, then the failure transition points back to the starting state.
1164 ///
1165 /// For example, let's say we built an Aho-Corasick automaton with the
1166 /// following patterns: 'abcd' and 'cef'. The trie looks like this:
1167 ///
1168 /// ```ignore
1169 /// a - S1 - b - S2 - c - S3 - d - S4*
1170 /// /
1171 /// S0 - c - S5 - e - S6 - f - S7*
1172 /// ```
1173 ///
1174 /// At this point, it should be fairly straight-forward to see how this
1175 /// trie can be used in a simplistic way. At any given position in the
1176 /// text we're searching (called the "subject" string), all we need to do
1177 /// is follow the transitions in the trie by consuming one transition for
1178 /// each byte in the subject string. If we reach a match state, then we can
1179 /// report that location as a match.
1180 ///
1181 /// The trick comes when searching a subject string like 'abcef'. We'll
1182 /// initially follow the transition from S0 to S1 and wind up in S3 after
1183 /// observng the 'c' byte. At this point, the next byte is 'e' but state
1184 /// S3 has no transition for 'e', so the search fails. We then would need
1185 /// to restart the search at the next position in 'abcef', which
1186 /// corresponds to 'b'. The match would fail, but the next search starting
1187 /// at 'c' would finally succeed. The problem with this approach is that
1188 /// we wind up searching the subject string potentially many times. In
1189 /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
1190 /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
1191 /// like to achieve a `O(n + m)` worst case complexity.
1192 ///
1193 /// This is where failure transitions come in. Instead of dying at S3 in
1194 /// the first search, the automaton can instruct the search to move to
1195 /// another part of the automaton that corresponds to a suffix of what
1196 /// we've seen so far. Recall that we've seen 'abc' in the subject string,
1197 /// and the automaton does indeed have a non-empty suffix, 'c', that could
1198 /// potentially lead to another match. Thus, the actual Aho-Corasick
1199 /// automaton for our patterns in this case looks like this:
1200 ///
1201 /// ```ignore
1202 /// a - S1 - b - S2 - c - S3 - d - S4*
1203 /// / /
1204 /// / ----------------
1205 /// / /
1206 /// S0 - c - S5 - e - S6 - f - S7*
1207 /// ```
1208 ///
1209 /// That is, we have a failure transition from S3 to S5, which is followed
1210 /// exactly in cases when we are in state S3 but see any byte other than
1211 /// 'd' (that is, we've "failed" to find a match in this portion of our
1212 /// trie). We know we can transition back to S5 because we've already seen
1213 /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
1214 /// with the search starting at S5 and complete our match.
1215 ///
1216 /// Adding failure transitions to a trie is fairly simple, but subtle. The
1217 /// key issue is that you might have multiple failure transition that you
1218 /// need to follow. For example, look at the trie for the patterns
1219 /// 'abcd', 'b', 'bcd' and 'cd':
1220 ///
1221 /// ```ignore
1222 /// - a - S1 - b - S2* - c - S3 - d - S4*
1223 /// / / /
1224 /// / ------- -------
1225 /// / / /
1226 /// S0 --- b - S5* - c - S6 - d - S7*
1227 /// \ /
1228 /// \ --------
1229 /// \ /
1230 /// - c - S8 - d - S9*
1231 /// ```
1232 ///
1233 /// The failure transitions for this trie are defined from S2 to S5,
1234 /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
1235 /// corresponds to a match, since its failure transition to S5 is itself
1236 /// a match state.
1237 ///
1238 /// Perhaps simplest way to think about adding these failure transitions
1239 /// is recursively. That is, if you know the failure transitions for every
1240 /// possible previous state that could be visited (e.g., when computing the
1241 /// failure transition for S3, you already know the failure transitions
1242 /// for S0, S1 and S2), then you can simply follow the failure transition
1243 /// of the previous state and check whether the incoming transition is
1244 /// defined after following the failure transition.
1245 ///
1246 /// For example, when determining the failure state for S3, by our
1247 /// assumptions, we already know that there is a failure transition from
1248 /// S2 (the previous state) to S5. So we follow that transition and check
1249 /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
1250 /// as there is a transition from S5 to S6 for the byte 'c'. If no such
1251 /// transition existed, we could keep following the failure transitions
1252 /// until we reach the start state, which is the failure transition for
1253 /// every state that has no corresponding proper suffix.
1254 ///
1255 /// We don't actually use recursion to implement this, but instead, use a
1256 /// breadth first search of the automaton. Our base case is the start
1257 /// state, whose failure transition is just a transition to itself.
1258 ///
1259 /// When building a leftmost automaton, we proceed as above, but only
1260 /// include a subset of failure transitions. Namely, we omit any failure
1261 /// transitions that appear after a match state in the trie. This is
1262 /// because failure transitions always point back to a proper suffix of
1263 /// what has been seen so far. Thus, following a failure transition after
1264 /// a match implies looking for a match that starts after the one that has
1265 /// already been seen, which is of course therefore not the leftmost match.
1266 ///
1267 /// N.B. I came up with this algorithm on my own, and after scouring all of
1268 /// the other AC implementations I know of (Perl, Snort, many on GitHub).
1269 /// I couldn't find any that implement leftmost semantics like this.
1270 /// Perl of course needs leftmost-first semantics, but they implement it
1271 /// with a seeming hack at *search* time instead of encoding it into the
1272 /// automaton. There are also a couple Java libraries that support leftmost
1273 /// longest semantics, but they do it by building a queue of matches at
1274 /// search time, which is even worse than what Perl is doing. ---AG
1275 fn fill_failure_transitions(&mut self) -> Result<(), BuildError> {
1276 let is_leftmost = self.builder.match_kind.is_leftmost();
1277 let start_uid = self.nfa.special.start_unanchored_id;
1278 // Initialize the queue for breadth first search with all transitions
1279 // out of the start state. We handle the start state specially because
1280 // we only want to follow non-self transitions. If we followed self
1281 // transitions, then this would never terminate.
1282 let mut queue = VecDeque::new();
1283 let mut seen = self.queued_set();
1284 let mut prev_link = None;
1285 while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1286 prev_link = Some(link);
1287 let t = self.nfa.sparse[link];
1288
1289 // Skip anything we've seen before and any self-transitions on the
1290 // start state.
1291 if start_uid == t.next() || seen.contains(t.next) {
1292 continue;
1293 }
1294 queue.push_back(t.next);
1295 seen.insert(t.next);
1296 // Under leftmost semantics, if a state immediately following
1297 // the start state is a match state, then we never want to
1298 // follow its failure transition since the failure transition
1299 // necessarily leads back to the start state, which we never
1300 // want to do for leftmost matching after a match has been
1301 // found.
1302 //
1303 // We apply the same logic to non-start states below as well.
1304 if is_leftmost && self.nfa.states[t.next].is_match() {
1305 self.nfa.states[t.next].fail = NFA::DEAD;
1306 }
1307 }
1308 while let Some(id) = queue.pop_front() {
1309 let mut prev_link = None;
1310 while let Some(link) = self.nfa.next_link(id, prev_link) {
1311 prev_link = Some(link);
1312 let t = self.nfa.sparse[link];
1313
1314 if seen.contains(t.next) {
1315 // The only way to visit a duplicate state in a transition
1316 // list is when ASCII case insensitivity is enabled. In
1317 // this case, we want to skip it since it's redundant work.
1318 // But it would also end up duplicating matches, which
1319 // results in reporting duplicate matches in some cases.
1320 // See the 'acasei010' regression test.
1321 continue;
1322 }
1323 queue.push_back(t.next);
1324 seen.insert(t.next);
1325
1326 // As above for start states, under leftmost semantics, once
1327 // we see a match all subsequent states should have no failure
1328 // transitions because failure transitions always imply looking
1329 // for a match that is a suffix of what has been seen so far
1330 // (where "seen so far" corresponds to the string formed by
1331 // following the transitions from the start state to the
1332 // current state). Under leftmost semantics, we specifically do
1333 // not want to allow this to happen because we always want to
1334 // report the match found at the leftmost position.
1335 //
1336 // The difference between leftmost-first and leftmost-longest
1337 // occurs previously while we build the trie. For
1338 // leftmost-first, we simply omit any entries that would
1339 // otherwise require passing through a match state.
1340 //
1341 // Note that for correctness, the failure transition has to be
1342 // set to the dead state for ALL states following a match, not
1343 // just the match state itself. However, by setting the failure
1344 // transition to the dead state on all match states, the dead
1345 // state will automatically propagate to all subsequent states
1346 // via the failure state computation below.
1347 if is_leftmost && self.nfa.states[t.next].is_match() {
1348 self.nfa.states[t.next].fail = NFA::DEAD;
1349 continue;
1350 }
1351 let mut fail = self.nfa.states[id].fail;
1352 while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL {
1353 fail = self.nfa.states[fail].fail;
1354 }
1355 fail = self.nfa.follow_transition(fail, t.byte);
1356 self.nfa.states[t.next].fail = fail;
1357 self.nfa.copy_matches(fail, t.next)?;
1358 }
1359 // If the start state is a match state, then this automaton can
1360 // match the empty string. This implies all states are match states
1361 // since every position matches the empty string, so copy the
1362 // matches from the start state to every state. Strictly speaking,
1363 // this is only necessary for overlapping matches since each
1364 // non-empty non-start match state needs to report empty matches
1365 // in addition to its own. For the non-overlapping case, such
1366 // states only report the first match, which is never empty since
1367 // it isn't a start state.
1368 if !is_leftmost {
1369 self.nfa
1370 .copy_matches(self.nfa.special.start_unanchored_id, id)?;
1371 }
1372 }
1373 Ok(())
1374 }
1375
1376 /// Shuffle the states so that they appear in this sequence:
1377 ///
1378 /// DEAD, FAIL, MATCH..., START, START, NON-MATCH...
1379 ///
1380 /// The idea here is that if we know how special states are laid out in our
1381 /// transition table, then we can determine what "kind" of state we're in
1382 /// just by comparing our current state ID with a particular value. In this
1383 /// way, we avoid doing extra memory lookups.
1384 ///
1385 /// Before shuffling begins, our states look something like this:
1386 ///
1387 /// DEAD, FAIL, START, START, (MATCH | NON-MATCH)...
1388 ///
1389 /// So all we need to do is move all of the MATCH states so that they
1390 /// all appear before any NON-MATCH state, like so:
1391 ///
1392 /// DEAD, FAIL, START, START, MATCH... NON-MATCH...
1393 ///
1394 /// Then it's just a simple matter of swapping the two START states with
1395 /// the last two MATCH states.
1396 ///
1397 /// (This is the same technique used for fully compiled DFAs in
1398 /// regex-automata.)
1399 fn shuffle(&mut self) {
1400 let old_start_uid = self.nfa.special.start_unanchored_id;
1401 let old_start_aid = self.nfa.special.start_anchored_id;
1402 assert!(old_start_uid < old_start_aid);
1403 assert_eq!(
1404 3,
1405 old_start_aid.as_usize(),
1406 "anchored start state should be at index 3"
1407 );
1408 // We implement shuffling by a sequence of pairwise swaps of states.
1409 // Since we have a number of things referencing states via their
1410 // IDs and swapping them changes their IDs, we need to record every
1411 // swap we make so that we can remap IDs. The remapper handles this
1412 // book-keeping for us.
1413 let mut remapper = Remapper::new(&self.nfa, 0);
1414 // The way we proceed here is by moving all match states so that
1415 // they directly follow the start states. So it will go: DEAD, FAIL,
1416 // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ...
1417 //
1418 // To do that, we proceed forward through all states after
1419 // START-ANCHORED and swap match states so that they appear before all
1420 // non-match states.
1421 let mut next_avail = StateID::from(4u8);
1422 for i in next_avail.as_usize()..self.nfa.states.len() {
1423 let sid = StateID::new(i).unwrap();
1424 if !self.nfa.states[sid].is_match() {
1425 continue;
1426 }
1427 remapper.swap(&mut self.nfa, sid, next_avail);
1428 // The key invariant here is that only non-match states exist
1429 // between 'next_avail' and 'sid' (with them being potentially
1430 // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed
1431 // to land on the leftmost non-match state. (Unless 'next_avail'
1432 // and 'sid' are equivalent, in which case, a swap will occur but
1433 // it is a no-op.)
1434 next_avail = StateID::new(next_avail.one_more()).unwrap();
1435 }
1436 // Now we'd like to move the start states to immediately following the
1437 // match states. (The start states may themselves be match states, but
1438 // we'll handle that later.) We arrange the states this way so that we
1439 // don't necessarily need to check whether a state is a start state or
1440 // not before checking whether a state is a match state. For example,
1441 // we'd like to be able to write this as our state machine loop:
1442 //
1443 // sid = start()
1444 // for byte in haystack:
1445 // sid = next(sid, byte)
1446 // if sid <= nfa.max_start_id:
1447 // if sid <= nfa.max_dead_id:
1448 // # search complete
1449 // elif sid <= nfa.max_match_id:
1450 // # found match
1451 //
1452 // The important context here is that we might not want to look for
1453 // start states at all. Namely, if a searcher doesn't have a prefilter,
1454 // then there is no reason to care about whether we're in a start state
1455 // or not. And indeed, if we did check for it, this very hot loop would
1456 // ping pong between the special state handling and the main state
1457 // transition logic. This in turn stalls the CPU by killing branch
1458 // prediction.
1459 //
1460 // So essentially, we really want to be able to "forget" that start
1461 // states even exist and this is why we put them at the end.
1462 let new_start_aid =
1463 StateID::new(next_avail.as_usize().checked_sub(1).unwrap())
1464 .unwrap();
1465 remapper.swap(&mut self.nfa, old_start_aid, new_start_aid);
1466 let new_start_uid =
1467 StateID::new(next_avail.as_usize().checked_sub(2).unwrap())
1468 .unwrap();
1469 remapper.swap(&mut self.nfa, old_start_uid, new_start_uid);
1470 let new_max_match_id =
1471 StateID::new(next_avail.as_usize().checked_sub(3).unwrap())
1472 .unwrap();
1473 self.nfa.special.max_match_id = new_max_match_id;
1474 self.nfa.special.start_unanchored_id = new_start_uid;
1475 self.nfa.special.start_anchored_id = new_start_aid;
1476 // If one start state is a match state, then they both are.
1477 if self.nfa.states[self.nfa.special.start_anchored_id].is_match() {
1478 self.nfa.special.max_match_id = self.nfa.special.start_anchored_id;
1479 }
1480 remapper.remap(&mut self.nfa);
1481 }
1482
1483 /// Attempts to convert the transition representation of a subset of states
1484 /// in this NFA from sparse to dense. This can greatly improve search
1485 /// performance since states with a higher number of transitions tend to
1486 /// correlate with very active states.
1487 ///
1488 /// We generally only densify states that are close to the start state.
1489 /// These tend to be the most active states and thus benefit from a dense
1490 /// representation more than other states.
1491 ///
1492 /// This tends to best balance between memory usage and performance. In
1493 /// particular, the *vast majority* of all states in a typical Aho-Corasick
1494 /// automaton have only 1 transition and are usually farther from the start
1495 /// state and thus don't get densified.
1496 ///
1497 /// Note that this doesn't remove the sparse representation of transitions
1498 /// for states that are densified. It could be done, but actually removing
1499 /// entries from `NFA::sparse` is likely more expensive than it's worth.
1500 fn densify(&mut self) -> Result<(), BuildError> {
1501 for i in 0..self.nfa.states.len() {
1502 let sid = StateID::new(i).unwrap();
1503 // Don't bother densifying states that are only used as sentinels.
1504 if sid == NFA::DEAD || sid == NFA::FAIL {
1505 continue;
1506 }
1507 // Only densify states that are "close enough" to the start state.
1508 if self.nfa.states[sid].depth.as_usize()
1509 >= self.builder.dense_depth
1510 {
1511 continue;
1512 }
1513 let dense = self.nfa.alloc_dense_state()?;
1514 let mut prev_link = None;
1515 while let Some(link) = self.nfa.next_link(sid, prev_link) {
1516 prev_link = Some(link);
1517 let t = self.nfa.sparse[link];
1518
1519 let class = usize::from(self.nfa.byte_classes.get(t.byte));
1520 let index = dense.as_usize() + class;
1521 self.nfa.dense[index] = t.next;
1522 }
1523 self.nfa.states[sid].dense = dense;
1524 }
1525 Ok(())
1526 }
1527
1528 /// Returns a set that tracked queued states.
1529 ///
1530 /// This is only necessary when ASCII case insensitivity is enabled, since
1531 /// it is the only way to visit the same state twice. Otherwise, this
1532 /// returns an inert set that nevers adds anything and always reports
1533 /// `false` for every member test.
1534 fn queued_set(&self) -> QueuedSet {
1535 if self.builder.ascii_case_insensitive {
1536 QueuedSet::active()
1537 } else {
1538 QueuedSet::inert()
1539 }
1540 }
1541
1542 /// Initializes the unanchored start state by making it dense. This is
1543 /// achieved by explicitly setting every transition to the FAIL state.
1544 /// This isn't necessary for correctness, since any missing transition is
1545 /// automatically assumed to be mapped to the FAIL state. We do this to
1546 /// make the unanchored starting state dense, and thus in turn make
1547 /// transition lookups on it faster. (Which is worth doing because it's
1548 /// the most active state.)
1549 fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> {
1550 let start_uid = self.nfa.special.start_unanchored_id;
1551 let start_aid = self.nfa.special.start_anchored_id;
1552 self.nfa.init_full_state(start_uid, NFA::FAIL)?;
1553 self.nfa.init_full_state(start_aid, NFA::FAIL)?;
1554 Ok(())
1555 }
1556
1557 /// Setup the anchored start state by copying all of the transitions and
1558 /// matches from the unanchored starting state with one change: the failure
1559 /// transition is changed to the DEAD state, so that for any undefined
1560 /// transitions, the search will stop.
1561 fn set_anchored_start_state(&mut self) -> Result<(), BuildError> {
1562 let start_uid = self.nfa.special.start_unanchored_id;
1563 let start_aid = self.nfa.special.start_anchored_id;
1564 let (mut uprev_link, mut aprev_link) = (None, None);
1565 loop {
1566 let unext = self.nfa.next_link(start_uid, uprev_link);
1567 let anext = self.nfa.next_link(start_aid, aprev_link);
1568 let (ulink, alink) = match (unext, anext) {
1569 (Some(ulink), Some(alink)) => (ulink, alink),
1570 (None, None) => break,
1571 _ => unreachable!(),
1572 };
1573 uprev_link = Some(ulink);
1574 aprev_link = Some(alink);
1575 self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next;
1576 }
1577 self.nfa.copy_matches(start_uid, start_aid)?;
1578 // This is the main difference between the unanchored and anchored
1579 // starting states. If a lookup on an anchored starting state fails,
1580 // then the search should stop.
1581 //
1582 // N.B. This assumes that the loop on the unanchored starting state
1583 // hasn't been created yet.
1584 self.nfa.states[start_aid].fail = NFA::DEAD;
1585 Ok(())
1586 }
1587
1588 /// Set the failure transitions on the start state to loop back to the
1589 /// start state. This effectively permits the Aho-Corasick automaton to
1590 /// match at any position. This is also required for finding the next
1591 /// state to terminate, namely, finding the next state should never return
1592 /// a fail_id.
1593 ///
1594 /// This must be done after building the initial trie, since trie
1595 /// construction depends on transitions to `fail_id` to determine whether a
1596 /// state already exists or not.
1597 fn add_unanchored_start_state_loop(&mut self) {
1598 let start_uid = self.nfa.special.start_unanchored_id;
1599 let mut prev_link = None;
1600 while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1601 prev_link = Some(link);
1602 if self.nfa.sparse[link].next() == NFA::FAIL {
1603 self.nfa.sparse[link].next = start_uid;
1604 }
1605 }
1606 }
1607
1608 /// Remove the start state loop by rewriting any transitions on the start
1609 /// state back to the start state with transitions to the dead state.
1610 ///
1611 /// The loop is only closed when two conditions are met: the start state
1612 /// is a match state and the match kind is leftmost-first or
1613 /// leftmost-longest.
1614 ///
1615 /// The reason for this is that under leftmost semantics, a start state
1616 /// that is also a match implies that we should never restart the search
1617 /// process. We allow normal transitions out of the start state, but if
1618 /// none exist, we transition to the dead state, which signals that
1619 /// searching should stop.
1620 fn close_start_state_loop_for_leftmost(&mut self) {
1621 let start_uid = self.nfa.special.start_unanchored_id;
1622 let start = &mut self.nfa.states[start_uid];
1623 let dense = start.dense;
1624 if self.builder.match_kind.is_leftmost() && start.is_match() {
1625 let mut prev_link = None;
1626 while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1627 prev_link = Some(link);
1628 if self.nfa.sparse[link].next() == start_uid {
1629 self.nfa.sparse[link].next = NFA::DEAD;
1630 if dense != StateID::ZERO {
1631 let b = self.nfa.sparse[link].byte;
1632 let class = usize::from(self.nfa.byte_classes.get(b));
1633 self.nfa.dense[dense.as_usize() + class] = NFA::DEAD;
1634 }
1635 }
1636 }
1637 }
1638 }
1639
1640 /// Sets all transitions on the dead state to point back to the dead state.
1641 /// Normally, missing transitions map back to the failure state, but the
1642 /// point of the dead state is to act as a sink that can never be escaped.
1643 fn add_dead_state_loop(&mut self) -> Result<(), BuildError> {
1644 self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?;
1645 Ok(())
1646 }
1647}
1648
1649/// A set of state identifiers used to avoid revisiting the same state multiple
1650/// times when filling in failure transitions.
1651///
1652/// This set has an "inert" and an "active" mode. When inert, the set never
1653/// stores anything and always returns `false` for every member test. This is
1654/// useful to avoid the performance and memory overhead of maintaining this
1655/// set when it is not needed.
1656#[derive(Debug)]
1657struct QueuedSet {
1658 set: Option<BTreeSet<StateID>>,
1659}
1660
1661impl QueuedSet {
1662 /// Return an inert set that returns `false` for every state ID membership
1663 /// test.
1664 fn inert() -> QueuedSet {
1665 QueuedSet { set: None }
1666 }
1667
1668 /// Return an active set that tracks state ID membership.
1669 fn active() -> QueuedSet {
1670 QueuedSet { set: Some(BTreeSet::new()) }
1671 }
1672
1673 /// Inserts the given state ID into this set. (If the set is inert, then
1674 /// this is a no-op.)
1675 fn insert(&mut self, state_id: StateID) {
1676 if let Some(ref mut set) = self.set {
1677 set.insert(state_id);
1678 }
1679 }
1680
1681 /// Returns true if and only if the given state ID is in this set. If the
1682 /// set is inert, this always returns false.
1683 fn contains(&self, state_id: StateID) -> bool {
1684 match self.set {
1685 None => false,
1686 Some(ref set) => set.contains(&state_id),
1687 }
1688 }
1689}
1690
1691impl core::fmt::Debug for NFA {
1692 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1693 use crate::{
1694 automaton::{fmt_state_indicator, sparse_transitions},
1695 util::debug::DebugByte,
1696 };
1697
1698 writeln!(f, "noncontiguous::NFA(")?;
1699 for (sid, state) in self.states.iter().with_state_ids() {
1700 // The FAIL state doesn't actually have space for a state allocated
1701 // for it, so we have to treat it as a special case.
1702 if sid == NFA::FAIL {
1703 writeln!(f, "F {:06}:", sid.as_usize())?;
1704 continue;
1705 }
1706 fmt_state_indicator(f, self, sid)?;
1707 write!(
1708 f,
1709 "{:06}({:06}): ",
1710 sid.as_usize(),
1711 state.fail.as_usize()
1712 )?;
1713
1714 let it = sparse_transitions(
1715 self.iter_trans(sid).map(|t| (t.byte, t.next)),
1716 )
1717 .enumerate();
1718 for (i, (start, end, sid)) in it {
1719 if i > 0 {
1720 write!(f, ", ")?;
1721 }
1722 if start == end {
1723 write!(
1724 f,
1725 "{:?} => {:?}",
1726 DebugByte(start),
1727 sid.as_usize()
1728 )?;
1729 } else {
1730 write!(
1731 f,
1732 "{:?}-{:?} => {:?}",
1733 DebugByte(start),
1734 DebugByte(end),
1735 sid.as_usize()
1736 )?;
1737 }
1738 }
1739
1740 write!(f, "\n")?;
1741 if self.is_match(sid) {
1742 write!(f, " matches: ")?;
1743 for (i, pid) in self.iter_matches(sid).enumerate() {
1744 if i > 0 {
1745 write!(f, ", ")?;
1746 }
1747 write!(f, "{}", pid.as_usize())?;
1748 }
1749 write!(f, "\n")?;
1750 }
1751 }
1752 writeln!(f, "match kind: {:?}", self.match_kind)?;
1753 writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
1754 writeln!(f, "state length: {:?}", self.states.len())?;
1755 writeln!(f, "pattern length: {:?}", self.patterns_len())?;
1756 writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
1757 writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
1758 writeln!(f, "memory usage: {:?}", self.memory_usage())?;
1759 writeln!(f, ")")?;
1760 Ok(())
1761 }
1762}
1763