1/*!
2Provides a noncontiguous NFA implementation of Aho-Corasick.
3
4This is a low-level API that generally only needs to be used in niche
5circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6instead of a noncontiguous NFA directly. Using an `NFA` directly is typically
7only necessary when one needs access to the [`Automaton`] trait implementation.
8*/
9
10use alloc::{
11 collections::{BTreeSet, VecDeque},
12 vec,
13 vec::Vec,
14};
15
16use crate::{
17 automaton::Automaton,
18 util::{
19 alphabet::{ByteClassSet, ByteClasses},
20 error::{BuildError, MatchError},
21 prefilter::{self, opposite_ascii_case, Prefilter},
22 primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
23 remapper::Remapper,
24 search::{Anchored, MatchKind},
25 special::Special,
26 },
27};
28
29/// A noncontiguous NFA implementation of Aho-Corasick.
30///
31/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
32/// this type directly. Using an `NFA` directly is typically only necessary
33/// when one needs access to the [`Automaton`] trait implementation.
34///
35/// This NFA represents the "core" implementation of Aho-Corasick in this
36/// crate. Namely, constructing this NFA involving building a trie and then
37/// filling in the failure transitions between states, similar to what is
38/// described in any standard textbook description of Aho-Corasick.
39///
40/// In order to minimize heap usage and to avoid additional construction costs,
41/// this implementation represents the transitions of all states as distinct
42/// sparse memory allocations. This is where it gets its name from. That is,
43/// this NFA has no contiguous memory allocation for its transition table. Each
44/// state gets its own allocation.
45///
46/// While the sparse representation keeps memory usage to somewhat reasonable
47/// levels, it is still quite large and also results in somewhat mediocre
48/// search performance. For this reason, it is almost always a good idea to
49/// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is
50/// marginally slower to build, but has higher throughput and can sometimes use
51/// an order of magnitude less memory. The main reason to use a noncontiguous
52/// NFA is when you need the fastest possible construction time, or when a
53/// contiguous NFA does not have the desired capacity. (The total number of NFA
54/// states it can have is fewer than a noncontiguous NFA.)
55///
56/// # Example
57///
58/// This example shows how to build an `NFA` directly and use it to execute
59/// [`Automaton::try_find`]:
60///
61/// ```
62/// use aho_corasick::{
63/// automaton::Automaton,
64/// nfa::noncontiguous::NFA,
65/// Input, Match,
66/// };
67///
68/// let patterns = &["b", "abc", "abcd"];
69/// let haystack = "abcd";
70///
71/// let nfa = NFA::new(patterns).unwrap();
72/// assert_eq!(
73/// Some(Match::must(0, 1..2)),
74/// nfa.try_find(&Input::new(haystack))?,
75/// );
76/// # Ok::<(), Box<dyn std::error::Error>>(())
77/// ```
78///
79/// It is also possible to implement your own version of `try_find`. See the
80/// [`Automaton`] documentation for an example.
81#[derive(Clone)]
82pub struct NFA {
83 /// The match semantics built into this NFA.
84 match_kind: MatchKind,
85 /// A set of states. Each state defines its own transitions, a fail
86 /// transition and a set of indices corresponding to matches.
87 ///
88 /// The first state is always the fail state, which is used only as a
89 /// sentinel. Namely, in the final NFA, no transition into the fail state
90 /// exists. (Well, they do, but they aren't followed. Instead, the state's
91 /// failure transition is followed.)
92 ///
93 /// The second state (index 1) is always the dead state. Dead states are
94 /// in every automaton, but only used when leftmost-{first,longest} match
95 /// semantics are enabled. Specifically, they instruct search to stop
96 /// at specific points in order to report the correct match location. In
97 /// the standard Aho-Corasick construction, there are no transitions to
98 /// the dead state.
99 ///
100 /// The third state (index 2) is generally intended to be the starting or
101 /// "root" state.
102 states: Vec<State>,
103 /// The length, in bytes, of each pattern in this NFA. This slice is
104 /// indexed by `PatternID`.
105 ///
106 /// The number of entries in this vector corresponds to the total number of
107 /// patterns in this automaton.
108 pattern_lens: Vec<SmallIndex>,
109 /// A prefilter for quickly skipping to candidate matches, if pertinent.
110 prefilter: Option<Prefilter>,
111 /// A set of equivalence classes in terms of bytes. We compute this while
112 /// building the NFA, but don't use it in the NFA's states. Instead, we
113 /// use this for building the DFA. We store it on the NFA since it's easy
114 /// to compute while visiting the patterns.
115 byte_classes: ByteClasses,
116 /// The length, in bytes, of the shortest pattern in this automaton. This
117 /// information is useful for detecting whether an automaton matches the
118 /// empty string or not.
119 min_pattern_len: usize,
120 /// The length, in bytes, of the longest pattern in this automaton. This
121 /// information is useful for keeping correct buffer sizes when searching
122 /// on streams.
123 max_pattern_len: usize,
124 /// The information required to deduce which states are "special" in this
125 /// NFA.
126 ///
127 /// Since the DEAD and FAIL states are always the first two states and
128 /// there are only ever two start states (which follow all of the match
129 /// states), it follows that we can determine whether a state is a fail,
130 /// dead, match or start with just a few comparisons on the ID itself:
131 ///
132 /// is_dead(sid): sid == NFA::DEAD
133 /// is_fail(sid): sid == NFA::FAIL
134 /// is_match(sid): NFA::FAIL < sid && sid <= max_match_id
135 /// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id
136 ///
137 /// Note that this only applies to the NFA after it has been constructed.
138 /// During construction, the start states are the first ones added and the
139 /// match states are inter-leaved with non-match states. Once all of the
140 /// states have been added, the states are shuffled such that the above
141 /// predicates hold.
142 special: Special,
143 /// The number of bytes of heap used by this sparse NFA.
144 memory_usage: usize,
145}
146
147impl NFA {
148 /// Create a new Aho-Corasick noncontiguous NFA using the default
149 /// configuration.
150 ///
151 /// Use a [`Builder`] if you want to change the configuration.
152 pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
153 where
154 I: IntoIterator<Item = P>,
155 P: AsRef<[u8]>,
156 {
157 NFA::builder().build(patterns)
158 }
159
160 /// A convenience method for returning a new Aho-Corasick noncontiguous NFA
161 /// builder.
162 ///
163 /// This usually permits one to just import the `NFA` type.
164 pub fn builder() -> Builder {
165 Builder::new()
166 }
167}
168
169impl NFA {
170 /// The DEAD state is a sentinel state like the FAIL state. The DEAD state
171 /// instructs any search to stop and return any currently recorded match,
172 /// or no match otherwise. Generally speaking, it is impossible for an
173 /// unanchored standard search to enter a DEAD state. But an anchored
174 /// search can, and so to can a leftmost search.
175 ///
176 /// We put DEAD before FAIL so that DEAD is always 0. We repeat this
177 /// decision across the other Aho-Corasicm automata, so that DEAD
178 /// states there are always 0 too. It's not that we need all of the
179 /// implementations to agree, but rather, the contiguous NFA and the DFA
180 /// use a sort of "premultiplied" state identifier where the only state
181 /// whose ID is always known and constant is the first state. Subsequent
182 /// state IDs depend on how much space has already been used in the
183 /// transition table.
184 pub(crate) const DEAD: StateID = StateID::new_unchecked(0);
185 /// The FAIL state mostly just corresponds to the ID of any transition on a
186 /// state that isn't explicitly defined. When one transitions into the FAIL
187 /// state, one must follow the previous state's failure transition before
188 /// doing the next state lookup. In this way, FAIL is more of a sentinel
189 /// than a state that one actually transitions into. In particular, it is
190 /// never exposed in the `Automaton` interface.
191 pub(crate) const FAIL: StateID = StateID::new_unchecked(1);
192
193 /// Returns the equivalence classes of bytes found while constructing
194 /// this NFA.
195 ///
196 /// Note that the NFA doesn't actually make use of these equivalence
197 /// classes. Instead, these are useful for building the DFA when desired.
198 pub(crate) fn byte_classes(&self) -> &ByteClasses {
199 &self.byte_classes
200 }
201
202 /// Returns a slice containing the length of each pattern in this searcher.
203 /// It is indexed by `PatternID` and has length `NFA::patterns_len`.
204 ///
205 /// This is exposed for convenience when building a contiguous NFA. But it
206 /// can be reconstructed from the `Automaton` API if necessary.
207 pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] {
208 &self.pattern_lens
209 }
210
211 /// Returns a slice of all states in this non-contiguous NFA.
212 pub(crate) fn states(&self) -> &[State] {
213 &self.states
214 }
215
216 /// Returns the underlying "special" state information for this NFA.
217 pub(crate) fn special(&self) -> &Special {
218 &self.special
219 }
220
221 /// Swaps the states at `id1` and `id2`.
222 ///
223 /// This does not update the transitions of any state to account for the
224 /// state swap.
225 pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
226 self.states.swap(id1.as_usize(), id2.as_usize());
227 }
228
229 /// Re-maps all state IDs in this NFA according to the `map` function
230 /// given.
231 pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
232 for state in self.states.iter_mut() {
233 state.fail = map(state.fail);
234 for (_, ref mut sid) in state.trans.iter_mut() {
235 *sid = map(*sid);
236 }
237 }
238 }
239}
240
241// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
242// returns a valid state ID given a valid state ID. We otherwise claim that
243// all other methods are correct as well.
244unsafe impl Automaton for NFA {
245 #[inline(always)]
246 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
247 match anchored {
248 Anchored::No => Ok(self.special.start_unanchored_id),
249 Anchored::Yes => Ok(self.special.start_anchored_id),
250 }
251 }
252
253 #[inline(always)]
254 fn next_state(
255 &self,
256 anchored: Anchored,
257 mut sid: StateID,
258 byte: u8,
259 ) -> StateID {
260 // This terminates since:
261 //
262 // 1. state.fail never points to the FAIL state.
263 // 2. All state.fail values point to a state closer to the start state.
264 // 3. The start state has no transitions to the FAIL state.
265 loop {
266 let state = &self.states[sid];
267 let next = state.next_state(byte);
268 if next != NFA::FAIL {
269 return next;
270 }
271 // For an anchored search, we never follow failure transitions
272 // because failure transitions lead us down a path to matching
273 // a *proper* suffix of the path we were on. Thus, it can only
274 // produce matches that appear after the beginning of the search.
275 if anchored.is_anchored() {
276 return NFA::DEAD;
277 }
278 sid = state.fail;
279 }
280 }
281
282 #[inline(always)]
283 fn is_special(&self, sid: StateID) -> bool {
284 sid <= self.special.max_special_id
285 }
286
287 #[inline(always)]
288 fn is_dead(&self, sid: StateID) -> bool {
289 sid == NFA::DEAD
290 }
291
292 #[inline(always)]
293 fn is_match(&self, sid: StateID) -> bool {
294 // N.B. This returns true when sid==NFA::FAIL but that's okay because
295 // NFA::FAIL is not actually a valid state ID from the perspective of
296 // the Automaton trait. Namely, it is never returned by 'start_state'
297 // or by 'next_state'. So we don't need to care about it here.
298 !self.is_dead(sid) && sid <= self.special.max_match_id
299 }
300
301 #[inline(always)]
302 fn is_start(&self, sid: StateID) -> bool {
303 sid == self.special.start_unanchored_id
304 || sid == self.special.start_anchored_id
305 }
306
307 #[inline(always)]
308 fn match_kind(&self) -> MatchKind {
309 self.match_kind
310 }
311
312 #[inline(always)]
313 fn patterns_len(&self) -> usize {
314 self.pattern_lens.len()
315 }
316
317 #[inline(always)]
318 fn pattern_len(&self, pid: PatternID) -> usize {
319 self.pattern_lens[pid].as_usize()
320 }
321
322 #[inline(always)]
323 fn min_pattern_len(&self) -> usize {
324 self.min_pattern_len
325 }
326
327 #[inline(always)]
328 fn max_pattern_len(&self) -> usize {
329 self.max_pattern_len
330 }
331
332 #[inline(always)]
333 fn match_len(&self, sid: StateID) -> usize {
334 self.states[sid].matches.len()
335 }
336
337 #[inline(always)]
338 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
339 self.states[sid].matches[index]
340 }
341
342 #[inline(always)]
343 fn memory_usage(&self) -> usize {
344 self.memory_usage
345 + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
346 }
347
348 #[inline(always)]
349 fn prefilter(&self) -> Option<&Prefilter> {
350 self.prefilter.as_ref()
351 }
352}
353
354/// A representation of a sparse NFA state for an Aho-Corasick automaton.
355///
356/// It contains the transitions to the next state, a failure transition for
357/// cases where there exists no other transition for the current input byte
358/// and the matches implied by visiting this state (if any).
359#[derive(Clone)]
360pub(crate) struct State {
361 /// The set of defined transitions for this state sorted by `u8`. In an
362 /// unanchored search, if a byte is not in this set of transitions, then
363 /// it should transition to `fail`. In an anchored search, it should
364 /// transition to the special DEAD state.
365 pub(crate) trans: Vec<(u8, StateID)>,
366 /// The patterns that match once this state is entered. Note that order
367 /// is important in the leftmost case. For example, if one adds 'foo' and
368 /// 'foo' (duplicate patterns are not disallowed), then in a leftmost-first
369 /// search, only the first 'foo' will ever match.
370 pub(crate) matches: Vec<PatternID>,
371 /// The state that should be transitioned to if the current byte in the
372 /// haystack does not have a corresponding transition defined in this
373 /// state.
374 pub(crate) fail: StateID,
375 /// The depth of this state. Specifically, this is the distance from this
376 /// state to the starting state. (For the special sentinel states DEAD and
377 /// FAIL, their depth is always 0.) The depth of a starting state is 0.
378 ///
379 /// Note that depth is currently not used in this non-contiguous NFA. It
380 /// may in the future, but it is used in the contiguous NFA. Namely, it
381 /// permits an optimization where states near the starting state have their
382 /// transitions stored in a dense fashion, but all other states have their
383 /// transitions stored in a sparse fashion. (This non-contiguous NFA uses
384 /// a sparse representation for all states unconditionally.) In any case,
385 /// this is really the only convenient place to compute and store this
386 /// information, which we need when building the contiguous NFA.
387 pub(crate) depth: SmallIndex,
388}
389
390impl State {
391 /// Return the heap memory used by this state. Note that if `State` is
392 /// itself on the heap, then callers need to call this in addition to
393 /// `size_of::<State>()` to get the full heap memory used.
394 fn memory_usage(&self) -> usize {
395 use core::mem::size_of;
396
397 (self.trans.len() * size_of::<(u8, StateID)>())
398 + (self.matches.len() * size_of::<PatternID>())
399 }
400
401 /// Return true if and only if this state is a match state.
402 fn is_match(&self) -> bool {
403 !self.matches.is_empty()
404 }
405
406 /// Return the next state by following the transition for the given byte.
407 /// If no transition for the given byte is defined, then the FAIL state ID
408 /// is returned.
409 #[inline(always)]
410 fn next_state(&self, byte: u8) -> StateID {
411 // This is a special case that targets the unanchored starting state.
412 // By construction, the unanchored starting state is actually a dense
413 // state, because every possible transition is defined on it. Any
414 // transitions that weren't added as part of initial trie construction
415 // get explicitly added as a self-transition back to itself. Thus, we
416 // can treat it as if it were dense and do a constant time lookup.
417 //
418 // This has *massive* benefit when executing searches because the
419 // unanchored starting state is by far the hottest state and is
420 // frequently visited. Moreover, the 'for' loop below that works
421 // decently on an actually sparse state is disastrous on a state that
422 // is nearly or completely dense.
423 //
424 // This optimization also works in general, including for non-starting
425 // states that happen to have every transition defined. Namely, it
426 // is impossible for 'self.trans' to have duplicate transitions (by
427 // construction) and transitions are always in sorted ascending order.
428 // So if a state has 256 transitions, it is, by construction, dense and
429 // amenable to constant time indexing.
430 if self.trans.len() == 256 {
431 self.trans[usize::from(byte)].1
432 } else {
433 for &(b, id) in self.trans.iter() {
434 if b == byte {
435 return id;
436 }
437 }
438 NFA::FAIL
439 }
440 }
441
442 /// Set the transition for the given byte to the state ID given.
443 ///
444 /// Note that one should not set transitions to the FAIL state. It is not
445 /// technically incorrect, but it wastes space. If a transition is not
446 /// defined, then it is automatically assumed to lead to the FAIL state.
447 fn set_next_state(&mut self, byte: u8, next: StateID) {
448 match self.trans.binary_search_by_key(&byte, |&(b, _)| b) {
449 Ok(i) => self.trans[i] = (byte, next),
450 Err(i) => self.trans.insert(i, (byte, next)),
451 }
452 }
453}
454
455impl core::fmt::Debug for State {
456 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
457 use crate::{automaton::sparse_transitions, util::debug::DebugByte};
458
459 let it: impl Iterator = sparse_transitions(self.trans.iter().copied()).enumerate();
460 for (i: usize, (start: u8, end: u8, sid: StateID)) in it {
461 if i > 0 {
462 write!(f, ", ")?;
463 }
464 if start == end {
465 write!(f, "{:?} => {:?}", DebugByte(start), sid.as_usize())?;
466 } else {
467 write!(
468 f,
469 "{:?}-{:?} => {:?}",
470 DebugByte(start),
471 DebugByte(end),
472 sid.as_usize()
473 )?;
474 }
475 }
476 Ok(())
477 }
478}
479
480/// A builder for configuring an Aho-Corasick noncontiguous NFA.
481///
482/// This builder has a subset of the options available to a
483/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
484/// their behavior is identical.
485#[derive(Clone, Debug)]
486pub struct Builder {
487 match_kind: MatchKind,
488 prefilter: bool,
489 ascii_case_insensitive: bool,
490}
491
492impl Default for Builder {
493 fn default() -> Builder {
494 Builder {
495 match_kind: MatchKind::default(),
496 prefilter: true,
497 ascii_case_insensitive: false,
498 }
499 }
500}
501
502impl Builder {
503 /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA.
504 pub fn new() -> Builder {
505 Builder::default()
506 }
507
508 /// Build an Aho-Corasick noncontiguous NFA from the given iterator of
509 /// patterns.
510 ///
511 /// A builder may be reused to create more NFAs.
512 pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
513 where
514 I: IntoIterator<Item = P>,
515 P: AsRef<[u8]>,
516 {
517 debug!("building non-contiguous NFA");
518 let nfa = Compiler::new(self)?.compile(patterns)?;
519 debug!(
520 "non-contiguous NFA built, <states: {:?}, size: {:?}>",
521 nfa.states.len(),
522 nfa.memory_usage()
523 );
524 Ok(nfa)
525 }
526
527 /// Set the desired match semantics.
528 ///
529 /// See
530 /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
531 /// for more documentation and examples.
532 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
533 self.match_kind = kind;
534 self
535 }
536
537 /// Enable ASCII-aware case insensitive matching.
538 ///
539 /// See
540 /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
541 /// for more documentation and examples.
542 pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
543 self.ascii_case_insensitive = yes;
544 self
545 }
546
547 /// Enable heuristic prefilter optimizations.
548 ///
549 /// See
550 /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
551 /// for more documentation and examples.
552 pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
553 self.prefilter = yes;
554 self
555 }
556}
557
558/// A compiler uses a builder configuration and builds up the NFA formulation
559/// of an Aho-Corasick automaton. This roughly corresponds to the standard
560/// formulation described in textbooks, with some tweaks to support leftmost
561/// searching.
562#[derive(Debug)]
563struct Compiler<'a> {
564 builder: &'a Builder,
565 prefilter: prefilter::Builder,
566 nfa: NFA,
567 byteset: ByteClassSet,
568}
569
570impl<'a> Compiler<'a> {
571 fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> {
572 let prefilter = prefilter::Builder::new(builder.match_kind)
573 .ascii_case_insensitive(builder.ascii_case_insensitive);
574 Ok(Compiler {
575 builder,
576 prefilter,
577 nfa: NFA {
578 match_kind: builder.match_kind,
579 states: vec![],
580 pattern_lens: vec![],
581 prefilter: None,
582 byte_classes: ByteClasses::singletons(),
583 min_pattern_len: usize::MAX,
584 max_pattern_len: 0,
585 special: Special::zero(),
586 memory_usage: 0,
587 },
588 byteset: ByteClassSet::empty(),
589 })
590 }
591
592 fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError>
593 where
594 I: IntoIterator<Item = P>,
595 P: AsRef<[u8]>,
596 {
597 // the dead state, only used for leftmost and fixed to id==0
598 self.add_state(0)?;
599 // the fail state, which is never entered and fixed to id==1
600 self.add_state(0)?;
601 // unanchored start state, initially fixed to id==2 but later shuffled
602 // to appear after all non-start match states.
603 self.nfa.special.start_unanchored_id = self.add_state(0)?;
604 // anchored start state, initially fixed to id==3 but later shuffled
605 // to appear after unanchored start state.
606 self.nfa.special.start_anchored_id = self.add_state(0)?;
607 // Initialize the unanchored starting state in order to make it dense,
608 // and thus make transition lookups on this state faster.
609 self.init_unanchored_start_state();
610 // Build the base trie from the given patterns.
611 self.build_trie(patterns)?;
612 // Add transitions (and maybe matches) to the anchored starting state.
613 // The anchored starting state is used for anchored searches. The only
614 // mechanical difference between it and the unanchored start state is
615 // that missing transitions map to the DEAD state instead of the FAIL
616 // state.
617 self.set_anchored_start_state();
618 // Rewrite transitions to the FAIL state on the unanchored start state
619 // as self-transitions. This keeps the start state active at all times.
620 self.add_unanchored_start_state_loop();
621 // Set all transitions on the DEAD state to point to itself. This way,
622 // the DEAD state can never be escaped. It MUST be used as a sentinel
623 // in any correct search.
624 self.add_dead_state_loop();
625 // The meat of the Aho-Corasick algorithm: compute and write failure
626 // transitions. i.e., the state to move to when a transition isn't
627 // defined in the current state. These are epsilon transitions and thus
628 // make this formulation an NFA.
629 self.fill_failure_transitions();
630 // Handle a special case under leftmost semantics when at least one
631 // of the patterns is the empty string.
632 self.close_start_state_loop_for_leftmost();
633 // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START,
634 // NON-MATCH, ... This permits us to very quickly query the type of
635 // the state we're currently in during a search.
636 self.shuffle();
637 // Turn our set of bytes into equivalent classes. This NFA
638 // implementation doesn't use byte classes directly, but any
639 // Aho-Corasick searcher built from this one might.
640 self.nfa.byte_classes = self.byteset.byte_classes();
641 self.nfa.prefilter = self.prefilter.build();
642 self.calculate_memory_usage();
643 // Store the maximum ID of all *relevant* special states. Start states
644 // are only relevant when we have a prefilter, otherwise, there is zero
645 // reason to care about whether a state is a start state or not during
646 // a search. Indeed, without a prefilter, we are careful to explicitly
647 // NOT care about start states, otherwise the search can ping pong
648 // between the unrolled loop and the handling of special-status states
649 // and destroy perf.
650 self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() {
651 // Why the anchored starting state? Because we always put it
652 // after the unanchored starting state and it is therefore the
653 // maximum. Why put unanchored followed by anchored? No particular
654 // reason, but that's how the states are logically organized in the
655 // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯
656 self.nfa.special.start_anchored_id
657 } else {
658 self.nfa.special.max_match_id
659 };
660 Ok(self.nfa)
661 }
662
663 /// This sets up the initial prefix trie that makes up the Aho-Corasick
664 /// automaton. Effectively, it creates the basic structure of the
665 /// automaton, where every pattern given has a path from the start state to
666 /// the end of the pattern.
667 fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError>
668 where
669 I: IntoIterator<Item = P>,
670 P: AsRef<[u8]>,
671 {
672 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() {
673 let pid = PatternID::new(i).map_err(|e| {
674 BuildError::pattern_id_overflow(
675 PatternID::MAX.as_u64(),
676 e.attempted(),
677 )
678 })?;
679 let pat = pat.as_ref();
680 let patlen = SmallIndex::new(pat.len())
681 .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?;
682 self.nfa.min_pattern_len =
683 core::cmp::min(self.nfa.min_pattern_len, pat.len());
684 self.nfa.max_pattern_len =
685 core::cmp::max(self.nfa.max_pattern_len, pat.len());
686 assert_eq!(
687 i,
688 self.nfa.pattern_lens.len(),
689 "expected number of patterns to match pattern ID"
690 );
691 self.nfa.pattern_lens.push(patlen);
692 // We add the pattern to the prefilter here because the pattern
693 // ID in the prefilter is determined with respect to the patterns
694 // added to the prefilter. That is, it isn't the ID we have here,
695 // but the one determined by its own accounting of patterns.
696 // To ensure they line up, we add every pattern we see to the
697 // prefilter, even if some patterns ultimately are impossible to
698 // match (in leftmost-first semantics specifically).
699 //
700 // Another way of doing this would be to expose an API in the
701 // prefilter to permit setting your own pattern IDs. Or to just use
702 // our own map and go between them. But this case is sufficiently
703 // rare that we don't bother and just make sure they're in sync.
704 if self.builder.prefilter {
705 self.prefilter.add(pat);
706 }
707
708 let mut prev = self.nfa.special.start_unanchored_id;
709 let mut saw_match = false;
710 for (depth, &b) in pat.iter().enumerate() {
711 // When leftmost-first match semantics are requested, we
712 // specifically stop adding patterns when a previously added
713 // pattern is a prefix of it. We avoid adding it because
714 // leftmost-first semantics imply that the pattern can never
715 // match. This is not just an optimization to save space! It
716 // is necessary for correctness. In fact, this is the only
717 // difference in the automaton between the implementations for
718 // leftmost-first and leftmost-longest.
719 saw_match = saw_match || self.nfa.states[prev].is_match();
720 if self.builder.match_kind.is_leftmost_first() && saw_match {
721 // Skip to the next pattern immediately. This avoids
722 // incorrectly adding a match after this loop terminates.
723 continue 'PATTERNS;
724 }
725
726 // Add this byte to our equivalence classes. We don't use these
727 // for NFA construction. These are instead used only if we're
728 // building a DFA. They would technically be useful for the
729 // NFA, but it would require a second pass over the patterns.
730 self.byteset.set_range(b, b);
731 if self.builder.ascii_case_insensitive {
732 let b = opposite_ascii_case(b);
733 self.byteset.set_range(b, b);
734 }
735
736 // If the transition from prev using the current byte already
737 // exists, then just move through it. Otherwise, add a new
738 // state. We track the depth here so that we can determine
739 // how to represent transitions. States near the start state
740 // use a dense representation that uses more memory but is
741 // faster. Other states use a sparse representation that uses
742 // less memory but is slower.
743 let next = self.nfa.states[prev].next_state(b);
744 if next != NFA::FAIL {
745 prev = next;
746 } else {
747 let next = self.add_state(depth)?;
748 self.nfa.states[prev].set_next_state(b, next);
749 if self.builder.ascii_case_insensitive {
750 let b = opposite_ascii_case(b);
751 self.nfa.states[prev].set_next_state(b, next);
752 }
753 prev = next;
754 }
755 }
756 // Once the pattern has been added, log the match in the final
757 // state that it reached.
758 self.nfa.states[prev].matches.push(pid);
759 }
760 Ok(())
761 }
762
763 /// This routine creates failure transitions according to the standard
764 /// textbook formulation of the Aho-Corasick algorithm, with a couple small
765 /// tweaks to support "leftmost" semantics.
766 ///
767 /// Building failure transitions is the most interesting part of building
768 /// the Aho-Corasick automaton, because they are what allow searches to
769 /// be performed in linear time. Specifically, a failure transition is
770 /// a single transition associated with each state that points back to
771 /// the longest proper suffix of the pattern being searched. The failure
772 /// transition is followed whenever there exists no transition on the
773 /// current state for the current input byte. If there is no other proper
774 /// suffix, then the failure transition points back to the starting state.
775 ///
776 /// For example, let's say we built an Aho-Corasick automaton with the
777 /// following patterns: 'abcd' and 'cef'. The trie looks like this:
778 ///
779 /// ```ignore
780 /// a - S1 - b - S2 - c - S3 - d - S4*
781 /// /
782 /// S0 - c - S5 - e - S6 - f - S7*
783 /// ```
784 ///
785 /// At this point, it should be fairly straight-forward to see how this
786 /// trie can be used in a simplistic way. At any given position in the
787 /// text we're searching (called the "subject" string), all we need to do
788 /// is follow the transitions in the trie by consuming one transition for
789 /// each byte in the subject string. If we reach a match state, then we can
790 /// report that location as a match.
791 ///
792 /// The trick comes when searching a subject string like 'abcef'. We'll
793 /// initially follow the transition from S0 to S1 and wind up in S3 after
794 /// observng the 'c' byte. At this point, the next byte is 'e' but state
795 /// S3 has no transition for 'e', so the search fails. We then would need
796 /// to restart the search at the next position in 'abcef', which
797 /// corresponds to 'b'. The match would fail, but the next search starting
798 /// at 'c' would finally succeed. The problem with this approach is that
799 /// we wind up searching the subject string potentially many times. In
800 /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
801 /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
802 /// like to achieve a `O(n + m)` worst case complexity.
803 ///
804 /// This is where failure transitions come in. Instead of dying at S3 in
805 /// the first search, the automaton can instruct the search to move to
806 /// another part of the automaton that corresponds to a suffix of what
807 /// we've seen so far. Recall that we've seen 'abc' in the subject string,
808 /// and the automaton does indeed have a non-empty suffix, 'c', that could
809 /// potentially lead to another match. Thus, the actual Aho-Corasick
810 /// automaton for our patterns in this case looks like this:
811 ///
812 /// ```ignore
813 /// a - S1 - b - S2 - c - S3 - d - S4*
814 /// / /
815 /// / ----------------
816 /// / /
817 /// S0 - c - S5 - e - S6 - f - S7*
818 /// ```
819 ///
820 /// That is, we have a failure transition from S3 to S5, which is followed
821 /// exactly in cases when we are in state S3 but see any byte other than
822 /// 'd' (that is, we've "failed" to find a match in this portion of our
823 /// trie). We know we can transition back to S5 because we've already seen
824 /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
825 /// with the search starting at S5 and complete our match.
826 ///
827 /// Adding failure transitions to a trie is fairly simple, but subtle. The
828 /// key issue is that you might have multiple failure transition that you
829 /// need to follow. For example, look at the trie for the patterns
830 /// 'abcd', 'b', 'bcd' and 'cd':
831 ///
832 /// ```ignore
833 /// - a - S1 - b - S2* - c - S3 - d - S4*
834 /// / / /
835 /// / ------- -------
836 /// / / /
837 /// S0 --- b - S5* - c - S6 - d - S7*
838 /// \ /
839 /// \ --------
840 /// \ /
841 /// - c - S8 - d - S9*
842 /// ```
843 ///
844 /// The failure transitions for this trie are defined from S2 to S5,
845 /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
846 /// corresponds to a match, since its failure transition to S5 is itself
847 /// a match state.
848 ///
849 /// Perhaps simplest way to think about adding these failure transitions
850 /// is recursively. That is, if you know the failure transitions for every
851 /// possible previous state that could be visited (e.g., when computing the
852 /// failure transition for S3, you already know the failure transitions
853 /// for S0, S1 and S2), then you can simply follow the failure transition
854 /// of the previous state and check whether the incoming transition is
855 /// defined after following the failure transition.
856 ///
857 /// For example, when determining the failure state for S3, by our
858 /// assumptions, we already know that there is a failure transition from
859 /// S2 (the previous state) to S5. So we follow that transition and check
860 /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
861 /// as there is a transition from S5 to S6 for the byte 'c'. If no such
862 /// transition existed, we could keep following the failure transitions
863 /// until we reach the start state, which is the failure transition for
864 /// every state that has no corresponding proper suffix.
865 ///
866 /// We don't actually use recursion to implement this, but instead, use a
867 /// breadth first search of the automaton. Our base case is the start
868 /// state, whose failure transition is just a transition to itself.
869 ///
870 /// When building a leftmost automaton, we proceed as above, but only
871 /// include a subset of failure transitions. Namely, we omit any failure
872 /// transitions that appear after a match state in the trie. This is
873 /// because failure transitions always point back to a proper suffix of
874 /// what has been seen so far. Thus, following a failure transition after
875 /// a match implies looking for a match that starts after the one that has
876 /// already been seen, which is of course therefore not the leftmost match.
877 ///
878 /// N.B. I came up with this algorithm on my own, and after scouring all of
879 /// the other AC implementations I know of (Perl, Snort, many on GitHub).
880 /// I couldn't find any that implement leftmost semantics like this.
881 /// Perl of course needs leftmost-first semantics, but they implement it
882 /// with a seeming hack at *search* time instead of encoding it into the
883 /// automaton. There are also a couple Java libraries that support leftmost
884 /// longest semantics, but they do it by building a queue of matches at
885 /// search time, which is even worse than what Perl is doing. ---AG
886 fn fill_failure_transitions(&mut self) {
887 let is_leftmost = self.builder.match_kind.is_leftmost();
888 let start_uid = self.nfa.special.start_unanchored_id;
889 // Initialize the queue for breadth first search with all transitions
890 // out of the start state. We handle the start state specially because
891 // we only want to follow non-self transitions. If we followed self
892 // transitions, then this would never terminate.
893 let mut queue = VecDeque::new();
894 let mut seen = self.queued_set();
895 for i in 0..self.nfa.states[start_uid].trans.len() {
896 let (_, next) = self.nfa.states[start_uid].trans[i];
897 // Skip anything we've seen before and any self-transitions on the
898 // start state.
899 if next == start_uid || seen.contains(next) {
900 continue;
901 }
902 queue.push_back(next);
903 seen.insert(next);
904 // Under leftmost semantics, if a state immediately following
905 // the start state is a match state, then we never want to
906 // follow its failure transition since the failure transition
907 // necessarily leads back to the start state, which we never
908 // want to do for leftmost matching after a match has been
909 // found.
910 //
911 // We apply the same logic to non-start states below as well.
912 if is_leftmost && self.nfa.states[next].is_match() {
913 self.nfa.states[next].fail = NFA::DEAD;
914 }
915 }
916 while let Some(id) = queue.pop_front() {
917 for i in 0..self.nfa.states[id].trans.len() {
918 let (b, next) = self.nfa.states[id].trans[i];
919 if seen.contains(next) {
920 // The only way to visit a duplicate state in a transition
921 // list is when ASCII case insensitivity is enabled. In
922 // this case, we want to skip it since it's redundant work.
923 // But it would also end up duplicating matches, which
924 // results in reporting duplicate matches in some cases.
925 // See the 'acasei010' regression test.
926 continue;
927 }
928 queue.push_back(next);
929 seen.insert(next);
930
931 // As above for start states, under leftmost semantics, once
932 // we see a match all subsequent states should have no failure
933 // transitions because failure transitions always imply looking
934 // for a match that is a suffix of what has been seen so far
935 // (where "seen so far" corresponds to the string formed by
936 // following the transitions from the start state to the
937 // current state). Under leftmost semantics, we specifically do
938 // not want to allow this to happen because we always want to
939 // report the match found at the leftmost position.
940 //
941 // The difference between leftmost-first and leftmost-longest
942 // occurs previously while we build the trie. For
943 // leftmost-first, we simply omit any entries that would
944 // otherwise require passing through a match state.
945 //
946 // Note that for correctness, the failure transition has to be
947 // set to the dead state for ALL states following a match, not
948 // just the match state itself. However, by setting the failure
949 // transition to the dead state on all match states, the dead
950 // state will automatically propagate to all subsequent states
951 // via the failure state computation below.
952 if is_leftmost && self.nfa.states[next].is_match() {
953 self.nfa.states[next].fail = NFA::DEAD;
954 continue;
955 }
956 let mut fail = self.nfa.states[id].fail;
957 while self.nfa.states[fail].next_state(b) == NFA::FAIL {
958 fail = self.nfa.states[fail].fail;
959 }
960 fail = self.nfa.states[fail].next_state(b);
961 self.nfa.states[next].fail = fail;
962 self.copy_matches(fail, next);
963 }
964 // If the start state is a match state, then this automaton can
965 // match the empty string. This implies all states are match states
966 // since every position matches the empty string, so copy the
967 // matches from the start state to every state. Strictly speaking,
968 // this is only necessary for overlapping matches since each
969 // non-empty non-start match state needs to report empty matches
970 // in addition to its own. For the non-overlapping case, such
971 // states only report the first match, which is never empty since
972 // it isn't a start state.
973 if !is_leftmost {
974 self.copy_matches(self.nfa.special.start_unanchored_id, id);
975 }
976 }
977 }
978
979 /// Shuffle the states so that they appear in this sequence:
980 ///
981 /// DEAD, FAIL, MATCH..., START, START, NON-MATCH...
982 ///
983 /// The idea here is that if we know how special states are laid out in our
984 /// transition table, then we can determine what "kind" of state we're in
985 /// just by comparing our current state ID with a particular value. In this
986 /// way, we avoid doing extra memory lookups.
987 ///
988 /// Before shuffling begins, our states look something like this:
989 ///
990 /// DEAD, FAIL, START, START, (MATCH | NON-MATCH)...
991 ///
992 /// So all we need to do is move all of the MATCH states so that they
993 /// all appear before any NON-MATCH state, like so:
994 ///
995 /// DEAD, FAIL, START, START, MATCH... NON-MATCH...
996 ///
997 /// Then it's just a simple matter of swapping the two START states with
998 /// the last two MATCH states.
999 ///
1000 /// (This is the same technique used for fully compiled DFAs in
1001 /// regex-automata.)
1002 fn shuffle(&mut self) {
1003 let old_start_uid = self.nfa.special.start_unanchored_id;
1004 let old_start_aid = self.nfa.special.start_anchored_id;
1005 assert!(old_start_uid < old_start_aid);
1006 assert_eq!(
1007 3,
1008 old_start_aid.as_usize(),
1009 "anchored start state should be at index 3"
1010 );
1011 // We implement shuffling by a sequence of pairwise swaps of states.
1012 // Since we have a number of things referencing states via their
1013 // IDs and swapping them changes their IDs, we need to record every
1014 // swap we make so that we can remap IDs. The remapper handles this
1015 // book-keeping for us.
1016 let mut remapper = Remapper::new(&self.nfa, 0);
1017 // The way we proceed here is by moving all match states so that
1018 // they directly follow the start states. So it will go: DEAD, FAIL,
1019 // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ...
1020 //
1021 // To do that, we proceed forward through all states after
1022 // START-ANCHORED and swap match states so that they appear before all
1023 // non-match states.
1024 let mut next_avail = StateID::from(4u8);
1025 for i in next_avail.as_usize()..self.nfa.states.len() {
1026 let sid = StateID::new(i).unwrap();
1027 if !self.nfa.states[sid].is_match() {
1028 continue;
1029 }
1030 remapper.swap(&mut self.nfa, sid, next_avail);
1031 // The key invariant here is that only non-match states exist
1032 // between 'next_avail' and 'sid' (with them being potentially
1033 // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed
1034 // to land on the leftmost non-match state. (Unless 'next_avail'
1035 // and 'sid' are equivalent, in which case, a swap will occur but
1036 // it is a no-op.)
1037 next_avail = StateID::new(next_avail.one_more()).unwrap();
1038 }
1039 // Now we'd like to move the start states to immediately following the
1040 // match states. (The start states may themselves be match states, but
1041 // we'll handle that later.) We arrange the states this way so that we
1042 // don't necessarily need to check whether a state is a start state or
1043 // not before checking whether a state is a match state. For example,
1044 // we'd like to be able to write this as our state machine loop:
1045 //
1046 // sid = start()
1047 // for byte in haystack:
1048 // sid = next(sid, byte)
1049 // if sid <= nfa.max_start_id:
1050 // if sid <= nfa.max_dead_id:
1051 // # search complete
1052 // elif sid <= nfa.max_match_id:
1053 // # found match
1054 //
1055 // The important context here is that we might not want to look for
1056 // start states at all. Namely, if a searcher doesn't have a prefilter,
1057 // then there is no reason to care about whether we're in a start state
1058 // or not. And indeed, if we did check for it, this very hot loop would
1059 // ping pong between the special state handling and the main state
1060 // transition logic. This in turn stalls the CPU by killing branch
1061 // prediction.
1062 //
1063 // So essentially, we really want to be able to "forget" that start
1064 // states even exist and this is why we put them at the end.
1065 let new_start_aid =
1066 StateID::new(next_avail.as_usize().checked_sub(1).unwrap())
1067 .unwrap();
1068 remapper.swap(&mut self.nfa, old_start_aid, new_start_aid);
1069 let new_start_uid =
1070 StateID::new(next_avail.as_usize().checked_sub(2).unwrap())
1071 .unwrap();
1072 remapper.swap(&mut self.nfa, old_start_uid, new_start_uid);
1073 let new_max_match_id =
1074 StateID::new(next_avail.as_usize().checked_sub(3).unwrap())
1075 .unwrap();
1076 self.nfa.special.max_match_id = new_max_match_id;
1077 self.nfa.special.start_unanchored_id = new_start_uid;
1078 self.nfa.special.start_anchored_id = new_start_aid;
1079 // If one start state is a match state, then they both are.
1080 if self.nfa.states[self.nfa.special.start_anchored_id].is_match() {
1081 self.nfa.special.max_match_id = self.nfa.special.start_anchored_id;
1082 }
1083 remapper.remap(&mut self.nfa);
1084 }
1085
1086 /// Returns a set that tracked queued states.
1087 ///
1088 /// This is only necessary when ASCII case insensitivity is enabled, since
1089 /// it is the only way to visit the same state twice. Otherwise, this
1090 /// returns an inert set that nevers adds anything and always reports
1091 /// `false` for every member test.
1092 fn queued_set(&self) -> QueuedSet {
1093 if self.builder.ascii_case_insensitive {
1094 QueuedSet::active()
1095 } else {
1096 QueuedSet::inert()
1097 }
1098 }
1099
1100 /// Initializes the unanchored start state by making it dense. This is
1101 /// achieved by explicitly setting every transition to the FAIL state.
1102 /// This isn't necessary for correctness, since any missing transition is
1103 /// automatically assumed to be mapped to the FAIL state. We do this to
1104 /// make the unanchored starting state dense, and thus in turn make
1105 /// transition lookups on it faster. (Which is worth doing because it's
1106 /// the most active state.)
1107 fn init_unanchored_start_state(&mut self) {
1108 let start_uid = self.nfa.special.start_unanchored_id;
1109 for byte in 0..=255 {
1110 self.nfa.states[start_uid].set_next_state(byte, NFA::FAIL);
1111 }
1112 }
1113
1114 /// Setup the anchored start state by copying all of the transitions and
1115 /// matches from the unanchored starting state with one change: the failure
1116 /// transition is changed to the DEAD state, so that for any undefined
1117 /// transitions, the search will stop.
1118 fn set_anchored_start_state(&mut self) {
1119 let start_uid = self.nfa.special.start_unanchored_id;
1120 let start_aid = self.nfa.special.start_anchored_id;
1121 self.nfa.states[start_aid].trans =
1122 self.nfa.states[start_uid].trans.clone();
1123 self.copy_matches(start_uid, start_aid);
1124 // This is the main difference between the unanchored and anchored
1125 // starting states. If a lookup on an anchored starting state fails,
1126 // then the search should stop.
1127 //
1128 // N.B. This assumes that the loop on the unanchored starting state
1129 // hasn't been created yet.
1130 self.nfa.states[start_aid].fail = NFA::DEAD;
1131 }
1132
1133 /// Set the failure transitions on the start state to loop back to the
1134 /// start state. This effectively permits the Aho-Corasick automaton to
1135 /// match at any position. This is also required for finding the next
1136 /// state to terminate, namely, finding the next state should never return
1137 /// a fail_id.
1138 ///
1139 /// This must be done after building the initial trie, since trie
1140 /// construction depends on transitions to `fail_id` to determine whether a
1141 /// state already exists or not.
1142 fn add_unanchored_start_state_loop(&mut self) {
1143 let start_uid = self.nfa.special.start_unanchored_id;
1144 let start = &mut self.nfa.states[start_uid];
1145 for b in 0..=255 {
1146 if start.next_state(b) == NFA::FAIL {
1147 start.set_next_state(b, start_uid);
1148 }
1149 }
1150 }
1151
1152 /// Remove the start state loop by rewriting any transitions on the start
1153 /// state back to the start state with transitions to the dead state.
1154 ///
1155 /// The loop is only closed when two conditions are met: the start state
1156 /// is a match state and the match kind is leftmost-first or
1157 /// leftmost-longest.
1158 ///
1159 /// The reason for this is that under leftmost semantics, a start state
1160 /// that is also a match implies that we should never restart the search
1161 /// process. We allow normal transitions out of the start state, but if
1162 /// none exist, we transition to the dead state, which signals that
1163 /// searching should stop.
1164 fn close_start_state_loop_for_leftmost(&mut self) {
1165 let start_uid = self.nfa.special.start_unanchored_id;
1166 let start = &mut self.nfa.states[start_uid];
1167 if self.builder.match_kind.is_leftmost() && start.is_match() {
1168 for b in 0..=255 {
1169 if start.next_state(b) == start_uid {
1170 start.set_next_state(b, NFA::DEAD);
1171 }
1172 }
1173 }
1174 }
1175
1176 /// Sets all transitions on the dead state to point back to the dead state.
1177 /// Normally, missing transitions map back to the failure state, but the
1178 /// point of the dead state is to act as a sink that can never be escaped.
1179 fn add_dead_state_loop(&mut self) {
1180 let dead = &mut self.nfa.states[NFA::DEAD];
1181 for b in 0..=255 {
1182 dead.set_next_state(b, NFA::DEAD);
1183 }
1184 }
1185
1186 /// Copy matches from the `src` state to the `dst` state. This is useful
1187 /// when a match state can be reached via a failure transition. In which
1188 /// case, you'll want to copy the matches (if any) from the state reached
1189 /// by the failure transition to the original state you were at.
1190 fn copy_matches(&mut self, src: StateID, dst: StateID) {
1191 let (src, dst) =
1192 get_two_mut(&mut self.nfa.states, src.as_usize(), dst.as_usize());
1193 dst.matches.extend_from_slice(&src.matches);
1194 }
1195
1196 /// Allocate and add a fresh state to the underlying NFA and return its
1197 /// ID (guaranteed to be one more than the ID of the previously allocated
1198 /// state). If the ID would overflow `StateID`, then this returns an error.
1199 fn add_state(&mut self, depth: usize) -> Result<StateID, BuildError> {
1200 // This is OK because we error when building the trie if we see a
1201 // pattern whose length cannot fit into a 'SmallIndex', and the longest
1202 // possible depth corresponds to the length of the longest pattern.
1203 let depth = SmallIndex::new(depth)
1204 .expect("patterns longer than SmallIndex::MAX are not allowed");
1205 let id = StateID::new(self.nfa.states.len()).map_err(|e| {
1206 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
1207 })?;
1208 self.nfa.states.push(State {
1209 trans: vec![],
1210 matches: vec![],
1211 fail: self.nfa.special.start_unanchored_id,
1212 depth,
1213 });
1214 Ok(id)
1215 }
1216
1217 /// Computes the total amount of heap used by this NFA in bytes.
1218 fn calculate_memory_usage(&mut self) {
1219 use core::mem::size_of;
1220
1221 for state in self.nfa.states.iter() {
1222 self.nfa.memory_usage += size_of::<State>() + state.memory_usage();
1223 }
1224 }
1225}
1226
1227/// A set of state identifiers used to avoid revisiting the same state multiple
1228/// times when filling in failure transitions.
1229///
1230/// This set has an "inert" and an "active" mode. When inert, the set never
1231/// stores anything and always returns `false` for every member test. This is
1232/// useful to avoid the performance and memory overhead of maintaining this
1233/// set when it is not needed.
1234#[derive(Debug)]
1235struct QueuedSet {
1236 set: Option<BTreeSet<StateID>>,
1237}
1238
1239impl QueuedSet {
1240 /// Return an inert set that returns `false` for every state ID membership
1241 /// test.
1242 fn inert() -> QueuedSet {
1243 QueuedSet { set: None }
1244 }
1245
1246 /// Return an active set that tracks state ID membership.
1247 fn active() -> QueuedSet {
1248 QueuedSet { set: Some(BTreeSet::new()) }
1249 }
1250
1251 /// Inserts the given state ID into this set. (If the set is inert, then
1252 /// this is a no-op.)
1253 fn insert(&mut self, state_id: StateID) {
1254 if let Some(ref mut set) = self.set {
1255 set.insert(state_id);
1256 }
1257 }
1258
1259 /// Returns true if and only if the given state ID is in this set. If the
1260 /// set is inert, this always returns false.
1261 fn contains(&self, state_id: StateID) -> bool {
1262 match self.set {
1263 None => false,
1264 Some(ref set) => set.contains(&state_id),
1265 }
1266 }
1267}
1268
1269impl core::fmt::Debug for NFA {
1270 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1271 use crate::automaton::fmt_state_indicator;
1272
1273 writeln!(f, "noncontiguous::NFA(")?;
1274 for (sid, state) in self.states.iter().with_state_ids() {
1275 // The FAIL state doesn't actually have space for a state allocated
1276 // for it, so we have to treat it as a special case.
1277 if sid == NFA::FAIL {
1278 writeln!(f, "F {:06}:", sid.as_usize())?;
1279 continue;
1280 }
1281 fmt_state_indicator(f, self, sid)?;
1282 write!(
1283 f,
1284 "{:06}({:06}): ",
1285 sid.as_usize(),
1286 state.fail.as_usize()
1287 )?;
1288 state.fmt(f)?;
1289 write!(f, "\n")?;
1290 if self.is_match(sid) {
1291 write!(f, " matches: ")?;
1292 for (i, pid) in state.matches.iter().enumerate() {
1293 if i > 0 {
1294 write!(f, ", ")?;
1295 }
1296 write!(f, "{}", pid.as_usize())?;
1297 }
1298 write!(f, "\n")?;
1299 }
1300 }
1301 writeln!(f, "match kind: {:?}", self.match_kind)?;
1302 writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
1303 writeln!(f, "state length: {:?}", self.states.len())?;
1304 writeln!(f, "pattern length: {:?}", self.patterns_len())?;
1305 writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
1306 writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
1307 writeln!(f, "memory usage: {:?}", self.memory_usage())?;
1308 writeln!(f, ")")?;
1309 Ok(())
1310 }
1311}
1312
1313/// Safely return two mutable borrows to two different locations in the given
1314/// slice.
1315///
1316/// This panics if i == j.
1317fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
1318 assert!(i != j, "{} must not be equal to {}", i, j);
1319 if i < j {
1320 let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:j);
1321 (&mut before[i], &mut after[0])
1322 } else {
1323 let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:i);
1324 (&mut after[0], &mut before[j])
1325 }
1326}
1327