1/*!
2Provides a contiguous NFA implementation of Aho-Corasick.
3
4This is a low-level API that generally only needs to be used in niche
5circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6instead of a contiguous NFA directly. Using an `NFA` directly is typically only
7necessary when one needs access to the [`Automaton`] trait implementation.
8*/
9
10use alloc::{vec, vec::Vec};
11
12use crate::{
13 automaton::Automaton,
14 nfa::noncontiguous,
15 util::{
16 alphabet::ByteClasses,
17 error::{BuildError, MatchError},
18 int::{Usize, U16, U32},
19 prefilter::Prefilter,
20 primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
21 search::{Anchored, MatchKind},
22 special::Special,
23 },
24};
25
26/// A contiguous NFA implementation of Aho-Corasick.
27///
28/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
29/// this type directly. Using an `NFA` directly is typically only necessary
30/// when one needs access to the [`Automaton`] trait implementation.
31///
32/// This NFA can only be built by first constructing a [`noncontiguous::NFA`].
33/// Both [`NFA::new`] and [`Builder::build`] do this for you automatically, but
34/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
35///
36/// The main difference between a noncontiguous NFA and a contiguous NFA is
37/// that the latter represents all of its states and transitions in a single
38/// allocation, where as the former uses a separate allocation for each state.
39/// Doing this at construction time while keeping a low memory footprint isn't
40/// feasible, which is primarily why there are two different NFA types: one
41/// that does the least amount of work possible to build itself, and another
42/// that does a little extra work to compact itself and make state transitions
43/// faster by making some states use a dense representation.
44///
45/// Because a contiguous NFA uses a single allocation, there is a lot more
46/// opportunity for compression tricks to reduce the heap memory used. Indeed,
47/// it is not uncommon for a contiguous NFA to use an order of magnitude less
48/// heap memory than a noncontiguous NFA. Since building a contiguous NFA
49/// usually only takes a fraction of the time it takes to build a noncontiguous
50/// NFA, the overall build time is not much slower. Thus, in most cases, a
51/// contiguous NFA is the best choice.
52///
53/// Since a contiguous NFA uses various tricks for compression and to achieve
54/// faster state transitions, currently, its limit on the number of states
55/// is somewhat smaller than what a noncontiguous NFA can achieve. Generally
56/// speaking, you shouldn't expect to run into this limit if the number of
57/// patterns is under 1 million. It is plausible that this limit will be
58/// increased in the future. If the limit is reached, building a contiguous NFA
59/// will return an error. Often, since building a contiguous NFA is relatively
60/// cheap, it can make sense to always try it even if you aren't sure if it
61/// will fail or not. If it does, you can always fall back to a noncontiguous
62/// NFA. (Indeed, the main [`AhoCorasick`](crate::AhoCorasick) type employs a
63/// strategy similar to this at construction time.)
64///
65/// # Example
66///
67/// This example shows how to build an `NFA` directly and use it to execute
68/// [`Automaton::try_find`]:
69///
70/// ```
71/// use aho_corasick::{
72/// automaton::Automaton,
73/// nfa::contiguous::NFA,
74/// Input, Match,
75/// };
76///
77/// let patterns = &["b", "abc", "abcd"];
78/// let haystack = "abcd";
79///
80/// let nfa = NFA::new(patterns).unwrap();
81/// assert_eq!(
82/// Some(Match::must(0, 1..2)),
83/// nfa.try_find(&Input::new(haystack))?,
84/// );
85/// # Ok::<(), Box<dyn std::error::Error>>(())
86/// ```
87///
88/// It is also possible to implement your own version of `try_find`. See the
89/// [`Automaton`] documentation for an example.
90#[derive(Clone)]
91pub struct NFA {
92 /// The raw NFA representation. Each state is packed with a header
93 /// (containing the format of the state, the failure transition and, for
94 /// a sparse state, the number of transitions), its transitions and any
95 /// matching pattern IDs for match states.
96 repr: Vec<u32>,
97 /// The length of each pattern. This is used to compute the start offset
98 /// of a match.
99 pattern_lens: Vec<SmallIndex>,
100 /// The total number of states in this NFA.
101 state_len: usize,
102 /// A prefilter for accelerating searches, if one exists.
103 prefilter: Option<Prefilter>,
104 /// The match semantics built into this NFA.
105 match_kind: MatchKind,
106 /// The alphabet size, or total number of equivalence classes, for this
107 /// NFA. Dense states always have this many transitions.
108 alphabet_len: usize,
109 /// The equivalence classes for this NFA. All transitions, dense and
110 /// sparse, are defined on equivalence classes and not on the 256 distinct
111 /// byte values.
112 byte_classes: ByteClasses,
113 /// The length of the shortest pattern in this automaton.
114 min_pattern_len: usize,
115 /// The length of the longest pattern in this automaton.
116 max_pattern_len: usize,
117 /// The information required to deduce which states are "special" in this
118 /// NFA.
119 special: Special,
120}
121
122impl NFA {
123 /// Create a new Aho-Corasick contiguous NFA using the default
124 /// configuration.
125 ///
126 /// Use a [`Builder`] if you want to change the configuration.
127 pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
128 where
129 I: IntoIterator<Item = P>,
130 P: AsRef<[u8]>,
131 {
132 NFA::builder().build(patterns)
133 }
134
135 /// A convenience method for returning a new Aho-Corasick contiguous NFA
136 /// builder.
137 ///
138 /// This usually permits one to just import the `NFA` type.
139 pub fn builder() -> Builder {
140 Builder::new()
141 }
142}
143
144impl NFA {
145 /// A sentinel state ID indicating that a search should stop once it has
146 /// entered this state. When a search stops, it returns a match if one
147 /// has been found, otherwise no match. A contiguous NFA always has an
148 /// actual dead state at this ID.
149 const DEAD: StateID = StateID::new_unchecked(0);
150 /// Another sentinel state ID indicating that a search should move through
151 /// current state's failure transition.
152 ///
153 /// Note that unlike DEAD, this does not actually point to a valid state
154 /// in a contiguous NFA. (noncontiguous::NFA::FAIL does point to a valid
155 /// state.) Instead, this points to the position that is guaranteed to
156 /// never be a valid state ID (by making sure it points to a place in the
157 /// middle of the encoding of the DEAD state). Since we never need to
158 /// actually look at the FAIL state itself, this works out.
159 ///
160 /// By why do it this way? So that FAIL is a constant. I don't have any
161 /// concrete evidence that this materially helps matters, but it's easy to
162 /// do. The alternative would be making the FAIL ID point to the second
163 /// state, which could be made a constant but is a little trickier to do.
164 /// The easiest path is to just make the FAIL state a runtime value, but
165 /// since comparisons with FAIL occur in perf critical parts of the search,
166 /// we want it to be as tight as possible and not waste any registers.
167 ///
168 /// Very hand wavy... But the code complexity that results from this is
169 /// very mild.
170 const FAIL: StateID = StateID::new_unchecked(1);
171}
172
173// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
174// returns a valid state ID given a valid state ID. We otherwise claim that
175// all other methods are correct as well.
176unsafe impl Automaton for NFA {
177 #[inline(always)]
178 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
179 match anchored {
180 Anchored::No => Ok(self.special.start_unanchored_id),
181 Anchored::Yes => Ok(self.special.start_anchored_id),
182 }
183 }
184
185 #[inline(always)]
186 fn next_state(
187 &self,
188 anchored: Anchored,
189 mut sid: StateID,
190 byte: u8,
191 ) -> StateID {
192 let repr = &self.repr;
193 let class = self.byte_classes.get(byte);
194 let u32tosid = StateID::from_u32_unchecked;
195 loop {
196 let o = sid.as_usize();
197 let kind = repr[o] & 0xFF;
198 // I tried to encapsulate the "next transition" logic into its own
199 // function, but it seemed to always result in sub-optimal codegen
200 // that led to real and significant slowdowns. So we just inline
201 // the logic here.
202 //
203 // I've also tried a lot of different ways to speed up this
204 // routine, and most of them have failed.
205 if kind == State::KIND_DENSE {
206 let next = u32tosid(repr[o + 2 + usize::from(class)]);
207 if next != NFA::FAIL {
208 return next;
209 }
210 } else if kind == State::KIND_ONE {
211 if class == repr[o].low_u16().high_u8() {
212 return u32tosid(repr[o + 2]);
213 }
214 } else {
215 // NOTE: I tried a SWAR technique in the loop below, but found
216 // it slower. See the 'swar' test in the tests for this module.
217 let trans_len = kind.as_usize();
218 let classes_len = u32_len(trans_len);
219 let trans_offset = o + 2 + classes_len;
220 for (i, &chunk) in
221 repr[o + 2..][..classes_len].iter().enumerate()
222 {
223 let classes = chunk.to_ne_bytes();
224 if classes[0] == class {
225 return u32tosid(repr[trans_offset + i * 4]);
226 }
227 if classes[1] == class {
228 return u32tosid(repr[trans_offset + i * 4 + 1]);
229 }
230 if classes[2] == class {
231 return u32tosid(repr[trans_offset + i * 4 + 2]);
232 }
233 if classes[3] == class {
234 return u32tosid(repr[trans_offset + i * 4 + 3]);
235 }
236 }
237 }
238 // For an anchored search, we never follow failure transitions
239 // because failure transitions lead us down a path to matching
240 // a *proper* suffix of the path we were on. Thus, it can only
241 // produce matches that appear after the beginning of the search.
242 if anchored.is_anchored() {
243 return NFA::DEAD;
244 }
245 sid = u32tosid(repr[o + 1]);
246 }
247 }
248
249 #[inline(always)]
250 fn is_special(&self, sid: StateID) -> bool {
251 sid <= self.special.max_special_id
252 }
253
254 #[inline(always)]
255 fn is_dead(&self, sid: StateID) -> bool {
256 sid == NFA::DEAD
257 }
258
259 #[inline(always)]
260 fn is_match(&self, sid: StateID) -> bool {
261 !self.is_dead(sid) && sid <= self.special.max_match_id
262 }
263
264 #[inline(always)]
265 fn is_start(&self, sid: StateID) -> bool {
266 sid == self.special.start_unanchored_id
267 || sid == self.special.start_anchored_id
268 }
269
270 #[inline(always)]
271 fn match_kind(&self) -> MatchKind {
272 self.match_kind
273 }
274
275 #[inline(always)]
276 fn patterns_len(&self) -> usize {
277 self.pattern_lens.len()
278 }
279
280 #[inline(always)]
281 fn pattern_len(&self, pid: PatternID) -> usize {
282 self.pattern_lens[pid].as_usize()
283 }
284
285 #[inline(always)]
286 fn min_pattern_len(&self) -> usize {
287 self.min_pattern_len
288 }
289
290 #[inline(always)]
291 fn max_pattern_len(&self) -> usize {
292 self.max_pattern_len
293 }
294
295 #[inline(always)]
296 fn match_len(&self, sid: StateID) -> usize {
297 State::match_len(self.alphabet_len, &self.repr[sid.as_usize()..])
298 }
299
300 #[inline(always)]
301 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
302 State::match_pattern(
303 self.alphabet_len,
304 &self.repr[sid.as_usize()..],
305 index,
306 )
307 }
308
309 #[inline(always)]
310 fn memory_usage(&self) -> usize {
311 use core::mem::size_of;
312
313 (self.repr.len() * size_of::<u32>())
314 + (self.pattern_lens.len() * size_of::<SmallIndex>())
315 + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
316 }
317
318 #[inline(always)]
319 fn prefilter(&self) -> Option<&Prefilter> {
320 self.prefilter.as_ref()
321 }
322}
323
324impl core::fmt::Debug for NFA {
325 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
326 use crate::automaton::fmt_state_indicator;
327
328 writeln!(f, "contiguous::NFA(")?;
329 let mut sid = NFA::DEAD; // always the first state and always present
330 loop {
331 let raw = &self.repr[sid.as_usize()..];
332 if raw.is_empty() {
333 break;
334 }
335 let is_match = self.is_match(sid);
336 let state = State::read(self.alphabet_len, is_match, raw);
337 fmt_state_indicator(f, self, sid)?;
338 write!(
339 f,
340 "{:06}({:06}): ",
341 sid.as_usize(),
342 state.fail.as_usize()
343 )?;
344 state.fmt(f)?;
345 write!(f, "\n")?;
346 if self.is_match(sid) {
347 write!(f, " matches: ")?;
348 for i in 0..state.match_len {
349 let pid = State::match_pattern(self.alphabet_len, raw, i);
350 if i > 0 {
351 write!(f, ", ")?;
352 }
353 write!(f, "{}", pid.as_usize())?;
354 }
355 write!(f, "\n")?;
356 }
357 // The FAIL state doesn't actually have space for a state allocated
358 // for it, so we have to treat it as a special case. write below
359 // the DEAD state.
360 if sid == NFA::DEAD {
361 writeln!(f, "F {:06}:", NFA::FAIL.as_usize())?;
362 }
363 let len = State::len(self.alphabet_len, is_match, raw);
364 sid = StateID::new(sid.as_usize().checked_add(len).unwrap())
365 .unwrap();
366 }
367 writeln!(f, "match kind: {:?}", self.match_kind)?;
368 writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
369 writeln!(f, "state length: {:?}", self.state_len)?;
370 writeln!(f, "pattern length: {:?}", self.patterns_len())?;
371 writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
372 writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
373 writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
374 writeln!(f, "byte classes: {:?}", self.byte_classes)?;
375 writeln!(f, "memory usage: {:?}", self.memory_usage())?;
376 writeln!(f, ")")?;
377
378 Ok(())
379 }
380}
381
382/// The "in memory" representation a single dense or sparse state.
383///
384/// A `State`'s in memory representation is not ever actually materialized
385/// during a search with a contiguous NFA. Doing so would be too slow. (Indeed,
386/// the only time a `State` is actually constructed is in `Debug` impls.)
387/// Instead, a `State` exposes a number of static methods for reading certain
388/// things from the raw binary encoding of the state.
389#[derive(Clone)]
390struct State<'a> {
391 /// The state to transition to when 'class_to_next' yields a transition
392 /// to the FAIL state.
393 fail: StateID,
394 /// The number of pattern IDs in this state. For a non-match state, this is
395 /// always zero. Otherwise it is always bigger than zero.
396 match_len: usize,
397 /// The sparse or dense representation of the transitions for this state.
398 trans: StateTrans<'a>,
399}
400
401/// The underlying representation of sparse or dense transitions for a state.
402///
403/// Note that like `State`, we don't typically construct values of this type
404/// during a search since we don't always need all values and thus would
405/// represent a lot of wasteful work.
406#[derive(Clone)]
407enum StateTrans<'a> {
408 /// A sparse representation of transitions for a state, where only non-FAIL
409 /// transitions are explicitly represented.
410 Sparse {
411 classes: &'a [u32],
412 /// The transitions for this state, where each transition is packed
413 /// into a u32. The low 8 bits correspond to the byte class for the
414 /// transition, and the high 24 bits correspond to the next state ID.
415 ///
416 /// This packing is why the max state ID allowed for a contiguous
417 /// NFA is 2^24-1.
418 nexts: &'a [u32],
419 },
420 /// A "one transition" state that is never a match state.
421 ///
422 /// These are by far the most common state, so we use a specialized and
423 /// very compact representation for them.
424 One {
425 /// The element of this NFA's alphabet that this transition is
426 /// defined for.
427 class: u8,
428 /// The state this should transition to if the current symbol is
429 /// equal to 'class'.
430 next: u32,
431 },
432 /// A dense representation of transitions for a state, where all
433 /// transitions are explicitly represented, including transitions to the
434 /// FAIL state.
435 Dense {
436 /// A dense set of transitions to other states. The transitions may
437 /// point to a FAIL state, in which case, the search should try the
438 /// same transition lookup at 'fail'.
439 ///
440 /// Note that this is indexed by byte equivalence classes and not
441 /// byte values. That means 'class_to_next[byte]' is wrong and
442 /// 'class_to_next[classes.get(byte)]' is correct. The number of
443 /// transitions is always equivalent to 'classes.alphabet_len()'.
444 class_to_next: &'a [u32],
445 },
446}
447
448impl<'a> State<'a> {
449 /// The offset of where the "kind" of a state is stored. If it isn't one
450 /// of the sentinel values below, then it's a sparse state and the kind
451 /// corresponds to the number of transitions in the state.
452 const KIND: usize = 0;
453
454 /// A sentinel value indicating that the state uses a dense representation.
455 const KIND_DENSE: u32 = 0xFF;
456 /// A sentinel value indicating that the state uses a special "one
457 /// transition" encoding. In practice, non-match states with one transition
458 /// make up the overwhelming majority of all states in any given
459 /// Aho-Corasick automaton, so we can specialize them using a very compact
460 /// representation.
461 const KIND_ONE: u32 = 0xFE;
462
463 /// The maximum number of transitions to encode as a sparse state. Usually
464 /// states with a lot of transitions are either very rare, or occur near
465 /// the start state. In the latter case, they are probably dense already
466 /// anyway. In the former case, making them dense is fine because they're
467 /// rare.
468 ///
469 /// This needs to be small enough to permit each of the sentinel values for
470 /// 'KIND' above. Namely, a sparse state embeds the number of transitions
471 /// into the 'KIND'. Basically, "sparse" is a state kind too, but it's the
472 /// "else" branch.
473 ///
474 /// N.B. There isn't anything particularly magical about 127 here. I
475 /// just picked it because I figured any sparse state with this many
476 /// transitions is going to be exceptionally rare, and if it did have this
477 /// many transitions, then it would be quite slow to do a linear scan on
478 /// the transitions during a search anyway.
479 const MAX_SPARSE_TRANSITIONS: usize = 127;
480
481 /// Remap state IDs in-place.
482 ///
483 /// `state` should be the the raw binary encoding of a state. (The start
484 /// of the slice must correspond to the start of the state, but the slice
485 /// may extend past the end of the encoding of the state.)
486 fn remap(
487 alphabet_len: usize,
488 old_to_new: &[StateID],
489 state: &mut [u32],
490 ) -> Result<(), BuildError> {
491 let kind = State::kind(state);
492 if kind == State::KIND_DENSE {
493 state[1] = old_to_new[state[1].as_usize()].as_u32();
494 for next in state[2..][..alphabet_len].iter_mut() {
495 *next = old_to_new[next.as_usize()].as_u32();
496 }
497 } else if kind == State::KIND_ONE {
498 state[1] = old_to_new[state[1].as_usize()].as_u32();
499 state[2] = old_to_new[state[2].as_usize()].as_u32();
500 } else {
501 let trans_len = State::sparse_trans_len(state);
502 let classes_len = u32_len(trans_len);
503 state[1] = old_to_new[state[1].as_usize()].as_u32();
504 for next in state[2 + classes_len..][..trans_len].iter_mut() {
505 *next = old_to_new[next.as_usize()].as_u32();
506 }
507 }
508 Ok(())
509 }
510
511 /// Returns the length, in number of u32s, of this state.
512 ///
513 /// This is useful for reading states consecutively, e.g., in the Debug
514 /// impl without needing to store a separate map from state index to state
515 /// identifier.
516 ///
517 /// `state` should be the the raw binary encoding of a state. (The start
518 /// of the slice must correspond to the start of the state, but the slice
519 /// may extend past the end of the encoding of the state.)
520 fn len(alphabet_len: usize, is_match: bool, state: &[u32]) -> usize {
521 let kind_len = 1;
522 let fail_len = 1;
523 let kind = State::kind(state);
524 let (classes_len, trans_len) = if kind == State::KIND_DENSE {
525 (0, alphabet_len)
526 } else if kind == State::KIND_ONE {
527 (0, 1)
528 } else {
529 let trans_len = State::sparse_trans_len(state);
530 let classes_len = u32_len(trans_len);
531 (classes_len, trans_len)
532 };
533 let match_len = if !is_match {
534 0
535 } else if State::match_len(alphabet_len, state) == 1 {
536 // This is a special case because when there is one pattern ID for
537 // a match state, it is represented by a single u32 with its high
538 // bit set (which is impossible for a valid pattern ID).
539 1
540 } else {
541 // We add 1 to include the u32 that indicates the number of
542 // pattern IDs that follow.
543 1 + State::match_len(alphabet_len, state)
544 };
545 kind_len + fail_len + classes_len + trans_len + match_len
546 }
547
548 /// Returns the kind of this state.
549 ///
550 /// This only includes the low byte.
551 #[inline(always)]
552 fn kind(state: &[u32]) -> u32 {
553 state[State::KIND] & 0xFF
554 }
555
556 /// Get the number of sparse transitions in this state. This can never
557 /// be more than State::MAX_SPARSE_TRANSITIONS, as all states with more
558 /// transitions are encoded as dense states.
559 ///
560 /// `state` should be the the raw binary encoding of a sparse state. (The
561 /// start of the slice must correspond to the start of the state, but the
562 /// slice may extend past the end of the encoding of the state.) If this
563 /// isn't a sparse state, then the return value is unspecified.
564 ///
565 /// Do note that this is only legal to call on a sparse state. So for
566 /// example, "one transition" state is not a sparse state, so it would not
567 /// be legal to call this method on such a state.
568 #[inline(always)]
569 fn sparse_trans_len(state: &[u32]) -> usize {
570 (state[State::KIND] & 0xFF).as_usize()
571 }
572
573 /// Returns the total number of matching pattern IDs in this state. Calling
574 /// this on a state that isn't a match results in unspecified behavior.
575 /// Thus, the returned number is never 0 for all correct calls.
576 ///
577 /// `state` should be the the raw binary encoding of a state. (The start
578 /// of the slice must correspond to the start of the state, but the slice
579 /// may extend past the end of the encoding of the state.)
580 #[inline(always)]
581 fn match_len(alphabet_len: usize, state: &[u32]) -> usize {
582 // We don't need to handle KIND_ONE here because it can never be a
583 // match state.
584 let packed = if State::kind(state) == State::KIND_DENSE {
585 let start = 2 + alphabet_len;
586 state[start].as_usize()
587 } else {
588 let trans_len = State::sparse_trans_len(state);
589 let classes_len = u32_len(trans_len);
590 let start = 2 + classes_len + trans_len;
591 state[start].as_usize()
592 };
593 if packed & (1 << 31) == 0 {
594 packed
595 } else {
596 1
597 }
598 }
599
600 /// Returns the pattern ID corresponding to the given index for the state
601 /// given. The `index` provided must be less than the number of pattern IDs
602 /// in this state.
603 ///
604 /// `state` should be the the raw binary encoding of a state. (The start of
605 /// the slice must correspond to the start of the state, but the slice may
606 /// extend past the end of the encoding of the state.)
607 ///
608 /// If the given state is not a match state or if the index is out of
609 /// bounds, then this has unspecified behavior.
610 #[inline(always)]
611 fn match_pattern(
612 alphabet_len: usize,
613 state: &[u32],
614 index: usize,
615 ) -> PatternID {
616 // We don't need to handle KIND_ONE here because it can never be a
617 // match state.
618 let start = if State::kind(state) == State::KIND_DENSE {
619 2 + alphabet_len
620 } else {
621 let trans_len = State::sparse_trans_len(state);
622 let classes_len = u32_len(trans_len);
623 2 + classes_len + trans_len
624 };
625 let packed = state[start];
626 let pid = if packed & (1 << 31) == 0 {
627 state[start + 1 + index]
628 } else {
629 assert_eq!(0, index);
630 packed & !(1 << 31)
631 };
632 PatternID::from_u32_unchecked(pid)
633 }
634
635 /// Read a state's binary encoding to its in-memory representation.
636 ///
637 /// `alphabet_len` should be the total number of transitions defined for
638 /// dense states.
639 ///
640 /// `is_match` should be true if this state is a match state and false
641 /// otherwise.
642 ///
643 /// `state` should be the the raw binary encoding of a state. (The start
644 /// of the slice must correspond to the start of the state, but the slice
645 /// may extend past the end of the encoding of the state.)
646 fn read(
647 alphabet_len: usize,
648 is_match: bool,
649 state: &'a [u32],
650 ) -> State<'a> {
651 let kind = State::kind(state);
652 let match_len =
653 if !is_match { 0 } else { State::match_len(alphabet_len, state) };
654 let (trans, fail) = if kind == State::KIND_DENSE {
655 let fail = StateID::from_u32_unchecked(state[1]);
656 let class_to_next = &state[2..][..alphabet_len];
657 (StateTrans::Dense { class_to_next }, fail)
658 } else if kind == State::KIND_ONE {
659 let fail = StateID::from_u32_unchecked(state[1]);
660 let class = state[State::KIND].low_u16().high_u8();
661 let next = state[2];
662 (StateTrans::One { class, next }, fail)
663 } else {
664 let fail = StateID::from_u32_unchecked(state[1]);
665 let trans_len = State::sparse_trans_len(state);
666 let classes_len = u32_len(trans_len);
667 let classes = &state[2..][..classes_len];
668 let nexts = &state[2 + classes_len..][..trans_len];
669 (StateTrans::Sparse { classes, nexts }, fail)
670 };
671 State { fail, match_len, trans }
672 }
673
674 /// Encode the "old" state from a noncontiguous NFA to its binary
675 /// representation to the given `dst` slice. `classes` should be the byte
676 /// classes computed for the noncontiguous NFA that the given state came
677 /// from.
678 ///
679 /// This returns an error if `dst` became so big that `StateID`s can no
680 /// longer be created for new states. Otherwise, it returns the state ID of
681 /// the new state created.
682 ///
683 /// When `force_dense` is true, then the encoded state will always use a
684 /// dense format. Otherwise, the choice between dense and sparse will be
685 /// automatically chosen based on the old state.
686 fn write(
687 nnfa: &noncontiguous::NFA,
688 oldsid: StateID,
689 old: &noncontiguous::State,
690 classes: &ByteClasses,
691 dst: &mut Vec<u32>,
692 force_dense: bool,
693 ) -> Result<StateID, BuildError> {
694 let sid = StateID::new(dst.len()).map_err(|e| {
695 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
696 })?;
697 let old_len = nnfa.iter_trans(oldsid).count();
698 // For states with a lot of transitions, we might as well just make
699 // them dense. These kinds of hot states tend to be very rare, so we're
700 // okay with it. This also gives us more sentinels in the state's
701 // 'kind', which lets us create different state kinds to save on
702 // space.
703 let kind = if force_dense || old_len > State::MAX_SPARSE_TRANSITIONS {
704 State::KIND_DENSE
705 } else if old_len == 1 && !old.is_match() {
706 State::KIND_ONE
707 } else {
708 // For a sparse state, the kind is just the number of transitions.
709 u32::try_from(old_len).unwrap()
710 };
711 if kind == State::KIND_DENSE {
712 dst.push(kind);
713 dst.push(old.fail().as_u32());
714 State::write_dense_trans(nnfa, oldsid, classes, dst)?;
715 } else if kind == State::KIND_ONE {
716 let t = nnfa.iter_trans(oldsid).next().unwrap();
717 let class = u32::from(classes.get(t.byte()));
718 dst.push(kind | (class << 8));
719 dst.push(old.fail().as_u32());
720 dst.push(t.next().as_u32());
721 } else {
722 dst.push(kind);
723 dst.push(old.fail().as_u32());
724 State::write_sparse_trans(nnfa, oldsid, classes, dst)?;
725 }
726 // Now finally write the number of matches and the matches themselves.
727 if old.is_match() {
728 let matches_len = nnfa.iter_matches(oldsid).count();
729 if matches_len == 1 {
730 let pid = nnfa.iter_matches(oldsid).next().unwrap().as_u32();
731 assert_eq!(0, pid & (1 << 31));
732 dst.push((1 << 31) | pid);
733 } else {
734 assert_eq!(0, matches_len & (1 << 31));
735 dst.push(matches_len.as_u32());
736 dst.extend(nnfa.iter_matches(oldsid).map(|pid| pid.as_u32()));
737 }
738 }
739 Ok(sid)
740 }
741
742 /// Encode the "old" state transitions from a noncontiguous NFA to its
743 /// binary sparse representation to the given `dst` slice. `classes` should
744 /// be the byte classes computed for the noncontiguous NFA that the given
745 /// state came from.
746 ///
747 /// This returns an error if `dst` became so big that `StateID`s can no
748 /// longer be created for new states.
749 fn write_sparse_trans(
750 nnfa: &noncontiguous::NFA,
751 oldsid: StateID,
752 classes: &ByteClasses,
753 dst: &mut Vec<u32>,
754 ) -> Result<(), BuildError> {
755 let (mut chunk, mut len) = ([0; 4], 0);
756 for t in nnfa.iter_trans(oldsid) {
757 chunk[len] = classes.get(t.byte());
758 len += 1;
759 if len == 4 {
760 dst.push(u32::from_ne_bytes(chunk));
761 chunk = [0; 4];
762 len = 0;
763 }
764 }
765 if len > 0 {
766 // In the case where the number of transitions isn't divisible
767 // by 4, the last u32 chunk will have some left over room. In
768 // this case, we "just" repeat the last equivalence class. By
769 // doing this, we know the leftover faux transitions will never
770 // be followed because if they were, it would have been followed
771 // prior to it in the last equivalence class. This saves us some
772 // branching in the search time state transition code.
773 let repeat = chunk[len - 1];
774 while len < 4 {
775 chunk[len] = repeat;
776 len += 1;
777 }
778 dst.push(u32::from_ne_bytes(chunk));
779 }
780 for t in nnfa.iter_trans(oldsid) {
781 dst.push(t.next().as_u32());
782 }
783 Ok(())
784 }
785
786 /// Encode the "old" state transitions from a noncontiguous NFA to its
787 /// binary dense representation to the given `dst` slice. `classes` should
788 /// be the byte classes computed for the noncontiguous NFA that the given
789 /// state came from.
790 ///
791 /// This returns an error if `dst` became so big that `StateID`s can no
792 /// longer be created for new states.
793 fn write_dense_trans(
794 nnfa: &noncontiguous::NFA,
795 oldsid: StateID,
796 classes: &ByteClasses,
797 dst: &mut Vec<u32>,
798 ) -> Result<(), BuildError> {
799 // Our byte classes let us shrink the size of our dense states to the
800 // number of equivalence classes instead of just fixing it to 256.
801 // Any non-explicitly defined transition is just a transition to the
802 // FAIL state, so we fill that in first and then overwrite them with
803 // explicitly defined transitions. (Most states probably only have one
804 // or two explicitly defined transitions.)
805 //
806 // N.B. Remember that while building the contiguous NFA, we use state
807 // IDs from the noncontiguous NFA. It isn't until we've added all
808 // states that we go back and map noncontiguous IDs to contiguous IDs.
809 let start = dst.len();
810 dst.extend(
811 core::iter::repeat(noncontiguous::NFA::FAIL.as_u32())
812 .take(classes.alphabet_len()),
813 );
814 assert!(start < dst.len(), "equivalence classes are never empty");
815 for t in nnfa.iter_trans(oldsid) {
816 dst[start + usize::from(classes.get(t.byte()))] =
817 t.next().as_u32();
818 }
819 Ok(())
820 }
821
822 /// Return an iterator over every explicitly defined transition in this
823 /// state.
824 fn transitions<'b>(&'b self) -> impl Iterator<Item = (u8, StateID)> + 'b {
825 let mut i = 0;
826 core::iter::from_fn(move || match self.trans {
827 StateTrans::Sparse { classes, nexts } => {
828 if i >= nexts.len() {
829 return None;
830 }
831 let chunk = classes[i / 4];
832 let class = chunk.to_ne_bytes()[i % 4];
833 let next = StateID::from_u32_unchecked(nexts[i]);
834 i += 1;
835 Some((class, next))
836 }
837 StateTrans::One { class, next } => {
838 if i == 0 {
839 i += 1;
840 Some((class, StateID::from_u32_unchecked(next)))
841 } else {
842 None
843 }
844 }
845 StateTrans::Dense { class_to_next } => {
846 if i >= class_to_next.len() {
847 return None;
848 }
849 let class = i.as_u8();
850 let next = StateID::from_u32_unchecked(class_to_next[i]);
851 i += 1;
852 Some((class, next))
853 }
854 })
855 }
856}
857
858impl<'a> core::fmt::Debug for State<'a> {
859 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
860 use crate::{automaton::sparse_transitions, util::debug::DebugByte};
861
862 let it = sparse_transitions(self.transitions())
863 // Writing out all FAIL transitions is quite noisy. Instead, we
864 // just require readers of the output to assume anything absent
865 // maps to the FAIL transition.
866 .filter(|&(_, _, sid)| sid != NFA::FAIL)
867 .enumerate();
868 for (i, (start, end, sid)) in it {
869 if i > 0 {
870 write!(f, ", ")?;
871 }
872 if start == end {
873 write!(f, "{:?} => {:?}", DebugByte(start), sid.as_usize())?;
874 } else {
875 write!(
876 f,
877 "{:?}-{:?} => {:?}",
878 DebugByte(start),
879 DebugByte(end),
880 sid.as_usize()
881 )?;
882 }
883 }
884 Ok(())
885 }
886}
887
888/// A builder for configuring an Aho-Corasick contiguous NFA.
889///
890/// This builder has a subset of the options available to a
891/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
892/// their behavior is identical.
893#[derive(Clone, Debug)]
894pub struct Builder {
895 noncontiguous: noncontiguous::Builder,
896 dense_depth: usize,
897 byte_classes: bool,
898}
899
900impl Default for Builder {
901 fn default() -> Builder {
902 Builder {
903 noncontiguous: noncontiguous::Builder::new(),
904 dense_depth: 2,
905 byte_classes: true,
906 }
907 }
908}
909
910impl Builder {
911 /// Create a new builder for configuring an Aho-Corasick contiguous NFA.
912 pub fn new() -> Builder {
913 Builder::default()
914 }
915
916 /// Build an Aho-Corasick contiguous NFA from the given iterator of
917 /// patterns.
918 ///
919 /// A builder may be reused to create more NFAs.
920 pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
921 where
922 I: IntoIterator<Item = P>,
923 P: AsRef<[u8]>,
924 {
925 let nnfa = self.noncontiguous.build(patterns)?;
926 self.build_from_noncontiguous(&nnfa)
927 }
928
929 /// Build an Aho-Corasick contiguous NFA from the given noncontiguous NFA.
930 ///
931 /// Note that when this method is used, only the `dense_depth` and
932 /// `byte_classes` settings on this builder are respected. The other
933 /// settings only apply to the initial construction of the Aho-Corasick
934 /// automaton. Since using this method requires that initial construction
935 /// has already completed, all settings impacting only initial construction
936 /// are no longer relevant.
937 pub fn build_from_noncontiguous(
938 &self,
939 nnfa: &noncontiguous::NFA,
940 ) -> Result<NFA, BuildError> {
941 debug!("building contiguous NFA");
942 let byte_classes = if self.byte_classes {
943 nnfa.byte_classes().clone()
944 } else {
945 ByteClasses::singletons()
946 };
947 let mut index_to_state_id = vec![NFA::DEAD; nnfa.states().len()];
948 let mut nfa = NFA {
949 repr: vec![],
950 pattern_lens: nnfa.pattern_lens_raw().to_vec(),
951 state_len: nnfa.states().len(),
952 prefilter: nnfa.prefilter().map(|p| p.clone()),
953 match_kind: nnfa.match_kind(),
954 alphabet_len: byte_classes.alphabet_len(),
955 byte_classes,
956 min_pattern_len: nnfa.min_pattern_len(),
957 max_pattern_len: nnfa.max_pattern_len(),
958 // The special state IDs are set later.
959 special: Special::zero(),
960 };
961 for (oldsid, state) in nnfa.states().iter().with_state_ids() {
962 // We don't actually encode a fail state since it isn't necessary.
963 // But we still want to make sure any FAIL ids are mapped
964 // correctly.
965 if oldsid == noncontiguous::NFA::FAIL {
966 index_to_state_id[oldsid] = NFA::FAIL;
967 continue;
968 }
969 let force_dense = state.depth().as_usize() < self.dense_depth;
970 let newsid = State::write(
971 nnfa,
972 oldsid,
973 state,
974 &nfa.byte_classes,
975 &mut nfa.repr,
976 force_dense,
977 )?;
978 index_to_state_id[oldsid] = newsid;
979 }
980 for &newsid in index_to_state_id.iter() {
981 if newsid == NFA::FAIL {
982 continue;
983 }
984 let state = &mut nfa.repr[newsid.as_usize()..];
985 State::remap(nfa.alphabet_len, &index_to_state_id, state)?;
986 }
987 // Now that we've remapped all the IDs in our states, all that's left
988 // is remapping the special state IDs.
989 let remap = &index_to_state_id;
990 let old = nnfa.special();
991 let new = &mut nfa.special;
992 new.max_special_id = remap[old.max_special_id];
993 new.max_match_id = remap[old.max_match_id];
994 new.start_unanchored_id = remap[old.start_unanchored_id];
995 new.start_anchored_id = remap[old.start_anchored_id];
996 debug!(
997 "contiguous NFA built, <states: {:?}, size: {:?}, \
998 alphabet len: {:?}>",
999 nfa.state_len,
1000 nfa.memory_usage(),
1001 nfa.byte_classes.alphabet_len(),
1002 );
1003 // The vectors can grow ~twice as big during construction because a
1004 // Vec amortizes growth. But here, let's shrink things back down to
1005 // what we actually need since we're never going to add more to it.
1006 nfa.repr.shrink_to_fit();
1007 nfa.pattern_lens.shrink_to_fit();
1008 Ok(nfa)
1009 }
1010
1011 /// Set the desired match semantics.
1012 ///
1013 /// This only applies when using [`Builder::build`] and not
1014 /// [`Builder::build_from_noncontiguous`].
1015 ///
1016 /// See
1017 /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
1018 /// for more documentation and examples.
1019 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
1020 self.noncontiguous.match_kind(kind);
1021 self
1022 }
1023
1024 /// Enable ASCII-aware case insensitive matching.
1025 ///
1026 /// This only applies when using [`Builder::build`] and not
1027 /// [`Builder::build_from_noncontiguous`].
1028 ///
1029 /// See
1030 /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
1031 /// for more documentation and examples.
1032 pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
1033 self.noncontiguous.ascii_case_insensitive(yes);
1034 self
1035 }
1036
1037 /// Enable heuristic prefilter optimizations.
1038 ///
1039 /// This only applies when using [`Builder::build`] and not
1040 /// [`Builder::build_from_noncontiguous`].
1041 ///
1042 /// See
1043 /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
1044 /// for more documentation and examples.
1045 pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
1046 self.noncontiguous.prefilter(yes);
1047 self
1048 }
1049
1050 /// Set the limit on how many states use a dense representation for their
1051 /// transitions. Other states will generally use a sparse representation.
1052 ///
1053 /// See
1054 /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
1055 /// for more documentation and examples.
1056 pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
1057 self.dense_depth = depth;
1058 self
1059 }
1060
1061 /// A debug setting for whether to attempt to shrink the size of the
1062 /// automaton's alphabet or not.
1063 ///
1064 /// This should never be enabled unless you're debugging an automaton.
1065 /// Namely, disabling byte classes makes transitions easier to reason
1066 /// about, since they use the actual bytes instead of equivalence classes.
1067 /// Disabling this confers no performance benefit at search time.
1068 ///
1069 /// See
1070 /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
1071 /// for more documentation and examples.
1072 pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
1073 self.byte_classes = yes;
1074 self
1075 }
1076}
1077
1078/// Computes the number of u32 values needed to represent one byte per the
1079/// number of transitions given.
1080fn u32_len(ntrans: usize) -> usize {
1081 if ntrans % 4 == 0 {
1082 ntrans >> 2
1083 } else {
1084 (ntrans >> 2) + 1
1085 }
1086}
1087
1088#[cfg(test)]
1089mod tests {
1090 // This test demonstrates a SWAR technique I tried in the sparse transition
1091 // code inside of 'next_state'. Namely, sparse transitions work by
1092 // iterating over u32 chunks, with each chunk containing up to 4 classes
1093 // corresponding to 4 transitions. This SWAR technique lets us find a
1094 // matching transition without converting the u32 to a [u8; 4].
1095 //
1096 // It turned out to be a little slower unfortunately, which isn't too
1097 // surprising, since this is likely a throughput oriented optimization.
1098 // Loop unrolling doesn't really help us because the vast majority of
1099 // states have very few transitions.
1100 //
1101 // Anyway, this code was a little tricky to write, so I converted it to a
1102 // test in case someone figures out how to use it more effectively than
1103 // I could.
1104 //
1105 // (This also only works on little endian. So big endian would need to be
1106 // accounted for if we ever decided to use this I think.)
1107 #[cfg(target_endian = "little")]
1108 #[test]
1109 fn swar() {
1110 use super::*;
1111
1112 fn has_zero_byte(x: u32) -> u32 {
1113 const LO_U32: u32 = 0x01010101;
1114 const HI_U32: u32 = 0x80808080;
1115
1116 x.wrapping_sub(LO_U32) & !x & HI_U32
1117 }
1118
1119 fn broadcast(b: u8) -> u32 {
1120 (u32::from(b)) * (u32::MAX / 255)
1121 }
1122
1123 fn index_of(x: u32) -> usize {
1124 let o =
1125 (((x - 1) & 0x01010101).wrapping_mul(0x01010101) >> 24) - 1;
1126 o.as_usize()
1127 }
1128
1129 let bytes: [u8; 4] = [b'1', b'A', b'a', b'z'];
1130 let chunk = u32::from_ne_bytes(bytes);
1131
1132 let needle = broadcast(b'1');
1133 assert_eq!(0, index_of(has_zero_byte(needle ^ chunk)));
1134 let needle = broadcast(b'A');
1135 assert_eq!(1, index_of(has_zero_byte(needle ^ chunk)));
1136 let needle = broadcast(b'a');
1137 assert_eq!(2, index_of(has_zero_byte(needle ^ chunk)));
1138 let needle = broadcast(b'z');
1139 assert_eq!(3, index_of(has_zero_byte(needle ^ chunk)));
1140 }
1141}
1142