1/*!
2Provides a contiguous NFA implementation of Aho-Corasick.
3
4This is a low-level API that generally only needs to be used in niche
5circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6instead of a contiguous NFA directly. Using an `NFA` directly is typically only
7necessary when one needs access to the [`Automaton`] trait implementation.
8*/
9
10use alloc::{vec, vec::Vec};
11
12use crate::{
13 automaton::Automaton,
14 nfa::noncontiguous,
15 util::{
16 alphabet::ByteClasses,
17 error::{BuildError, MatchError},
18 int::{Usize, U16, U32},
19 prefilter::Prefilter,
20 primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
21 search::{Anchored, MatchKind},
22 special::Special,
23 },
24};
25
26/// A contiguous NFA implementation of Aho-Corasick.
27///
28/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
29/// this type directly. Using an `NFA` directly is typically only necessary
30/// when one needs access to the [`Automaton`] trait implementation.
31///
32/// This NFA can only be built by first constructing a [`noncontiguous::NFA`].
33/// Both [`NFA::new`] and [`Builder::build`] do this for you automatically, but
34/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
35///
36/// The main difference between a noncontiguous NFA and a contiguous NFA is
37/// that the latter represents all of its states and transitions in a single
38/// allocation, where as the former uses a separate allocation for each state.
39/// Doing this at construction time while keeping a low memory footprint isn't
40/// feasible, which is primarily why there are two different NFA types: one
41/// that does the least amount of work possible to build itself, and another
42/// that does a little extra work to compact itself and make state transitions
43/// faster by making some states use a dense representation.
44///
45/// Because a contiguous NFA uses a single allocation, there is a lot more
46/// opportunity for compression tricks to reduce the heap memory used. Indeed,
47/// it is not uncommon for a contiguous NFA to use an order of magnitude less
48/// heap memory than a noncontiguous NFA. Since building a contiguous NFA
49/// usually only takes a fraction of the time it takes to build a noncontiguous
50/// NFA, the overall build time is not much slower. Thus, in most cases, a
51/// contiguous NFA is the best choice.
52///
53/// Since a contiguous NFA uses various tricks for compression and to achieve
54/// faster state transitions, currently, its limit on the number of states
55/// is somewhat smaller than what a noncontiguous NFA can achieve. Generally
56/// speaking, you shouldn't expect to run into this limit if the number of
57/// patterns is under 1 million. It is plausible that this limit will be
58/// increased in the future. If the limit is reached, building a contiguous NFA
59/// will return an error. Often, since building a contiguous NFA is relatively
60/// cheap, it can make sense to always try it even if you aren't sure if it
61/// will fail or not. If it does, you can always fall back to a noncontiguous
62/// NFA. (Indeed, the main [`AhoCorasick`](crate::AhoCorasick) type employs a
63/// strategy similar to this at construction time.)
64///
65/// # Example
66///
67/// This example shows how to build an `NFA` directly and use it to execute
68/// [`Automaton::try_find`]:
69///
70/// ```
71/// use aho_corasick::{
72/// automaton::Automaton,
73/// nfa::contiguous::NFA,
74/// Input, Match,
75/// };
76///
77/// let patterns = &["b", "abc", "abcd"];
78/// let haystack = "abcd";
79///
80/// let nfa = NFA::new(patterns).unwrap();
81/// assert_eq!(
82/// Some(Match::must(0, 1..2)),
83/// nfa.try_find(&Input::new(haystack))?,
84/// );
85/// # Ok::<(), Box<dyn std::error::Error>>(())
86/// ```
87///
88/// It is also possible to implement your own version of `try_find`. See the
89/// [`Automaton`] documentation for an example.
90#[derive(Clone)]
91pub struct NFA {
92 /// The raw NFA representation. Each state is packed with a header
93 /// (containing the format of the state, the failure transition and, for
94 /// a sparse state, the number of transitions), its transitions and any
95 /// matching pattern IDs for match states.
96 repr: Vec<u32>,
97 /// The length of each pattern. This is used to compute the start offset
98 /// of a match.
99 pattern_lens: Vec<SmallIndex>,
100 /// The total number of states in this NFA.
101 state_len: usize,
102 /// A prefilter for accelerating searches, if one exists.
103 prefilter: Option<Prefilter>,
104 /// The match semantics built into this NFA.
105 match_kind: MatchKind,
106 /// The alphabet size, or total number of equivalence classes, for this
107 /// NFA. Dense states always have this many transitions.
108 alphabet_len: usize,
109 /// The equivalence classes for this NFA. All transitions, dense and
110 /// sparse, are defined on equivalence classes and not on the 256 distinct
111 /// byte values.
112 byte_classes: ByteClasses,
113 /// The length of the shortest pattern in this automaton.
114 min_pattern_len: usize,
115 /// The length of the longest pattern in this automaton.
116 max_pattern_len: usize,
117 /// The information required to deduce which states are "special" in this
118 /// NFA.
119 special: Special,
120}
121
122impl NFA {
123 /// Create a new Aho-Corasick contiguous NFA using the default
124 /// configuration.
125 ///
126 /// Use a [`Builder`] if you want to change the configuration.
127 pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
128 where
129 I: IntoIterator<Item = P>,
130 P: AsRef<[u8]>,
131 {
132 NFA::builder().build(patterns)
133 }
134
135 /// A convenience method for returning a new Aho-Corasick contiguous NFA
136 /// builder.
137 ///
138 /// This usually permits one to just import the `NFA` type.
139 pub fn builder() -> Builder {
140 Builder::new()
141 }
142}
143
144impl NFA {
145 /// A sentinel state ID indicating that a search should stop once it has
146 /// entered this state. When a search stops, it returns a match if one
147 /// has been found, otherwise no match. A contiguous NFA always has an
148 /// actual dead state at this ID.
149 const DEAD: StateID = StateID::new_unchecked(0);
150 /// Another sentinel state ID indicating that a search should move through
151 /// current state's failure transition.
152 ///
153 /// Note that unlike DEAD, this does not actually point to a valid state
154 /// in a contiguous NFA. (noncontiguous::NFA::FAIL does point to a valid
155 /// state.) Instead, this points to the position that is guaranteed to
156 /// never be a valid state ID (by making sure it points to a place in the
157 /// middle of the encoding of the DEAD state). Since we never need to
158 /// actually look at the FAIL state itself, this works out.
159 ///
160 /// By why do it this way? So that FAIL is a constant. I don't have any
161 /// concrete evidence that this materially helps matters, but it's easy to
162 /// do. The alternative would be making the FAIL ID point to the second
163 /// state, which could be made a constant but is a little trickier to do.
164 /// The easiest path is to just make the FAIL state a runtime value, but
165 /// since comparisons with FAIL occur in perf critical parts of the search,
166 /// we want it to be as tight as possible and not waste any registers.
167 ///
168 /// Very hand wavy... But the code complexity that results from this is
169 /// very mild.
170 const FAIL: StateID = StateID::new_unchecked(1);
171}
172
173// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
174// returns a valid state ID given a valid state ID. We otherwise claim that
175// all other methods are correct as well.
176unsafe impl Automaton for NFA {
177 #[inline(always)]
178 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
179 match anchored {
180 Anchored::No => Ok(self.special.start_unanchored_id),
181 Anchored::Yes => Ok(self.special.start_anchored_id),
182 }
183 }
184
185 #[inline(always)]
186 fn next_state(
187 &self,
188 anchored: Anchored,
189 mut sid: StateID,
190 byte: u8,
191 ) -> StateID {
192 let repr = &self.repr;
193 let class = self.byte_classes.get(byte);
194 let u32tosid = StateID::from_u32_unchecked;
195 loop {
196 let o = sid.as_usize();
197 let kind = repr[o] & 0xFF;
198 // I tried to encapsulate the "next transition" logic into its own
199 // function, but it seemed to always result in sub-optimal codegen
200 // that led to real and significant slowdowns. So we just inline
201 // the logic here.
202 //
203 // I've also tried a lot of different ways to speed up this
204 // routine, and most of them have failed.
205 if kind == State::KIND_DENSE {
206 let next = u32tosid(repr[o + 2 + usize::from(class)]);
207 if next != NFA::FAIL {
208 return next;
209 }
210 } else if kind == State::KIND_ONE {
211 if class == repr[o].low_u16().high_u8() {
212 return u32tosid(repr[o + 2]);
213 }
214 } else {
215 // NOTE: I tried a SWAR technique in the loop below, but found
216 // it slower. See the 'swar' test in the tests for this module.
217 let trans_len = kind.as_usize();
218 let classes_len = u32_len(trans_len);
219 let trans_offset = o + 2 + classes_len;
220 for (i, &chunk) in
221 repr[o + 2..][..classes_len].iter().enumerate()
222 {
223 let classes = chunk.to_ne_bytes();
224 if classes[0] == class {
225 return u32tosid(repr[trans_offset + i * 4]);
226 }
227 if classes[1] == class {
228 return u32tosid(repr[trans_offset + i * 4 + 1]);
229 }
230 if classes[2] == class {
231 return u32tosid(repr[trans_offset + i * 4 + 2]);
232 }
233 if classes[3] == class {
234 return u32tosid(repr[trans_offset + i * 4 + 3]);
235 }
236 }
237 }
238 // For an anchored search, we never follow failure transitions
239 // because failure transitions lead us down a path to matching
240 // a *proper* suffix of the path we were on. Thus, it can only
241 // produce matches that appear after the beginning of the search.
242 if anchored.is_anchored() {
243 return NFA::DEAD;
244 }
245 sid = u32tosid(repr[o + 1]);
246 }
247 }
248
249 #[inline(always)]
250 fn is_special(&self, sid: StateID) -> bool {
251 sid <= self.special.max_special_id
252 }
253
254 #[inline(always)]
255 fn is_dead(&self, sid: StateID) -> bool {
256 sid == NFA::DEAD
257 }
258
259 #[inline(always)]
260 fn is_match(&self, sid: StateID) -> bool {
261 !self.is_dead(sid) && sid <= self.special.max_match_id
262 }
263
264 #[inline(always)]
265 fn is_start(&self, sid: StateID) -> bool {
266 sid == self.special.start_unanchored_id
267 || sid == self.special.start_anchored_id
268 }
269
270 #[inline(always)]
271 fn match_kind(&self) -> MatchKind {
272 self.match_kind
273 }
274
275 #[inline(always)]
276 fn patterns_len(&self) -> usize {
277 self.pattern_lens.len()
278 }
279
280 #[inline(always)]
281 fn pattern_len(&self, pid: PatternID) -> usize {
282 self.pattern_lens[pid].as_usize()
283 }
284
285 #[inline(always)]
286 fn min_pattern_len(&self) -> usize {
287 self.min_pattern_len
288 }
289
290 #[inline(always)]
291 fn max_pattern_len(&self) -> usize {
292 self.max_pattern_len
293 }
294
295 #[inline(always)]
296 fn match_len(&self, sid: StateID) -> usize {
297 State::match_len(self.alphabet_len, &self.repr[sid.as_usize()..])
298 }
299
300 #[inline(always)]
301 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
302 State::match_pattern(
303 self.alphabet_len,
304 &self.repr[sid.as_usize()..],
305 index,
306 )
307 }
308
309 #[inline(always)]
310 fn memory_usage(&self) -> usize {
311 use core::mem::size_of;
312
313 (self.repr.len() * size_of::<u32>())
314 + (self.pattern_lens.len() * size_of::<SmallIndex>())
315 + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
316 }
317
318 #[inline(always)]
319 fn prefilter(&self) -> Option<&Prefilter> {
320 self.prefilter.as_ref()
321 }
322}
323
324impl core::fmt::Debug for NFA {
325 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
326 use crate::automaton::fmt_state_indicator;
327
328 writeln!(f, "contiguous::NFA(")?;
329 let mut sid = NFA::DEAD; // always the first state and always present
330 loop {
331 let raw = &self.repr[sid.as_usize()..];
332 if raw.is_empty() {
333 break;
334 }
335 let is_match = self.is_match(sid);
336 let state = State::read(self.alphabet_len, is_match, raw);
337 fmt_state_indicator(f, self, sid)?;
338 write!(
339 f,
340 "{:06}({:06}): ",
341 sid.as_usize(),
342 state.fail.as_usize()
343 )?;
344 state.fmt(f)?;
345 write!(f, "\n")?;
346 if self.is_match(sid) {
347 write!(f, " matches: ")?;
348 for i in 0..state.match_len {
349 let pid = State::match_pattern(self.alphabet_len, raw, i);
350 if i > 0 {
351 write!(f, ", ")?;
352 }
353 write!(f, "{}", pid.as_usize())?;
354 }
355 write!(f, "\n")?;
356 }
357 // The FAIL state doesn't actually have space for a state allocated
358 // for it, so we have to treat it as a special case. write below
359 // the DEAD state.
360 if sid == NFA::DEAD {
361 writeln!(f, "F {:06}:", NFA::FAIL.as_usize())?;
362 }
363 let len = State::len(self.alphabet_len, is_match, raw);
364 sid = StateID::new(sid.as_usize().checked_add(len).unwrap())
365 .unwrap();
366 }
367 writeln!(f, "match kind: {:?}", self.match_kind)?;
368 writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
369 writeln!(f, "state length: {:?}", self.state_len)?;
370 writeln!(f, "pattern length: {:?}", self.patterns_len())?;
371 writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
372 writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
373 writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
374 writeln!(f, "byte classes: {:?}", self.byte_classes)?;
375 writeln!(f, "memory usage: {:?}", self.memory_usage())?;
376 writeln!(f, ")")?;
377
378 Ok(())
379 }
380}
381
382/// The "in memory" representation a single dense or sparse state.
383///
384/// A `State`'s in memory representation is not ever actually materialized
385/// during a search with a contiguous NFA. Doing so would be too slow. (Indeed,
386/// the only time a `State` is actually constructed is in `Debug` impls.)
387/// Instead, a `State` exposes a number of static methods for reading certain
388/// things from the raw binary encoding of the state.
389#[derive(Clone)]
390struct State<'a> {
391 /// The state to transition to when 'class_to_next' yields a transition
392 /// to the FAIL state.
393 fail: StateID,
394 /// The number of pattern IDs in this state. For a non-match state, this is
395 /// always zero. Otherwise it is always bigger than zero.
396 match_len: usize,
397 /// The sparse or dense representation of the transitions for this state.
398 trans: StateTrans<'a>,
399}
400
401/// The underlying representation of sparse or dense transitions for a state.
402///
403/// Note that like `State`, we don't typically construct values of this type
404/// during a search since we don't always need all values and thus would
405/// represent a lot of wasteful work.
406#[derive(Clone)]
407enum StateTrans<'a> {
408 /// A sparse representation of transitions for a state, where only non-FAIL
409 /// transitions are explicitly represented.
410 Sparse {
411 classes: &'a [u32],
412 /// The transitions for this state, where each transition is packed
413 /// into a u32. The low 8 bits correspond to the byte class for the
414 /// transition, and the high 24 bits correspond to the next state ID.
415 ///
416 /// This packing is why the max state ID allowed for a contiguous
417 /// NFA is 2^24-1.
418 nexts: &'a [u32],
419 },
420 /// A "one transition" state that is never a match state.
421 ///
422 /// These are by far the most common state, so we use a specialized and
423 /// very compact representation for them.
424 One {
425 /// The element of this NFA's alphabet that this transition is
426 /// defined for.
427 class: u8,
428 /// The state this should transition to if the current symbol is
429 /// equal to 'class'.
430 next: u32,
431 },
432 /// A dense representation of transitions for a state, where all
433 /// transitions are explicitly represented, including transitions to the
434 /// FAIL state.
435 Dense {
436 /// A dense set of transitions to other states. The transitions may
437 /// point to a FAIL state, in which case, the search should try the
438 /// same transition lookup at 'fail'.
439 ///
440 /// Note that this is indexed by byte equivalence classes and not
441 /// byte values. That means 'class_to_next[byte]' is wrong and
442 /// 'class_to_next[classes.get(byte)]' is correct. The number of
443 /// transitions is always equivalent to 'classes.alphabet_len()'.
444 class_to_next: &'a [u32],
445 },
446}
447
448impl<'a> State<'a> {
449 /// The offset of where the "kind" of a state is stored. If it isn't one
450 /// of the sentinel values below, then it's a sparse state and the kind
451 /// corresponds to the number of transitions in the state.
452 const KIND: usize = 0;
453
454 /// A sentinel value indicating that the state uses a dense representation.
455 const KIND_DENSE: u32 = 0xFF;
456 /// A sentinel value indicating that the state uses a special "one
457 /// transition" encoding. In practice, non-match states with one transition
458 /// make up the overwhelming majority of all states in any given
459 /// Aho-Corasick automaton, so we can specialize them using a very compact
460 /// representation.
461 const KIND_ONE: u32 = 0xFE;
462
463 /// The maximum number of transitions to encode as a sparse state. Usually
464 /// states with a lot of transitions are either very rare, or occur near
465 /// the start state. In the latter case, they are probably dense already
466 /// anyway. In the former case, making them dense is fine because they're
467 /// rare.
468 ///
469 /// This needs to be small enough to permit each of the sentinel values for
470 /// 'KIND' above. Namely, a sparse state embeds the number of transitions
471 /// into the 'KIND'. Basically, "sparse" is a state kind too, but it's the
472 /// "else" branch.
473 ///
474 /// N.B. There isn't anything particularly magical about 127 here. I
475 /// just picked it because I figured any sparse state with this many
476 /// transitions is going to be exceptionally rare, and if it did have this
477 /// many transitions, then it would be quite slow to do a linear scan on
478 /// the transitions during a search anyway.
479 const MAX_SPARSE_TRANSITIONS: usize = 127;
480
481 /// Remap state IDs in-place.
482 ///
483 /// `state` should be the the raw binary encoding of a state. (The start
484 /// of the slice must correspond to the start of the state, but the slice
485 /// may extend past the end of the encoding of the state.)
486 fn remap(
487 alphabet_len: usize,
488 old_to_new: &[StateID],
489 state: &mut [u32],
490 ) -> Result<(), BuildError> {
491 let kind = State::kind(state);
492 if kind == State::KIND_DENSE {
493 state[1] = old_to_new[state[1].as_usize()].as_u32();
494 for next in state[2..][..alphabet_len].iter_mut() {
495 *next = old_to_new[next.as_usize()].as_u32();
496 }
497 } else if kind == State::KIND_ONE {
498 state[1] = old_to_new[state[1].as_usize()].as_u32();
499 state[2] = old_to_new[state[2].as_usize()].as_u32();
500 } else {
501 let trans_len = State::sparse_trans_len(state);
502 let classes_len = u32_len(trans_len);
503 state[1] = old_to_new[state[1].as_usize()].as_u32();
504 for next in state[2 + classes_len..][..trans_len].iter_mut() {
505 *next = old_to_new[next.as_usize()].as_u32();
506 }
507 }
508 Ok(())
509 }
510
511 /// Returns the length, in number of u32s, of this state.
512 ///
513 /// This is useful for reading states consecutively, e.g., in the Debug
514 /// impl without needing to store a separate map from state index to state
515 /// identifier.
516 ///
517 /// `state` should be the the raw binary encoding of a state. (The start
518 /// of the slice must correspond to the start of the state, but the slice
519 /// may extend past the end of the encoding of the state.)
520 fn len(alphabet_len: usize, is_match: bool, state: &[u32]) -> usize {
521 let kind_len = 1;
522 let fail_len = 1;
523 let kind = State::kind(state);
524 let (classes_len, trans_len) = if kind == State::KIND_DENSE {
525 (0, alphabet_len)
526 } else if kind == State::KIND_ONE {
527 (0, 1)
528 } else {
529 let trans_len = State::sparse_trans_len(state);
530 let classes_len = u32_len(trans_len);
531 (classes_len, trans_len)
532 };
533 let match_len = if !is_match {
534 0
535 } else if State::match_len(alphabet_len, state) == 1 {
536 // This is a special case because when there is one pattern ID for
537 // a match state, it is represented by a single u32 with its high
538 // bit set (which is impossible for a valid pattern ID).
539 1
540 } else {
541 // We add 1 to include the u32 that indicates the number of
542 // pattern IDs that follow.
543 1 + State::match_len(alphabet_len, state)
544 };
545 kind_len + fail_len + classes_len + trans_len + match_len
546 }
547
548 /// Returns the kind of this state.
549 ///
550 /// This only includes the low byte.
551 #[inline(always)]
552 fn kind(state: &[u32]) -> u32 {
553 state[State::KIND] & 0xFF
554 }
555
556 /// Get the number of sparse transitions in this state. This can never
557 /// be more than State::MAX_SPARSE_TRANSITIONS, as all states with more
558 /// transitions are encoded as dense states.
559 ///
560 /// `state` should be the the raw binary encoding of a sparse state. (The
561 /// start of the slice must correspond to the start of the state, but the
562 /// slice may extend past the end of the encoding of the state.) If this
563 /// isn't a sparse state, then the return value is unspecified.
564 ///
565 /// Do note that this is only legal to call on a sparse state. So for
566 /// example, "one transition" state is not a sparse state, so it would not
567 /// be legal to call this method on such a state.
568 #[inline(always)]
569 fn sparse_trans_len(state: &[u32]) -> usize {
570 (state[State::KIND] & 0xFF).as_usize()
571 }
572
573 /// Returns the total number of matching pattern IDs in this state. Calling
574 /// this on a state that isn't a match results in unspecified behavior.
575 /// Thus, the returned number is never 0 for all correct calls.
576 ///
577 /// `state` should be the the raw binary encoding of a state. (The start
578 /// of the slice must correspond to the start of the state, but the slice
579 /// may extend past the end of the encoding of the state.)
580 #[inline(always)]
581 fn match_len(alphabet_len: usize, state: &[u32]) -> usize {
582 // We don't need to handle KIND_ONE here because it can never be a
583 // match state.
584 let packed = if State::kind(state) == State::KIND_DENSE {
585 let start = 2 + alphabet_len;
586 state[start].as_usize()
587 } else {
588 let trans_len = State::sparse_trans_len(state);
589 let classes_len = u32_len(trans_len);
590 let start = 2 + classes_len + trans_len;
591 state[start].as_usize()
592 };
593 if packed & (1 << 31) == 0 {
594 packed
595 } else {
596 1
597 }
598 }
599
600 /// Returns the pattern ID corresponding to the given index for the state
601 /// given. The `index` provided must be less than the number of pattern IDs
602 /// in this state.
603 ///
604 /// `state` should be the the raw binary encoding of a state. (The start of
605 /// the slice must correspond to the start of the state, but the slice may
606 /// extend past the end of the encoding of the state.)
607 ///
608 /// If the given state is not a match state or if the index is out of
609 /// bounds, then this has unspecified behavior.
610 #[inline(always)]
611 fn match_pattern(
612 alphabet_len: usize,
613 state: &[u32],
614 index: usize,
615 ) -> PatternID {
616 // We don't need to handle KIND_ONE here because it can never be a
617 // match state.
618 let start = if State::kind(state) == State::KIND_DENSE {
619 2 + alphabet_len
620 } else {
621 let trans_len = State::sparse_trans_len(state);
622 let classes_len = u32_len(trans_len);
623 2 + classes_len + trans_len
624 };
625 let packed = state[start];
626 let pid = if packed & (1 << 31) == 0 {
627 state[start + 1 + index]
628 } else {
629 assert_eq!(0, index);
630 packed & !(1 << 31)
631 };
632 PatternID::from_u32_unchecked(pid)
633 }
634
635 /// Read a state's binary encoding to its in-memory representation.
636 ///
637 /// `alphabet_len` should be the total number of transitions defined for
638 /// dense states.
639 ///
640 /// `is_match` should be true if this state is a match state and false
641 /// otherwise.
642 ///
643 /// `state` should be the the raw binary encoding of a state. (The start
644 /// of the slice must correspond to the start of the state, but the slice
645 /// may extend past the end of the encoding of the state.)
646 fn read(
647 alphabet_len: usize,
648 is_match: bool,
649 state: &'a [u32],
650 ) -> State<'a> {
651 let kind = State::kind(state);
652 let match_len =
653 if !is_match { 0 } else { State::match_len(alphabet_len, state) };
654 let (trans, fail) = if kind == State::KIND_DENSE {
655 let fail = StateID::from_u32_unchecked(state[1]);
656 let class_to_next = &state[2..][..alphabet_len];
657 (StateTrans::Dense { class_to_next }, fail)
658 } else if kind == State::KIND_ONE {
659 let fail = StateID::from_u32_unchecked(state[1]);
660 let class = state[State::KIND].low_u16().high_u8();
661 let next = state[2];
662 (StateTrans::One { class, next }, fail)
663 } else {
664 let fail = StateID::from_u32_unchecked(state[1]);
665 let trans_len = State::sparse_trans_len(state);
666 let classes_len = u32_len(trans_len);
667 let classes = &state[2..][..classes_len];
668 let nexts = &state[2 + classes_len..][..trans_len];
669 (StateTrans::Sparse { classes, nexts }, fail)
670 };
671 State { fail, match_len, trans }
672 }
673
674 /// Encode the "old" state from a noncontiguous NFA to its binary
675 /// representation to the given `dst` slice. `classes` should be the byte
676 /// classes computed for the noncontiguous NFA that the given state came
677 /// from.
678 ///
679 /// This returns an error if `dst` became so big that `StateID`s can no
680 /// longer be created for new states. Otherwise, it returns the state ID of
681 /// the new state created.
682 ///
683 /// When `force_dense` is true, then the encoded state will always use a
684 /// dense format. Otherwise, the choice between dense and sparse will be
685 /// automatically chosen based on the old state.
686 fn write(
687 old: &noncontiguous::State,
688 classes: &ByteClasses,
689 dst: &mut Vec<u32>,
690 force_dense: bool,
691 ) -> Result<StateID, BuildError> {
692 let sid = StateID::new(dst.len()).map_err(|e| {
693 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
694 })?;
695 // For states with a lot of transitions, we might as well just make
696 // them dense. These kinds of hot states tend to be very rare, so we're
697 // okay with it. This also gives us more sentinels in the state's
698 // 'kind', which lets us create different state kinds to save on
699 // space.
700 let kind = if force_dense
701 || old.trans.len() > State::MAX_SPARSE_TRANSITIONS
702 {
703 State::KIND_DENSE
704 } else if old.trans.len() == 1 && old.matches.is_empty() {
705 State::KIND_ONE
706 } else {
707 // For a sparse state, the kind is just the number of transitions.
708 u32::try_from(old.trans.len()).unwrap()
709 };
710 if kind == State::KIND_DENSE {
711 dst.push(kind);
712 dst.push(old.fail.as_u32());
713 State::write_dense_trans(old, classes, dst)?;
714 } else if kind == State::KIND_ONE {
715 let class = u32::from(classes.get(old.trans[0].0));
716 dst.push(kind | (class << 8));
717 dst.push(old.fail.as_u32());
718 dst.push(old.trans[0].1.as_u32());
719 } else {
720 dst.push(kind);
721 dst.push(old.fail.as_u32());
722 State::write_sparse_trans(old, classes, dst)?;
723 }
724 // Now finally write the number of matches and the matches themselves.
725 if !old.matches.is_empty() {
726 if old.matches.len() == 1 {
727 let pid = old.matches[0].as_u32();
728 assert_eq!(0, pid & (1 << 31));
729 dst.push((1 << 31) | pid);
730 } else {
731 assert_eq!(0, old.matches.len() & (1 << 31));
732 dst.push(old.matches.len().as_u32());
733 dst.extend(old.matches.iter().map(|pid| pid.as_u32()));
734 }
735 }
736 Ok(sid)
737 }
738
739 /// Encode the "old" state transitions from a noncontiguous NFA to its
740 /// binary sparse representation to the given `dst` slice. `classes` should
741 /// be the byte classes computed for the noncontiguous NFA that the given
742 /// state came from.
743 ///
744 /// This returns an error if `dst` became so big that `StateID`s can no
745 /// longer be created for new states.
746 fn write_sparse_trans(
747 old: &noncontiguous::State,
748 classes: &ByteClasses,
749 dst: &mut Vec<u32>,
750 ) -> Result<(), BuildError> {
751 let (mut chunk, mut len) = ([0; 4], 0);
752 for &(byte, _) in old.trans.iter() {
753 chunk[len] = classes.get(byte);
754 len += 1;
755 if len == 4 {
756 dst.push(u32::from_ne_bytes(chunk));
757 chunk = [0; 4];
758 len = 0;
759 }
760 }
761 if len > 0 {
762 // In the case where the number of transitions isn't divisible
763 // by 4, the last u32 chunk will have some left over room. In
764 // this case, we "just" repeat the last equivalence class. By
765 // doing this, we know the leftover faux transitions will never
766 // be followed because if they were, it would have been followed
767 // prior to it in the last equivalence class. This saves us some
768 // branching in the search time state transition code.
769 let repeat = chunk[len - 1];
770 while len < 4 {
771 chunk[len] = repeat;
772 len += 1;
773 }
774 dst.push(u32::from_ne_bytes(chunk));
775 }
776 for &(_, next) in old.trans.iter() {
777 dst.push(next.as_u32());
778 }
779 Ok(())
780 }
781
782 /// Encode the "old" state transitions from a noncontiguous NFA to its
783 /// binary dense representation to the given `dst` slice. `classes` should
784 /// be the byte classes computed for the noncontiguous NFA that the given
785 /// state came from.
786 ///
787 /// This returns an error if `dst` became so big that `StateID`s can no
788 /// longer be created for new states.
789 fn write_dense_trans(
790 old: &noncontiguous::State,
791 classes: &ByteClasses,
792 dst: &mut Vec<u32>,
793 ) -> Result<(), BuildError> {
794 // Our byte classes let us shrink the size of our dense states to the
795 // number of equivalence classes instead of just fixing it to 256.
796 // Any non-explicitly defined transition is just a transition to the
797 // FAIL state, so we fill that in first and then overwrite them with
798 // explicitly defined transitions. (Most states probably only have one
799 // or two explicitly defined transitions.)
800 //
801 // N.B. Remember that while building the contiguous NFA, we use state
802 // IDs from the noncontiguous NFA. It isn't until we've added all
803 // states that we go back and map noncontiguous IDs to contiguous IDs.
804 let start = dst.len();
805 dst.extend(
806 core::iter::repeat(noncontiguous::NFA::FAIL.as_u32())
807 .take(classes.alphabet_len()),
808 );
809 assert!(start < dst.len(), "equivalence classes are never empty");
810 for &(byte, next) in old.trans.iter() {
811 dst[start + usize::from(classes.get(byte))] = next.as_u32();
812 }
813 Ok(())
814 }
815
816 /// Return an iterator over every explicitly defined transition in this
817 /// state.
818 fn transitions<'b>(&'b self) -> impl Iterator<Item = (u8, StateID)> + 'b {
819 let mut i = 0;
820 core::iter::from_fn(move || match self.trans {
821 StateTrans::Sparse { classes, nexts } => {
822 if i >= nexts.len() {
823 return None;
824 }
825 let chunk = classes[i / 4];
826 let class = chunk.to_ne_bytes()[i % 4];
827 let next = StateID::from_u32_unchecked(nexts[i]);
828 i += 1;
829 Some((class, next))
830 }
831 StateTrans::One { class, next } => {
832 if i == 0 {
833 i += 1;
834 Some((class, StateID::from_u32_unchecked(next)))
835 } else {
836 None
837 }
838 }
839 StateTrans::Dense { class_to_next } => {
840 if i >= class_to_next.len() {
841 return None;
842 }
843 let class = i.as_u8();
844 let next = StateID::from_u32_unchecked(class_to_next[i]);
845 i += 1;
846 Some((class, next))
847 }
848 })
849 }
850}
851
852impl<'a> core::fmt::Debug for State<'a> {
853 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
854 use crate::{automaton::sparse_transitions, util::debug::DebugByte};
855
856 let it = sparse_transitions(self.transitions())
857 // Writing out all FAIL transitions is quite noisy. Instead, we
858 // just require readers of the output to assume anything absent
859 // maps to the FAIL transition.
860 .filter(|&(_, _, sid)| sid != NFA::FAIL)
861 .enumerate();
862 for (i, (start, end, sid)) in it {
863 if i > 0 {
864 write!(f, ", ")?;
865 }
866 if start == end {
867 write!(f, "{:?} => {:?}", DebugByte(start), sid.as_usize())?;
868 } else {
869 write!(
870 f,
871 "{:?}-{:?} => {:?}",
872 DebugByte(start),
873 DebugByte(end),
874 sid.as_usize()
875 )?;
876 }
877 }
878 Ok(())
879 }
880}
881
882/// A builder for configuring an Aho-Corasick contiguous NFA.
883///
884/// This builder has a subset of the options available to a
885/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
886/// their behavior is identical.
887#[derive(Clone, Debug)]
888pub struct Builder {
889 noncontiguous: noncontiguous::Builder,
890 dense_depth: usize,
891 byte_classes: bool,
892}
893
894impl Default for Builder {
895 fn default() -> Builder {
896 Builder {
897 noncontiguous: noncontiguous::Builder::new(),
898 dense_depth: 2,
899 byte_classes: true,
900 }
901 }
902}
903
904impl Builder {
905 /// Create a new builder for configuring an Aho-Corasick contiguous NFA.
906 pub fn new() -> Builder {
907 Builder::default()
908 }
909
910 /// Build an Aho-Corasick contiguous NFA from the given iterator of
911 /// patterns.
912 ///
913 /// A builder may be reused to create more NFAs.
914 pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
915 where
916 I: IntoIterator<Item = P>,
917 P: AsRef<[u8]>,
918 {
919 let nnfa = self.noncontiguous.build(patterns)?;
920 self.build_from_noncontiguous(&nnfa)
921 }
922
923 /// Build an Aho-Corasick contiguous NFA from the given noncontiguous NFA.
924 ///
925 /// Note that when this method is used, only the `dense_depth` and
926 /// `byte_classes` settings on this builder are respected. The other
927 /// settings only apply to the initial construction of the Aho-Corasick
928 /// automaton. Since using this method requires that initial construction
929 /// has already completed, all settings impacting only initial construction
930 /// are no longer relevant.
931 pub fn build_from_noncontiguous(
932 &self,
933 nnfa: &noncontiguous::NFA,
934 ) -> Result<NFA, BuildError> {
935 debug!("building contiguous NFA");
936 let byte_classes = if self.byte_classes {
937 nnfa.byte_classes().clone()
938 } else {
939 ByteClasses::singletons()
940 };
941 let mut index_to_state_id = vec![NFA::DEAD; nnfa.states().len()];
942 let mut nfa = NFA {
943 repr: vec![],
944 pattern_lens: nnfa.pattern_lens_raw().to_vec(),
945 state_len: nnfa.states().len(),
946 prefilter: nnfa.prefilter().map(|p| p.clone()),
947 match_kind: nnfa.match_kind(),
948 alphabet_len: byte_classes.alphabet_len(),
949 byte_classes,
950 min_pattern_len: nnfa.min_pattern_len(),
951 max_pattern_len: nnfa.max_pattern_len(),
952 // The special state IDs are set later.
953 special: Special::zero(),
954 };
955 for (oldsid, state) in nnfa.states().iter().with_state_ids() {
956 // We don't actually encode a fail state since it isn't necessary.
957 // But we still want to make sure any FAIL ids are mapped
958 // correctly.
959 if oldsid == noncontiguous::NFA::FAIL {
960 index_to_state_id[oldsid] = NFA::FAIL;
961 continue;
962 }
963 let force_dense = state.depth.as_usize() < self.dense_depth;
964 let newsid = State::write(
965 state,
966 &nfa.byte_classes,
967 &mut nfa.repr,
968 force_dense,
969 )?;
970 index_to_state_id[oldsid] = newsid;
971 }
972 for &newsid in index_to_state_id.iter() {
973 if newsid == NFA::FAIL {
974 continue;
975 }
976 let state = &mut nfa.repr[newsid.as_usize()..];
977 State::remap(nfa.alphabet_len, &index_to_state_id, state)?;
978 }
979 // Now that we've remapped all the IDs in our states, all that's left
980 // is remapping the special state IDs.
981 let remap = &index_to_state_id;
982 let old = nnfa.special();
983 let new = &mut nfa.special;
984 new.max_special_id = remap[old.max_special_id];
985 new.max_match_id = remap[old.max_match_id];
986 new.start_unanchored_id = remap[old.start_unanchored_id];
987 new.start_anchored_id = remap[old.start_anchored_id];
988 debug!(
989 "contiguous NFA built, <states: {:?}, size: {:?}, \
990 alphabet len: {:?}>",
991 nfa.state_len,
992 nfa.memory_usage(),
993 nfa.byte_classes.alphabet_len(),
994 );
995 Ok(nfa)
996 }
997
998 /// Set the desired match semantics.
999 ///
1000 /// This only applies when using [`Builder::build`] and not
1001 /// [`Builder::build_from_noncontiguous`].
1002 ///
1003 /// See
1004 /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
1005 /// for more documentation and examples.
1006 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
1007 self.noncontiguous.match_kind(kind);
1008 self
1009 }
1010
1011 /// Enable ASCII-aware case insensitive matching.
1012 ///
1013 /// This only applies when using [`Builder::build`] and not
1014 /// [`Builder::build_from_noncontiguous`].
1015 ///
1016 /// See
1017 /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
1018 /// for more documentation and examples.
1019 pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
1020 self.noncontiguous.ascii_case_insensitive(yes);
1021 self
1022 }
1023
1024 /// Enable heuristic prefilter optimizations.
1025 ///
1026 /// This only applies when using [`Builder::build`] and not
1027 /// [`Builder::build_from_noncontiguous`].
1028 ///
1029 /// See
1030 /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
1031 /// for more documentation and examples.
1032 pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
1033 self.noncontiguous.prefilter(yes);
1034 self
1035 }
1036
1037 /// Set the limit on how many states use a dense representation for their
1038 /// transitions. Other states will generally use a sparse representation.
1039 ///
1040 /// See
1041 /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
1042 /// for more documentation and examples.
1043 pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
1044 self.dense_depth = depth;
1045 self
1046 }
1047
1048 /// A debug setting for whether to attempt to shrink the size of the
1049 /// automaton's alphabet or not.
1050 ///
1051 /// This should never be enabled unless you're debugging an automaton.
1052 /// Namely, disabling byte classes makes transitions easier to reason
1053 /// about, since they use the actual bytes instead of equivalence classes.
1054 /// Disabling this confers no performance benefit at search time.
1055 ///
1056 /// See
1057 /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
1058 /// for more documentation and examples.
1059 pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
1060 self.byte_classes = yes;
1061 self
1062 }
1063}
1064
1065/// Computes the number of u32 values needed to represent one byte per the
1066/// number of transitions given.
1067fn u32_len(ntrans: usize) -> usize {
1068 if ntrans % 4 == 0 {
1069 ntrans >> 2
1070 } else {
1071 (ntrans >> 2) + 1
1072 }
1073}
1074
1075#[cfg(test)]
1076mod tests {
1077 // This test demonstrates a SWAR technique I tried in the sparse transition
1078 // code inside of 'next_state'. Namely, sparse transitions work by
1079 // iterating over u32 chunks, with each chunk containing up to 4 classes
1080 // corresponding to 4 transitions. This SWAR technique lets us find a
1081 // matching transition without converting the u32 to a [u8; 4].
1082 //
1083 // It turned out to be a little slower unfortunately, which isn't too
1084 // surprising, since this is likely a throughput oriented optimization.
1085 // Loop unrolling doesn't really help us because the vast majority of
1086 // states have very few transitions.
1087 //
1088 // Anyway, this code was a little tricky to write, so I converted it to a
1089 // test in case someone figures out how to use it more effectively than
1090 // I could.
1091 //
1092 // (This also only works on little endian. So big endian would need to be
1093 // accounted for if we ever decided to use this I think.)
1094 #[cfg(target_endian = "little")]
1095 #[test]
1096 fn swar() {
1097 use super::*;
1098
1099 fn has_zero_byte(x: u32) -> u32 {
1100 const LO_U32: u32 = 0x01010101;
1101 const HI_U32: u32 = 0x80808080;
1102
1103 x.wrapping_sub(LO_U32) & !x & HI_U32
1104 }
1105
1106 fn broadcast(b: u8) -> u32 {
1107 (u32::from(b)) * (u32::MAX / 255)
1108 }
1109
1110 fn index_of(x: u32) -> usize {
1111 let o =
1112 (((x - 1) & 0x01010101).wrapping_mul(0x01010101) >> 24) - 1;
1113 o.as_usize()
1114 }
1115
1116 let bytes: [u8; 4] = [b'1', b'A', b'a', b'z'];
1117 let chunk = u32::from_ne_bytes(bytes);
1118
1119 let needle = broadcast(b'1');
1120 assert_eq!(0, index_of(has_zero_byte(needle ^ chunk)));
1121 let needle = broadcast(b'A');
1122 assert_eq!(1, index_of(has_zero_byte(needle ^ chunk)));
1123 let needle = broadcast(b'a');
1124 assert_eq!(2, index_of(has_zero_byte(needle ^ chunk)));
1125 let needle = broadcast(b'z');
1126 assert_eq!(3, index_of(has_zero_byte(needle ^ chunk)));
1127 }
1128}
1129