1/*!
2Provides direct access to a DFA implementation of Aho-Corasick.
3
4This is a low-level API that generally only needs to be used in niche
5circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6instead of a DFA directly. Using an `DFA` directly is typically only necessary
7when one needs access to the [`Automaton`] trait implementation.
8*/
9
10use alloc::{vec, vec::Vec};
11
12use crate::{
13 automaton::Automaton,
14 nfa::noncontiguous,
15 util::{
16 alphabet::ByteClasses,
17 error::{BuildError, MatchError},
18 int::{Usize, U32},
19 prefilter::Prefilter,
20 primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
21 search::{Anchored, MatchKind, StartKind},
22 special::Special,
23 },
24};
25
26/// A DFA implementation of Aho-Corasick.
27///
28/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
29/// this type directly. Using a `DFA` directly is typically only necessary when
30/// one needs access to the [`Automaton`] trait implementation.
31///
32/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
33/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
34/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
35///
36/// A DFA provides the best possible search performance (in this crate) via two
37/// mechanisms:
38///
39/// * All states use a dense representation for their transitions.
40/// * All failure transitions are pre-computed such that they are never
41/// explicitly handled at search time.
42///
43/// These two facts combined mean that every state transition is performed
44/// using a constant number of instructions. However, this comes at
45/// great cost. The memory usage of a DFA can be quite exorbitant.
46/// It is potentially multiple orders of magnitude greater than a
47/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
48/// a DFA will typically have better search speed than a `contiguous::NFA`, but
49/// not by orders of magnitude.
50///
51/// Unless you have a small number of patterns or memory usage is not a concern
52/// and search performance is critical, a DFA is usually not the best choice.
53///
54/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
55/// support for anchored and unanchored search configurations. Namely,
56/// since failure transitions are pre-computed, supporting both anchored
57/// and unanchored searches requires a duplication of the transition table,
58/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
59/// unconditionally support both anchored and unanchored searches because there
60/// is essentially no added cost for doing so.) It is for this reason that
61/// a DFA's support for anchored and unanchored searches can be configured
62/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
63/// searches.
64///
65/// # Example
66///
67/// This example shows how to build an `DFA` directly and use it to execute
68/// [`Automaton::try_find`]:
69///
70/// ```
71/// use aho_corasick::{
72/// automaton::Automaton,
73/// dfa::DFA,
74/// Input, Match,
75/// };
76///
77/// let patterns = &["b", "abc", "abcd"];
78/// let haystack = "abcd";
79///
80/// let nfa = DFA::new(patterns).unwrap();
81/// assert_eq!(
82/// Some(Match::must(0, 1..2)),
83/// nfa.try_find(&Input::new(haystack))?,
84/// );
85/// # Ok::<(), Box<dyn std::error::Error>>(())
86/// ```
87///
88/// It is also possible to implement your own version of `try_find`. See the
89/// [`Automaton`] documentation for an example.
90#[derive(Clone)]
91pub struct DFA {
92 /// The DFA transition table. IDs in this table are pre-multiplied. So
93 /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride,
94 /// 2*stride, 3*stride, ...
95 trans: Vec<StateID>,
96 /// The matches for every match state in this DFA. This is first indexed by
97 /// state index (so that's `sid >> stride2`) and then by order in which the
98 /// matches are meant to occur.
99 matches: Vec<Vec<PatternID>>,
100 /// The amount of heap memory used, in bytes, by the inner Vecs of
101 /// 'matches'.
102 matches_memory_usage: usize,
103 /// The length of each pattern. This is used to compute the start offset
104 /// of a match.
105 pattern_lens: Vec<SmallIndex>,
106 /// A prefilter for accelerating searches, if one exists.
107 prefilter: Option<Prefilter>,
108 /// The match semantics built into this DFA.
109 match_kind: MatchKind,
110 /// The total number of states in this DFA.
111 state_len: usize,
112 /// The alphabet size, or total number of equivalence classes, for this
113 /// DFA. Note that the actual number of transitions in each state is
114 /// stride=2^stride2, where stride is the smallest power of 2 greater than
115 /// or equal to alphabet_len. We do things this way so that we can use
116 /// bitshifting to go from a state ID to an index into 'matches'.
117 alphabet_len: usize,
118 /// The exponent with a base 2, such that stride=2^stride2. Given a state
119 /// index 'i', its state identifier is 'i << stride2'. Given a state
120 /// identifier 'sid', its state index is 'sid >> stride2'.
121 stride2: usize,
122 /// The equivalence classes for this DFA. All transitions are defined on
123 /// equivalence classes and not on the 256 distinct byte values.
124 byte_classes: ByteClasses,
125 /// The length of the shortest pattern in this automaton.
126 min_pattern_len: usize,
127 /// The length of the longest pattern in this automaton.
128 max_pattern_len: usize,
129 /// The information required to deduce which states are "special" in this
130 /// DFA.
131 special: Special,
132}
133
134impl DFA {
135 /// Create a new Aho-Corasick DFA using the default configuration.
136 ///
137 /// Use a [`Builder`] if you want to change the configuration.
138 pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
139 where
140 I: IntoIterator<Item = P>,
141 P: AsRef<[u8]>,
142 {
143 DFA::builder().build(patterns)
144 }
145
146 /// A convenience method for returning a new Aho-Corasick DFA builder.
147 ///
148 /// This usually permits one to just import the `DFA` type.
149 pub fn builder() -> Builder {
150 Builder::new()
151 }
152}
153
154impl DFA {
155 /// A sentinel state ID indicating that a search should stop once it has
156 /// entered this state. When a search stops, it returns a match if one has
157 /// been found, otherwise no match. A DFA always has an actual dead state
158 /// at this ID.
159 ///
160 /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
161 /// Namely, the whole point of a DFA is that the FAIL state is completely
162 /// compiled away. That is, DFA construction involves pre-computing the
163 /// failure transitions everywhere, such that failure transitions are no
164 /// longer used at search time. This, combined with its uniformly dense
165 /// representation, are the two most important factors in why it's faster
166 /// than the NFAs in this crate.
167 const DEAD: StateID = StateID::new_unchecked(0);
168
169 /// Adds the given pattern IDs as matches to the given state and also
170 /// records the added memory usage.
171 fn set_matches(
172 &mut self,
173 sid: StateID,
174 pids: impl Iterator<Item = PatternID>,
175 ) {
176 let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap();
177 let mut at_least_one = false;
178 for pid in pids {
179 self.matches[index].push(pid);
180 self.matches_memory_usage += PatternID::SIZE;
181 at_least_one = true;
182 }
183 assert!(at_least_one, "match state must have non-empty pids");
184 }
185}
186
187// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
188// returns a valid state ID given a valid state ID. We otherwise claim that
189// all other methods are correct as well.
190unsafe impl Automaton for DFA {
191 #[inline(always)]
192 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
193 // Either of the start state IDs can be DEAD, in which case, support
194 // for that type of search is not provided by this DFA. Which start
195 // state IDs are inactive depends on the 'StartKind' configuration at
196 // DFA construction time.
197 match anchored {
198 Anchored::No => {
199 let start = self.special.start_unanchored_id;
200 if start == DFA::DEAD {
201 Err(MatchError::invalid_input_unanchored())
202 } else {
203 Ok(start)
204 }
205 }
206 Anchored::Yes => {
207 let start = self.special.start_anchored_id;
208 if start == DFA::DEAD {
209 Err(MatchError::invalid_input_anchored())
210 } else {
211 Ok(start)
212 }
213 }
214 }
215 }
216
217 #[inline(always)]
218 fn next_state(
219 &self,
220 _anchored: Anchored,
221 sid: StateID,
222 byte: u8,
223 ) -> StateID {
224 let class = self.byte_classes.get(byte);
225 self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
226 }
227
228 #[inline(always)]
229 fn is_special(&self, sid: StateID) -> bool {
230 sid <= self.special.max_special_id
231 }
232
233 #[inline(always)]
234 fn is_dead(&self, sid: StateID) -> bool {
235 sid == DFA::DEAD
236 }
237
238 #[inline(always)]
239 fn is_match(&self, sid: StateID) -> bool {
240 !self.is_dead(sid) && sid <= self.special.max_match_id
241 }
242
243 #[inline(always)]
244 fn is_start(&self, sid: StateID) -> bool {
245 sid == self.special.start_unanchored_id
246 || sid == self.special.start_anchored_id
247 }
248
249 #[inline(always)]
250 fn match_kind(&self) -> MatchKind {
251 self.match_kind
252 }
253
254 #[inline(always)]
255 fn patterns_len(&self) -> usize {
256 self.pattern_lens.len()
257 }
258
259 #[inline(always)]
260 fn pattern_len(&self, pid: PatternID) -> usize {
261 self.pattern_lens[pid].as_usize()
262 }
263
264 #[inline(always)]
265 fn min_pattern_len(&self) -> usize {
266 self.min_pattern_len
267 }
268
269 #[inline(always)]
270 fn max_pattern_len(&self) -> usize {
271 self.max_pattern_len
272 }
273
274 #[inline(always)]
275 fn match_len(&self, sid: StateID) -> usize {
276 debug_assert!(self.is_match(sid));
277 let offset = (sid.as_usize() >> self.stride2) - 2;
278 self.matches[offset].len()
279 }
280
281 #[inline(always)]
282 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
283 debug_assert!(self.is_match(sid));
284 let offset = (sid.as_usize() >> self.stride2) - 2;
285 self.matches[offset][index]
286 }
287
288 #[inline(always)]
289 fn memory_usage(&self) -> usize {
290 use core::mem::size_of;
291
292 (self.trans.len() * size_of::<u32>())
293 + (self.matches.len() * size_of::<Vec<PatternID>>())
294 + self.matches_memory_usage
295 + (self.pattern_lens.len() * size_of::<SmallIndex>())
296 + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
297 }
298
299 #[inline(always)]
300 fn prefilter(&self) -> Option<&Prefilter> {
301 self.prefilter.as_ref()
302 }
303}
304
305impl core::fmt::Debug for DFA {
306 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
307 use crate::{
308 automaton::{fmt_state_indicator, sparse_transitions},
309 util::debug::DebugByte,
310 };
311
312 writeln!(f, "dfa::DFA(")?;
313 for index in 0..self.state_len {
314 let sid = StateID::new_unchecked(index << self.stride2);
315 // While we do currently include the FAIL state in the transition
316 // table (to simplify construction), it is never actually used. It
317 // poses problems with the code below because it gets treated as
318 // a match state incidentally when it is, of course, not. So we
319 // special case it. The fail state is always the first state after
320 // the dead state.
321 //
322 // If the construction is changed to remove the fail state (it
323 // probably should be), then this special case should be updated.
324 if index == 1 {
325 writeln!(f, "F {:06}:", sid.as_usize())?;
326 continue;
327 }
328 fmt_state_indicator(f, self, sid)?;
329 write!(f, "{:06}: ", sid.as_usize())?;
330
331 let it = (0..self.byte_classes.alphabet_len()).map(|class| {
332 (class.as_u8(), self.trans[sid.as_usize() + class])
333 });
334 for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
335 if i > 0 {
336 write!(f, ", ")?;
337 }
338 if start == end {
339 write!(
340 f,
341 "{:?} => {:?}",
342 DebugByte(start),
343 next.as_usize()
344 )?;
345 } else {
346 write!(
347 f,
348 "{:?}-{:?} => {:?}",
349 DebugByte(start),
350 DebugByte(end),
351 next.as_usize()
352 )?;
353 }
354 }
355 write!(f, "\n")?;
356 if self.is_match(sid) {
357 write!(f, " matches: ")?;
358 for i in 0..self.match_len(sid) {
359 if i > 0 {
360 write!(f, ", ")?;
361 }
362 let pid = self.match_pattern(sid, i);
363 write!(f, "{}", pid.as_usize())?;
364 }
365 write!(f, "\n")?;
366 }
367 }
368 writeln!(f, "match kind: {:?}", self.match_kind)?;
369 writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
370 writeln!(f, "state length: {:?}", self.state_len)?;
371 writeln!(f, "pattern length: {:?}", self.patterns_len())?;
372 writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
373 writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
374 writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
375 writeln!(f, "stride: {:?}", 1 << self.stride2)?;
376 writeln!(f, "byte classes: {:?}", self.byte_classes)?;
377 writeln!(f, "memory usage: {:?}", self.memory_usage())?;
378 writeln!(f, ")")?;
379 Ok(())
380 }
381}
382
383/// A builder for configuring an Aho-Corasick DFA.
384///
385/// This builder has a subset of the options available to a
386/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
387/// their behavior is identical.
388#[derive(Clone, Debug)]
389pub struct Builder {
390 noncontiguous: noncontiguous::Builder,
391 start_kind: StartKind,
392 byte_classes: bool,
393}
394
395impl Default for Builder {
396 fn default() -> Builder {
397 Builder {
398 noncontiguous: noncontiguous::Builder::new(),
399 start_kind: StartKind::Unanchored,
400 byte_classes: true,
401 }
402 }
403}
404
405impl Builder {
406 /// Create a new builder for configuring an Aho-Corasick DFA.
407 pub fn new() -> Builder {
408 Builder::default()
409 }
410
411 /// Build an Aho-Corasick DFA from the given iterator of patterns.
412 ///
413 /// A builder may be reused to create more DFAs.
414 pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
415 where
416 I: IntoIterator<Item = P>,
417 P: AsRef<[u8]>,
418 {
419 let nnfa = self.noncontiguous.build(patterns)?;
420 self.build_from_noncontiguous(&nnfa)
421 }
422
423 /// Build an Aho-Corasick DFA from the given noncontiguous NFA.
424 ///
425 /// Note that when this method is used, only the `start_kind` and
426 /// `byte_classes` settings on this builder are respected. The other
427 /// settings only apply to the initial construction of the Aho-Corasick
428 /// automaton. Since using this method requires that initial construction
429 /// has already completed, all settings impacting only initial construction
430 /// are no longer relevant.
431 pub fn build_from_noncontiguous(
432 &self,
433 nnfa: &noncontiguous::NFA,
434 ) -> Result<DFA, BuildError> {
435 debug!("building DFA");
436 let byte_classes = if self.byte_classes {
437 nnfa.byte_classes().clone()
438 } else {
439 ByteClasses::singletons()
440 };
441 let state_len = match self.start_kind {
442 StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(),
443 StartKind::Both => {
444 // These unwraps are OK because we know that the number of
445 // NFA states is < StateID::LIMIT which is in turn less than
446 // i32::MAX. Thus, there is always room to multiply by 2.
447 // Finally, the number of states is always at least 4 in the
448 // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
449 // subtraction of 4 is okay.
450 //
451 // Note that we subtract 4 because the "anchored" part of
452 // the DFA duplicates the unanchored part (without failure
453 // transitions), but reuses the DEAD, FAIL and START states.
454 nnfa.states()
455 .len()
456 .checked_mul(2)
457 .unwrap()
458 .checked_sub(4)
459 .unwrap()
460 }
461 };
462 let trans_len =
463 match state_len.checked_shl(byte_classes.stride2().as_u32()) {
464 Some(trans_len) => trans_len,
465 None => {
466 return Err(BuildError::state_id_overflow(
467 StateID::MAX.as_u64(),
468 usize::MAX.as_u64(),
469 ))
470 }
471 };
472 StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
473 .map_err(|e| {
474 BuildError::state_id_overflow(
475 StateID::MAX.as_u64(),
476 e.attempted(),
477 )
478 })?;
479 let num_match_states = match self.start_kind {
480 StartKind::Unanchored | StartKind::Anchored => {
481 nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap()
482 }
483 StartKind::Both => nnfa
484 .special()
485 .max_match_id
486 .as_usize()
487 .checked_sub(1)
488 .unwrap()
489 .checked_mul(2)
490 .unwrap(),
491 };
492 let mut dfa = DFA {
493 trans: vec![DFA::DEAD; trans_len],
494 matches: vec![vec![]; num_match_states],
495 matches_memory_usage: 0,
496 pattern_lens: nnfa.pattern_lens_raw().to_vec(),
497 prefilter: nnfa.prefilter().map(|p| p.clone()),
498 match_kind: nnfa.match_kind(),
499 state_len,
500 alphabet_len: byte_classes.alphabet_len(),
501 stride2: byte_classes.stride2(),
502 byte_classes,
503 min_pattern_len: nnfa.min_pattern_len(),
504 max_pattern_len: nnfa.max_pattern_len(),
505 // The special state IDs are set later.
506 special: Special::zero(),
507 };
508 match self.start_kind {
509 StartKind::Both => {
510 self.finish_build_both_starts(nnfa, &mut dfa);
511 }
512 StartKind::Unanchored => {
513 self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
514 }
515 StartKind::Anchored => {
516 self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
517 }
518 }
519 debug!(
520 "DFA built, <states: {:?}, size: {:?}, \
521 alphabet len: {:?}, stride: {:?}>",
522 dfa.state_len,
523 dfa.memory_usage(),
524 dfa.byte_classes.alphabet_len(),
525 dfa.byte_classes.stride(),
526 );
527 // The vectors can grow ~twice as big during construction because a
528 // Vec amortizes growth. But here, let's shrink things back down to
529 // what we actually need since we're never going to add more to it.
530 dfa.trans.shrink_to_fit();
531 dfa.pattern_lens.shrink_to_fit();
532 dfa.matches.shrink_to_fit();
533 // TODO: We might also want to shrink each Vec inside of `dfa.matches`,
534 // or even better, convert it to one contiguous allocation. But I think
535 // I went with nested allocs for good reason (can't remember), so this
536 // may be tricky to do. I decided not to shrink them here because it
537 // might require a fair bit of work to do. It's unclear whether it's
538 // worth it.
539 Ok(dfa)
540 }
541
542 /// Finishes building a DFA for either unanchored or anchored searches,
543 /// but NOT both.
544 fn finish_build_one_start(
545 &self,
546 anchored: Anchored,
547 nnfa: &noncontiguous::NFA,
548 dfa: &mut DFA,
549 ) {
550 // This function always succeeds because we check above that all of the
551 // states in the NFA can be mapped to DFA state IDs.
552 let stride2 = dfa.stride2;
553 let old2new = |oldsid: StateID| {
554 StateID::new_unchecked(oldsid.as_usize() << stride2)
555 };
556 for (oldsid, state) in nnfa.states().iter().with_state_ids() {
557 let newsid = old2new(oldsid);
558 if state.is_match() {
559 dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
560 }
561 sparse_iter(
562 nnfa,
563 oldsid,
564 &dfa.byte_classes,
565 |byte, class, mut oldnextsid| {
566 if oldnextsid == noncontiguous::NFA::FAIL {
567 if anchored.is_anchored() {
568 oldnextsid = noncontiguous::NFA::DEAD;
569 } else if state.fail() == noncontiguous::NFA::DEAD {
570 // This is a special case that avoids following
571 // DEAD transitions in a non-contiguous NFA.
572 // Following these transitions is pretty slow
573 // because the non-contiguous NFA will always use
574 // a sparse representation for it (because the
575 // DEAD state is usually treated as a sentinel).
576 // The *vast* majority of failure states are DEAD
577 // states, so this winds up being pretty slow if
578 // we go through the non-contiguous NFA state
579 // transition logic. Instead, just do it ourselves.
580 oldnextsid = noncontiguous::NFA::DEAD;
581 } else {
582 oldnextsid = nnfa.next_state(
583 Anchored::No,
584 state.fail(),
585 byte,
586 );
587 }
588 }
589 dfa.trans[newsid.as_usize() + usize::from(class)] =
590 old2new(oldnextsid);
591 },
592 );
593 }
594 // Now that we've remapped all the IDs in our states, all that's left
595 // is remapping the special state IDs.
596 let old = nnfa.special();
597 let new = &mut dfa.special;
598 new.max_special_id = old2new(old.max_special_id);
599 new.max_match_id = old2new(old.max_match_id);
600 if anchored.is_anchored() {
601 new.start_unanchored_id = DFA::DEAD;
602 new.start_anchored_id = old2new(old.start_anchored_id);
603 } else {
604 new.start_unanchored_id = old2new(old.start_unanchored_id);
605 new.start_anchored_id = DFA::DEAD;
606 }
607 }
608
609 /// Finishes building a DFA that supports BOTH unanchored and anchored
610 /// searches. It works by inter-leaving unanchored states with anchored
611 /// states in the same transition table. This way, we avoid needing to
612 /// re-shuffle states afterward to ensure that our states still look like
613 /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
614 ///
615 /// Honestly this is pretty inscrutable... Simplifications are most
616 /// welcome.
617 fn finish_build_both_starts(
618 &self,
619 nnfa: &noncontiguous::NFA,
620 dfa: &mut DFA,
621 ) {
622 let stride2 = dfa.stride2;
623 let stride = 1 << stride2;
624 let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
625 let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
626 let mut is_anchored = vec![false; dfa.state_len];
627 let mut newsid = DFA::DEAD;
628 let next_dfa_id =
629 |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride);
630 for (oldsid, state) in nnfa.states().iter().with_state_ids() {
631 if oldsid == noncontiguous::NFA::DEAD
632 || oldsid == noncontiguous::NFA::FAIL
633 {
634 remap_unanchored[oldsid] = newsid;
635 remap_anchored[oldsid] = newsid;
636 newsid = next_dfa_id(newsid);
637 } else if oldsid == nnfa.special().start_unanchored_id
638 || oldsid == nnfa.special().start_anchored_id
639 {
640 if oldsid == nnfa.special().start_unanchored_id {
641 remap_unanchored[oldsid] = newsid;
642 remap_anchored[oldsid] = DFA::DEAD;
643 } else {
644 remap_unanchored[oldsid] = DFA::DEAD;
645 remap_anchored[oldsid] = newsid;
646 is_anchored[newsid.as_usize() >> stride2] = true;
647 }
648 if state.is_match() {
649 dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
650 }
651 sparse_iter(
652 nnfa,
653 oldsid,
654 &dfa.byte_classes,
655 |_, class, oldnextsid| {
656 let class = usize::from(class);
657 if oldnextsid == noncontiguous::NFA::FAIL {
658 dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
659 } else {
660 dfa.trans[newsid.as_usize() + class] = oldnextsid;
661 }
662 },
663 );
664 newsid = next_dfa_id(newsid);
665 } else {
666 let unewsid = newsid;
667 newsid = next_dfa_id(newsid);
668 let anewsid = newsid;
669 newsid = next_dfa_id(newsid);
670
671 remap_unanchored[oldsid] = unewsid;
672 remap_anchored[oldsid] = anewsid;
673 is_anchored[anewsid.as_usize() >> stride2] = true;
674 if state.is_match() {
675 dfa.set_matches(unewsid, nnfa.iter_matches(oldsid));
676 dfa.set_matches(anewsid, nnfa.iter_matches(oldsid));
677 }
678 sparse_iter(
679 nnfa,
680 oldsid,
681 &dfa.byte_classes,
682 |byte, class, oldnextsid| {
683 let class = usize::from(class);
684 if oldnextsid == noncontiguous::NFA::FAIL {
685 let oldnextsid =
686 if state.fail() == noncontiguous::NFA::DEAD {
687 noncontiguous::NFA::DEAD
688 } else {
689 nnfa.next_state(
690 Anchored::No,
691 state.fail(),
692 byte,
693 )
694 };
695 dfa.trans[unewsid.as_usize() + class] = oldnextsid;
696 } else {
697 dfa.trans[unewsid.as_usize() + class] = oldnextsid;
698 dfa.trans[anewsid.as_usize() + class] = oldnextsid;
699 }
700 },
701 );
702 }
703 }
704 for i in 0..dfa.state_len {
705 let sid = i << stride2;
706 if is_anchored[i] {
707 for next in dfa.trans[sid..][..stride].iter_mut() {
708 *next = remap_anchored[*next];
709 }
710 } else {
711 for next in dfa.trans[sid..][..stride].iter_mut() {
712 *next = remap_unanchored[*next];
713 }
714 }
715 }
716 // Now that we've remapped all the IDs in our states, all that's left
717 // is remapping the special state IDs.
718 let old = nnfa.special();
719 let new = &mut dfa.special;
720 new.max_special_id = remap_anchored[old.max_special_id];
721 new.max_match_id = remap_anchored[old.max_match_id];
722 new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
723 new.start_anchored_id = remap_anchored[old.start_anchored_id];
724 }
725
726 /// Set the desired match semantics.
727 ///
728 /// This only applies when using [`Builder::build`] and not
729 /// [`Builder::build_from_noncontiguous`].
730 ///
731 /// See
732 /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
733 /// for more documentation and examples.
734 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
735 self.noncontiguous.match_kind(kind);
736 self
737 }
738
739 /// Enable ASCII-aware case insensitive matching.
740 ///
741 /// This only applies when using [`Builder::build`] and not
742 /// [`Builder::build_from_noncontiguous`].
743 ///
744 /// See
745 /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
746 /// for more documentation and examples.
747 pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
748 self.noncontiguous.ascii_case_insensitive(yes);
749 self
750 }
751
752 /// Enable heuristic prefilter optimizations.
753 ///
754 /// This only applies when using [`Builder::build`] and not
755 /// [`Builder::build_from_noncontiguous`].
756 ///
757 /// See
758 /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
759 /// for more documentation and examples.
760 pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
761 self.noncontiguous.prefilter(yes);
762 self
763 }
764
765 /// Sets the starting state configuration for the automaton.
766 ///
767 /// See
768 /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
769 /// for more documentation and examples.
770 pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
771 self.start_kind = kind;
772 self
773 }
774
775 /// A debug setting for whether to attempt to shrink the size of the
776 /// automaton's alphabet or not.
777 ///
778 /// This should never be enabled unless you're debugging an automaton.
779 /// Namely, disabling byte classes makes transitions easier to reason
780 /// about, since they use the actual bytes instead of equivalence classes.
781 /// Disabling this confers no performance benefit at search time.
782 ///
783 /// See
784 /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
785 /// for more documentation and examples.
786 pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
787 self.byte_classes = yes;
788 self
789 }
790}
791
792/// Iterate over all possible equivalence class transitions in this state.
793/// The closure is called for all transitions with a distinct equivalence
794/// class, even those not explicitly represented in this sparse state. For
795/// any implicitly defined transitions, the given closure is called with
796/// the fail state ID.
797///
798/// The closure is guaranteed to be called precisely
799/// `byte_classes.alphabet_len()` times, once for every possible class in
800/// ascending order.
801fn sparse_iter<F: FnMut(u8, u8, StateID)>(
802 nnfa: &noncontiguous::NFA,
803 oldsid: StateID,
804 classes: &ByteClasses,
805 mut f: F,
806) {
807 let mut prev_class = None;
808 let mut byte = 0usize;
809 for t in nnfa.iter_trans(oldsid) {
810 while byte < usize::from(t.byte()) {
811 let rep = byte.as_u8();
812 let class = classes.get(rep);
813 byte += 1;
814 if prev_class != Some(class) {
815 f(rep, class, noncontiguous::NFA::FAIL);
816 prev_class = Some(class);
817 }
818 }
819 let rep = t.byte();
820 let class = classes.get(rep);
821 byte += 1;
822 if prev_class != Some(class) {
823 f(rep, class, t.next());
824 prev_class = Some(class);
825 }
826 }
827 for b in byte..=255 {
828 let rep = b.as_u8();
829 let class = classes.get(rep);
830 if prev_class != Some(class) {
831 f(rep, class, noncontiguous::NFA::FAIL);
832 prev_class = Some(class);
833 }
834 }
835}
836