1/*!
2Provides direct access to a DFA implementation of Aho-Corasick.
3
4This is a low-level API that generally only needs to be used in niche
5circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6instead of a DFA directly. Using an `DFA` directly is typically only necessary
7when one needs access to the [`Automaton`] trait implementation.
8*/
9
10use alloc::{vec, vec::Vec};
11
12use crate::{
13 automaton::Automaton,
14 nfa::noncontiguous,
15 util::{
16 alphabet::ByteClasses,
17 error::{BuildError, MatchError},
18 int::{Usize, U32},
19 prefilter::Prefilter,
20 primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
21 search::{Anchored, MatchKind, StartKind},
22 special::Special,
23 },
24};
25
26/// A DFA implementation of Aho-Corasick.
27///
28/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
29/// this type directly. Using a `DFA` directly is typically only necessary when
30/// one needs access to the [`Automaton`] trait implementation.
31///
32/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
33/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
34/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
35///
36/// A DFA provides the best possible search performance (in this crate) via two
37/// mechanisms:
38///
39/// * All states use a dense representation for their transitions.
40/// * All failure transitions are pre-computed such that they are never
41/// explicitly handled at search time.
42///
43/// These two facts combined mean that every state transition is performed
44/// using a constant number of instructions. However, this comes at
45/// great cost. The memory usage of a DFA can be quite exorbitant.
46/// It is potentially multiple orders of magnitude greater than a
47/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
48/// a DFA will typically have better search speed than a `contiguous::NFA`, but
49/// not by orders of magnitude.
50///
51/// Unless you have a small number of patterns or memory usage is not a concern
52/// and search performance is critical, a DFA is usually not the best choice.
53///
54/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
55/// support for anchored and unanchored search configurations. Namely,
56/// since failure transitions are pre-computed, supporting both anchored
57/// and unanchored searches requires a duplication of the transition table,
58/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
59/// unconditionally support both anchored and unanchored searches because there
60/// is essentially no added cost for doing so.) It is for this reason that
61/// a DFA's support for anchored and unanchored searches can be configured
62/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
63/// searches.
64///
65/// # Example
66///
67/// This example shows how to build an `DFA` directly and use it to execute
68/// [`Automaton::try_find`]:
69///
70/// ```
71/// use aho_corasick::{
72/// automaton::Automaton,
73/// dfa::DFA,
74/// Input, Match,
75/// };
76///
77/// let patterns = &["b", "abc", "abcd"];
78/// let haystack = "abcd";
79///
80/// let nfa = DFA::new(patterns).unwrap();
81/// assert_eq!(
82/// Some(Match::must(0, 1..2)),
83/// nfa.try_find(&Input::new(haystack))?,
84/// );
85/// # Ok::<(), Box<dyn std::error::Error>>(())
86/// ```
87///
88/// It is also possible to implement your own version of `try_find`. See the
89/// [`Automaton`] documentation for an example.
90#[derive(Clone)]
91pub struct DFA {
92 /// The DFA transition table. IDs in this table are pre-multiplied. So
93 /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride,
94 /// 2*stride, 3*stride, ...
95 trans: Vec<StateID>,
96 /// The matches for every match state in this DFA. This is indexed by order
97 /// of match states in the DFA. Namely, as constructed, match states are
98 /// always laid out sequentially and contiguously in memory. Thus, after
99 /// converting a match state ID to a match state index, the indices are
100 /// all adjacent.
101 ///
102 /// More concretely, when a search enters a match state with id 'sid', then
103 /// the matching patterns are at 'matches[(sid >> stride2) - 2]'. The '- 2'
104 /// is to offset the first two states of a DFA: the dead and fail states.
105 matches: Vec<Vec<PatternID>>,
106 /// The amount of heap memory used, in bytes, by the inner Vecs of
107 /// 'matches'.
108 matches_memory_usage: usize,
109 /// The length of each pattern. This is used to compute the start offset
110 /// of a match.
111 pattern_lens: Vec<SmallIndex>,
112 /// A prefilter for accelerating searches, if one exists.
113 prefilter: Option<Prefilter>,
114 /// The match semantics built into this DFA.
115 match_kind: MatchKind,
116 /// The total number of states in this DFA.
117 state_len: usize,
118 /// The alphabet size, or total number of equivalence classes, for this
119 /// DFA. Note that the actual number of transitions in each state is
120 /// stride=2^stride2, where stride is the smallest power of 2 greater than
121 /// or equal to alphabet_len. We do things this way so that we can use
122 /// bitshifting to go from a state ID to an index into 'matches'.
123 alphabet_len: usize,
124 /// The exponent with a base 2, such that stride=2^stride2. Given a state
125 /// index 'i', its state identifier is 'i << stride2'. Given a state
126 /// identifier 'sid', its state index is 'sid >> stride2'.
127 stride2: usize,
128 /// The equivalence classes for this DFA. All transitions are defined on
129 /// equivalence classes and not on the 256 distinct byte values.
130 byte_classes: ByteClasses,
131 /// The length of the shortest pattern in this automaton.
132 min_pattern_len: usize,
133 /// The length of the longest pattern in this automaton.
134 max_pattern_len: usize,
135 /// The information required to deduce which states are "special" in this
136 /// DFA.
137 special: Special,
138}
139
140impl DFA {
141 /// Create a new Aho-Corasick DFA using the default configuration.
142 ///
143 /// Use a [`Builder`] if you want to change the configuration.
144 pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
145 where
146 I: IntoIterator<Item = P>,
147 P: AsRef<[u8]>,
148 {
149 DFA::builder().build(patterns)
150 }
151
152 /// A convenience method for returning a new Aho-Corasick DFA builder.
153 ///
154 /// This usually permits one to just import the `DFA` type.
155 pub fn builder() -> Builder {
156 Builder::new()
157 }
158}
159
160impl DFA {
161 /// A sentinel state ID indicating that a search should stop once it has
162 /// entered this state. When a search stops, it returns a match if one has
163 /// been found, otherwise no match. A DFA always has an actual dead state
164 /// at this ID.
165 ///
166 /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
167 /// Namely, the whole point of a DFA is that the FAIL state is completely
168 /// compiled away. That is, DFA construction involves pre-computing the
169 /// failure transitions everywhere, such that failure transitions are no
170 /// longer used at search time. This, combined with its uniformly dense
171 /// representation, are the two most important factors in why it's faster
172 /// than the NFAs in this crate.
173 const DEAD: StateID = StateID::new_unchecked(0);
174
175 /// Adds the given pattern IDs as matches to the given state and also
176 /// records the added memory usage.
177 fn set_matches(&mut self, sid: StateID, pids: &[PatternID]) {
178 use core::mem::size_of;
179
180 assert!(!pids.is_empty(), "match state must have non-empty pids");
181 let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap();
182 self.matches[index].extend_from_slice(pids);
183 self.matches_memory_usage += size_of::<PatternID>() * pids.len();
184 }
185}
186
187// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
188// returns a valid state ID given a valid state ID. We otherwise claim that
189// all other methods are correct as well.
190unsafe impl Automaton for DFA {
191 #[inline(always)]
192 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
193 // Either of the start state IDs can be DEAD, in which case, support
194 // for that type of search is not provided by this DFA. Which start
195 // state IDs are inactive depends on the 'StartKind' configuration at
196 // DFA construction time.
197 match anchored {
198 Anchored::No => {
199 let start = self.special.start_unanchored_id;
200 if start == DFA::DEAD {
201 Err(MatchError::invalid_input_unanchored())
202 } else {
203 Ok(start)
204 }
205 }
206 Anchored::Yes => {
207 let start = self.special.start_anchored_id;
208 if start == DFA::DEAD {
209 Err(MatchError::invalid_input_anchored())
210 } else {
211 Ok(start)
212 }
213 }
214 }
215 }
216
217 #[inline(always)]
218 fn next_state(
219 &self,
220 _anchored: Anchored,
221 sid: StateID,
222 byte: u8,
223 ) -> StateID {
224 let class = self.byte_classes.get(byte);
225 self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
226 }
227
228 #[inline(always)]
229 fn is_special(&self, sid: StateID) -> bool {
230 sid <= self.special.max_special_id
231 }
232
233 #[inline(always)]
234 fn is_dead(&self, sid: StateID) -> bool {
235 sid == DFA::DEAD
236 }
237
238 #[inline(always)]
239 fn is_match(&self, sid: StateID) -> bool {
240 !self.is_dead(sid) && sid <= self.special.max_match_id
241 }
242
243 #[inline(always)]
244 fn is_start(&self, sid: StateID) -> bool {
245 sid == self.special.start_unanchored_id
246 || sid == self.special.start_anchored_id
247 }
248
249 #[inline(always)]
250 fn match_kind(&self) -> MatchKind {
251 self.match_kind
252 }
253
254 #[inline(always)]
255 fn patterns_len(&self) -> usize {
256 self.pattern_lens.len()
257 }
258
259 #[inline(always)]
260 fn pattern_len(&self, pid: PatternID) -> usize {
261 self.pattern_lens[pid].as_usize()
262 }
263
264 #[inline(always)]
265 fn min_pattern_len(&self) -> usize {
266 self.min_pattern_len
267 }
268
269 #[inline(always)]
270 fn max_pattern_len(&self) -> usize {
271 self.max_pattern_len
272 }
273
274 #[inline(always)]
275 fn match_len(&self, sid: StateID) -> usize {
276 debug_assert!(self.is_match(sid));
277 let offset = (sid.as_usize() >> self.stride2) - 2;
278 self.matches[offset].len()
279 }
280
281 #[inline(always)]
282 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
283 debug_assert!(self.is_match(sid));
284 let offset = (sid.as_usize() >> self.stride2) - 2;
285 self.matches[offset][index]
286 }
287
288 #[inline(always)]
289 fn memory_usage(&self) -> usize {
290 use core::mem::size_of;
291
292 (self.trans.len() * size_of::<u32>())
293 + (self.matches.len() * size_of::<Vec<PatternID>>())
294 + self.matches_memory_usage
295 + (self.pattern_lens.len() * size_of::<SmallIndex>())
296 + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
297 }
298
299 #[inline(always)]
300 fn prefilter(&self) -> Option<&Prefilter> {
301 self.prefilter.as_ref()
302 }
303}
304
305impl core::fmt::Debug for DFA {
306 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
307 use crate::{
308 automaton::{fmt_state_indicator, sparse_transitions},
309 util::debug::DebugByte,
310 };
311
312 writeln!(f, "dfa::DFA(")?;
313 for index in 0..self.state_len {
314 let sid = StateID::new_unchecked(index << self.stride2);
315 // While we do currently include the FAIL state in the transition
316 // table (to simplify construction), it is never actually used. It
317 // poses problems with the code below because it gets treated as
318 // a match state incidentally when it is, of course, not. So we
319 // special case it. The fail state is always the first state after
320 // the dead state.
321 //
322 // If the construction is changed to remove the fail state (it
323 // probably should be), then this special case should be updated.
324 if index == 1 {
325 writeln!(f, "F {:06}:", sid.as_usize())?;
326 continue;
327 }
328 fmt_state_indicator(f, self, sid)?;
329 write!(f, "{:06}: ", sid.as_usize())?;
330
331 let it = (0..self.byte_classes.alphabet_len()).map(|class| {
332 (class.as_u8(), self.trans[sid.as_usize() + class])
333 });
334 for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
335 if i > 0 {
336 write!(f, ", ")?;
337 }
338 if start == end {
339 write!(
340 f,
341 "{:?} => {:?}",
342 DebugByte(start),
343 next.as_usize()
344 )?;
345 } else {
346 write!(
347 f,
348 "{:?}-{:?} => {:?}",
349 DebugByte(start),
350 DebugByte(end),
351 next.as_usize()
352 )?;
353 }
354 }
355 write!(f, "\n")?;
356 if self.is_match(sid) {
357 write!(f, " matches: ")?;
358 for i in 0..self.match_len(sid) {
359 if i > 0 {
360 write!(f, ", ")?;
361 }
362 let pid = self.match_pattern(sid, i);
363 write!(f, "{}", pid.as_usize())?;
364 }
365 write!(f, "\n")?;
366 }
367 }
368 writeln!(f, "match kind: {:?}", self.match_kind)?;
369 writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
370 writeln!(f, "state length: {:?}", self.state_len)?;
371 writeln!(f, "pattern length: {:?}", self.patterns_len())?;
372 writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
373 writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
374 writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
375 writeln!(f, "stride: {:?}", 1 << self.stride2)?;
376 writeln!(f, "byte classes: {:?}", self.byte_classes)?;
377 writeln!(f, "memory usage: {:?}", self.memory_usage())?;
378 writeln!(f, ")")?;
379 Ok(())
380 }
381}
382
383/// A builder for configuring an Aho-Corasick DFA.
384///
385/// This builder has a subset of the options available to a
386/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
387/// their behavior is identical.
388#[derive(Clone, Debug)]
389pub struct Builder {
390 noncontiguous: noncontiguous::Builder,
391 start_kind: StartKind,
392 byte_classes: bool,
393}
394
395impl Default for Builder {
396 fn default() -> Builder {
397 Builder {
398 noncontiguous: noncontiguous::Builder::new(),
399 start_kind: StartKind::Unanchored,
400 byte_classes: true,
401 }
402 }
403}
404
405impl Builder {
406 /// Create a new builder for configuring an Aho-Corasick DFA.
407 pub fn new() -> Builder {
408 Builder::default()
409 }
410
411 /// Build an Aho-Corasick DFA from the given iterator of patterns.
412 ///
413 /// A builder may be reused to create more DFAs.
414 pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
415 where
416 I: IntoIterator<Item = P>,
417 P: AsRef<[u8]>,
418 {
419 let nnfa = self.noncontiguous.build(patterns)?;
420 self.build_from_noncontiguous(&nnfa)
421 }
422
423 /// Build an Aho-Corasick DFA from the given noncontiguous NFA.
424 ///
425 /// Note that when this method is used, only the `start_kind` and
426 /// `byte_classes` settings on this builder are respected. The other
427 /// settings only apply to the initial construction of the Aho-Corasick
428 /// automaton. Since using this method requires that initial construction
429 /// has already completed, all settings impacting only initial construction
430 /// are no longer relevant.
431 pub fn build_from_noncontiguous(
432 &self,
433 nnfa: &noncontiguous::NFA,
434 ) -> Result<DFA, BuildError> {
435 debug!("building DFA");
436 let byte_classes = if self.byte_classes {
437 nnfa.byte_classes().clone()
438 } else {
439 ByteClasses::singletons()
440 };
441 let state_len = match self.start_kind {
442 StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(),
443 StartKind::Both => {
444 // These unwraps are OK because we know that the number of
445 // NFA states is < StateID::LIMIT which is in turn less than
446 // i32::MAX. Thus, there is always room to multiply by 2.
447 // Finally, the number of states is always at least 4 in the
448 // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
449 // subtraction of 4 is okay.
450 //
451 // Note that we subtract 4 because the "anchored" part of
452 // the DFA duplicates the unanchored part (without failure
453 // transitions), but reuses the DEAD, FAIL and START states.
454 nnfa.states()
455 .len()
456 .checked_mul(2)
457 .unwrap()
458 .checked_sub(4)
459 .unwrap()
460 }
461 };
462 let trans_len =
463 match state_len.checked_shl(byte_classes.stride2().as_u32()) {
464 Some(trans_len) => trans_len,
465 None => {
466 return Err(BuildError::state_id_overflow(
467 StateID::MAX.as_u64(),
468 usize::MAX.as_u64(),
469 ))
470 }
471 };
472 StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
473 .map_err(|e| {
474 BuildError::state_id_overflow(
475 StateID::MAX.as_u64(),
476 e.attempted(),
477 )
478 })?;
479 let num_match_states = match self.start_kind {
480 StartKind::Unanchored | StartKind::Anchored => {
481 nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap()
482 }
483 StartKind::Both => nnfa
484 .special()
485 .max_match_id
486 .as_usize()
487 .checked_sub(1)
488 .unwrap()
489 .checked_mul(2)
490 .unwrap(),
491 };
492 let mut dfa = DFA {
493 trans: vec![DFA::DEAD; trans_len],
494 matches: vec![vec![]; num_match_states],
495 matches_memory_usage: 0,
496 pattern_lens: nnfa.pattern_lens_raw().to_vec(),
497 prefilter: nnfa.prefilter().map(|p| p.clone()),
498 match_kind: nnfa.match_kind(),
499 state_len,
500 alphabet_len: byte_classes.alphabet_len(),
501 stride2: byte_classes.stride2(),
502 byte_classes,
503 min_pattern_len: nnfa.min_pattern_len(),
504 max_pattern_len: nnfa.max_pattern_len(),
505 // The special state IDs are set later.
506 special: Special::zero(),
507 };
508 match self.start_kind {
509 StartKind::Both => {
510 self.finish_build_both_starts(nnfa, &mut dfa);
511 }
512 StartKind::Unanchored => {
513 self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
514 }
515 StartKind::Anchored => {
516 self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
517 }
518 }
519 debug!(
520 "DFA built, <states: {:?}, size: {:?}, \
521 alphabet len: {:?}, stride: {:?}>",
522 dfa.state_len,
523 dfa.memory_usage(),
524 dfa.byte_classes.alphabet_len(),
525 dfa.byte_classes.stride(),
526 );
527 Ok(dfa)
528 }
529
530 /// Finishes building a DFA for either unanchored or anchored searches,
531 /// but NOT both.
532 fn finish_build_one_start(
533 &self,
534 anchored: Anchored,
535 nnfa: &noncontiguous::NFA,
536 dfa: &mut DFA,
537 ) {
538 // This function always succeeds because we check above that all of the
539 // states in the NFA can be mapped to DFA state IDs.
540 let stride2 = dfa.stride2;
541 let old2new = |oldsid: StateID| {
542 StateID::new_unchecked(oldsid.as_usize() << stride2)
543 };
544 for (oldsid, state) in nnfa.states().iter().with_state_ids() {
545 let newsid = old2new(oldsid);
546 if !state.matches.is_empty() {
547 dfa.set_matches(newsid, &state.matches);
548 }
549 sparse_iter(
550 state,
551 &dfa.byte_classes,
552 |byte, class, mut oldnextsid| {
553 if oldnextsid == noncontiguous::NFA::FAIL {
554 if anchored.is_anchored() {
555 oldnextsid = noncontiguous::NFA::DEAD;
556 } else {
557 oldnextsid = nnfa.next_state(
558 Anchored::No,
559 state.fail,
560 byte,
561 );
562 }
563 }
564 dfa.trans[newsid.as_usize() + usize::from(class)] =
565 old2new(oldnextsid);
566 },
567 );
568 }
569 // Now that we've remapped all the IDs in our states, all that's left
570 // is remapping the special state IDs.
571 let old = nnfa.special();
572 let new = &mut dfa.special;
573 new.max_special_id = old2new(old.max_special_id);
574 new.max_match_id = old2new(old.max_match_id);
575 if anchored.is_anchored() {
576 new.start_unanchored_id = DFA::DEAD;
577 new.start_anchored_id = old2new(old.start_anchored_id);
578 } else {
579 new.start_unanchored_id = old2new(old.start_unanchored_id);
580 new.start_anchored_id = DFA::DEAD;
581 }
582 }
583
584 /// Finishes building a DFA that supports BOTH unanchored and anchored
585 /// searches. It works by inter-leaving unanchored states with anchored
586 /// states in the same transition table. This way, we avoid needing to
587 /// re-shuffle states afterward to ensure that our states still look like
588 /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
589 ///
590 /// Honestly this is pretty inscrutable... Simplifications are most
591 /// welcome.
592 fn finish_build_both_starts(
593 &self,
594 nnfa: &noncontiguous::NFA,
595 dfa: &mut DFA,
596 ) {
597 let stride2 = dfa.stride2;
598 let stride = 1 << stride2;
599 let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
600 let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
601 let mut is_anchored = vec![false; dfa.state_len];
602 let mut newsid = DFA::DEAD;
603 let next_dfa_id =
604 |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride);
605 for (oldsid, state) in nnfa.states().iter().with_state_ids() {
606 if oldsid == noncontiguous::NFA::DEAD
607 || oldsid == noncontiguous::NFA::FAIL
608 {
609 remap_unanchored[oldsid] = newsid;
610 remap_anchored[oldsid] = newsid;
611 newsid = next_dfa_id(newsid);
612 } else if oldsid == nnfa.special().start_unanchored_id
613 || oldsid == nnfa.special().start_anchored_id
614 {
615 if oldsid == nnfa.special().start_unanchored_id {
616 remap_unanchored[oldsid] = newsid;
617 remap_anchored[oldsid] = DFA::DEAD;
618 } else {
619 remap_unanchored[oldsid] = DFA::DEAD;
620 remap_anchored[oldsid] = newsid;
621 is_anchored[newsid.as_usize() >> stride2] = true;
622 }
623 if !state.matches.is_empty() {
624 dfa.set_matches(newsid, &state.matches);
625 }
626 sparse_iter(
627 state,
628 &dfa.byte_classes,
629 |_, class, oldnextsid| {
630 let class = usize::from(class);
631 if oldnextsid == noncontiguous::NFA::FAIL {
632 dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
633 } else {
634 dfa.trans[newsid.as_usize() + class] = oldnextsid;
635 }
636 },
637 );
638 newsid = next_dfa_id(newsid);
639 } else {
640 let unewsid = newsid;
641 newsid = next_dfa_id(newsid);
642 let anewsid = newsid;
643 newsid = next_dfa_id(newsid);
644
645 remap_unanchored[oldsid] = unewsid;
646 remap_anchored[oldsid] = anewsid;
647 is_anchored[anewsid.as_usize() >> stride2] = true;
648 if !state.matches.is_empty() {
649 dfa.set_matches(unewsid, &state.matches);
650 dfa.set_matches(anewsid, &state.matches);
651 }
652 sparse_iter(
653 state,
654 &dfa.byte_classes,
655 |byte, class, oldnextsid| {
656 let class = usize::from(class);
657 if oldnextsid == noncontiguous::NFA::FAIL {
658 dfa.trans[unewsid.as_usize() + class] = nnfa
659 .next_state(Anchored::No, state.fail, byte);
660 } else {
661 dfa.trans[unewsid.as_usize() + class] = oldnextsid;
662 dfa.trans[anewsid.as_usize() + class] = oldnextsid;
663 }
664 },
665 );
666 }
667 }
668 for i in 0..dfa.state_len {
669 let sid = i << stride2;
670 if is_anchored[i] {
671 for next in dfa.trans[sid..][..stride].iter_mut() {
672 *next = remap_anchored[*next];
673 }
674 } else {
675 for next in dfa.trans[sid..][..stride].iter_mut() {
676 *next = remap_unanchored[*next];
677 }
678 }
679 }
680 // Now that we've remapped all the IDs in our states, all that's left
681 // is remapping the special state IDs.
682 let old = nnfa.special();
683 let new = &mut dfa.special;
684 new.max_special_id = remap_anchored[old.max_special_id];
685 new.max_match_id = remap_anchored[old.max_match_id];
686 new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
687 new.start_anchored_id = remap_anchored[old.start_anchored_id];
688 }
689
690 /// Set the desired match semantics.
691 ///
692 /// This only applies when using [`Builder::build`] and not
693 /// [`Builder::build_from_noncontiguous`].
694 ///
695 /// See
696 /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
697 /// for more documentation and examples.
698 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
699 self.noncontiguous.match_kind(kind);
700 self
701 }
702
703 /// Enable ASCII-aware case insensitive matching.
704 ///
705 /// This only applies when using [`Builder::build`] and not
706 /// [`Builder::build_from_noncontiguous`].
707 ///
708 /// See
709 /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
710 /// for more documentation and examples.
711 pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
712 self.noncontiguous.ascii_case_insensitive(yes);
713 self
714 }
715
716 /// Enable heuristic prefilter optimizations.
717 ///
718 /// This only applies when using [`Builder::build`] and not
719 /// [`Builder::build_from_noncontiguous`].
720 ///
721 /// See
722 /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
723 /// for more documentation and examples.
724 pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
725 self.noncontiguous.prefilter(yes);
726 self
727 }
728
729 /// Sets the starting state configuration for the automaton.
730 ///
731 /// See
732 /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
733 /// for more documentation and examples.
734 pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
735 self.start_kind = kind;
736 self
737 }
738
739 /// A debug setting for whether to attempt to shrink the size of the
740 /// automaton's alphabet or not.
741 ///
742 /// This should never be enabled unless you're debugging an automaton.
743 /// Namely, disabling byte classes makes transitions easier to reason
744 /// about, since they use the actual bytes instead of equivalence classes.
745 /// Disabling this confers no performance benefit at search time.
746 ///
747 /// See
748 /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
749 /// for more documentation and examples.
750 pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
751 self.byte_classes = yes;
752 self
753 }
754}
755
756/// Iterate over all possible equivalence class transitions in this state.
757/// The closure is called for all transitions with a distinct equivalence
758/// class, even those not explicitly represented in this sparse state. For
759/// any implicitly defined transitions, the given closure is called with
760/// the fail state ID.
761///
762/// The closure is guaranteed to be called precisely
763/// `byte_classes.alphabet_len()` times, once for every possible class in
764/// ascending order.
765fn sparse_iter<F: FnMut(u8, u8, StateID)>(
766 state: &noncontiguous::State,
767 classes: &ByteClasses,
768 mut f: F,
769) {
770 let mut prev_class = None;
771 let mut byte = 0usize;
772 for &(b, id) in state.trans.iter() {
773 while byte < usize::from(b) {
774 let rep = byte.as_u8();
775 let class = classes.get(rep);
776 byte += 1;
777 if prev_class != Some(class) {
778 f(rep, class, noncontiguous::NFA::FAIL);
779 prev_class = Some(class);
780 }
781 }
782 let rep = b;
783 let class = classes.get(rep);
784 byte += 1;
785 if prev_class != Some(class) {
786 f(rep, class, id);
787 prev_class = Some(class);
788 }
789 }
790 for b in byte..=255 {
791 let rep = b.as_u8();
792 let class = classes.get(rep);
793 if prev_class != Some(class) {
794 f(rep, class, noncontiguous::NFA::FAIL);
795 prev_class = Some(class);
796 }
797 }
798}
799