1 | /*! |
2 | Provides direct access to a DFA implementation of Aho-Corasick. |
3 | |
4 | This is a low-level API that generally only needs to be used in niche |
5 | circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) |
6 | instead of a DFA directly. Using an `DFA` directly is typically only necessary |
7 | when one needs access to the [`Automaton`] trait implementation. |
8 | */ |
9 | |
10 | use alloc::{vec, vec::Vec}; |
11 | |
12 | use crate::{ |
13 | automaton::Automaton, |
14 | nfa::noncontiguous, |
15 | util::{ |
16 | alphabet::ByteClasses, |
17 | error::{BuildError, MatchError}, |
18 | int::{Usize, U32}, |
19 | prefilter::Prefilter, |
20 | primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, |
21 | search::{Anchored, MatchKind, StartKind}, |
22 | special::Special, |
23 | }, |
24 | }; |
25 | |
26 | /// A DFA implementation of Aho-Corasick. |
27 | /// |
28 | /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of |
29 | /// this type directly. Using a `DFA` directly is typically only necessary when |
30 | /// one needs access to the [`Automaton`] trait implementation. |
31 | /// |
32 | /// This DFA can only be built by first constructing a [`noncontiguous::NFA`]. |
33 | /// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but |
34 | /// [`Builder::build_from_noncontiguous`] permits doing it explicitly. |
35 | /// |
36 | /// A DFA provides the best possible search performance (in this crate) via two |
37 | /// mechanisms: |
38 | /// |
39 | /// * All states use a dense representation for their transitions. |
40 | /// * All failure transitions are pre-computed such that they are never |
41 | /// explicitly handled at search time. |
42 | /// |
43 | /// These two facts combined mean that every state transition is performed |
44 | /// using a constant number of instructions. However, this comes at |
45 | /// great cost. The memory usage of a DFA can be quite exorbitant. |
46 | /// It is potentially multiple orders of magnitude greater than a |
47 | /// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange, |
48 | /// a DFA will typically have better search speed than a `contiguous::NFA`, but |
49 | /// not by orders of magnitude. |
50 | /// |
51 | /// Unless you have a small number of patterns or memory usage is not a concern |
52 | /// and search performance is critical, a DFA is usually not the best choice. |
53 | /// |
54 | /// Moreover, unlike the NFAs in this crate, it is costly for a DFA to |
55 | /// support for anchored and unanchored search configurations. Namely, |
56 | /// since failure transitions are pre-computed, supporting both anchored |
57 | /// and unanchored searches requires a duplication of the transition table, |
58 | /// making the memory usage of such a DFA ever bigger. (The NFAs in this crate |
59 | /// unconditionally support both anchored and unanchored searches because there |
60 | /// is essentially no added cost for doing so.) It is for this reason that |
61 | /// a DFA's support for anchored and unanchored searches can be configured |
62 | /// via [`Builder::start_kind`]. By default, a DFA only supports unanchored |
63 | /// searches. |
64 | /// |
65 | /// # Example |
66 | /// |
67 | /// This example shows how to build an `DFA` directly and use it to execute |
68 | /// [`Automaton::try_find`]: |
69 | /// |
70 | /// ``` |
71 | /// use aho_corasick::{ |
72 | /// automaton::Automaton, |
73 | /// dfa::DFA, |
74 | /// Input, Match, |
75 | /// }; |
76 | /// |
77 | /// let patterns = &["b" , "abc" , "abcd" ]; |
78 | /// let haystack = "abcd" ; |
79 | /// |
80 | /// let nfa = DFA::new(patterns).unwrap(); |
81 | /// assert_eq!( |
82 | /// Some(Match::must(0, 1..2)), |
83 | /// nfa.try_find(&Input::new(haystack))?, |
84 | /// ); |
85 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
86 | /// ``` |
87 | /// |
88 | /// It is also possible to implement your own version of `try_find`. See the |
89 | /// [`Automaton`] documentation for an example. |
90 | #[derive(Clone)] |
91 | pub struct DFA { |
92 | /// The DFA transition table. IDs in this table are pre-multiplied. So |
93 | /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride, |
94 | /// 2*stride, 3*stride, ... |
95 | trans: Vec<StateID>, |
96 | /// The matches for every match state in this DFA. This is first indexed by |
97 | /// state index (so that's `sid >> stride2`) and then by order in which the |
98 | /// matches are meant to occur. |
99 | matches: Vec<Vec<PatternID>>, |
100 | /// The amount of heap memory used, in bytes, by the inner Vecs of |
101 | /// 'matches'. |
102 | matches_memory_usage: usize, |
103 | /// The length of each pattern. This is used to compute the start offset |
104 | /// of a match. |
105 | pattern_lens: Vec<SmallIndex>, |
106 | /// A prefilter for accelerating searches, if one exists. |
107 | prefilter: Option<Prefilter>, |
108 | /// The match semantics built into this DFA. |
109 | match_kind: MatchKind, |
110 | /// The total number of states in this DFA. |
111 | state_len: usize, |
112 | /// The alphabet size, or total number of equivalence classes, for this |
113 | /// DFA. Note that the actual number of transitions in each state is |
114 | /// stride=2^stride2, where stride is the smallest power of 2 greater than |
115 | /// or equal to alphabet_len. We do things this way so that we can use |
116 | /// bitshifting to go from a state ID to an index into 'matches'. |
117 | alphabet_len: usize, |
118 | /// The exponent with a base 2, such that stride=2^stride2. Given a state |
119 | /// index 'i', its state identifier is 'i << stride2'. Given a state |
120 | /// identifier 'sid', its state index is 'sid >> stride2'. |
121 | stride2: usize, |
122 | /// The equivalence classes for this DFA. All transitions are defined on |
123 | /// equivalence classes and not on the 256 distinct byte values. |
124 | byte_classes: ByteClasses, |
125 | /// The length of the shortest pattern in this automaton. |
126 | min_pattern_len: usize, |
127 | /// The length of the longest pattern in this automaton. |
128 | max_pattern_len: usize, |
129 | /// The information required to deduce which states are "special" in this |
130 | /// DFA. |
131 | special: Special, |
132 | } |
133 | |
134 | impl DFA { |
135 | /// Create a new Aho-Corasick DFA using the default configuration. |
136 | /// |
137 | /// Use a [`Builder`] if you want to change the configuration. |
138 | pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError> |
139 | where |
140 | I: IntoIterator<Item = P>, |
141 | P: AsRef<[u8]>, |
142 | { |
143 | DFA::builder().build(patterns) |
144 | } |
145 | |
146 | /// A convenience method for returning a new Aho-Corasick DFA builder. |
147 | /// |
148 | /// This usually permits one to just import the `DFA` type. |
149 | pub fn builder() -> Builder { |
150 | Builder::new() |
151 | } |
152 | } |
153 | |
154 | impl DFA { |
155 | /// A sentinel state ID indicating that a search should stop once it has |
156 | /// entered this state. When a search stops, it returns a match if one has |
157 | /// been found, otherwise no match. A DFA always has an actual dead state |
158 | /// at this ID. |
159 | /// |
160 | /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state. |
161 | /// Namely, the whole point of a DFA is that the FAIL state is completely |
162 | /// compiled away. That is, DFA construction involves pre-computing the |
163 | /// failure transitions everywhere, such that failure transitions are no |
164 | /// longer used at search time. This, combined with its uniformly dense |
165 | /// representation, are the two most important factors in why it's faster |
166 | /// than the NFAs in this crate. |
167 | const DEAD: StateID = StateID::new_unchecked(0); |
168 | |
169 | /// Adds the given pattern IDs as matches to the given state and also |
170 | /// records the added memory usage. |
171 | fn set_matches( |
172 | &mut self, |
173 | sid: StateID, |
174 | pids: impl Iterator<Item = PatternID>, |
175 | ) { |
176 | let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap(); |
177 | let mut at_least_one = false; |
178 | for pid in pids { |
179 | self.matches[index].push(pid); |
180 | self.matches_memory_usage += PatternID::SIZE; |
181 | at_least_one = true; |
182 | } |
183 | assert!(at_least_one, "match state must have non-empty pids" ); |
184 | } |
185 | } |
186 | |
187 | // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always |
188 | // returns a valid state ID given a valid state ID. We otherwise claim that |
189 | // all other methods are correct as well. |
190 | unsafe impl Automaton for DFA { |
191 | #[inline (always)] |
192 | fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { |
193 | // Either of the start state IDs can be DEAD, in which case, support |
194 | // for that type of search is not provided by this DFA. Which start |
195 | // state IDs are inactive depends on the 'StartKind' configuration at |
196 | // DFA construction time. |
197 | match anchored { |
198 | Anchored::No => { |
199 | let start = self.special.start_unanchored_id; |
200 | if start == DFA::DEAD { |
201 | Err(MatchError::invalid_input_unanchored()) |
202 | } else { |
203 | Ok(start) |
204 | } |
205 | } |
206 | Anchored::Yes => { |
207 | let start = self.special.start_anchored_id; |
208 | if start == DFA::DEAD { |
209 | Err(MatchError::invalid_input_anchored()) |
210 | } else { |
211 | Ok(start) |
212 | } |
213 | } |
214 | } |
215 | } |
216 | |
217 | #[inline (always)] |
218 | fn next_state( |
219 | &self, |
220 | _anchored: Anchored, |
221 | sid: StateID, |
222 | byte: u8, |
223 | ) -> StateID { |
224 | let class = self.byte_classes.get(byte); |
225 | self.trans[(sid.as_u32() + u32::from(class)).as_usize()] |
226 | } |
227 | |
228 | #[inline (always)] |
229 | fn is_special(&self, sid: StateID) -> bool { |
230 | sid <= self.special.max_special_id |
231 | } |
232 | |
233 | #[inline (always)] |
234 | fn is_dead(&self, sid: StateID) -> bool { |
235 | sid == DFA::DEAD |
236 | } |
237 | |
238 | #[inline (always)] |
239 | fn is_match(&self, sid: StateID) -> bool { |
240 | !self.is_dead(sid) && sid <= self.special.max_match_id |
241 | } |
242 | |
243 | #[inline (always)] |
244 | fn is_start(&self, sid: StateID) -> bool { |
245 | sid == self.special.start_unanchored_id |
246 | || sid == self.special.start_anchored_id |
247 | } |
248 | |
249 | #[inline (always)] |
250 | fn match_kind(&self) -> MatchKind { |
251 | self.match_kind |
252 | } |
253 | |
254 | #[inline (always)] |
255 | fn patterns_len(&self) -> usize { |
256 | self.pattern_lens.len() |
257 | } |
258 | |
259 | #[inline (always)] |
260 | fn pattern_len(&self, pid: PatternID) -> usize { |
261 | self.pattern_lens[pid].as_usize() |
262 | } |
263 | |
264 | #[inline (always)] |
265 | fn min_pattern_len(&self) -> usize { |
266 | self.min_pattern_len |
267 | } |
268 | |
269 | #[inline (always)] |
270 | fn max_pattern_len(&self) -> usize { |
271 | self.max_pattern_len |
272 | } |
273 | |
274 | #[inline (always)] |
275 | fn match_len(&self, sid: StateID) -> usize { |
276 | debug_assert!(self.is_match(sid)); |
277 | let offset = (sid.as_usize() >> self.stride2) - 2; |
278 | self.matches[offset].len() |
279 | } |
280 | |
281 | #[inline (always)] |
282 | fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { |
283 | debug_assert!(self.is_match(sid)); |
284 | let offset = (sid.as_usize() >> self.stride2) - 2; |
285 | self.matches[offset][index] |
286 | } |
287 | |
288 | #[inline (always)] |
289 | fn memory_usage(&self) -> usize { |
290 | use core::mem::size_of; |
291 | |
292 | (self.trans.len() * size_of::<u32>()) |
293 | + (self.matches.len() * size_of::<Vec<PatternID>>()) |
294 | + self.matches_memory_usage |
295 | + (self.pattern_lens.len() * size_of::<SmallIndex>()) |
296 | + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) |
297 | } |
298 | |
299 | #[inline (always)] |
300 | fn prefilter(&self) -> Option<&Prefilter> { |
301 | self.prefilter.as_ref() |
302 | } |
303 | } |
304 | |
305 | impl core::fmt::Debug for DFA { |
306 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
307 | use crate::{ |
308 | automaton::{fmt_state_indicator, sparse_transitions}, |
309 | util::debug::DebugByte, |
310 | }; |
311 | |
312 | writeln!(f, "dfa::DFA(" )?; |
313 | for index in 0..self.state_len { |
314 | let sid = StateID::new_unchecked(index << self.stride2); |
315 | // While we do currently include the FAIL state in the transition |
316 | // table (to simplify construction), it is never actually used. It |
317 | // poses problems with the code below because it gets treated as |
318 | // a match state incidentally when it is, of course, not. So we |
319 | // special case it. The fail state is always the first state after |
320 | // the dead state. |
321 | // |
322 | // If the construction is changed to remove the fail state (it |
323 | // probably should be), then this special case should be updated. |
324 | if index == 1 { |
325 | writeln!(f, "F {:06}:" , sid.as_usize())?; |
326 | continue; |
327 | } |
328 | fmt_state_indicator(f, self, sid)?; |
329 | write!(f, "{:06}: " , sid.as_usize())?; |
330 | |
331 | let it = (0..self.byte_classes.alphabet_len()).map(|class| { |
332 | (class.as_u8(), self.trans[sid.as_usize() + class]) |
333 | }); |
334 | for (i, (start, end, next)) in sparse_transitions(it).enumerate() { |
335 | if i > 0 { |
336 | write!(f, ", " )?; |
337 | } |
338 | if start == end { |
339 | write!( |
340 | f, |
341 | "{:?} => {:?}" , |
342 | DebugByte(start), |
343 | next.as_usize() |
344 | )?; |
345 | } else { |
346 | write!( |
347 | f, |
348 | "{:?}-{:?} => {:?}" , |
349 | DebugByte(start), |
350 | DebugByte(end), |
351 | next.as_usize() |
352 | )?; |
353 | } |
354 | } |
355 | write!(f, " \n" )?; |
356 | if self.is_match(sid) { |
357 | write!(f, " matches: " )?; |
358 | for i in 0..self.match_len(sid) { |
359 | if i > 0 { |
360 | write!(f, ", " )?; |
361 | } |
362 | let pid = self.match_pattern(sid, i); |
363 | write!(f, "{}" , pid.as_usize())?; |
364 | } |
365 | write!(f, " \n" )?; |
366 | } |
367 | } |
368 | writeln!(f, "match kind: {:?}" , self.match_kind)?; |
369 | writeln!(f, "prefilter: {:?}" , self.prefilter.is_some())?; |
370 | writeln!(f, "state length: {:?}" , self.state_len)?; |
371 | writeln!(f, "pattern length: {:?}" , self.patterns_len())?; |
372 | writeln!(f, "shortest pattern length: {:?}" , self.min_pattern_len)?; |
373 | writeln!(f, "longest pattern length: {:?}" , self.max_pattern_len)?; |
374 | writeln!(f, "alphabet length: {:?}" , self.alphabet_len)?; |
375 | writeln!(f, "stride: {:?}" , 1 << self.stride2)?; |
376 | writeln!(f, "byte classes: {:?}" , self.byte_classes)?; |
377 | writeln!(f, "memory usage: {:?}" , self.memory_usage())?; |
378 | writeln!(f, ")" )?; |
379 | Ok(()) |
380 | } |
381 | } |
382 | |
383 | /// A builder for configuring an Aho-Corasick DFA. |
384 | /// |
385 | /// This builder has a subset of the options available to a |
386 | /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, |
387 | /// their behavior is identical. |
388 | #[derive(Clone, Debug)] |
389 | pub struct Builder { |
390 | noncontiguous: noncontiguous::Builder, |
391 | start_kind: StartKind, |
392 | byte_classes: bool, |
393 | } |
394 | |
395 | impl Default for Builder { |
396 | fn default() -> Builder { |
397 | Builder { |
398 | noncontiguous: noncontiguous::Builder::new(), |
399 | start_kind: StartKind::Unanchored, |
400 | byte_classes: true, |
401 | } |
402 | } |
403 | } |
404 | |
405 | impl Builder { |
406 | /// Create a new builder for configuring an Aho-Corasick DFA. |
407 | pub fn new() -> Builder { |
408 | Builder::default() |
409 | } |
410 | |
411 | /// Build an Aho-Corasick DFA from the given iterator of patterns. |
412 | /// |
413 | /// A builder may be reused to create more DFAs. |
414 | pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError> |
415 | where |
416 | I: IntoIterator<Item = P>, |
417 | P: AsRef<[u8]>, |
418 | { |
419 | let nnfa = self.noncontiguous.build(patterns)?; |
420 | self.build_from_noncontiguous(&nnfa) |
421 | } |
422 | |
423 | /// Build an Aho-Corasick DFA from the given noncontiguous NFA. |
424 | /// |
425 | /// Note that when this method is used, only the `start_kind` and |
426 | /// `byte_classes` settings on this builder are respected. The other |
427 | /// settings only apply to the initial construction of the Aho-Corasick |
428 | /// automaton. Since using this method requires that initial construction |
429 | /// has already completed, all settings impacting only initial construction |
430 | /// are no longer relevant. |
431 | pub fn build_from_noncontiguous( |
432 | &self, |
433 | nnfa: &noncontiguous::NFA, |
434 | ) -> Result<DFA, BuildError> { |
435 | debug!("building DFA" ); |
436 | let byte_classes = if self.byte_classes { |
437 | nnfa.byte_classes().clone() |
438 | } else { |
439 | ByteClasses::singletons() |
440 | }; |
441 | let state_len = match self.start_kind { |
442 | StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(), |
443 | StartKind::Both => { |
444 | // These unwraps are OK because we know that the number of |
445 | // NFA states is < StateID::LIMIT which is in turn less than |
446 | // i32::MAX. Thus, there is always room to multiply by 2. |
447 | // Finally, the number of states is always at least 4 in the |
448 | // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the |
449 | // subtraction of 4 is okay. |
450 | // |
451 | // Note that we subtract 4 because the "anchored" part of |
452 | // the DFA duplicates the unanchored part (without failure |
453 | // transitions), but reuses the DEAD, FAIL and START states. |
454 | nnfa.states() |
455 | .len() |
456 | .checked_mul(2) |
457 | .unwrap() |
458 | .checked_sub(4) |
459 | .unwrap() |
460 | } |
461 | }; |
462 | let trans_len = |
463 | match state_len.checked_shl(byte_classes.stride2().as_u32()) { |
464 | Some(trans_len) => trans_len, |
465 | None => { |
466 | return Err(BuildError::state_id_overflow( |
467 | StateID::MAX.as_u64(), |
468 | usize::MAX.as_u64(), |
469 | )) |
470 | } |
471 | }; |
472 | StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap()) |
473 | .map_err(|e| { |
474 | BuildError::state_id_overflow( |
475 | StateID::MAX.as_u64(), |
476 | e.attempted(), |
477 | ) |
478 | })?; |
479 | let num_match_states = match self.start_kind { |
480 | StartKind::Unanchored | StartKind::Anchored => { |
481 | nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap() |
482 | } |
483 | StartKind::Both => nnfa |
484 | .special() |
485 | .max_match_id |
486 | .as_usize() |
487 | .checked_sub(1) |
488 | .unwrap() |
489 | .checked_mul(2) |
490 | .unwrap(), |
491 | }; |
492 | let mut dfa = DFA { |
493 | trans: vec![DFA::DEAD; trans_len], |
494 | matches: vec![vec![]; num_match_states], |
495 | matches_memory_usage: 0, |
496 | pattern_lens: nnfa.pattern_lens_raw().to_vec(), |
497 | prefilter: nnfa.prefilter().map(|p| p.clone()), |
498 | match_kind: nnfa.match_kind(), |
499 | state_len, |
500 | alphabet_len: byte_classes.alphabet_len(), |
501 | stride2: byte_classes.stride2(), |
502 | byte_classes, |
503 | min_pattern_len: nnfa.min_pattern_len(), |
504 | max_pattern_len: nnfa.max_pattern_len(), |
505 | // The special state IDs are set later. |
506 | special: Special::zero(), |
507 | }; |
508 | match self.start_kind { |
509 | StartKind::Both => { |
510 | self.finish_build_both_starts(nnfa, &mut dfa); |
511 | } |
512 | StartKind::Unanchored => { |
513 | self.finish_build_one_start(Anchored::No, nnfa, &mut dfa); |
514 | } |
515 | StartKind::Anchored => { |
516 | self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa) |
517 | } |
518 | } |
519 | debug!( |
520 | "DFA built, <states: {:?}, size: {:?}, \ |
521 | alphabet len: {:?}, stride: {:?}>" , |
522 | dfa.state_len, |
523 | dfa.memory_usage(), |
524 | dfa.byte_classes.alphabet_len(), |
525 | dfa.byte_classes.stride(), |
526 | ); |
527 | // The vectors can grow ~twice as big during construction because a |
528 | // Vec amortizes growth. But here, let's shrink things back down to |
529 | // what we actually need since we're never going to add more to it. |
530 | dfa.trans.shrink_to_fit(); |
531 | dfa.pattern_lens.shrink_to_fit(); |
532 | dfa.matches.shrink_to_fit(); |
533 | // TODO: We might also want to shrink each Vec inside of `dfa.matches`, |
534 | // or even better, convert it to one contiguous allocation. But I think |
535 | // I went with nested allocs for good reason (can't remember), so this |
536 | // may be tricky to do. I decided not to shrink them here because it |
537 | // might require a fair bit of work to do. It's unclear whether it's |
538 | // worth it. |
539 | Ok(dfa) |
540 | } |
541 | |
542 | /// Finishes building a DFA for either unanchored or anchored searches, |
543 | /// but NOT both. |
544 | fn finish_build_one_start( |
545 | &self, |
546 | anchored: Anchored, |
547 | nnfa: &noncontiguous::NFA, |
548 | dfa: &mut DFA, |
549 | ) { |
550 | // This function always succeeds because we check above that all of the |
551 | // states in the NFA can be mapped to DFA state IDs. |
552 | let stride2 = dfa.stride2; |
553 | let old2new = |oldsid: StateID| { |
554 | StateID::new_unchecked(oldsid.as_usize() << stride2) |
555 | }; |
556 | for (oldsid, state) in nnfa.states().iter().with_state_ids() { |
557 | let newsid = old2new(oldsid); |
558 | if state.is_match() { |
559 | dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); |
560 | } |
561 | sparse_iter( |
562 | nnfa, |
563 | oldsid, |
564 | &dfa.byte_classes, |
565 | |byte, class, mut oldnextsid| { |
566 | if oldnextsid == noncontiguous::NFA::FAIL { |
567 | if anchored.is_anchored() { |
568 | oldnextsid = noncontiguous::NFA::DEAD; |
569 | } else if state.fail() == noncontiguous::NFA::DEAD { |
570 | // This is a special case that avoids following |
571 | // DEAD transitions in a non-contiguous NFA. |
572 | // Following these transitions is pretty slow |
573 | // because the non-contiguous NFA will always use |
574 | // a sparse representation for it (because the |
575 | // DEAD state is usually treated as a sentinel). |
576 | // The *vast* majority of failure states are DEAD |
577 | // states, so this winds up being pretty slow if |
578 | // we go through the non-contiguous NFA state |
579 | // transition logic. Instead, just do it ourselves. |
580 | oldnextsid = noncontiguous::NFA::DEAD; |
581 | } else { |
582 | oldnextsid = nnfa.next_state( |
583 | Anchored::No, |
584 | state.fail(), |
585 | byte, |
586 | ); |
587 | } |
588 | } |
589 | dfa.trans[newsid.as_usize() + usize::from(class)] = |
590 | old2new(oldnextsid); |
591 | }, |
592 | ); |
593 | } |
594 | // Now that we've remapped all the IDs in our states, all that's left |
595 | // is remapping the special state IDs. |
596 | let old = nnfa.special(); |
597 | let new = &mut dfa.special; |
598 | new.max_special_id = old2new(old.max_special_id); |
599 | new.max_match_id = old2new(old.max_match_id); |
600 | if anchored.is_anchored() { |
601 | new.start_unanchored_id = DFA::DEAD; |
602 | new.start_anchored_id = old2new(old.start_anchored_id); |
603 | } else { |
604 | new.start_unanchored_id = old2new(old.start_unanchored_id); |
605 | new.start_anchored_id = DFA::DEAD; |
606 | } |
607 | } |
608 | |
609 | /// Finishes building a DFA that supports BOTH unanchored and anchored |
610 | /// searches. It works by inter-leaving unanchored states with anchored |
611 | /// states in the same transition table. This way, we avoid needing to |
612 | /// re-shuffle states afterward to ensure that our states still look like |
613 | /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ... |
614 | /// |
615 | /// Honestly this is pretty inscrutable... Simplifications are most |
616 | /// welcome. |
617 | fn finish_build_both_starts( |
618 | &self, |
619 | nnfa: &noncontiguous::NFA, |
620 | dfa: &mut DFA, |
621 | ) { |
622 | let stride2 = dfa.stride2; |
623 | let stride = 1 << stride2; |
624 | let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()]; |
625 | let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()]; |
626 | let mut is_anchored = vec![false; dfa.state_len]; |
627 | let mut newsid = DFA::DEAD; |
628 | let next_dfa_id = |
629 | |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride); |
630 | for (oldsid, state) in nnfa.states().iter().with_state_ids() { |
631 | if oldsid == noncontiguous::NFA::DEAD |
632 | || oldsid == noncontiguous::NFA::FAIL |
633 | { |
634 | remap_unanchored[oldsid] = newsid; |
635 | remap_anchored[oldsid] = newsid; |
636 | newsid = next_dfa_id(newsid); |
637 | } else if oldsid == nnfa.special().start_unanchored_id |
638 | || oldsid == nnfa.special().start_anchored_id |
639 | { |
640 | if oldsid == nnfa.special().start_unanchored_id { |
641 | remap_unanchored[oldsid] = newsid; |
642 | remap_anchored[oldsid] = DFA::DEAD; |
643 | } else { |
644 | remap_unanchored[oldsid] = DFA::DEAD; |
645 | remap_anchored[oldsid] = newsid; |
646 | is_anchored[newsid.as_usize() >> stride2] = true; |
647 | } |
648 | if state.is_match() { |
649 | dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); |
650 | } |
651 | sparse_iter( |
652 | nnfa, |
653 | oldsid, |
654 | &dfa.byte_classes, |
655 | |_, class, oldnextsid| { |
656 | let class = usize::from(class); |
657 | if oldnextsid == noncontiguous::NFA::FAIL { |
658 | dfa.trans[newsid.as_usize() + class] = DFA::DEAD; |
659 | } else { |
660 | dfa.trans[newsid.as_usize() + class] = oldnextsid; |
661 | } |
662 | }, |
663 | ); |
664 | newsid = next_dfa_id(newsid); |
665 | } else { |
666 | let unewsid = newsid; |
667 | newsid = next_dfa_id(newsid); |
668 | let anewsid = newsid; |
669 | newsid = next_dfa_id(newsid); |
670 | |
671 | remap_unanchored[oldsid] = unewsid; |
672 | remap_anchored[oldsid] = anewsid; |
673 | is_anchored[anewsid.as_usize() >> stride2] = true; |
674 | if state.is_match() { |
675 | dfa.set_matches(unewsid, nnfa.iter_matches(oldsid)); |
676 | dfa.set_matches(anewsid, nnfa.iter_matches(oldsid)); |
677 | } |
678 | sparse_iter( |
679 | nnfa, |
680 | oldsid, |
681 | &dfa.byte_classes, |
682 | |byte, class, oldnextsid| { |
683 | let class = usize::from(class); |
684 | if oldnextsid == noncontiguous::NFA::FAIL { |
685 | let oldnextsid = |
686 | if state.fail() == noncontiguous::NFA::DEAD { |
687 | noncontiguous::NFA::DEAD |
688 | } else { |
689 | nnfa.next_state( |
690 | Anchored::No, |
691 | state.fail(), |
692 | byte, |
693 | ) |
694 | }; |
695 | dfa.trans[unewsid.as_usize() + class] = oldnextsid; |
696 | } else { |
697 | dfa.trans[unewsid.as_usize() + class] = oldnextsid; |
698 | dfa.trans[anewsid.as_usize() + class] = oldnextsid; |
699 | } |
700 | }, |
701 | ); |
702 | } |
703 | } |
704 | for i in 0..dfa.state_len { |
705 | let sid = i << stride2; |
706 | if is_anchored[i] { |
707 | for next in dfa.trans[sid..][..stride].iter_mut() { |
708 | *next = remap_anchored[*next]; |
709 | } |
710 | } else { |
711 | for next in dfa.trans[sid..][..stride].iter_mut() { |
712 | *next = remap_unanchored[*next]; |
713 | } |
714 | } |
715 | } |
716 | // Now that we've remapped all the IDs in our states, all that's left |
717 | // is remapping the special state IDs. |
718 | let old = nnfa.special(); |
719 | let new = &mut dfa.special; |
720 | new.max_special_id = remap_anchored[old.max_special_id]; |
721 | new.max_match_id = remap_anchored[old.max_match_id]; |
722 | new.start_unanchored_id = remap_unanchored[old.start_unanchored_id]; |
723 | new.start_anchored_id = remap_anchored[old.start_anchored_id]; |
724 | } |
725 | |
726 | /// Set the desired match semantics. |
727 | /// |
728 | /// This only applies when using [`Builder::build`] and not |
729 | /// [`Builder::build_from_noncontiguous`]. |
730 | /// |
731 | /// See |
732 | /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) |
733 | /// for more documentation and examples. |
734 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { |
735 | self.noncontiguous.match_kind(kind); |
736 | self |
737 | } |
738 | |
739 | /// Enable ASCII-aware case insensitive matching. |
740 | /// |
741 | /// This only applies when using [`Builder::build`] and not |
742 | /// [`Builder::build_from_noncontiguous`]. |
743 | /// |
744 | /// See |
745 | /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) |
746 | /// for more documentation and examples. |
747 | pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { |
748 | self.noncontiguous.ascii_case_insensitive(yes); |
749 | self |
750 | } |
751 | |
752 | /// Enable heuristic prefilter optimizations. |
753 | /// |
754 | /// This only applies when using [`Builder::build`] and not |
755 | /// [`Builder::build_from_noncontiguous`]. |
756 | /// |
757 | /// See |
758 | /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) |
759 | /// for more documentation and examples. |
760 | pub fn prefilter(&mut self, yes: bool) -> &mut Builder { |
761 | self.noncontiguous.prefilter(yes); |
762 | self |
763 | } |
764 | |
765 | /// Sets the starting state configuration for the automaton. |
766 | /// |
767 | /// See |
768 | /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) |
769 | /// for more documentation and examples. |
770 | pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder { |
771 | self.start_kind = kind; |
772 | self |
773 | } |
774 | |
775 | /// A debug setting for whether to attempt to shrink the size of the |
776 | /// automaton's alphabet or not. |
777 | /// |
778 | /// This should never be enabled unless you're debugging an automaton. |
779 | /// Namely, disabling byte classes makes transitions easier to reason |
780 | /// about, since they use the actual bytes instead of equivalence classes. |
781 | /// Disabling this confers no performance benefit at search time. |
782 | /// |
783 | /// See |
784 | /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes) |
785 | /// for more documentation and examples. |
786 | pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { |
787 | self.byte_classes = yes; |
788 | self |
789 | } |
790 | } |
791 | |
792 | /// Iterate over all possible equivalence class transitions in this state. |
793 | /// The closure is called for all transitions with a distinct equivalence |
794 | /// class, even those not explicitly represented in this sparse state. For |
795 | /// any implicitly defined transitions, the given closure is called with |
796 | /// the fail state ID. |
797 | /// |
798 | /// The closure is guaranteed to be called precisely |
799 | /// `byte_classes.alphabet_len()` times, once for every possible class in |
800 | /// ascending order. |
801 | fn sparse_iter<F: FnMut(u8, u8, StateID)>( |
802 | nnfa: &noncontiguous::NFA, |
803 | oldsid: StateID, |
804 | classes: &ByteClasses, |
805 | mut f: F, |
806 | ) { |
807 | let mut prev_class = None; |
808 | let mut byte = 0usize; |
809 | for t in nnfa.iter_trans(oldsid) { |
810 | while byte < usize::from(t.byte()) { |
811 | let rep = byte.as_u8(); |
812 | let class = classes.get(rep); |
813 | byte += 1; |
814 | if prev_class != Some(class) { |
815 | f(rep, class, noncontiguous::NFA::FAIL); |
816 | prev_class = Some(class); |
817 | } |
818 | } |
819 | let rep = t.byte(); |
820 | let class = classes.get(rep); |
821 | byte += 1; |
822 | if prev_class != Some(class) { |
823 | f(rep, class, t.next()); |
824 | prev_class = Some(class); |
825 | } |
826 | } |
827 | for b in byte..=255 { |
828 | let rep = b.as_u8(); |
829 | let class = classes.get(rep); |
830 | if prev_class != Some(class) { |
831 | f(rep, class, noncontiguous::NFA::FAIL); |
832 | prev_class = Some(class); |
833 | } |
834 | } |
835 | } |
836 | |