1 | /*! |
2 | Provides direct access to a DFA implementation of Aho-Corasick. |
3 | |
4 | This is a low-level API that generally only needs to be used in niche |
5 | circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) |
6 | instead of a DFA directly. Using an `DFA` directly is typically only necessary |
7 | when one needs access to the [`Automaton`] trait implementation. |
8 | */ |
9 | |
10 | use alloc::{vec, vec::Vec}; |
11 | |
12 | use crate::{ |
13 | automaton::Automaton, |
14 | nfa::noncontiguous, |
15 | util::{ |
16 | alphabet::ByteClasses, |
17 | error::{BuildError, MatchError}, |
18 | int::{Usize, U32}, |
19 | prefilter::Prefilter, |
20 | primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, |
21 | search::{Anchored, MatchKind, StartKind}, |
22 | special::Special, |
23 | }, |
24 | }; |
25 | |
26 | /// A DFA implementation of Aho-Corasick. |
27 | /// |
28 | /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of |
29 | /// this type directly. Using a `DFA` directly is typically only necessary when |
30 | /// one needs access to the [`Automaton`] trait implementation. |
31 | /// |
32 | /// This DFA can only be built by first constructing a [`noncontiguous::NFA`]. |
33 | /// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but |
34 | /// [`Builder::build_from_noncontiguous`] permits doing it explicitly. |
35 | /// |
36 | /// A DFA provides the best possible search performance (in this crate) via two |
37 | /// mechanisms: |
38 | /// |
39 | /// * All states use a dense representation for their transitions. |
40 | /// * All failure transitions are pre-computed such that they are never |
41 | /// explicitly handled at search time. |
42 | /// |
43 | /// These two facts combined mean that every state transition is performed |
44 | /// using a constant number of instructions. However, this comes at |
45 | /// great cost. The memory usage of a DFA can be quite exorbitant. |
46 | /// It is potentially multiple orders of magnitude greater than a |
47 | /// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange, |
48 | /// a DFA will typically have better search speed than a `contiguous::NFA`, but |
49 | /// not by orders of magnitude. |
50 | /// |
51 | /// Unless you have a small number of patterns or memory usage is not a concern |
52 | /// and search performance is critical, a DFA is usually not the best choice. |
53 | /// |
54 | /// Moreover, unlike the NFAs in this crate, it is costly for a DFA to |
55 | /// support for anchored and unanchored search configurations. Namely, |
56 | /// since failure transitions are pre-computed, supporting both anchored |
57 | /// and unanchored searches requires a duplication of the transition table, |
58 | /// making the memory usage of such a DFA ever bigger. (The NFAs in this crate |
59 | /// unconditionally support both anchored and unanchored searches because there |
60 | /// is essentially no added cost for doing so.) It is for this reason that |
61 | /// a DFA's support for anchored and unanchored searches can be configured |
62 | /// via [`Builder::start_kind`]. By default, a DFA only supports unanchored |
63 | /// searches. |
64 | /// |
65 | /// # Example |
66 | /// |
67 | /// This example shows how to build an `DFA` directly and use it to execute |
68 | /// [`Automaton::try_find`]: |
69 | /// |
70 | /// ``` |
71 | /// use aho_corasick::{ |
72 | /// automaton::Automaton, |
73 | /// dfa::DFA, |
74 | /// Input, Match, |
75 | /// }; |
76 | /// |
77 | /// let patterns = &["b" , "abc" , "abcd" ]; |
78 | /// let haystack = "abcd" ; |
79 | /// |
80 | /// let nfa = DFA::new(patterns).unwrap(); |
81 | /// assert_eq!( |
82 | /// Some(Match::must(0, 1..2)), |
83 | /// nfa.try_find(&Input::new(haystack))?, |
84 | /// ); |
85 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
86 | /// ``` |
87 | /// |
88 | /// It is also possible to implement your own version of `try_find`. See the |
89 | /// [`Automaton`] documentation for an example. |
90 | #[derive (Clone)] |
91 | pub struct DFA { |
92 | /// The DFA transition table. IDs in this table are pre-multiplied. So |
93 | /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride, |
94 | /// 2*stride, 3*stride, ... |
95 | trans: Vec<StateID>, |
96 | /// The matches for every match state in this DFA. This is indexed by order |
97 | /// of match states in the DFA. Namely, as constructed, match states are |
98 | /// always laid out sequentially and contiguously in memory. Thus, after |
99 | /// converting a match state ID to a match state index, the indices are |
100 | /// all adjacent. |
101 | /// |
102 | /// More concretely, when a search enters a match state with id 'sid', then |
103 | /// the matching patterns are at 'matches[(sid >> stride2) - 2]'. The '- 2' |
104 | /// is to offset the first two states of a DFA: the dead and fail states. |
105 | matches: Vec<Vec<PatternID>>, |
106 | /// The amount of heap memory used, in bytes, by the inner Vecs of |
107 | /// 'matches'. |
108 | matches_memory_usage: usize, |
109 | /// The length of each pattern. This is used to compute the start offset |
110 | /// of a match. |
111 | pattern_lens: Vec<SmallIndex>, |
112 | /// A prefilter for accelerating searches, if one exists. |
113 | prefilter: Option<Prefilter>, |
114 | /// The match semantics built into this DFA. |
115 | match_kind: MatchKind, |
116 | /// The total number of states in this DFA. |
117 | state_len: usize, |
118 | /// The alphabet size, or total number of equivalence classes, for this |
119 | /// DFA. Note that the actual number of transitions in each state is |
120 | /// stride=2^stride2, where stride is the smallest power of 2 greater than |
121 | /// or equal to alphabet_len. We do things this way so that we can use |
122 | /// bitshifting to go from a state ID to an index into 'matches'. |
123 | alphabet_len: usize, |
124 | /// The exponent with a base 2, such that stride=2^stride2. Given a state |
125 | /// index 'i', its state identifier is 'i << stride2'. Given a state |
126 | /// identifier 'sid', its state index is 'sid >> stride2'. |
127 | stride2: usize, |
128 | /// The equivalence classes for this DFA. All transitions are defined on |
129 | /// equivalence classes and not on the 256 distinct byte values. |
130 | byte_classes: ByteClasses, |
131 | /// The length of the shortest pattern in this automaton. |
132 | min_pattern_len: usize, |
133 | /// The length of the longest pattern in this automaton. |
134 | max_pattern_len: usize, |
135 | /// The information required to deduce which states are "special" in this |
136 | /// DFA. |
137 | special: Special, |
138 | } |
139 | |
140 | impl DFA { |
141 | /// Create a new Aho-Corasick DFA using the default configuration. |
142 | /// |
143 | /// Use a [`Builder`] if you want to change the configuration. |
144 | pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError> |
145 | where |
146 | I: IntoIterator<Item = P>, |
147 | P: AsRef<[u8]>, |
148 | { |
149 | DFA::builder().build(patterns) |
150 | } |
151 | |
152 | /// A convenience method for returning a new Aho-Corasick DFA builder. |
153 | /// |
154 | /// This usually permits one to just import the `DFA` type. |
155 | pub fn builder() -> Builder { |
156 | Builder::new() |
157 | } |
158 | } |
159 | |
160 | impl DFA { |
161 | /// A sentinel state ID indicating that a search should stop once it has |
162 | /// entered this state. When a search stops, it returns a match if one has |
163 | /// been found, otherwise no match. A DFA always has an actual dead state |
164 | /// at this ID. |
165 | /// |
166 | /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state. |
167 | /// Namely, the whole point of a DFA is that the FAIL state is completely |
168 | /// compiled away. That is, DFA construction involves pre-computing the |
169 | /// failure transitions everywhere, such that failure transitions are no |
170 | /// longer used at search time. This, combined with its uniformly dense |
171 | /// representation, are the two most important factors in why it's faster |
172 | /// than the NFAs in this crate. |
173 | const DEAD: StateID = StateID::new_unchecked(0); |
174 | |
175 | /// Adds the given pattern IDs as matches to the given state and also |
176 | /// records the added memory usage. |
177 | fn set_matches(&mut self, sid: StateID, pids: &[PatternID]) { |
178 | use core::mem::size_of; |
179 | |
180 | assert!(!pids.is_empty(), "match state must have non-empty pids" ); |
181 | let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap(); |
182 | self.matches[index].extend_from_slice(pids); |
183 | self.matches_memory_usage += size_of::<PatternID>() * pids.len(); |
184 | } |
185 | } |
186 | |
187 | // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always |
188 | // returns a valid state ID given a valid state ID. We otherwise claim that |
189 | // all other methods are correct as well. |
190 | unsafe impl Automaton for DFA { |
191 | #[inline (always)] |
192 | fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { |
193 | // Either of the start state IDs can be DEAD, in which case, support |
194 | // for that type of search is not provided by this DFA. Which start |
195 | // state IDs are inactive depends on the 'StartKind' configuration at |
196 | // DFA construction time. |
197 | match anchored { |
198 | Anchored::No => { |
199 | let start = self.special.start_unanchored_id; |
200 | if start == DFA::DEAD { |
201 | Err(MatchError::invalid_input_unanchored()) |
202 | } else { |
203 | Ok(start) |
204 | } |
205 | } |
206 | Anchored::Yes => { |
207 | let start = self.special.start_anchored_id; |
208 | if start == DFA::DEAD { |
209 | Err(MatchError::invalid_input_anchored()) |
210 | } else { |
211 | Ok(start) |
212 | } |
213 | } |
214 | } |
215 | } |
216 | |
217 | #[inline (always)] |
218 | fn next_state( |
219 | &self, |
220 | _anchored: Anchored, |
221 | sid: StateID, |
222 | byte: u8, |
223 | ) -> StateID { |
224 | let class = self.byte_classes.get(byte); |
225 | self.trans[(sid.as_u32() + u32::from(class)).as_usize()] |
226 | } |
227 | |
228 | #[inline (always)] |
229 | fn is_special(&self, sid: StateID) -> bool { |
230 | sid <= self.special.max_special_id |
231 | } |
232 | |
233 | #[inline (always)] |
234 | fn is_dead(&self, sid: StateID) -> bool { |
235 | sid == DFA::DEAD |
236 | } |
237 | |
238 | #[inline (always)] |
239 | fn is_match(&self, sid: StateID) -> bool { |
240 | !self.is_dead(sid) && sid <= self.special.max_match_id |
241 | } |
242 | |
243 | #[inline (always)] |
244 | fn is_start(&self, sid: StateID) -> bool { |
245 | sid == self.special.start_unanchored_id |
246 | || sid == self.special.start_anchored_id |
247 | } |
248 | |
249 | #[inline (always)] |
250 | fn match_kind(&self) -> MatchKind { |
251 | self.match_kind |
252 | } |
253 | |
254 | #[inline (always)] |
255 | fn patterns_len(&self) -> usize { |
256 | self.pattern_lens.len() |
257 | } |
258 | |
259 | #[inline (always)] |
260 | fn pattern_len(&self, pid: PatternID) -> usize { |
261 | self.pattern_lens[pid].as_usize() |
262 | } |
263 | |
264 | #[inline (always)] |
265 | fn min_pattern_len(&self) -> usize { |
266 | self.min_pattern_len |
267 | } |
268 | |
269 | #[inline (always)] |
270 | fn max_pattern_len(&self) -> usize { |
271 | self.max_pattern_len |
272 | } |
273 | |
274 | #[inline (always)] |
275 | fn match_len(&self, sid: StateID) -> usize { |
276 | debug_assert!(self.is_match(sid)); |
277 | let offset = (sid.as_usize() >> self.stride2) - 2; |
278 | self.matches[offset].len() |
279 | } |
280 | |
281 | #[inline (always)] |
282 | fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { |
283 | debug_assert!(self.is_match(sid)); |
284 | let offset = (sid.as_usize() >> self.stride2) - 2; |
285 | self.matches[offset][index] |
286 | } |
287 | |
288 | #[inline (always)] |
289 | fn memory_usage(&self) -> usize { |
290 | use core::mem::size_of; |
291 | |
292 | (self.trans.len() * size_of::<u32>()) |
293 | + (self.matches.len() * size_of::<Vec<PatternID>>()) |
294 | + self.matches_memory_usage |
295 | + (self.pattern_lens.len() * size_of::<SmallIndex>()) |
296 | + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) |
297 | } |
298 | |
299 | #[inline (always)] |
300 | fn prefilter(&self) -> Option<&Prefilter> { |
301 | self.prefilter.as_ref() |
302 | } |
303 | } |
304 | |
305 | impl core::fmt::Debug for DFA { |
306 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
307 | use crate::{ |
308 | automaton::{fmt_state_indicator, sparse_transitions}, |
309 | util::debug::DebugByte, |
310 | }; |
311 | |
312 | writeln!(f, "dfa::DFA(" )?; |
313 | for index in 0..self.state_len { |
314 | let sid = StateID::new_unchecked(index << self.stride2); |
315 | // While we do currently include the FAIL state in the transition |
316 | // table (to simplify construction), it is never actually used. It |
317 | // poses problems with the code below because it gets treated as |
318 | // a match state incidentally when it is, of course, not. So we |
319 | // special case it. The fail state is always the first state after |
320 | // the dead state. |
321 | // |
322 | // If the construction is changed to remove the fail state (it |
323 | // probably should be), then this special case should be updated. |
324 | if index == 1 { |
325 | writeln!(f, "F {:06}:" , sid.as_usize())?; |
326 | continue; |
327 | } |
328 | fmt_state_indicator(f, self, sid)?; |
329 | write!(f, " {:06}: " , sid.as_usize())?; |
330 | |
331 | let it = (0..self.byte_classes.alphabet_len()).map(|class| { |
332 | (class.as_u8(), self.trans[sid.as_usize() + class]) |
333 | }); |
334 | for (i, (start, end, next)) in sparse_transitions(it).enumerate() { |
335 | if i > 0 { |
336 | write!(f, ", " )?; |
337 | } |
338 | if start == end { |
339 | write!( |
340 | f, |
341 | " {:?} => {:?}" , |
342 | DebugByte(start), |
343 | next.as_usize() |
344 | )?; |
345 | } else { |
346 | write!( |
347 | f, |
348 | " {:?}- {:?} => {:?}" , |
349 | DebugByte(start), |
350 | DebugByte(end), |
351 | next.as_usize() |
352 | )?; |
353 | } |
354 | } |
355 | write!(f, " \n" )?; |
356 | if self.is_match(sid) { |
357 | write!(f, " matches: " )?; |
358 | for i in 0..self.match_len(sid) { |
359 | if i > 0 { |
360 | write!(f, ", " )?; |
361 | } |
362 | let pid = self.match_pattern(sid, i); |
363 | write!(f, " {}" , pid.as_usize())?; |
364 | } |
365 | write!(f, " \n" )?; |
366 | } |
367 | } |
368 | writeln!(f, "match kind: {:?}" , self.match_kind)?; |
369 | writeln!(f, "prefilter: {:?}" , self.prefilter.is_some())?; |
370 | writeln!(f, "state length: {:?}" , self.state_len)?; |
371 | writeln!(f, "pattern length: {:?}" , self.patterns_len())?; |
372 | writeln!(f, "shortest pattern length: {:?}" , self.min_pattern_len)?; |
373 | writeln!(f, "longest pattern length: {:?}" , self.max_pattern_len)?; |
374 | writeln!(f, "alphabet length: {:?}" , self.alphabet_len)?; |
375 | writeln!(f, "stride: {:?}" , 1 << self.stride2)?; |
376 | writeln!(f, "byte classes: {:?}" , self.byte_classes)?; |
377 | writeln!(f, "memory usage: {:?}" , self.memory_usage())?; |
378 | writeln!(f, ")" )?; |
379 | Ok(()) |
380 | } |
381 | } |
382 | |
383 | /// A builder for configuring an Aho-Corasick DFA. |
384 | /// |
385 | /// This builder has a subset of the options available to a |
386 | /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, |
387 | /// their behavior is identical. |
388 | #[derive (Clone, Debug)] |
389 | pub struct Builder { |
390 | noncontiguous: noncontiguous::Builder, |
391 | start_kind: StartKind, |
392 | byte_classes: bool, |
393 | } |
394 | |
395 | impl Default for Builder { |
396 | fn default() -> Builder { |
397 | Builder { |
398 | noncontiguous: noncontiguous::Builder::new(), |
399 | start_kind: StartKind::Unanchored, |
400 | byte_classes: true, |
401 | } |
402 | } |
403 | } |
404 | |
405 | impl Builder { |
406 | /// Create a new builder for configuring an Aho-Corasick DFA. |
407 | pub fn new() -> Builder { |
408 | Builder::default() |
409 | } |
410 | |
411 | /// Build an Aho-Corasick DFA from the given iterator of patterns. |
412 | /// |
413 | /// A builder may be reused to create more DFAs. |
414 | pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError> |
415 | where |
416 | I: IntoIterator<Item = P>, |
417 | P: AsRef<[u8]>, |
418 | { |
419 | let nnfa = self.noncontiguous.build(patterns)?; |
420 | self.build_from_noncontiguous(&nnfa) |
421 | } |
422 | |
423 | /// Build an Aho-Corasick DFA from the given noncontiguous NFA. |
424 | /// |
425 | /// Note that when this method is used, only the `start_kind` and |
426 | /// `byte_classes` settings on this builder are respected. The other |
427 | /// settings only apply to the initial construction of the Aho-Corasick |
428 | /// automaton. Since using this method requires that initial construction |
429 | /// has already completed, all settings impacting only initial construction |
430 | /// are no longer relevant. |
431 | pub fn build_from_noncontiguous( |
432 | &self, |
433 | nnfa: &noncontiguous::NFA, |
434 | ) -> Result<DFA, BuildError> { |
435 | debug!("building DFA" ); |
436 | let byte_classes = if self.byte_classes { |
437 | nnfa.byte_classes().clone() |
438 | } else { |
439 | ByteClasses::singletons() |
440 | }; |
441 | let state_len = match self.start_kind { |
442 | StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(), |
443 | StartKind::Both => { |
444 | // These unwraps are OK because we know that the number of |
445 | // NFA states is < StateID::LIMIT which is in turn less than |
446 | // i32::MAX. Thus, there is always room to multiply by 2. |
447 | // Finally, the number of states is always at least 4 in the |
448 | // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the |
449 | // subtraction of 4 is okay. |
450 | // |
451 | // Note that we subtract 4 because the "anchored" part of |
452 | // the DFA duplicates the unanchored part (without failure |
453 | // transitions), but reuses the DEAD, FAIL and START states. |
454 | nnfa.states() |
455 | .len() |
456 | .checked_mul(2) |
457 | .unwrap() |
458 | .checked_sub(4) |
459 | .unwrap() |
460 | } |
461 | }; |
462 | let trans_len = |
463 | match state_len.checked_shl(byte_classes.stride2().as_u32()) { |
464 | Some(trans_len) => trans_len, |
465 | None => { |
466 | return Err(BuildError::state_id_overflow( |
467 | StateID::MAX.as_u64(), |
468 | usize::MAX.as_u64(), |
469 | )) |
470 | } |
471 | }; |
472 | StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap()) |
473 | .map_err(|e| { |
474 | BuildError::state_id_overflow( |
475 | StateID::MAX.as_u64(), |
476 | e.attempted(), |
477 | ) |
478 | })?; |
479 | let num_match_states = match self.start_kind { |
480 | StartKind::Unanchored | StartKind::Anchored => { |
481 | nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap() |
482 | } |
483 | StartKind::Both => nnfa |
484 | .special() |
485 | .max_match_id |
486 | .as_usize() |
487 | .checked_sub(1) |
488 | .unwrap() |
489 | .checked_mul(2) |
490 | .unwrap(), |
491 | }; |
492 | let mut dfa = DFA { |
493 | trans: vec![DFA::DEAD; trans_len], |
494 | matches: vec![vec![]; num_match_states], |
495 | matches_memory_usage: 0, |
496 | pattern_lens: nnfa.pattern_lens_raw().to_vec(), |
497 | prefilter: nnfa.prefilter().map(|p| p.clone()), |
498 | match_kind: nnfa.match_kind(), |
499 | state_len, |
500 | alphabet_len: byte_classes.alphabet_len(), |
501 | stride2: byte_classes.stride2(), |
502 | byte_classes, |
503 | min_pattern_len: nnfa.min_pattern_len(), |
504 | max_pattern_len: nnfa.max_pattern_len(), |
505 | // The special state IDs are set later. |
506 | special: Special::zero(), |
507 | }; |
508 | match self.start_kind { |
509 | StartKind::Both => { |
510 | self.finish_build_both_starts(nnfa, &mut dfa); |
511 | } |
512 | StartKind::Unanchored => { |
513 | self.finish_build_one_start(Anchored::No, nnfa, &mut dfa); |
514 | } |
515 | StartKind::Anchored => { |
516 | self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa) |
517 | } |
518 | } |
519 | debug!( |
520 | "DFA built, <states: {:?}, size: {:?}, \ |
521 | alphabet len: {:?}, stride: {:?}>" , |
522 | dfa.state_len, |
523 | dfa.memory_usage(), |
524 | dfa.byte_classes.alphabet_len(), |
525 | dfa.byte_classes.stride(), |
526 | ); |
527 | Ok(dfa) |
528 | } |
529 | |
530 | /// Finishes building a DFA for either unanchored or anchored searches, |
531 | /// but NOT both. |
532 | fn finish_build_one_start( |
533 | &self, |
534 | anchored: Anchored, |
535 | nnfa: &noncontiguous::NFA, |
536 | dfa: &mut DFA, |
537 | ) { |
538 | // This function always succeeds because we check above that all of the |
539 | // states in the NFA can be mapped to DFA state IDs. |
540 | let stride2 = dfa.stride2; |
541 | let old2new = |oldsid: StateID| { |
542 | StateID::new_unchecked(oldsid.as_usize() << stride2) |
543 | }; |
544 | for (oldsid, state) in nnfa.states().iter().with_state_ids() { |
545 | let newsid = old2new(oldsid); |
546 | if !state.matches.is_empty() { |
547 | dfa.set_matches(newsid, &state.matches); |
548 | } |
549 | sparse_iter( |
550 | state, |
551 | &dfa.byte_classes, |
552 | |byte, class, mut oldnextsid| { |
553 | if oldnextsid == noncontiguous::NFA::FAIL { |
554 | if anchored.is_anchored() { |
555 | oldnextsid = noncontiguous::NFA::DEAD; |
556 | } else { |
557 | oldnextsid = nnfa.next_state( |
558 | Anchored::No, |
559 | state.fail, |
560 | byte, |
561 | ); |
562 | } |
563 | } |
564 | dfa.trans[newsid.as_usize() + usize::from(class)] = |
565 | old2new(oldnextsid); |
566 | }, |
567 | ); |
568 | } |
569 | // Now that we've remapped all the IDs in our states, all that's left |
570 | // is remapping the special state IDs. |
571 | let old = nnfa.special(); |
572 | let new = &mut dfa.special; |
573 | new.max_special_id = old2new(old.max_special_id); |
574 | new.max_match_id = old2new(old.max_match_id); |
575 | if anchored.is_anchored() { |
576 | new.start_unanchored_id = DFA::DEAD; |
577 | new.start_anchored_id = old2new(old.start_anchored_id); |
578 | } else { |
579 | new.start_unanchored_id = old2new(old.start_unanchored_id); |
580 | new.start_anchored_id = DFA::DEAD; |
581 | } |
582 | } |
583 | |
584 | /// Finishes building a DFA that supports BOTH unanchored and anchored |
585 | /// searches. It works by inter-leaving unanchored states with anchored |
586 | /// states in the same transition table. This way, we avoid needing to |
587 | /// re-shuffle states afterward to ensure that our states still look like |
588 | /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ... |
589 | /// |
590 | /// Honestly this is pretty inscrutable... Simplifications are most |
591 | /// welcome. |
592 | fn finish_build_both_starts( |
593 | &self, |
594 | nnfa: &noncontiguous::NFA, |
595 | dfa: &mut DFA, |
596 | ) { |
597 | let stride2 = dfa.stride2; |
598 | let stride = 1 << stride2; |
599 | let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()]; |
600 | let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()]; |
601 | let mut is_anchored = vec![false; dfa.state_len]; |
602 | let mut newsid = DFA::DEAD; |
603 | let next_dfa_id = |
604 | |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride); |
605 | for (oldsid, state) in nnfa.states().iter().with_state_ids() { |
606 | if oldsid == noncontiguous::NFA::DEAD |
607 | || oldsid == noncontiguous::NFA::FAIL |
608 | { |
609 | remap_unanchored[oldsid] = newsid; |
610 | remap_anchored[oldsid] = newsid; |
611 | newsid = next_dfa_id(newsid); |
612 | } else if oldsid == nnfa.special().start_unanchored_id |
613 | || oldsid == nnfa.special().start_anchored_id |
614 | { |
615 | if oldsid == nnfa.special().start_unanchored_id { |
616 | remap_unanchored[oldsid] = newsid; |
617 | remap_anchored[oldsid] = DFA::DEAD; |
618 | } else { |
619 | remap_unanchored[oldsid] = DFA::DEAD; |
620 | remap_anchored[oldsid] = newsid; |
621 | is_anchored[newsid.as_usize() >> stride2] = true; |
622 | } |
623 | if !state.matches.is_empty() { |
624 | dfa.set_matches(newsid, &state.matches); |
625 | } |
626 | sparse_iter( |
627 | state, |
628 | &dfa.byte_classes, |
629 | |_, class, oldnextsid| { |
630 | let class = usize::from(class); |
631 | if oldnextsid == noncontiguous::NFA::FAIL { |
632 | dfa.trans[newsid.as_usize() + class] = DFA::DEAD; |
633 | } else { |
634 | dfa.trans[newsid.as_usize() + class] = oldnextsid; |
635 | } |
636 | }, |
637 | ); |
638 | newsid = next_dfa_id(newsid); |
639 | } else { |
640 | let unewsid = newsid; |
641 | newsid = next_dfa_id(newsid); |
642 | let anewsid = newsid; |
643 | newsid = next_dfa_id(newsid); |
644 | |
645 | remap_unanchored[oldsid] = unewsid; |
646 | remap_anchored[oldsid] = anewsid; |
647 | is_anchored[anewsid.as_usize() >> stride2] = true; |
648 | if !state.matches.is_empty() { |
649 | dfa.set_matches(unewsid, &state.matches); |
650 | dfa.set_matches(anewsid, &state.matches); |
651 | } |
652 | sparse_iter( |
653 | state, |
654 | &dfa.byte_classes, |
655 | |byte, class, oldnextsid| { |
656 | let class = usize::from(class); |
657 | if oldnextsid == noncontiguous::NFA::FAIL { |
658 | dfa.trans[unewsid.as_usize() + class] = nnfa |
659 | .next_state(Anchored::No, state.fail, byte); |
660 | } else { |
661 | dfa.trans[unewsid.as_usize() + class] = oldnextsid; |
662 | dfa.trans[anewsid.as_usize() + class] = oldnextsid; |
663 | } |
664 | }, |
665 | ); |
666 | } |
667 | } |
668 | for i in 0..dfa.state_len { |
669 | let sid = i << stride2; |
670 | if is_anchored[i] { |
671 | for next in dfa.trans[sid..][..stride].iter_mut() { |
672 | *next = remap_anchored[*next]; |
673 | } |
674 | } else { |
675 | for next in dfa.trans[sid..][..stride].iter_mut() { |
676 | *next = remap_unanchored[*next]; |
677 | } |
678 | } |
679 | } |
680 | // Now that we've remapped all the IDs in our states, all that's left |
681 | // is remapping the special state IDs. |
682 | let old = nnfa.special(); |
683 | let new = &mut dfa.special; |
684 | new.max_special_id = remap_anchored[old.max_special_id]; |
685 | new.max_match_id = remap_anchored[old.max_match_id]; |
686 | new.start_unanchored_id = remap_unanchored[old.start_unanchored_id]; |
687 | new.start_anchored_id = remap_anchored[old.start_anchored_id]; |
688 | } |
689 | |
690 | /// Set the desired match semantics. |
691 | /// |
692 | /// This only applies when using [`Builder::build`] and not |
693 | /// [`Builder::build_from_noncontiguous`]. |
694 | /// |
695 | /// See |
696 | /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) |
697 | /// for more documentation and examples. |
698 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { |
699 | self.noncontiguous.match_kind(kind); |
700 | self |
701 | } |
702 | |
703 | /// Enable ASCII-aware case insensitive matching. |
704 | /// |
705 | /// This only applies when using [`Builder::build`] and not |
706 | /// [`Builder::build_from_noncontiguous`]. |
707 | /// |
708 | /// See |
709 | /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) |
710 | /// for more documentation and examples. |
711 | pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { |
712 | self.noncontiguous.ascii_case_insensitive(yes); |
713 | self |
714 | } |
715 | |
716 | /// Enable heuristic prefilter optimizations. |
717 | /// |
718 | /// This only applies when using [`Builder::build`] and not |
719 | /// [`Builder::build_from_noncontiguous`]. |
720 | /// |
721 | /// See |
722 | /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) |
723 | /// for more documentation and examples. |
724 | pub fn prefilter(&mut self, yes: bool) -> &mut Builder { |
725 | self.noncontiguous.prefilter(yes); |
726 | self |
727 | } |
728 | |
729 | /// Sets the starting state configuration for the automaton. |
730 | /// |
731 | /// See |
732 | /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) |
733 | /// for more documentation and examples. |
734 | pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder { |
735 | self.start_kind = kind; |
736 | self |
737 | } |
738 | |
739 | /// A debug setting for whether to attempt to shrink the size of the |
740 | /// automaton's alphabet or not. |
741 | /// |
742 | /// This should never be enabled unless you're debugging an automaton. |
743 | /// Namely, disabling byte classes makes transitions easier to reason |
744 | /// about, since they use the actual bytes instead of equivalence classes. |
745 | /// Disabling this confers no performance benefit at search time. |
746 | /// |
747 | /// See |
748 | /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes) |
749 | /// for more documentation and examples. |
750 | pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { |
751 | self.byte_classes = yes; |
752 | self |
753 | } |
754 | } |
755 | |
756 | /// Iterate over all possible equivalence class transitions in this state. |
757 | /// The closure is called for all transitions with a distinct equivalence |
758 | /// class, even those not explicitly represented in this sparse state. For |
759 | /// any implicitly defined transitions, the given closure is called with |
760 | /// the fail state ID. |
761 | /// |
762 | /// The closure is guaranteed to be called precisely |
763 | /// `byte_classes.alphabet_len()` times, once for every possible class in |
764 | /// ascending order. |
765 | fn sparse_iter<F: FnMut(u8, u8, StateID)>( |
766 | state: &noncontiguous::State, |
767 | classes: &ByteClasses, |
768 | mut f: F, |
769 | ) { |
770 | let mut prev_class = None; |
771 | let mut byte = 0usize; |
772 | for &(b, id) in state.trans.iter() { |
773 | while byte < usize::from(b) { |
774 | let rep = byte.as_u8(); |
775 | let class = classes.get(rep); |
776 | byte += 1; |
777 | if prev_class != Some(class) { |
778 | f(rep, class, noncontiguous::NFA::FAIL); |
779 | prev_class = Some(class); |
780 | } |
781 | } |
782 | let rep = b; |
783 | let class = classes.get(rep); |
784 | byte += 1; |
785 | if prev_class != Some(class) { |
786 | f(rep, class, id); |
787 | prev_class = Some(class); |
788 | } |
789 | } |
790 | for b in byte..=255 { |
791 | let rep = b.as_u8(); |
792 | let class = classes.get(rep); |
793 | if prev_class != Some(class) { |
794 | f(rep, class, noncontiguous::NFA::FAIL); |
795 | prev_class = Some(class); |
796 | } |
797 | } |
798 | } |
799 | |