1 | /*! |
2 | Provides a noncontiguous NFA implementation of Aho-Corasick. |
3 | |
4 | This is a low-level API that generally only needs to be used in niche |
5 | circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) |
6 | instead of a noncontiguous NFA directly. Using an `NFA` directly is typically |
7 | only necessary when one needs access to the [`Automaton`] trait implementation. |
8 | */ |
9 | |
10 | use alloc::{ |
11 | collections::{BTreeSet, VecDeque}, |
12 | vec, |
13 | vec::Vec, |
14 | }; |
15 | |
16 | use crate::{ |
17 | automaton::Automaton, |
18 | util::{ |
19 | alphabet::{ByteClassSet, ByteClasses}, |
20 | error::{BuildError, MatchError}, |
21 | prefilter::{self, opposite_ascii_case, Prefilter}, |
22 | primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, |
23 | remapper::Remapper, |
24 | search::{Anchored, MatchKind}, |
25 | special::Special, |
26 | }, |
27 | }; |
28 | |
29 | /// A noncontiguous NFA implementation of Aho-Corasick. |
30 | /// |
31 | /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of |
32 | /// this type directly. Using an `NFA` directly is typically only necessary |
33 | /// when one needs access to the [`Automaton`] trait implementation. |
34 | /// |
35 | /// This NFA represents the "core" implementation of Aho-Corasick in this |
36 | /// crate. Namely, constructing this NFA involving building a trie and then |
37 | /// filling in the failure transitions between states, similar to what is |
38 | /// described in any standard textbook description of Aho-Corasick. |
39 | /// |
40 | /// In order to minimize heap usage and to avoid additional construction costs, |
41 | /// this implementation represents the transitions of all states as distinct |
42 | /// sparse memory allocations. This is where it gets its name from. That is, |
43 | /// this NFA has no contiguous memory allocation for its transition table. Each |
44 | /// state gets its own allocation. |
45 | /// |
46 | /// While the sparse representation keeps memory usage to somewhat reasonable |
47 | /// levels, it is still quite large and also results in somewhat mediocre |
48 | /// search performance. For this reason, it is almost always a good idea to |
49 | /// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is |
50 | /// marginally slower to build, but has higher throughput and can sometimes use |
51 | /// an order of magnitude less memory. The main reason to use a noncontiguous |
52 | /// NFA is when you need the fastest possible construction time, or when a |
53 | /// contiguous NFA does not have the desired capacity. (The total number of NFA |
54 | /// states it can have is fewer than a noncontiguous NFA.) |
55 | /// |
56 | /// # Example |
57 | /// |
58 | /// This example shows how to build an `NFA` directly and use it to execute |
59 | /// [`Automaton::try_find`]: |
60 | /// |
61 | /// ``` |
62 | /// use aho_corasick::{ |
63 | /// automaton::Automaton, |
64 | /// nfa::noncontiguous::NFA, |
65 | /// Input, Match, |
66 | /// }; |
67 | /// |
68 | /// let patterns = &["b" , "abc" , "abcd" ]; |
69 | /// let haystack = "abcd" ; |
70 | /// |
71 | /// let nfa = NFA::new(patterns).unwrap(); |
72 | /// assert_eq!( |
73 | /// Some(Match::must(0, 1..2)), |
74 | /// nfa.try_find(&Input::new(haystack))?, |
75 | /// ); |
76 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
77 | /// ``` |
78 | /// |
79 | /// It is also possible to implement your own version of `try_find`. See the |
80 | /// [`Automaton`] documentation for an example. |
81 | #[derive (Clone)] |
82 | pub struct NFA { |
83 | /// The match semantics built into this NFA. |
84 | match_kind: MatchKind, |
85 | /// A set of states. Each state defines its own transitions, a fail |
86 | /// transition and a set of indices corresponding to matches. |
87 | /// |
88 | /// The first state is always the fail state, which is used only as a |
89 | /// sentinel. Namely, in the final NFA, no transition into the fail state |
90 | /// exists. (Well, they do, but they aren't followed. Instead, the state's |
91 | /// failure transition is followed.) |
92 | /// |
93 | /// The second state (index 1) is always the dead state. Dead states are |
94 | /// in every automaton, but only used when leftmost-{first,longest} match |
95 | /// semantics are enabled. Specifically, they instruct search to stop |
96 | /// at specific points in order to report the correct match location. In |
97 | /// the standard Aho-Corasick construction, there are no transitions to |
98 | /// the dead state. |
99 | /// |
100 | /// The third state (index 2) is generally intended to be the starting or |
101 | /// "root" state. |
102 | states: Vec<State>, |
103 | /// Transitions stored in a sparse representation via a linked list. |
104 | /// |
105 | /// Each transition contains three pieces of information: the byte it |
106 | /// is defined for, the state it transitions to and a link to the next |
107 | /// transition in the same state (or `StateID::ZERO` if it is the last |
108 | /// transition). |
109 | /// |
110 | /// The first transition for each state is determined by `State::sparse`. |
111 | /// |
112 | /// Note that this contains a complete set of all transitions in this NFA, |
113 | /// including states that have a dense representation for transitions. |
114 | /// (Adding dense transitions for a state doesn't remove its sparse |
115 | /// transitions, since deleting transitions from this particular sparse |
116 | /// representation would be fairly expensive.) |
117 | sparse: Vec<Transition>, |
118 | /// Transitions stored in a dense representation. |
119 | /// |
120 | /// A state has a row in this table if and only if `State::dense` is |
121 | /// not equal to `StateID::ZERO`. When not zero, there are precisely |
122 | /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense` |
123 | /// in this table. |
124 | /// |
125 | /// Generally a very small minority of states have a dense representation |
126 | /// since it uses so much memory. |
127 | dense: Vec<StateID>, |
128 | /// Matches stored in linked list for each state. |
129 | /// |
130 | /// Like sparse transitions, each match has a link to the next match in the |
131 | /// state. |
132 | /// |
133 | /// The first match for each state is determined by `State::matches`. |
134 | matches: Vec<Match>, |
135 | /// The length, in bytes, of each pattern in this NFA. This slice is |
136 | /// indexed by `PatternID`. |
137 | /// |
138 | /// The number of entries in this vector corresponds to the total number of |
139 | /// patterns in this automaton. |
140 | pattern_lens: Vec<SmallIndex>, |
141 | /// A prefilter for quickly skipping to candidate matches, if pertinent. |
142 | prefilter: Option<Prefilter>, |
143 | /// A set of equivalence classes in terms of bytes. We compute this while |
144 | /// building the NFA, but don't use it in the NFA's states. Instead, we |
145 | /// use this for building the DFA. We store it on the NFA since it's easy |
146 | /// to compute while visiting the patterns. |
147 | byte_classes: ByteClasses, |
148 | /// The length, in bytes, of the shortest pattern in this automaton. This |
149 | /// information is useful for detecting whether an automaton matches the |
150 | /// empty string or not. |
151 | min_pattern_len: usize, |
152 | /// The length, in bytes, of the longest pattern in this automaton. This |
153 | /// information is useful for keeping correct buffer sizes when searching |
154 | /// on streams. |
155 | max_pattern_len: usize, |
156 | /// The information required to deduce which states are "special" in this |
157 | /// NFA. |
158 | /// |
159 | /// Since the DEAD and FAIL states are always the first two states and |
160 | /// there are only ever two start states (which follow all of the match |
161 | /// states), it follows that we can determine whether a state is a fail, |
162 | /// dead, match or start with just a few comparisons on the ID itself: |
163 | /// |
164 | /// is_dead(sid): sid == NFA::DEAD |
165 | /// is_fail(sid): sid == NFA::FAIL |
166 | /// is_match(sid): NFA::FAIL < sid && sid <= max_match_id |
167 | /// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id |
168 | /// |
169 | /// Note that this only applies to the NFA after it has been constructed. |
170 | /// During construction, the start states are the first ones added and the |
171 | /// match states are inter-leaved with non-match states. Once all of the |
172 | /// states have been added, the states are shuffled such that the above |
173 | /// predicates hold. |
174 | special: Special, |
175 | } |
176 | |
177 | impl NFA { |
178 | /// Create a new Aho-Corasick noncontiguous NFA using the default |
179 | /// configuration. |
180 | /// |
181 | /// Use a [`Builder`] if you want to change the configuration. |
182 | pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError> |
183 | where |
184 | I: IntoIterator<Item = P>, |
185 | P: AsRef<[u8]>, |
186 | { |
187 | NFA::builder().build(patterns) |
188 | } |
189 | |
190 | /// A convenience method for returning a new Aho-Corasick noncontiguous NFA |
191 | /// builder. |
192 | /// |
193 | /// This usually permits one to just import the `NFA` type. |
194 | pub fn builder() -> Builder { |
195 | Builder::new() |
196 | } |
197 | } |
198 | |
199 | impl NFA { |
200 | /// The DEAD state is a sentinel state like the FAIL state. The DEAD state |
201 | /// instructs any search to stop and return any currently recorded match, |
202 | /// or no match otherwise. Generally speaking, it is impossible for an |
203 | /// unanchored standard search to enter a DEAD state. But an anchored |
204 | /// search can, and so to can a leftmost search. |
205 | /// |
206 | /// We put DEAD before FAIL so that DEAD is always 0. We repeat this |
207 | /// decision across the other Aho-Corasicm automata, so that DEAD |
208 | /// states there are always 0 too. It's not that we need all of the |
209 | /// implementations to agree, but rather, the contiguous NFA and the DFA |
210 | /// use a sort of "premultiplied" state identifier where the only state |
211 | /// whose ID is always known and constant is the first state. Subsequent |
212 | /// state IDs depend on how much space has already been used in the |
213 | /// transition table. |
214 | pub(crate) const DEAD: StateID = StateID::new_unchecked(0); |
215 | /// The FAIL state mostly just corresponds to the ID of any transition on a |
216 | /// state that isn't explicitly defined. When one transitions into the FAIL |
217 | /// state, one must follow the previous state's failure transition before |
218 | /// doing the next state lookup. In this way, FAIL is more of a sentinel |
219 | /// than a state that one actually transitions into. In particular, it is |
220 | /// never exposed in the `Automaton` interface. |
221 | pub(crate) const FAIL: StateID = StateID::new_unchecked(1); |
222 | |
223 | /// Returns the equivalence classes of bytes found while constructing |
224 | /// this NFA. |
225 | /// |
226 | /// Note that the NFA doesn't actually make use of these equivalence |
227 | /// classes. Instead, these are useful for building the DFA when desired. |
228 | pub(crate) fn byte_classes(&self) -> &ByteClasses { |
229 | &self.byte_classes |
230 | } |
231 | |
232 | /// Returns a slice containing the length of each pattern in this searcher. |
233 | /// It is indexed by `PatternID` and has length `NFA::patterns_len`. |
234 | /// |
235 | /// This is exposed for convenience when building a contiguous NFA. But it |
236 | /// can be reconstructed from the `Automaton` API if necessary. |
237 | pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] { |
238 | &self.pattern_lens |
239 | } |
240 | |
241 | /// Returns a slice of all states in this non-contiguous NFA. |
242 | pub(crate) fn states(&self) -> &[State] { |
243 | &self.states |
244 | } |
245 | |
246 | /// Returns the underlying "special" state information for this NFA. |
247 | pub(crate) fn special(&self) -> &Special { |
248 | &self.special |
249 | } |
250 | |
251 | /// Swaps the states at `id1` and `id2`. |
252 | /// |
253 | /// This does not update the transitions of any state to account for the |
254 | /// state swap. |
255 | pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { |
256 | self.states.swap(id1.as_usize(), id2.as_usize()); |
257 | } |
258 | |
259 | /// Re-maps all state IDs in this NFA according to the `map` function |
260 | /// given. |
261 | pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { |
262 | let alphabet_len = self.byte_classes.alphabet_len(); |
263 | for state in self.states.iter_mut() { |
264 | state.fail = map(state.fail); |
265 | let mut link = state.sparse; |
266 | while link != StateID::ZERO { |
267 | let t = &mut self.sparse[link]; |
268 | t.next = map(t.next); |
269 | link = t.link; |
270 | } |
271 | if state.dense != StateID::ZERO { |
272 | let start = state.dense.as_usize(); |
273 | for next in self.dense[start..][..alphabet_len].iter_mut() { |
274 | *next = map(*next); |
275 | } |
276 | } |
277 | } |
278 | } |
279 | |
280 | /// Iterate over all of the transitions for the given state ID. |
281 | pub(crate) fn iter_trans( |
282 | &self, |
283 | sid: StateID, |
284 | ) -> impl Iterator<Item = Transition> + '_ { |
285 | let mut link = self.states[sid].sparse; |
286 | core::iter::from_fn(move || { |
287 | if link == StateID::ZERO { |
288 | return None; |
289 | } |
290 | let t = self.sparse[link]; |
291 | link = t.link; |
292 | Some(t) |
293 | }) |
294 | } |
295 | |
296 | /// Iterate over all of the matches for the given state ID. |
297 | pub(crate) fn iter_matches( |
298 | &self, |
299 | sid: StateID, |
300 | ) -> impl Iterator<Item = PatternID> + '_ { |
301 | let mut link = self.states[sid].matches; |
302 | core::iter::from_fn(move || { |
303 | if link == StateID::ZERO { |
304 | return None; |
305 | } |
306 | let m = self.matches[link]; |
307 | link = m.link; |
308 | Some(m.pid) |
309 | }) |
310 | } |
311 | |
312 | /// Return the link following the one given. If the one given is the last |
313 | /// link for the given state, then return `None`. |
314 | /// |
315 | /// If no previous link is given, then this returns the first link in the |
316 | /// state, if one exists. |
317 | /// |
318 | /// This is useful for manually iterating over the transitions in a single |
319 | /// state without borrowing the NFA. This permits mutating other parts of |
320 | /// the NFA during iteration. Namely, one can access the transition pointed |
321 | /// to by the link via `self.sparse[link]`. |
322 | fn next_link( |
323 | &self, |
324 | sid: StateID, |
325 | prev: Option<StateID>, |
326 | ) -> Option<StateID> { |
327 | let link = |
328 | prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link); |
329 | if link == StateID::ZERO { |
330 | None |
331 | } else { |
332 | Some(link) |
333 | } |
334 | } |
335 | |
336 | /// Follow the transition for the given byte in the given state. If no such |
337 | /// transition exists, then the FAIL state ID is returned. |
338 | #[inline (always)] |
339 | fn follow_transition(&self, sid: StateID, byte: u8) -> StateID { |
340 | let s = &self.states[sid]; |
341 | // This is a special case that targets starting states and states |
342 | // near a start state. Namely, after the initial trie is constructed, |
343 | // we look for states close to the start state to convert to a dense |
344 | // representation for their transitions. This winds up using a lot more |
345 | // memory per state in exchange for faster transition lookups. But |
346 | // since we only do this for a small number of states (by default), the |
347 | // memory usage is usually minimal. |
348 | // |
349 | // This has *massive* benefit when executing searches because the |
350 | // unanchored starting state is by far the hottest state and is |
351 | // frequently visited. Moreover, the 'for' loop below that works |
352 | // decently on an actually sparse state is disastrous on a state that |
353 | // is nearly or completely dense. |
354 | if s.dense == StateID::ZERO { |
355 | self.follow_transition_sparse(sid, byte) |
356 | } else { |
357 | let class = usize::from(self.byte_classes.get(byte)); |
358 | self.dense[s.dense.as_usize() + class] |
359 | } |
360 | } |
361 | |
362 | /// Like `follow_transition`, but always uses the sparse representation. |
363 | #[inline (always)] |
364 | fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID { |
365 | for t in self.iter_trans(sid) { |
366 | if byte <= t.byte { |
367 | if byte == t.byte { |
368 | return t.next; |
369 | } |
370 | break; |
371 | } |
372 | } |
373 | NFA::FAIL |
374 | } |
375 | |
376 | /// Set the transition for the given byte to the state ID given. |
377 | /// |
378 | /// Note that one should not set transitions to the FAIL state. It is not |
379 | /// technically incorrect, but it wastes space. If a transition is not |
380 | /// defined, then it is automatically assumed to lead to the FAIL state. |
381 | fn add_transition( |
382 | &mut self, |
383 | prev: StateID, |
384 | byte: u8, |
385 | next: StateID, |
386 | ) -> Result<(), BuildError> { |
387 | if self.states[prev].dense != StateID::ZERO { |
388 | let dense = self.states[prev].dense; |
389 | let class = usize::from(self.byte_classes.get(byte)); |
390 | self.dense[dense.as_usize() + class] = next; |
391 | } |
392 | |
393 | let head = self.states[prev].sparse; |
394 | if head == StateID::ZERO || byte < self.sparse[head].byte { |
395 | let new_link = self.alloc_transition()?; |
396 | self.sparse[new_link] = Transition { byte, next, link: head }; |
397 | self.states[prev].sparse = new_link; |
398 | return Ok(()); |
399 | } else if byte == self.sparse[head].byte { |
400 | self.sparse[head].next = next; |
401 | return Ok(()); |
402 | } |
403 | |
404 | // We handled the only cases where the beginning of the transition |
405 | // chain needs to change. At this point, we now know that there is |
406 | // at least one entry in the transition chain and the byte for that |
407 | // transition is less than the byte for the transition we're adding. |
408 | let (mut link_prev, mut link_next) = (head, self.sparse[head].link); |
409 | while link_next != StateID::ZERO && byte > self.sparse[link_next].byte |
410 | { |
411 | link_prev = link_next; |
412 | link_next = self.sparse[link_next].link; |
413 | } |
414 | if link_next == StateID::ZERO || byte < self.sparse[link_next].byte { |
415 | let link = self.alloc_transition()?; |
416 | self.sparse[link] = Transition { byte, next, link: link_next }; |
417 | self.sparse[link_prev].link = link; |
418 | } else { |
419 | assert_eq!(byte, self.sparse[link_next].byte); |
420 | self.sparse[link_next].next = next; |
421 | } |
422 | Ok(()) |
423 | } |
424 | |
425 | /// This sets every possible transition (all 255 of them) for the given |
426 | /// state to the name `next` value. |
427 | /// |
428 | /// This is useful for efficiently initializing start/dead states. |
429 | /// |
430 | /// # Panics |
431 | /// |
432 | /// This requires that the state has no transitions added to it already. |
433 | /// If it has any transitions, then this panics. It will also panic if |
434 | /// the state has been densified prior to calling this. |
435 | fn init_full_state( |
436 | &mut self, |
437 | prev: StateID, |
438 | next: StateID, |
439 | ) -> Result<(), BuildError> { |
440 | assert_eq!( |
441 | StateID::ZERO, |
442 | self.states[prev].dense, |
443 | "state must not be dense yet" |
444 | ); |
445 | assert_eq!( |
446 | StateID::ZERO, |
447 | self.states[prev].sparse, |
448 | "state must have zero transitions" |
449 | ); |
450 | let mut prev_link = StateID::ZERO; |
451 | for byte in 0..=255 { |
452 | let new_link = self.alloc_transition()?; |
453 | self.sparse[new_link] = |
454 | Transition { byte, next, link: StateID::ZERO }; |
455 | if prev_link == StateID::ZERO { |
456 | self.states[prev].sparse = new_link; |
457 | } else { |
458 | self.sparse[prev_link].link = new_link; |
459 | } |
460 | prev_link = new_link; |
461 | } |
462 | Ok(()) |
463 | } |
464 | |
465 | /// Add a match for the given pattern ID to the state for the given ID. |
466 | fn add_match( |
467 | &mut self, |
468 | sid: StateID, |
469 | pid: PatternID, |
470 | ) -> Result<(), BuildError> { |
471 | let head = self.states[sid].matches; |
472 | let mut link = head; |
473 | while self.matches[link].link != StateID::ZERO { |
474 | link = self.matches[link].link; |
475 | } |
476 | let new_match_link = self.alloc_match()?; |
477 | self.matches[new_match_link].pid = pid; |
478 | if link == StateID::ZERO { |
479 | self.states[sid].matches = new_match_link; |
480 | } else { |
481 | self.matches[link].link = new_match_link; |
482 | } |
483 | Ok(()) |
484 | } |
485 | |
486 | /// Copy matches from the `src` state to the `dst` state. This is useful |
487 | /// when a match state can be reached via a failure transition. In which |
488 | /// case, you'll want to copy the matches (if any) from the state reached |
489 | /// by the failure transition to the original state you were at. |
490 | fn copy_matches( |
491 | &mut self, |
492 | src: StateID, |
493 | dst: StateID, |
494 | ) -> Result<(), BuildError> { |
495 | let head_dst = self.states[dst].matches; |
496 | let mut link_dst = head_dst; |
497 | while self.matches[link_dst].link != StateID::ZERO { |
498 | link_dst = self.matches[link_dst].link; |
499 | } |
500 | let mut link_src = self.states[src].matches; |
501 | while link_src != StateID::ZERO { |
502 | let new_match_link = |
503 | StateID::new(self.matches.len()).map_err(|e| { |
504 | BuildError::state_id_overflow( |
505 | StateID::MAX.as_u64(), |
506 | e.attempted(), |
507 | ) |
508 | })?; |
509 | self.matches.push(Match { |
510 | pid: self.matches[link_src].pid, |
511 | link: StateID::ZERO, |
512 | }); |
513 | if link_dst == StateID::ZERO { |
514 | self.states[dst].matches = new_match_link; |
515 | } else { |
516 | self.matches[link_dst].link = new_match_link; |
517 | } |
518 | |
519 | link_dst = new_match_link; |
520 | link_src = self.matches[link_src].link; |
521 | } |
522 | Ok(()) |
523 | } |
524 | |
525 | /// Create a new entry in `NFA::trans`, if there's room, and return that |
526 | /// entry's ID. If there's no room, then an error is returned. |
527 | fn alloc_transition(&mut self) -> Result<StateID, BuildError> { |
528 | let id = StateID::new(self.sparse.len()).map_err(|e| { |
529 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) |
530 | })?; |
531 | self.sparse.push(Transition::default()); |
532 | Ok(id) |
533 | } |
534 | |
535 | /// Create a new entry in `NFA::matches`, if there's room, and return that |
536 | /// entry's ID. If there's no room, then an error is returned. |
537 | fn alloc_match(&mut self) -> Result<StateID, BuildError> { |
538 | let id = StateID::new(self.matches.len()).map_err(|e| { |
539 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) |
540 | })?; |
541 | self.matches.push(Match::default()); |
542 | Ok(id) |
543 | } |
544 | |
545 | /// Create a new set of `N` transitions in this NFA's dense transition |
546 | /// table. The ID return corresponds to the index at which the `N` |
547 | /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is |
548 | /// the last. |
549 | /// |
550 | /// `N` is determined via `NFA::byte_classes::alphabet_len`. |
551 | fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> { |
552 | let id = StateID::new(self.dense.len()).map_err(|e| { |
553 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) |
554 | })?; |
555 | // We use FAIL because it's the correct default. If a state doesn't |
556 | // have a transition defined for every possible byte value, then the |
557 | // transition function should return NFA::FAIL. |
558 | self.dense.extend( |
559 | core::iter::repeat(NFA::FAIL) |
560 | .take(self.byte_classes.alphabet_len()), |
561 | ); |
562 | Ok(id) |
563 | } |
564 | |
565 | /// Allocate and add a fresh state to the underlying NFA and return its |
566 | /// ID (guaranteed to be one more than the ID of the previously allocated |
567 | /// state). If the ID would overflow `StateID`, then this returns an error. |
568 | fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> { |
569 | // This is OK because we error when building the trie if we see a |
570 | // pattern whose length cannot fit into a 'SmallIndex', and the longest |
571 | // possible depth corresponds to the length of the longest pattern. |
572 | let depth = SmallIndex::new(depth) |
573 | .expect("patterns longer than SmallIndex::MAX are not allowed" ); |
574 | let id = StateID::new(self.states.len()).map_err(|e| { |
575 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) |
576 | })?; |
577 | self.states.push(State { |
578 | sparse: StateID::ZERO, |
579 | dense: StateID::ZERO, |
580 | matches: StateID::ZERO, |
581 | fail: self.special.start_unanchored_id, |
582 | depth, |
583 | }); |
584 | Ok(id) |
585 | } |
586 | } |
587 | |
588 | // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always |
589 | // returns a valid state ID given a valid state ID. We otherwise claim that |
590 | // all other methods are correct as well. |
591 | unsafe impl Automaton for NFA { |
592 | #[inline (always)] |
593 | fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { |
594 | match anchored { |
595 | Anchored::No => Ok(self.special.start_unanchored_id), |
596 | Anchored::Yes => Ok(self.special.start_anchored_id), |
597 | } |
598 | } |
599 | |
600 | #[inline (always)] |
601 | fn next_state( |
602 | &self, |
603 | anchored: Anchored, |
604 | mut sid: StateID, |
605 | byte: u8, |
606 | ) -> StateID { |
607 | // This terminates since: |
608 | // |
609 | // 1. state.fail never points to the FAIL state. |
610 | // 2. All state.fail values point to a state closer to the start state. |
611 | // 3. The start state has no transitions to the FAIL state. |
612 | loop { |
613 | let next = self.follow_transition(sid, byte); |
614 | if next != NFA::FAIL { |
615 | return next; |
616 | } |
617 | // For an anchored search, we never follow failure transitions |
618 | // because failure transitions lead us down a path to matching |
619 | // a *proper* suffix of the path we were on. Thus, it can only |
620 | // produce matches that appear after the beginning of the search. |
621 | if anchored.is_anchored() { |
622 | return NFA::DEAD; |
623 | } |
624 | sid = self.states[sid].fail(); |
625 | } |
626 | } |
627 | |
628 | #[inline (always)] |
629 | fn is_special(&self, sid: StateID) -> bool { |
630 | sid <= self.special.max_special_id |
631 | } |
632 | |
633 | #[inline (always)] |
634 | fn is_dead(&self, sid: StateID) -> bool { |
635 | sid == NFA::DEAD |
636 | } |
637 | |
638 | #[inline (always)] |
639 | fn is_match(&self, sid: StateID) -> bool { |
640 | // N.B. This returns true when sid==NFA::FAIL but that's okay because |
641 | // NFA::FAIL is not actually a valid state ID from the perspective of |
642 | // the Automaton trait. Namely, it is never returned by 'start_state' |
643 | // or by 'next_state'. So we don't need to care about it here. |
644 | !self.is_dead(sid) && sid <= self.special.max_match_id |
645 | } |
646 | |
647 | #[inline (always)] |
648 | fn is_start(&self, sid: StateID) -> bool { |
649 | sid == self.special.start_unanchored_id |
650 | || sid == self.special.start_anchored_id |
651 | } |
652 | |
653 | #[inline (always)] |
654 | fn match_kind(&self) -> MatchKind { |
655 | self.match_kind |
656 | } |
657 | |
658 | #[inline (always)] |
659 | fn patterns_len(&self) -> usize { |
660 | self.pattern_lens.len() |
661 | } |
662 | |
663 | #[inline (always)] |
664 | fn pattern_len(&self, pid: PatternID) -> usize { |
665 | self.pattern_lens[pid].as_usize() |
666 | } |
667 | |
668 | #[inline (always)] |
669 | fn min_pattern_len(&self) -> usize { |
670 | self.min_pattern_len |
671 | } |
672 | |
673 | #[inline (always)] |
674 | fn max_pattern_len(&self) -> usize { |
675 | self.max_pattern_len |
676 | } |
677 | |
678 | #[inline (always)] |
679 | fn match_len(&self, sid: StateID) -> usize { |
680 | self.iter_matches(sid).count() |
681 | } |
682 | |
683 | #[inline (always)] |
684 | fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { |
685 | self.iter_matches(sid).nth(index).unwrap() |
686 | } |
687 | |
688 | #[inline (always)] |
689 | fn memory_usage(&self) -> usize { |
690 | self.states.len() * core::mem::size_of::<State>() |
691 | + self.sparse.len() * core::mem::size_of::<Transition>() |
692 | + self.matches.len() * core::mem::size_of::<Match>() |
693 | + self.dense.len() * StateID::SIZE |
694 | + self.pattern_lens.len() * SmallIndex::SIZE |
695 | + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) |
696 | } |
697 | |
698 | #[inline (always)] |
699 | fn prefilter(&self) -> Option<&Prefilter> { |
700 | self.prefilter.as_ref() |
701 | } |
702 | } |
703 | |
704 | /// A representation of a sparse NFA state for an Aho-Corasick automaton. |
705 | /// |
706 | /// It contains the transitions to the next state, a failure transition for |
707 | /// cases where there exists no other transition for the current input byte |
708 | /// and the matches implied by visiting this state (if any). |
709 | #[derive (Clone, Debug)] |
710 | pub(crate) struct State { |
711 | /// A pointer to `NFA::trans` corresponding to the head of a linked list |
712 | /// containing all of the transitions for this state. |
713 | /// |
714 | /// This is `StateID::ZERO` if and only if this state has zero transitions. |
715 | sparse: StateID, |
716 | /// A pointer to a row of `N` transitions in `NFA::dense`. These |
717 | /// transitions correspond precisely to what is obtained by traversing |
718 | /// `sparse`, but permits constant time lookup. |
719 | /// |
720 | /// When this is zero (which is true for most states in the default |
721 | /// configuration), then this state has no dense representation. |
722 | /// |
723 | /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is |
724 | /// typically much less than 256 (the maximum value). |
725 | dense: StateID, |
726 | /// A pointer to `NFA::matches` corresponding to the head of a linked list |
727 | /// containing all of the matches for this state. |
728 | /// |
729 | /// This is `StateID::ZERO` if and only if this state is not a match state. |
730 | matches: StateID, |
731 | /// The state that should be transitioned to if the current byte in the |
732 | /// haystack does not have a corresponding transition defined in this |
733 | /// state. |
734 | fail: StateID, |
735 | /// The depth of this state. Specifically, this is the distance from this |
736 | /// state to the starting state. (For the special sentinel states DEAD and |
737 | /// FAIL, their depth is always 0.) The depth of a starting state is 0. |
738 | /// |
739 | /// Note that depth is currently not used in this non-contiguous NFA. It |
740 | /// may in the future, but it is used in the contiguous NFA. Namely, it |
741 | /// permits an optimization where states near the starting state have their |
742 | /// transitions stored in a dense fashion, but all other states have their |
743 | /// transitions stored in a sparse fashion. (This non-contiguous NFA uses |
744 | /// a sparse representation for all states unconditionally.) In any case, |
745 | /// this is really the only convenient place to compute and store this |
746 | /// information, which we need when building the contiguous NFA. |
747 | depth: SmallIndex, |
748 | } |
749 | |
750 | impl State { |
751 | /// Return true if and only if this state is a match state. |
752 | pub(crate) fn is_match(&self) -> bool { |
753 | self.matches != StateID::ZERO |
754 | } |
755 | |
756 | /// Returns the failure transition for this state. |
757 | pub(crate) fn fail(&self) -> StateID { |
758 | self.fail |
759 | } |
760 | |
761 | /// Returns the depth of this state. That is, the number of transitions |
762 | /// this state is from the start state of the NFA. |
763 | pub(crate) fn depth(&self) -> SmallIndex { |
764 | self.depth |
765 | } |
766 | } |
767 | |
768 | /// A single transition in a non-contiguous NFA. |
769 | #[derive (Clone, Copy, Default)] |
770 | #[repr (packed)] |
771 | pub(crate) struct Transition { |
772 | byte: u8, |
773 | next: StateID, |
774 | link: StateID, |
775 | } |
776 | |
777 | impl Transition { |
778 | /// Return the byte for which this transition is defined. |
779 | pub(crate) fn byte(&self) -> u8 { |
780 | self.byte |
781 | } |
782 | |
783 | /// Return the ID of the state that this transition points to. |
784 | pub(crate) fn next(&self) -> StateID { |
785 | self.next |
786 | } |
787 | |
788 | /// Return the ID of the next transition. |
789 | fn link(&self) -> StateID { |
790 | self.link |
791 | } |
792 | } |
793 | |
794 | impl core::fmt::Debug for Transition { |
795 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
796 | write!( |
797 | f, |
798 | "Transition(byte: {:X?}, next: {:?}, link: {:?})" , |
799 | self.byte, |
800 | self.next().as_usize(), |
801 | self.link().as_usize() |
802 | ) |
803 | } |
804 | } |
805 | |
806 | /// A single match in a non-contiguous NFA. |
807 | #[derive (Clone, Copy, Default)] |
808 | struct Match { |
809 | pid: PatternID, |
810 | link: StateID, |
811 | } |
812 | |
813 | impl Match { |
814 | /// Return the pattern ID for this match. |
815 | pub(crate) fn pattern(&self) -> PatternID { |
816 | self.pid |
817 | } |
818 | |
819 | /// Return the ID of the next match. |
820 | fn link(&self) -> StateID { |
821 | self.link |
822 | } |
823 | } |
824 | |
825 | impl core::fmt::Debug for Match { |
826 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
827 | write!( |
828 | f, |
829 | "Match(pid: {:?}, link: {:?})" , |
830 | self.pattern().as_usize(), |
831 | self.link().as_usize() |
832 | ) |
833 | } |
834 | } |
835 | |
836 | /// A builder for configuring an Aho-Corasick noncontiguous NFA. |
837 | /// |
838 | /// This builder has a subset of the options available to a |
839 | /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, |
840 | /// their behavior is identical. |
841 | #[derive (Clone, Debug)] |
842 | pub struct Builder { |
843 | match_kind: MatchKind, |
844 | prefilter: bool, |
845 | ascii_case_insensitive: bool, |
846 | dense_depth: usize, |
847 | } |
848 | |
849 | impl Default for Builder { |
850 | fn default() -> Builder { |
851 | Builder { |
852 | match_kind: MatchKind::default(), |
853 | prefilter: true, |
854 | ascii_case_insensitive: false, |
855 | dense_depth: 3, |
856 | } |
857 | } |
858 | } |
859 | |
860 | impl Builder { |
861 | /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA. |
862 | pub fn new() -> Builder { |
863 | Builder::default() |
864 | } |
865 | |
866 | /// Build an Aho-Corasick noncontiguous NFA from the given iterator of |
867 | /// patterns. |
868 | /// |
869 | /// A builder may be reused to create more NFAs. |
870 | pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError> |
871 | where |
872 | I: IntoIterator<Item = P>, |
873 | P: AsRef<[u8]>, |
874 | { |
875 | debug!("building non-contiguous NFA" ); |
876 | let nfa = Compiler::new(self)?.compile(patterns)?; |
877 | debug!( |
878 | "non-contiguous NFA built, <states: {:?}, size: {:?}>" , |
879 | nfa.states.len(), |
880 | nfa.memory_usage() |
881 | ); |
882 | Ok(nfa) |
883 | } |
884 | |
885 | /// Set the desired match semantics. |
886 | /// |
887 | /// See |
888 | /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) |
889 | /// for more documentation and examples. |
890 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { |
891 | self.match_kind = kind; |
892 | self |
893 | } |
894 | |
895 | /// Enable ASCII-aware case insensitive matching. |
896 | /// |
897 | /// See |
898 | /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) |
899 | /// for more documentation and examples. |
900 | pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { |
901 | self.ascii_case_insensitive = yes; |
902 | self |
903 | } |
904 | |
905 | /// Set the limit on how many states use a dense representation for their |
906 | /// transitions. Other states will generally use a sparse representation. |
907 | /// |
908 | /// See |
909 | /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth) |
910 | /// for more documentation and examples. |
911 | pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { |
912 | self.dense_depth = depth; |
913 | self |
914 | } |
915 | |
916 | /// Enable heuristic prefilter optimizations. |
917 | /// |
918 | /// See |
919 | /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) |
920 | /// for more documentation and examples. |
921 | pub fn prefilter(&mut self, yes: bool) -> &mut Builder { |
922 | self.prefilter = yes; |
923 | self |
924 | } |
925 | } |
926 | |
927 | /// A compiler uses a builder configuration and builds up the NFA formulation |
928 | /// of an Aho-Corasick automaton. This roughly corresponds to the standard |
929 | /// formulation described in textbooks, with some tweaks to support leftmost |
930 | /// searching. |
931 | #[derive (Debug)] |
932 | struct Compiler<'a> { |
933 | builder: &'a Builder, |
934 | prefilter: prefilter::Builder, |
935 | nfa: NFA, |
936 | byteset: ByteClassSet, |
937 | } |
938 | |
939 | impl<'a> Compiler<'a> { |
940 | fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> { |
941 | let prefilter = prefilter::Builder::new(builder.match_kind) |
942 | .ascii_case_insensitive(builder.ascii_case_insensitive); |
943 | Ok(Compiler { |
944 | builder, |
945 | prefilter, |
946 | nfa: NFA { |
947 | match_kind: builder.match_kind, |
948 | states: vec![], |
949 | sparse: vec![], |
950 | dense: vec![], |
951 | matches: vec![], |
952 | pattern_lens: vec![], |
953 | prefilter: None, |
954 | byte_classes: ByteClasses::singletons(), |
955 | min_pattern_len: usize::MAX, |
956 | max_pattern_len: 0, |
957 | special: Special::zero(), |
958 | }, |
959 | byteset: ByteClassSet::empty(), |
960 | }) |
961 | } |
962 | |
963 | fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError> |
964 | where |
965 | I: IntoIterator<Item = P>, |
966 | P: AsRef<[u8]>, |
967 | { |
968 | // Add dummy transition/match links, so that no valid link will point |
969 | // to another link at index 0. |
970 | self.nfa.sparse.push(Transition::default()); |
971 | self.nfa.matches.push(Match::default()); |
972 | // Add a dummy dense transition so that no states can have dense==0 |
973 | // represent a valid pointer to dense transitions. This permits |
974 | // dense==0 to be a sentinel indicating "no dense transitions." |
975 | self.nfa.dense.push(NFA::DEAD); |
976 | // the dead state, only used for leftmost and fixed to id==0 |
977 | self.nfa.alloc_state(0)?; |
978 | // the fail state, which is never entered and fixed to id==1 |
979 | self.nfa.alloc_state(0)?; |
980 | // unanchored start state, initially fixed to id==2 but later shuffled |
981 | // to appear after all non-start match states. |
982 | self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?; |
983 | // anchored start state, initially fixed to id==3 but later shuffled |
984 | // to appear after unanchored start state. |
985 | self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?; |
986 | // Initialize the unanchored starting state in order to make it dense, |
987 | // and thus make transition lookups on this state faster. |
988 | self.init_unanchored_start_state()?; |
989 | // Set all transitions on the DEAD state to point to itself. This way, |
990 | // the DEAD state can never be escaped. It MUST be used as a sentinel |
991 | // in any correct search. |
992 | self.add_dead_state_loop()?; |
993 | // Build the base trie from the given patterns. |
994 | self.build_trie(patterns)?; |
995 | self.nfa.states.shrink_to_fit(); |
996 | // Turn our set of bytes into equivalent classes. This NFA |
997 | // implementation uses byte classes only for states that use a dense |
998 | // representation of transitions. (And that's why this comes before |
999 | // `self.densify()`, as the byte classes need to be set first.) |
1000 | self.nfa.byte_classes = self.byteset.byte_classes(); |
1001 | // Add transitions (and maybe matches) to the anchored starting state. |
1002 | // The anchored starting state is used for anchored searches. The only |
1003 | // mechanical difference between it and the unanchored start state is |
1004 | // that missing transitions map to the DEAD state instead of the FAIL |
1005 | // state. |
1006 | self.set_anchored_start_state()?; |
1007 | // Rewrite transitions to the FAIL state on the unanchored start state |
1008 | // as self-transitions. This keeps the start state active at all times. |
1009 | self.add_unanchored_start_state_loop(); |
1010 | // Make some (possibly zero) states use a dense representation for |
1011 | // transitions. It's important to do this right after the states |
1012 | // and non-failure transitions are solidified. That way, subsequent |
1013 | // accesses (particularly `fill_failure_transitions`) will benefit from |
1014 | // the faster transition lookup in densified states. |
1015 | self.densify()?; |
1016 | // The meat of the Aho-Corasick algorithm: compute and write failure |
1017 | // transitions. i.e., the state to move to when a transition isn't |
1018 | // defined in the current state. These are epsilon transitions and thus |
1019 | // make this formulation an NFA. |
1020 | self.fill_failure_transitions()?; |
1021 | // Handle a special case under leftmost semantics when at least one |
1022 | // of the patterns is the empty string. |
1023 | self.close_start_state_loop_for_leftmost(); |
1024 | // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START, |
1025 | // NON-MATCH, ... This permits us to very quickly query the type of |
1026 | // the state we're currently in during a search. |
1027 | self.shuffle(); |
1028 | self.nfa.prefilter = self.prefilter.build(); |
1029 | // Store the maximum ID of all *relevant* special states. Start states |
1030 | // are only relevant when we have a prefilter, otherwise, there is zero |
1031 | // reason to care about whether a state is a start state or not during |
1032 | // a search. Indeed, without a prefilter, we are careful to explicitly |
1033 | // NOT care about start states, otherwise the search can ping pong |
1034 | // between the unrolled loop and the handling of special-status states |
1035 | // and destroy perf. |
1036 | self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() { |
1037 | // Why the anchored starting state? Because we always put it |
1038 | // after the unanchored starting state and it is therefore the |
1039 | // maximum. Why put unanchored followed by anchored? No particular |
1040 | // reason, but that's how the states are logically organized in the |
1041 | // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯ |
1042 | self.nfa.special.start_anchored_id |
1043 | } else { |
1044 | self.nfa.special.max_match_id |
1045 | }; |
1046 | self.nfa.sparse.shrink_to_fit(); |
1047 | self.nfa.dense.shrink_to_fit(); |
1048 | self.nfa.matches.shrink_to_fit(); |
1049 | self.nfa.pattern_lens.shrink_to_fit(); |
1050 | Ok(self.nfa) |
1051 | } |
1052 | |
1053 | /// This sets up the initial prefix trie that makes up the Aho-Corasick |
1054 | /// automaton. Effectively, it creates the basic structure of the |
1055 | /// automaton, where every pattern given has a path from the start state to |
1056 | /// the end of the pattern. |
1057 | fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError> |
1058 | where |
1059 | I: IntoIterator<Item = P>, |
1060 | P: AsRef<[u8]>, |
1061 | { |
1062 | 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() { |
1063 | let pid = PatternID::new(i).map_err(|e| { |
1064 | BuildError::pattern_id_overflow( |
1065 | PatternID::MAX.as_u64(), |
1066 | e.attempted(), |
1067 | ) |
1068 | })?; |
1069 | let pat = pat.as_ref(); |
1070 | let patlen = SmallIndex::new(pat.len()) |
1071 | .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?; |
1072 | self.nfa.min_pattern_len = |
1073 | core::cmp::min(self.nfa.min_pattern_len, pat.len()); |
1074 | self.nfa.max_pattern_len = |
1075 | core::cmp::max(self.nfa.max_pattern_len, pat.len()); |
1076 | assert_eq!( |
1077 | i, |
1078 | self.nfa.pattern_lens.len(), |
1079 | "expected number of patterns to match pattern ID" |
1080 | ); |
1081 | self.nfa.pattern_lens.push(patlen); |
1082 | // We add the pattern to the prefilter here because the pattern |
1083 | // ID in the prefilter is determined with respect to the patterns |
1084 | // added to the prefilter. That is, it isn't the ID we have here, |
1085 | // but the one determined by its own accounting of patterns. |
1086 | // To ensure they line up, we add every pattern we see to the |
1087 | // prefilter, even if some patterns ultimately are impossible to |
1088 | // match (in leftmost-first semantics specifically). |
1089 | // |
1090 | // Another way of doing this would be to expose an API in the |
1091 | // prefilter to permit setting your own pattern IDs. Or to just use |
1092 | // our own map and go between them. But this case is sufficiently |
1093 | // rare that we don't bother and just make sure they're in sync. |
1094 | if self.builder.prefilter { |
1095 | self.prefilter.add(pat); |
1096 | } |
1097 | |
1098 | let mut prev = self.nfa.special.start_unanchored_id; |
1099 | let mut saw_match = false; |
1100 | for (depth, &b) in pat.iter().enumerate() { |
1101 | // When leftmost-first match semantics are requested, we |
1102 | // specifically stop adding patterns when a previously added |
1103 | // pattern is a prefix of it. We avoid adding it because |
1104 | // leftmost-first semantics imply that the pattern can never |
1105 | // match. This is not just an optimization to save space! It |
1106 | // is necessary for correctness. In fact, this is the only |
1107 | // difference in the automaton between the implementations for |
1108 | // leftmost-first and leftmost-longest. |
1109 | saw_match = saw_match || self.nfa.states[prev].is_match(); |
1110 | if self.builder.match_kind.is_leftmost_first() && saw_match { |
1111 | // Skip to the next pattern immediately. This avoids |
1112 | // incorrectly adding a match after this loop terminates. |
1113 | continue 'PATTERNS; |
1114 | } |
1115 | |
1116 | // Add this byte to our equivalence classes. These don't |
1117 | // get used while building the trie, but other Aho-Corasick |
1118 | // implementations may use them. |
1119 | self.byteset.set_range(b, b); |
1120 | if self.builder.ascii_case_insensitive { |
1121 | let b = opposite_ascii_case(b); |
1122 | self.byteset.set_range(b, b); |
1123 | } |
1124 | |
1125 | // If the transition from prev using the current byte already |
1126 | // exists, then just move through it. Otherwise, add a new |
1127 | // state. We track the depth here so that we can determine |
1128 | // how to represent transitions. States near the start state |
1129 | // use a dense representation that uses more memory but is |
1130 | // faster. Other states use a sparse representation that uses |
1131 | // less memory but is slower. |
1132 | let next = self.nfa.follow_transition(prev, b); |
1133 | if next != NFA::FAIL { |
1134 | prev = next; |
1135 | } else { |
1136 | let next = self.nfa.alloc_state(depth)?; |
1137 | self.nfa.add_transition(prev, b, next)?; |
1138 | if self.builder.ascii_case_insensitive { |
1139 | let b = opposite_ascii_case(b); |
1140 | self.nfa.add_transition(prev, b, next)?; |
1141 | } |
1142 | prev = next; |
1143 | } |
1144 | } |
1145 | // Once the pattern has been added, log the match in the final |
1146 | // state that it reached. |
1147 | self.nfa.add_match(prev, pid)?; |
1148 | } |
1149 | Ok(()) |
1150 | } |
1151 | |
1152 | /// This routine creates failure transitions according to the standard |
1153 | /// textbook formulation of the Aho-Corasick algorithm, with a couple small |
1154 | /// tweaks to support "leftmost" semantics. |
1155 | /// |
1156 | /// Building failure transitions is the most interesting part of building |
1157 | /// the Aho-Corasick automaton, because they are what allow searches to |
1158 | /// be performed in linear time. Specifically, a failure transition is |
1159 | /// a single transition associated with each state that points back to |
1160 | /// the longest proper suffix of the pattern being searched. The failure |
1161 | /// transition is followed whenever there exists no transition on the |
1162 | /// current state for the current input byte. If there is no other proper |
1163 | /// suffix, then the failure transition points back to the starting state. |
1164 | /// |
1165 | /// For example, let's say we built an Aho-Corasick automaton with the |
1166 | /// following patterns: 'abcd' and 'cef'. The trie looks like this: |
1167 | /// |
1168 | /// ```ignore |
1169 | /// a - S1 - b - S2 - c - S3 - d - S4* |
1170 | /// / |
1171 | /// S0 - c - S5 - e - S6 - f - S7* |
1172 | /// ``` |
1173 | /// |
1174 | /// At this point, it should be fairly straight-forward to see how this |
1175 | /// trie can be used in a simplistic way. At any given position in the |
1176 | /// text we're searching (called the "subject" string), all we need to do |
1177 | /// is follow the transitions in the trie by consuming one transition for |
1178 | /// each byte in the subject string. If we reach a match state, then we can |
1179 | /// report that location as a match. |
1180 | /// |
1181 | /// The trick comes when searching a subject string like 'abcef'. We'll |
1182 | /// initially follow the transition from S0 to S1 and wind up in S3 after |
1183 | /// observng the 'c' byte. At this point, the next byte is 'e' but state |
1184 | /// S3 has no transition for 'e', so the search fails. We then would need |
1185 | /// to restart the search at the next position in 'abcef', which |
1186 | /// corresponds to 'b'. The match would fail, but the next search starting |
1187 | /// at 'c' would finally succeed. The problem with this approach is that |
1188 | /// we wind up searching the subject string potentially many times. In |
1189 | /// effect, this makes the algorithm have worst case `O(n * m)` complexity, |
1190 | /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead |
1191 | /// like to achieve a `O(n + m)` worst case complexity. |
1192 | /// |
1193 | /// This is where failure transitions come in. Instead of dying at S3 in |
1194 | /// the first search, the automaton can instruct the search to move to |
1195 | /// another part of the automaton that corresponds to a suffix of what |
1196 | /// we've seen so far. Recall that we've seen 'abc' in the subject string, |
1197 | /// and the automaton does indeed have a non-empty suffix, 'c', that could |
1198 | /// potentially lead to another match. Thus, the actual Aho-Corasick |
1199 | /// automaton for our patterns in this case looks like this: |
1200 | /// |
1201 | /// ```ignore |
1202 | /// a - S1 - b - S2 - c - S3 - d - S4* |
1203 | /// / / |
1204 | /// / ---------------- |
1205 | /// / / |
1206 | /// S0 - c - S5 - e - S6 - f - S7* |
1207 | /// ``` |
1208 | /// |
1209 | /// That is, we have a failure transition from S3 to S5, which is followed |
1210 | /// exactly in cases when we are in state S3 but see any byte other than |
1211 | /// 'd' (that is, we've "failed" to find a match in this portion of our |
1212 | /// trie). We know we can transition back to S5 because we've already seen |
1213 | /// a 'c' byte, so we don't need to re-scan it. We can then pick back up |
1214 | /// with the search starting at S5 and complete our match. |
1215 | /// |
1216 | /// Adding failure transitions to a trie is fairly simple, but subtle. The |
1217 | /// key issue is that you might have multiple failure transition that you |
1218 | /// need to follow. For example, look at the trie for the patterns |
1219 | /// 'abcd', 'b', 'bcd' and 'cd': |
1220 | /// |
1221 | /// ```ignore |
1222 | /// - a - S1 - b - S2* - c - S3 - d - S4* |
1223 | /// / / / |
1224 | /// / ------- ------- |
1225 | /// / / / |
1226 | /// S0 --- b - S5* - c - S6 - d - S7* |
1227 | /// \ / |
1228 | /// \ -------- |
1229 | /// \ / |
1230 | /// - c - S8 - d - S9* |
1231 | /// ``` |
1232 | /// |
1233 | /// The failure transitions for this trie are defined from S2 to S5, |
1234 | /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it |
1235 | /// corresponds to a match, since its failure transition to S5 is itself |
1236 | /// a match state. |
1237 | /// |
1238 | /// Perhaps simplest way to think about adding these failure transitions |
1239 | /// is recursively. That is, if you know the failure transitions for every |
1240 | /// possible previous state that could be visited (e.g., when computing the |
1241 | /// failure transition for S3, you already know the failure transitions |
1242 | /// for S0, S1 and S2), then you can simply follow the failure transition |
1243 | /// of the previous state and check whether the incoming transition is |
1244 | /// defined after following the failure transition. |
1245 | /// |
1246 | /// For example, when determining the failure state for S3, by our |
1247 | /// assumptions, we already know that there is a failure transition from |
1248 | /// S2 (the previous state) to S5. So we follow that transition and check |
1249 | /// whether the transition connecting S2 to S3 is defined. Indeed, it is, |
1250 | /// as there is a transition from S5 to S6 for the byte 'c'. If no such |
1251 | /// transition existed, we could keep following the failure transitions |
1252 | /// until we reach the start state, which is the failure transition for |
1253 | /// every state that has no corresponding proper suffix. |
1254 | /// |
1255 | /// We don't actually use recursion to implement this, but instead, use a |
1256 | /// breadth first search of the automaton. Our base case is the start |
1257 | /// state, whose failure transition is just a transition to itself. |
1258 | /// |
1259 | /// When building a leftmost automaton, we proceed as above, but only |
1260 | /// include a subset of failure transitions. Namely, we omit any failure |
1261 | /// transitions that appear after a match state in the trie. This is |
1262 | /// because failure transitions always point back to a proper suffix of |
1263 | /// what has been seen so far. Thus, following a failure transition after |
1264 | /// a match implies looking for a match that starts after the one that has |
1265 | /// already been seen, which is of course therefore not the leftmost match. |
1266 | /// |
1267 | /// N.B. I came up with this algorithm on my own, and after scouring all of |
1268 | /// the other AC implementations I know of (Perl, Snort, many on GitHub). |
1269 | /// I couldn't find any that implement leftmost semantics like this. |
1270 | /// Perl of course needs leftmost-first semantics, but they implement it |
1271 | /// with a seeming hack at *search* time instead of encoding it into the |
1272 | /// automaton. There are also a couple Java libraries that support leftmost |
1273 | /// longest semantics, but they do it by building a queue of matches at |
1274 | /// search time, which is even worse than what Perl is doing. ---AG |
1275 | fn fill_failure_transitions(&mut self) -> Result<(), BuildError> { |
1276 | let is_leftmost = self.builder.match_kind.is_leftmost(); |
1277 | let start_uid = self.nfa.special.start_unanchored_id; |
1278 | // Initialize the queue for breadth first search with all transitions |
1279 | // out of the start state. We handle the start state specially because |
1280 | // we only want to follow non-self transitions. If we followed self |
1281 | // transitions, then this would never terminate. |
1282 | let mut queue = VecDeque::new(); |
1283 | let mut seen = self.queued_set(); |
1284 | let mut prev_link = None; |
1285 | while let Some(link) = self.nfa.next_link(start_uid, prev_link) { |
1286 | prev_link = Some(link); |
1287 | let t = self.nfa.sparse[link]; |
1288 | |
1289 | // Skip anything we've seen before and any self-transitions on the |
1290 | // start state. |
1291 | if start_uid == t.next() || seen.contains(t.next) { |
1292 | continue; |
1293 | } |
1294 | queue.push_back(t.next); |
1295 | seen.insert(t.next); |
1296 | // Under leftmost semantics, if a state immediately following |
1297 | // the start state is a match state, then we never want to |
1298 | // follow its failure transition since the failure transition |
1299 | // necessarily leads back to the start state, which we never |
1300 | // want to do for leftmost matching after a match has been |
1301 | // found. |
1302 | // |
1303 | // We apply the same logic to non-start states below as well. |
1304 | if is_leftmost && self.nfa.states[t.next].is_match() { |
1305 | self.nfa.states[t.next].fail = NFA::DEAD; |
1306 | } |
1307 | } |
1308 | while let Some(id) = queue.pop_front() { |
1309 | let mut prev_link = None; |
1310 | while let Some(link) = self.nfa.next_link(id, prev_link) { |
1311 | prev_link = Some(link); |
1312 | let t = self.nfa.sparse[link]; |
1313 | |
1314 | if seen.contains(t.next) { |
1315 | // The only way to visit a duplicate state in a transition |
1316 | // list is when ASCII case insensitivity is enabled. In |
1317 | // this case, we want to skip it since it's redundant work. |
1318 | // But it would also end up duplicating matches, which |
1319 | // results in reporting duplicate matches in some cases. |
1320 | // See the 'acasei010' regression test. |
1321 | continue; |
1322 | } |
1323 | queue.push_back(t.next); |
1324 | seen.insert(t.next); |
1325 | |
1326 | // As above for start states, under leftmost semantics, once |
1327 | // we see a match all subsequent states should have no failure |
1328 | // transitions because failure transitions always imply looking |
1329 | // for a match that is a suffix of what has been seen so far |
1330 | // (where "seen so far" corresponds to the string formed by |
1331 | // following the transitions from the start state to the |
1332 | // current state). Under leftmost semantics, we specifically do |
1333 | // not want to allow this to happen because we always want to |
1334 | // report the match found at the leftmost position. |
1335 | // |
1336 | // The difference between leftmost-first and leftmost-longest |
1337 | // occurs previously while we build the trie. For |
1338 | // leftmost-first, we simply omit any entries that would |
1339 | // otherwise require passing through a match state. |
1340 | // |
1341 | // Note that for correctness, the failure transition has to be |
1342 | // set to the dead state for ALL states following a match, not |
1343 | // just the match state itself. However, by setting the failure |
1344 | // transition to the dead state on all match states, the dead |
1345 | // state will automatically propagate to all subsequent states |
1346 | // via the failure state computation below. |
1347 | if is_leftmost && self.nfa.states[t.next].is_match() { |
1348 | self.nfa.states[t.next].fail = NFA::DEAD; |
1349 | continue; |
1350 | } |
1351 | let mut fail = self.nfa.states[id].fail; |
1352 | while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL { |
1353 | fail = self.nfa.states[fail].fail; |
1354 | } |
1355 | fail = self.nfa.follow_transition(fail, t.byte); |
1356 | self.nfa.states[t.next].fail = fail; |
1357 | self.nfa.copy_matches(fail, t.next)?; |
1358 | } |
1359 | // If the start state is a match state, then this automaton can |
1360 | // match the empty string. This implies all states are match states |
1361 | // since every position matches the empty string, so copy the |
1362 | // matches from the start state to every state. Strictly speaking, |
1363 | // this is only necessary for overlapping matches since each |
1364 | // non-empty non-start match state needs to report empty matches |
1365 | // in addition to its own. For the non-overlapping case, such |
1366 | // states only report the first match, which is never empty since |
1367 | // it isn't a start state. |
1368 | if !is_leftmost { |
1369 | self.nfa |
1370 | .copy_matches(self.nfa.special.start_unanchored_id, id)?; |
1371 | } |
1372 | } |
1373 | Ok(()) |
1374 | } |
1375 | |
1376 | /// Shuffle the states so that they appear in this sequence: |
1377 | /// |
1378 | /// DEAD, FAIL, MATCH..., START, START, NON-MATCH... |
1379 | /// |
1380 | /// The idea here is that if we know how special states are laid out in our |
1381 | /// transition table, then we can determine what "kind" of state we're in |
1382 | /// just by comparing our current state ID with a particular value. In this |
1383 | /// way, we avoid doing extra memory lookups. |
1384 | /// |
1385 | /// Before shuffling begins, our states look something like this: |
1386 | /// |
1387 | /// DEAD, FAIL, START, START, (MATCH | NON-MATCH)... |
1388 | /// |
1389 | /// So all we need to do is move all of the MATCH states so that they |
1390 | /// all appear before any NON-MATCH state, like so: |
1391 | /// |
1392 | /// DEAD, FAIL, START, START, MATCH... NON-MATCH... |
1393 | /// |
1394 | /// Then it's just a simple matter of swapping the two START states with |
1395 | /// the last two MATCH states. |
1396 | /// |
1397 | /// (This is the same technique used for fully compiled DFAs in |
1398 | /// regex-automata.) |
1399 | fn shuffle(&mut self) { |
1400 | let old_start_uid = self.nfa.special.start_unanchored_id; |
1401 | let old_start_aid = self.nfa.special.start_anchored_id; |
1402 | assert!(old_start_uid < old_start_aid); |
1403 | assert_eq!( |
1404 | 3, |
1405 | old_start_aid.as_usize(), |
1406 | "anchored start state should be at index 3" |
1407 | ); |
1408 | // We implement shuffling by a sequence of pairwise swaps of states. |
1409 | // Since we have a number of things referencing states via their |
1410 | // IDs and swapping them changes their IDs, we need to record every |
1411 | // swap we make so that we can remap IDs. The remapper handles this |
1412 | // book-keeping for us. |
1413 | let mut remapper = Remapper::new(&self.nfa, 0); |
1414 | // The way we proceed here is by moving all match states so that |
1415 | // they directly follow the start states. So it will go: DEAD, FAIL, |
1416 | // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ... |
1417 | // |
1418 | // To do that, we proceed forward through all states after |
1419 | // START-ANCHORED and swap match states so that they appear before all |
1420 | // non-match states. |
1421 | let mut next_avail = StateID::from(4u8); |
1422 | for i in next_avail.as_usize()..self.nfa.states.len() { |
1423 | let sid = StateID::new(i).unwrap(); |
1424 | if !self.nfa.states[sid].is_match() { |
1425 | continue; |
1426 | } |
1427 | remapper.swap(&mut self.nfa, sid, next_avail); |
1428 | // The key invariant here is that only non-match states exist |
1429 | // between 'next_avail' and 'sid' (with them being potentially |
1430 | // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed |
1431 | // to land on the leftmost non-match state. (Unless 'next_avail' |
1432 | // and 'sid' are equivalent, in which case, a swap will occur but |
1433 | // it is a no-op.) |
1434 | next_avail = StateID::new(next_avail.one_more()).unwrap(); |
1435 | } |
1436 | // Now we'd like to move the start states to immediately following the |
1437 | // match states. (The start states may themselves be match states, but |
1438 | // we'll handle that later.) We arrange the states this way so that we |
1439 | // don't necessarily need to check whether a state is a start state or |
1440 | // not before checking whether a state is a match state. For example, |
1441 | // we'd like to be able to write this as our state machine loop: |
1442 | // |
1443 | // sid = start() |
1444 | // for byte in haystack: |
1445 | // sid = next(sid, byte) |
1446 | // if sid <= nfa.max_start_id: |
1447 | // if sid <= nfa.max_dead_id: |
1448 | // # search complete |
1449 | // elif sid <= nfa.max_match_id: |
1450 | // # found match |
1451 | // |
1452 | // The important context here is that we might not want to look for |
1453 | // start states at all. Namely, if a searcher doesn't have a prefilter, |
1454 | // then there is no reason to care about whether we're in a start state |
1455 | // or not. And indeed, if we did check for it, this very hot loop would |
1456 | // ping pong between the special state handling and the main state |
1457 | // transition logic. This in turn stalls the CPU by killing branch |
1458 | // prediction. |
1459 | // |
1460 | // So essentially, we really want to be able to "forget" that start |
1461 | // states even exist and this is why we put them at the end. |
1462 | let new_start_aid = |
1463 | StateID::new(next_avail.as_usize().checked_sub(1).unwrap()) |
1464 | .unwrap(); |
1465 | remapper.swap(&mut self.nfa, old_start_aid, new_start_aid); |
1466 | let new_start_uid = |
1467 | StateID::new(next_avail.as_usize().checked_sub(2).unwrap()) |
1468 | .unwrap(); |
1469 | remapper.swap(&mut self.nfa, old_start_uid, new_start_uid); |
1470 | let new_max_match_id = |
1471 | StateID::new(next_avail.as_usize().checked_sub(3).unwrap()) |
1472 | .unwrap(); |
1473 | self.nfa.special.max_match_id = new_max_match_id; |
1474 | self.nfa.special.start_unanchored_id = new_start_uid; |
1475 | self.nfa.special.start_anchored_id = new_start_aid; |
1476 | // If one start state is a match state, then they both are. |
1477 | if self.nfa.states[self.nfa.special.start_anchored_id].is_match() { |
1478 | self.nfa.special.max_match_id = self.nfa.special.start_anchored_id; |
1479 | } |
1480 | remapper.remap(&mut self.nfa); |
1481 | } |
1482 | |
1483 | /// Attempts to convert the transition representation of a subset of states |
1484 | /// in this NFA from sparse to dense. This can greatly improve search |
1485 | /// performance since states with a higher number of transitions tend to |
1486 | /// correlate with very active states. |
1487 | /// |
1488 | /// We generally only densify states that are close to the start state. |
1489 | /// These tend to be the most active states and thus benefit from a dense |
1490 | /// representation more than other states. |
1491 | /// |
1492 | /// This tends to best balance between memory usage and performance. In |
1493 | /// particular, the *vast majority* of all states in a typical Aho-Corasick |
1494 | /// automaton have only 1 transition and are usually farther from the start |
1495 | /// state and thus don't get densified. |
1496 | /// |
1497 | /// Note that this doesn't remove the sparse representation of transitions |
1498 | /// for states that are densified. It could be done, but actually removing |
1499 | /// entries from `NFA::sparse` is likely more expensive than it's worth. |
1500 | fn densify(&mut self) -> Result<(), BuildError> { |
1501 | for i in 0..self.nfa.states.len() { |
1502 | let sid = StateID::new(i).unwrap(); |
1503 | // Don't bother densifying states that are only used as sentinels. |
1504 | if sid == NFA::DEAD || sid == NFA::FAIL { |
1505 | continue; |
1506 | } |
1507 | // Only densify states that are "close enough" to the start state. |
1508 | if self.nfa.states[sid].depth.as_usize() |
1509 | >= self.builder.dense_depth |
1510 | { |
1511 | continue; |
1512 | } |
1513 | let dense = self.nfa.alloc_dense_state()?; |
1514 | let mut prev_link = None; |
1515 | while let Some(link) = self.nfa.next_link(sid, prev_link) { |
1516 | prev_link = Some(link); |
1517 | let t = self.nfa.sparse[link]; |
1518 | |
1519 | let class = usize::from(self.nfa.byte_classes.get(t.byte)); |
1520 | let index = dense.as_usize() + class; |
1521 | self.nfa.dense[index] = t.next; |
1522 | } |
1523 | self.nfa.states[sid].dense = dense; |
1524 | } |
1525 | Ok(()) |
1526 | } |
1527 | |
1528 | /// Returns a set that tracked queued states. |
1529 | /// |
1530 | /// This is only necessary when ASCII case insensitivity is enabled, since |
1531 | /// it is the only way to visit the same state twice. Otherwise, this |
1532 | /// returns an inert set that nevers adds anything and always reports |
1533 | /// `false` for every member test. |
1534 | fn queued_set(&self) -> QueuedSet { |
1535 | if self.builder.ascii_case_insensitive { |
1536 | QueuedSet::active() |
1537 | } else { |
1538 | QueuedSet::inert() |
1539 | } |
1540 | } |
1541 | |
1542 | /// Initializes the unanchored start state by making it dense. This is |
1543 | /// achieved by explicitly setting every transition to the FAIL state. |
1544 | /// This isn't necessary for correctness, since any missing transition is |
1545 | /// automatically assumed to be mapped to the FAIL state. We do this to |
1546 | /// make the unanchored starting state dense, and thus in turn make |
1547 | /// transition lookups on it faster. (Which is worth doing because it's |
1548 | /// the most active state.) |
1549 | fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> { |
1550 | let start_uid = self.nfa.special.start_unanchored_id; |
1551 | let start_aid = self.nfa.special.start_anchored_id; |
1552 | self.nfa.init_full_state(start_uid, NFA::FAIL)?; |
1553 | self.nfa.init_full_state(start_aid, NFA::FAIL)?; |
1554 | Ok(()) |
1555 | } |
1556 | |
1557 | /// Setup the anchored start state by copying all of the transitions and |
1558 | /// matches from the unanchored starting state with one change: the failure |
1559 | /// transition is changed to the DEAD state, so that for any undefined |
1560 | /// transitions, the search will stop. |
1561 | fn set_anchored_start_state(&mut self) -> Result<(), BuildError> { |
1562 | let start_uid = self.nfa.special.start_unanchored_id; |
1563 | let start_aid = self.nfa.special.start_anchored_id; |
1564 | let (mut uprev_link, mut aprev_link) = (None, None); |
1565 | loop { |
1566 | let unext = self.nfa.next_link(start_uid, uprev_link); |
1567 | let anext = self.nfa.next_link(start_aid, aprev_link); |
1568 | let (ulink, alink) = match (unext, anext) { |
1569 | (Some(ulink), Some(alink)) => (ulink, alink), |
1570 | (None, None) => break, |
1571 | _ => unreachable!(), |
1572 | }; |
1573 | uprev_link = Some(ulink); |
1574 | aprev_link = Some(alink); |
1575 | self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next; |
1576 | } |
1577 | self.nfa.copy_matches(start_uid, start_aid)?; |
1578 | // This is the main difference between the unanchored and anchored |
1579 | // starting states. If a lookup on an anchored starting state fails, |
1580 | // then the search should stop. |
1581 | // |
1582 | // N.B. This assumes that the loop on the unanchored starting state |
1583 | // hasn't been created yet. |
1584 | self.nfa.states[start_aid].fail = NFA::DEAD; |
1585 | Ok(()) |
1586 | } |
1587 | |
1588 | /// Set the failure transitions on the start state to loop back to the |
1589 | /// start state. This effectively permits the Aho-Corasick automaton to |
1590 | /// match at any position. This is also required for finding the next |
1591 | /// state to terminate, namely, finding the next state should never return |
1592 | /// a fail_id. |
1593 | /// |
1594 | /// This must be done after building the initial trie, since trie |
1595 | /// construction depends on transitions to `fail_id` to determine whether a |
1596 | /// state already exists or not. |
1597 | fn add_unanchored_start_state_loop(&mut self) { |
1598 | let start_uid = self.nfa.special.start_unanchored_id; |
1599 | let mut prev_link = None; |
1600 | while let Some(link) = self.nfa.next_link(start_uid, prev_link) { |
1601 | prev_link = Some(link); |
1602 | if self.nfa.sparse[link].next() == NFA::FAIL { |
1603 | self.nfa.sparse[link].next = start_uid; |
1604 | } |
1605 | } |
1606 | } |
1607 | |
1608 | /// Remove the start state loop by rewriting any transitions on the start |
1609 | /// state back to the start state with transitions to the dead state. |
1610 | /// |
1611 | /// The loop is only closed when two conditions are met: the start state |
1612 | /// is a match state and the match kind is leftmost-first or |
1613 | /// leftmost-longest. |
1614 | /// |
1615 | /// The reason for this is that under leftmost semantics, a start state |
1616 | /// that is also a match implies that we should never restart the search |
1617 | /// process. We allow normal transitions out of the start state, but if |
1618 | /// none exist, we transition to the dead state, which signals that |
1619 | /// searching should stop. |
1620 | fn close_start_state_loop_for_leftmost(&mut self) { |
1621 | let start_uid = self.nfa.special.start_unanchored_id; |
1622 | let start = &mut self.nfa.states[start_uid]; |
1623 | let dense = start.dense; |
1624 | if self.builder.match_kind.is_leftmost() && start.is_match() { |
1625 | let mut prev_link = None; |
1626 | while let Some(link) = self.nfa.next_link(start_uid, prev_link) { |
1627 | prev_link = Some(link); |
1628 | if self.nfa.sparse[link].next() == start_uid { |
1629 | self.nfa.sparse[link].next = NFA::DEAD; |
1630 | if dense != StateID::ZERO { |
1631 | let b = self.nfa.sparse[link].byte; |
1632 | let class = usize::from(self.nfa.byte_classes.get(b)); |
1633 | self.nfa.dense[dense.as_usize() + class] = NFA::DEAD; |
1634 | } |
1635 | } |
1636 | } |
1637 | } |
1638 | } |
1639 | |
1640 | /// Sets all transitions on the dead state to point back to the dead state. |
1641 | /// Normally, missing transitions map back to the failure state, but the |
1642 | /// point of the dead state is to act as a sink that can never be escaped. |
1643 | fn add_dead_state_loop(&mut self) -> Result<(), BuildError> { |
1644 | self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?; |
1645 | Ok(()) |
1646 | } |
1647 | } |
1648 | |
1649 | /// A set of state identifiers used to avoid revisiting the same state multiple |
1650 | /// times when filling in failure transitions. |
1651 | /// |
1652 | /// This set has an "inert" and an "active" mode. When inert, the set never |
1653 | /// stores anything and always returns `false` for every member test. This is |
1654 | /// useful to avoid the performance and memory overhead of maintaining this |
1655 | /// set when it is not needed. |
1656 | #[derive (Debug)] |
1657 | struct QueuedSet { |
1658 | set: Option<BTreeSet<StateID>>, |
1659 | } |
1660 | |
1661 | impl QueuedSet { |
1662 | /// Return an inert set that returns `false` for every state ID membership |
1663 | /// test. |
1664 | fn inert() -> QueuedSet { |
1665 | QueuedSet { set: None } |
1666 | } |
1667 | |
1668 | /// Return an active set that tracks state ID membership. |
1669 | fn active() -> QueuedSet { |
1670 | QueuedSet { set: Some(BTreeSet::new()) } |
1671 | } |
1672 | |
1673 | /// Inserts the given state ID into this set. (If the set is inert, then |
1674 | /// this is a no-op.) |
1675 | fn insert(&mut self, state_id: StateID) { |
1676 | if let Some(ref mut set) = self.set { |
1677 | set.insert(state_id); |
1678 | } |
1679 | } |
1680 | |
1681 | /// Returns true if and only if the given state ID is in this set. If the |
1682 | /// set is inert, this always returns false. |
1683 | fn contains(&self, state_id: StateID) -> bool { |
1684 | match self.set { |
1685 | None => false, |
1686 | Some(ref set) => set.contains(&state_id), |
1687 | } |
1688 | } |
1689 | } |
1690 | |
1691 | impl core::fmt::Debug for NFA { |
1692 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
1693 | use crate::{ |
1694 | automaton::{fmt_state_indicator, sparse_transitions}, |
1695 | util::debug::DebugByte, |
1696 | }; |
1697 | |
1698 | writeln!(f, "noncontiguous::NFA(" )?; |
1699 | for (sid, state) in self.states.iter().with_state_ids() { |
1700 | // The FAIL state doesn't actually have space for a state allocated |
1701 | // for it, so we have to treat it as a special case. |
1702 | if sid == NFA::FAIL { |
1703 | writeln!(f, "F {:06}:" , sid.as_usize())?; |
1704 | continue; |
1705 | } |
1706 | fmt_state_indicator(f, self, sid)?; |
1707 | write!( |
1708 | f, |
1709 | " {:06}( {:06}): " , |
1710 | sid.as_usize(), |
1711 | state.fail.as_usize() |
1712 | )?; |
1713 | |
1714 | let it = sparse_transitions( |
1715 | self.iter_trans(sid).map(|t| (t.byte, t.next)), |
1716 | ) |
1717 | .enumerate(); |
1718 | for (i, (start, end, sid)) in it { |
1719 | if i > 0 { |
1720 | write!(f, ", " )?; |
1721 | } |
1722 | if start == end { |
1723 | write!( |
1724 | f, |
1725 | " {:?} => {:?}" , |
1726 | DebugByte(start), |
1727 | sid.as_usize() |
1728 | )?; |
1729 | } else { |
1730 | write!( |
1731 | f, |
1732 | " {:?}- {:?} => {:?}" , |
1733 | DebugByte(start), |
1734 | DebugByte(end), |
1735 | sid.as_usize() |
1736 | )?; |
1737 | } |
1738 | } |
1739 | |
1740 | write!(f, " \n" )?; |
1741 | if self.is_match(sid) { |
1742 | write!(f, " matches: " )?; |
1743 | for (i, pid) in self.iter_matches(sid).enumerate() { |
1744 | if i > 0 { |
1745 | write!(f, ", " )?; |
1746 | } |
1747 | write!(f, " {}" , pid.as_usize())?; |
1748 | } |
1749 | write!(f, " \n" )?; |
1750 | } |
1751 | } |
1752 | writeln!(f, "match kind: {:?}" , self.match_kind)?; |
1753 | writeln!(f, "prefilter: {:?}" , self.prefilter.is_some())?; |
1754 | writeln!(f, "state length: {:?}" , self.states.len())?; |
1755 | writeln!(f, "pattern length: {:?}" , self.patterns_len())?; |
1756 | writeln!(f, "shortest pattern length: {:?}" , self.min_pattern_len)?; |
1757 | writeln!(f, "longest pattern length: {:?}" , self.max_pattern_len)?; |
1758 | writeln!(f, "memory usage: {:?}" , self.memory_usage())?; |
1759 | writeln!(f, ")" )?; |
1760 | Ok(()) |
1761 | } |
1762 | } |
1763 | |