1 | /*! |
2 | Provides a noncontiguous NFA implementation of Aho-Corasick. |
3 | |
4 | This is a low-level API that generally only needs to be used in niche |
5 | circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) |
6 | instead of a noncontiguous NFA directly. Using an `NFA` directly is typically |
7 | only necessary when one needs access to the [`Automaton`] trait implementation. |
8 | */ |
9 | |
10 | use alloc::{ |
11 | collections::{BTreeSet, VecDeque}, |
12 | vec, |
13 | vec::Vec, |
14 | }; |
15 | |
16 | use crate::{ |
17 | automaton::Automaton, |
18 | util::{ |
19 | alphabet::{ByteClassSet, ByteClasses}, |
20 | error::{BuildError, MatchError}, |
21 | prefilter::{self, opposite_ascii_case, Prefilter}, |
22 | primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, |
23 | remapper::Remapper, |
24 | search::{Anchored, MatchKind}, |
25 | special::Special, |
26 | }, |
27 | }; |
28 | |
29 | /// A noncontiguous NFA implementation of Aho-Corasick. |
30 | /// |
31 | /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of |
32 | /// this type directly. Using an `NFA` directly is typically only necessary |
33 | /// when one needs access to the [`Automaton`] trait implementation. |
34 | /// |
35 | /// This NFA represents the "core" implementation of Aho-Corasick in this |
36 | /// crate. Namely, constructing this NFA involving building a trie and then |
37 | /// filling in the failure transitions between states, similar to what is |
38 | /// described in any standard textbook description of Aho-Corasick. |
39 | /// |
40 | /// In order to minimize heap usage and to avoid additional construction costs, |
41 | /// this implementation represents the transitions of all states as distinct |
42 | /// sparse memory allocations. This is where it gets its name from. That is, |
43 | /// this NFA has no contiguous memory allocation for its transition table. Each |
44 | /// state gets its own allocation. |
45 | /// |
46 | /// While the sparse representation keeps memory usage to somewhat reasonable |
47 | /// levels, it is still quite large and also results in somewhat mediocre |
48 | /// search performance. For this reason, it is almost always a good idea to |
49 | /// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is |
50 | /// marginally slower to build, but has higher throughput and can sometimes use |
51 | /// an order of magnitude less memory. The main reason to use a noncontiguous |
52 | /// NFA is when you need the fastest possible construction time, or when a |
53 | /// contiguous NFA does not have the desired capacity. (The total number of NFA |
54 | /// states it can have is fewer than a noncontiguous NFA.) |
55 | /// |
56 | /// # Example |
57 | /// |
58 | /// This example shows how to build an `NFA` directly and use it to execute |
59 | /// [`Automaton::try_find`]: |
60 | /// |
61 | /// ``` |
62 | /// use aho_corasick::{ |
63 | /// automaton::Automaton, |
64 | /// nfa::noncontiguous::NFA, |
65 | /// Input, Match, |
66 | /// }; |
67 | /// |
68 | /// let patterns = &["b" , "abc" , "abcd" ]; |
69 | /// let haystack = "abcd" ; |
70 | /// |
71 | /// let nfa = NFA::new(patterns).unwrap(); |
72 | /// assert_eq!( |
73 | /// Some(Match::must(0, 1..2)), |
74 | /// nfa.try_find(&Input::new(haystack))?, |
75 | /// ); |
76 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
77 | /// ``` |
78 | /// |
79 | /// It is also possible to implement your own version of `try_find`. See the |
80 | /// [`Automaton`] documentation for an example. |
81 | #[derive (Clone)] |
82 | pub struct NFA { |
83 | /// The match semantics built into this NFA. |
84 | match_kind: MatchKind, |
85 | /// A set of states. Each state defines its own transitions, a fail |
86 | /// transition and a set of indices corresponding to matches. |
87 | /// |
88 | /// The first state is always the fail state, which is used only as a |
89 | /// sentinel. Namely, in the final NFA, no transition into the fail state |
90 | /// exists. (Well, they do, but they aren't followed. Instead, the state's |
91 | /// failure transition is followed.) |
92 | /// |
93 | /// The second state (index 1) is always the dead state. Dead states are |
94 | /// in every automaton, but only used when leftmost-{first,longest} match |
95 | /// semantics are enabled. Specifically, they instruct search to stop |
96 | /// at specific points in order to report the correct match location. In |
97 | /// the standard Aho-Corasick construction, there are no transitions to |
98 | /// the dead state. |
99 | /// |
100 | /// The third state (index 2) is generally intended to be the starting or |
101 | /// "root" state. |
102 | states: Vec<State>, |
103 | /// The length, in bytes, of each pattern in this NFA. This slice is |
104 | /// indexed by `PatternID`. |
105 | /// |
106 | /// The number of entries in this vector corresponds to the total number of |
107 | /// patterns in this automaton. |
108 | pattern_lens: Vec<SmallIndex>, |
109 | /// A prefilter for quickly skipping to candidate matches, if pertinent. |
110 | prefilter: Option<Prefilter>, |
111 | /// A set of equivalence classes in terms of bytes. We compute this while |
112 | /// building the NFA, but don't use it in the NFA's states. Instead, we |
113 | /// use this for building the DFA. We store it on the NFA since it's easy |
114 | /// to compute while visiting the patterns. |
115 | byte_classes: ByteClasses, |
116 | /// The length, in bytes, of the shortest pattern in this automaton. This |
117 | /// information is useful for detecting whether an automaton matches the |
118 | /// empty string or not. |
119 | min_pattern_len: usize, |
120 | /// The length, in bytes, of the longest pattern in this automaton. This |
121 | /// information is useful for keeping correct buffer sizes when searching |
122 | /// on streams. |
123 | max_pattern_len: usize, |
124 | /// The information required to deduce which states are "special" in this |
125 | /// NFA. |
126 | /// |
127 | /// Since the DEAD and FAIL states are always the first two states and |
128 | /// there are only ever two start states (which follow all of the match |
129 | /// states), it follows that we can determine whether a state is a fail, |
130 | /// dead, match or start with just a few comparisons on the ID itself: |
131 | /// |
132 | /// is_dead(sid): sid == NFA::DEAD |
133 | /// is_fail(sid): sid == NFA::FAIL |
134 | /// is_match(sid): NFA::FAIL < sid && sid <= max_match_id |
135 | /// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id |
136 | /// |
137 | /// Note that this only applies to the NFA after it has been constructed. |
138 | /// During construction, the start states are the first ones added and the |
139 | /// match states are inter-leaved with non-match states. Once all of the |
140 | /// states have been added, the states are shuffled such that the above |
141 | /// predicates hold. |
142 | special: Special, |
143 | /// The number of bytes of heap used by this sparse NFA. |
144 | memory_usage: usize, |
145 | } |
146 | |
147 | impl NFA { |
148 | /// Create a new Aho-Corasick noncontiguous NFA using the default |
149 | /// configuration. |
150 | /// |
151 | /// Use a [`Builder`] if you want to change the configuration. |
152 | pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError> |
153 | where |
154 | I: IntoIterator<Item = P>, |
155 | P: AsRef<[u8]>, |
156 | { |
157 | NFA::builder().build(patterns) |
158 | } |
159 | |
160 | /// A convenience method for returning a new Aho-Corasick noncontiguous NFA |
161 | /// builder. |
162 | /// |
163 | /// This usually permits one to just import the `NFA` type. |
164 | pub fn builder() -> Builder { |
165 | Builder::new() |
166 | } |
167 | } |
168 | |
169 | impl NFA { |
170 | /// The DEAD state is a sentinel state like the FAIL state. The DEAD state |
171 | /// instructs any search to stop and return any currently recorded match, |
172 | /// or no match otherwise. Generally speaking, it is impossible for an |
173 | /// unanchored standard search to enter a DEAD state. But an anchored |
174 | /// search can, and so to can a leftmost search. |
175 | /// |
176 | /// We put DEAD before FAIL so that DEAD is always 0. We repeat this |
177 | /// decision across the other Aho-Corasicm automata, so that DEAD |
178 | /// states there are always 0 too. It's not that we need all of the |
179 | /// implementations to agree, but rather, the contiguous NFA and the DFA |
180 | /// use a sort of "premultiplied" state identifier where the only state |
181 | /// whose ID is always known and constant is the first state. Subsequent |
182 | /// state IDs depend on how much space has already been used in the |
183 | /// transition table. |
184 | pub(crate) const DEAD: StateID = StateID::new_unchecked(0); |
185 | /// The FAIL state mostly just corresponds to the ID of any transition on a |
186 | /// state that isn't explicitly defined. When one transitions into the FAIL |
187 | /// state, one must follow the previous state's failure transition before |
188 | /// doing the next state lookup. In this way, FAIL is more of a sentinel |
189 | /// than a state that one actually transitions into. In particular, it is |
190 | /// never exposed in the `Automaton` interface. |
191 | pub(crate) const FAIL: StateID = StateID::new_unchecked(1); |
192 | |
193 | /// Returns the equivalence classes of bytes found while constructing |
194 | /// this NFA. |
195 | /// |
196 | /// Note that the NFA doesn't actually make use of these equivalence |
197 | /// classes. Instead, these are useful for building the DFA when desired. |
198 | pub(crate) fn byte_classes(&self) -> &ByteClasses { |
199 | &self.byte_classes |
200 | } |
201 | |
202 | /// Returns a slice containing the length of each pattern in this searcher. |
203 | /// It is indexed by `PatternID` and has length `NFA::patterns_len`. |
204 | /// |
205 | /// This is exposed for convenience when building a contiguous NFA. But it |
206 | /// can be reconstructed from the `Automaton` API if necessary. |
207 | pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] { |
208 | &self.pattern_lens |
209 | } |
210 | |
211 | /// Returns a slice of all states in this non-contiguous NFA. |
212 | pub(crate) fn states(&self) -> &[State] { |
213 | &self.states |
214 | } |
215 | |
216 | /// Returns the underlying "special" state information for this NFA. |
217 | pub(crate) fn special(&self) -> &Special { |
218 | &self.special |
219 | } |
220 | |
221 | /// Swaps the states at `id1` and `id2`. |
222 | /// |
223 | /// This does not update the transitions of any state to account for the |
224 | /// state swap. |
225 | pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { |
226 | self.states.swap(id1.as_usize(), id2.as_usize()); |
227 | } |
228 | |
229 | /// Re-maps all state IDs in this NFA according to the `map` function |
230 | /// given. |
231 | pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { |
232 | for state in self.states.iter_mut() { |
233 | state.fail = map(state.fail); |
234 | for (_, ref mut sid) in state.trans.iter_mut() { |
235 | *sid = map(*sid); |
236 | } |
237 | } |
238 | } |
239 | } |
240 | |
241 | // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always |
242 | // returns a valid state ID given a valid state ID. We otherwise claim that |
243 | // all other methods are correct as well. |
244 | unsafe impl Automaton for NFA { |
245 | #[inline (always)] |
246 | fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { |
247 | match anchored { |
248 | Anchored::No => Ok(self.special.start_unanchored_id), |
249 | Anchored::Yes => Ok(self.special.start_anchored_id), |
250 | } |
251 | } |
252 | |
253 | #[inline (always)] |
254 | fn next_state( |
255 | &self, |
256 | anchored: Anchored, |
257 | mut sid: StateID, |
258 | byte: u8, |
259 | ) -> StateID { |
260 | // This terminates since: |
261 | // |
262 | // 1. state.fail never points to the FAIL state. |
263 | // 2. All state.fail values point to a state closer to the start state. |
264 | // 3. The start state has no transitions to the FAIL state. |
265 | loop { |
266 | let state = &self.states[sid]; |
267 | let next = state.next_state(byte); |
268 | if next != NFA::FAIL { |
269 | return next; |
270 | } |
271 | // For an anchored search, we never follow failure transitions |
272 | // because failure transitions lead us down a path to matching |
273 | // a *proper* suffix of the path we were on. Thus, it can only |
274 | // produce matches that appear after the beginning of the search. |
275 | if anchored.is_anchored() { |
276 | return NFA::DEAD; |
277 | } |
278 | sid = state.fail; |
279 | } |
280 | } |
281 | |
282 | #[inline (always)] |
283 | fn is_special(&self, sid: StateID) -> bool { |
284 | sid <= self.special.max_special_id |
285 | } |
286 | |
287 | #[inline (always)] |
288 | fn is_dead(&self, sid: StateID) -> bool { |
289 | sid == NFA::DEAD |
290 | } |
291 | |
292 | #[inline (always)] |
293 | fn is_match(&self, sid: StateID) -> bool { |
294 | // N.B. This returns true when sid==NFA::FAIL but that's okay because |
295 | // NFA::FAIL is not actually a valid state ID from the perspective of |
296 | // the Automaton trait. Namely, it is never returned by 'start_state' |
297 | // or by 'next_state'. So we don't need to care about it here. |
298 | !self.is_dead(sid) && sid <= self.special.max_match_id |
299 | } |
300 | |
301 | #[inline (always)] |
302 | fn is_start(&self, sid: StateID) -> bool { |
303 | sid == self.special.start_unanchored_id |
304 | || sid == self.special.start_anchored_id |
305 | } |
306 | |
307 | #[inline (always)] |
308 | fn match_kind(&self) -> MatchKind { |
309 | self.match_kind |
310 | } |
311 | |
312 | #[inline (always)] |
313 | fn patterns_len(&self) -> usize { |
314 | self.pattern_lens.len() |
315 | } |
316 | |
317 | #[inline (always)] |
318 | fn pattern_len(&self, pid: PatternID) -> usize { |
319 | self.pattern_lens[pid].as_usize() |
320 | } |
321 | |
322 | #[inline (always)] |
323 | fn min_pattern_len(&self) -> usize { |
324 | self.min_pattern_len |
325 | } |
326 | |
327 | #[inline (always)] |
328 | fn max_pattern_len(&self) -> usize { |
329 | self.max_pattern_len |
330 | } |
331 | |
332 | #[inline (always)] |
333 | fn match_len(&self, sid: StateID) -> usize { |
334 | self.states[sid].matches.len() |
335 | } |
336 | |
337 | #[inline (always)] |
338 | fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { |
339 | self.states[sid].matches[index] |
340 | } |
341 | |
342 | #[inline (always)] |
343 | fn memory_usage(&self) -> usize { |
344 | self.memory_usage |
345 | + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) |
346 | } |
347 | |
348 | #[inline (always)] |
349 | fn prefilter(&self) -> Option<&Prefilter> { |
350 | self.prefilter.as_ref() |
351 | } |
352 | } |
353 | |
354 | /// A representation of a sparse NFA state for an Aho-Corasick automaton. |
355 | /// |
356 | /// It contains the transitions to the next state, a failure transition for |
357 | /// cases where there exists no other transition for the current input byte |
358 | /// and the matches implied by visiting this state (if any). |
359 | #[derive (Clone)] |
360 | pub(crate) struct State { |
361 | /// The set of defined transitions for this state sorted by `u8`. In an |
362 | /// unanchored search, if a byte is not in this set of transitions, then |
363 | /// it should transition to `fail`. In an anchored search, it should |
364 | /// transition to the special DEAD state. |
365 | pub(crate) trans: Vec<(u8, StateID)>, |
366 | /// The patterns that match once this state is entered. Note that order |
367 | /// is important in the leftmost case. For example, if one adds 'foo' and |
368 | /// 'foo' (duplicate patterns are not disallowed), then in a leftmost-first |
369 | /// search, only the first 'foo' will ever match. |
370 | pub(crate) matches: Vec<PatternID>, |
371 | /// The state that should be transitioned to if the current byte in the |
372 | /// haystack does not have a corresponding transition defined in this |
373 | /// state. |
374 | pub(crate) fail: StateID, |
375 | /// The depth of this state. Specifically, this is the distance from this |
376 | /// state to the starting state. (For the special sentinel states DEAD and |
377 | /// FAIL, their depth is always 0.) The depth of a starting state is 0. |
378 | /// |
379 | /// Note that depth is currently not used in this non-contiguous NFA. It |
380 | /// may in the future, but it is used in the contiguous NFA. Namely, it |
381 | /// permits an optimization where states near the starting state have their |
382 | /// transitions stored in a dense fashion, but all other states have their |
383 | /// transitions stored in a sparse fashion. (This non-contiguous NFA uses |
384 | /// a sparse representation for all states unconditionally.) In any case, |
385 | /// this is really the only convenient place to compute and store this |
386 | /// information, which we need when building the contiguous NFA. |
387 | pub(crate) depth: SmallIndex, |
388 | } |
389 | |
390 | impl State { |
391 | /// Return the heap memory used by this state. Note that if `State` is |
392 | /// itself on the heap, then callers need to call this in addition to |
393 | /// `size_of::<State>()` to get the full heap memory used. |
394 | fn memory_usage(&self) -> usize { |
395 | use core::mem::size_of; |
396 | |
397 | (self.trans.len() * size_of::<(u8, StateID)>()) |
398 | + (self.matches.len() * size_of::<PatternID>()) |
399 | } |
400 | |
401 | /// Return true if and only if this state is a match state. |
402 | fn is_match(&self) -> bool { |
403 | !self.matches.is_empty() |
404 | } |
405 | |
406 | /// Return the next state by following the transition for the given byte. |
407 | /// If no transition for the given byte is defined, then the FAIL state ID |
408 | /// is returned. |
409 | #[inline (always)] |
410 | fn next_state(&self, byte: u8) -> StateID { |
411 | // This is a special case that targets the unanchored starting state. |
412 | // By construction, the unanchored starting state is actually a dense |
413 | // state, because every possible transition is defined on it. Any |
414 | // transitions that weren't added as part of initial trie construction |
415 | // get explicitly added as a self-transition back to itself. Thus, we |
416 | // can treat it as if it were dense and do a constant time lookup. |
417 | // |
418 | // This has *massive* benefit when executing searches because the |
419 | // unanchored starting state is by far the hottest state and is |
420 | // frequently visited. Moreover, the 'for' loop below that works |
421 | // decently on an actually sparse state is disastrous on a state that |
422 | // is nearly or completely dense. |
423 | // |
424 | // This optimization also works in general, including for non-starting |
425 | // states that happen to have every transition defined. Namely, it |
426 | // is impossible for 'self.trans' to have duplicate transitions (by |
427 | // construction) and transitions are always in sorted ascending order. |
428 | // So if a state has 256 transitions, it is, by construction, dense and |
429 | // amenable to constant time indexing. |
430 | if self.trans.len() == 256 { |
431 | self.trans[usize::from(byte)].1 |
432 | } else { |
433 | for &(b, id) in self.trans.iter() { |
434 | if b == byte { |
435 | return id; |
436 | } |
437 | } |
438 | NFA::FAIL |
439 | } |
440 | } |
441 | |
442 | /// Set the transition for the given byte to the state ID given. |
443 | /// |
444 | /// Note that one should not set transitions to the FAIL state. It is not |
445 | /// technically incorrect, but it wastes space. If a transition is not |
446 | /// defined, then it is automatically assumed to lead to the FAIL state. |
447 | fn set_next_state(&mut self, byte: u8, next: StateID) { |
448 | match self.trans.binary_search_by_key(&byte, |&(b, _)| b) { |
449 | Ok(i) => self.trans[i] = (byte, next), |
450 | Err(i) => self.trans.insert(i, (byte, next)), |
451 | } |
452 | } |
453 | } |
454 | |
455 | impl core::fmt::Debug for State { |
456 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
457 | use crate::{automaton::sparse_transitions, util::debug::DebugByte}; |
458 | |
459 | let it: impl Iterator = sparse_transitions(self.trans.iter().copied()).enumerate(); |
460 | for (i: usize, (start: u8, end: u8, sid: StateID)) in it { |
461 | if i > 0 { |
462 | write!(f, ", " )?; |
463 | } |
464 | if start == end { |
465 | write!(f, " {:?} => {:?}" , DebugByte(start), sid.as_usize())?; |
466 | } else { |
467 | write!( |
468 | f, |
469 | " {:?}- {:?} => {:?}" , |
470 | DebugByte(start), |
471 | DebugByte(end), |
472 | sid.as_usize() |
473 | )?; |
474 | } |
475 | } |
476 | Ok(()) |
477 | } |
478 | } |
479 | |
480 | /// A builder for configuring an Aho-Corasick noncontiguous NFA. |
481 | /// |
482 | /// This builder has a subset of the options available to a |
483 | /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, |
484 | /// their behavior is identical. |
485 | #[derive (Clone, Debug)] |
486 | pub struct Builder { |
487 | match_kind: MatchKind, |
488 | prefilter: bool, |
489 | ascii_case_insensitive: bool, |
490 | } |
491 | |
492 | impl Default for Builder { |
493 | fn default() -> Builder { |
494 | Builder { |
495 | match_kind: MatchKind::default(), |
496 | prefilter: true, |
497 | ascii_case_insensitive: false, |
498 | } |
499 | } |
500 | } |
501 | |
502 | impl Builder { |
503 | /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA. |
504 | pub fn new() -> Builder { |
505 | Builder::default() |
506 | } |
507 | |
508 | /// Build an Aho-Corasick noncontiguous NFA from the given iterator of |
509 | /// patterns. |
510 | /// |
511 | /// A builder may be reused to create more NFAs. |
512 | pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError> |
513 | where |
514 | I: IntoIterator<Item = P>, |
515 | P: AsRef<[u8]>, |
516 | { |
517 | debug!("building non-contiguous NFA" ); |
518 | let nfa = Compiler::new(self)?.compile(patterns)?; |
519 | debug!( |
520 | "non-contiguous NFA built, <states: {:?}, size: {:?}>" , |
521 | nfa.states.len(), |
522 | nfa.memory_usage() |
523 | ); |
524 | Ok(nfa) |
525 | } |
526 | |
527 | /// Set the desired match semantics. |
528 | /// |
529 | /// See |
530 | /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) |
531 | /// for more documentation and examples. |
532 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { |
533 | self.match_kind = kind; |
534 | self |
535 | } |
536 | |
537 | /// Enable ASCII-aware case insensitive matching. |
538 | /// |
539 | /// See |
540 | /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) |
541 | /// for more documentation and examples. |
542 | pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { |
543 | self.ascii_case_insensitive = yes; |
544 | self |
545 | } |
546 | |
547 | /// Enable heuristic prefilter optimizations. |
548 | /// |
549 | /// See |
550 | /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) |
551 | /// for more documentation and examples. |
552 | pub fn prefilter(&mut self, yes: bool) -> &mut Builder { |
553 | self.prefilter = yes; |
554 | self |
555 | } |
556 | } |
557 | |
558 | /// A compiler uses a builder configuration and builds up the NFA formulation |
559 | /// of an Aho-Corasick automaton. This roughly corresponds to the standard |
560 | /// formulation described in textbooks, with some tweaks to support leftmost |
561 | /// searching. |
562 | #[derive (Debug)] |
563 | struct Compiler<'a> { |
564 | builder: &'a Builder, |
565 | prefilter: prefilter::Builder, |
566 | nfa: NFA, |
567 | byteset: ByteClassSet, |
568 | } |
569 | |
570 | impl<'a> Compiler<'a> { |
571 | fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> { |
572 | let prefilter = prefilter::Builder::new(builder.match_kind) |
573 | .ascii_case_insensitive(builder.ascii_case_insensitive); |
574 | Ok(Compiler { |
575 | builder, |
576 | prefilter, |
577 | nfa: NFA { |
578 | match_kind: builder.match_kind, |
579 | states: vec![], |
580 | pattern_lens: vec![], |
581 | prefilter: None, |
582 | byte_classes: ByteClasses::singletons(), |
583 | min_pattern_len: usize::MAX, |
584 | max_pattern_len: 0, |
585 | special: Special::zero(), |
586 | memory_usage: 0, |
587 | }, |
588 | byteset: ByteClassSet::empty(), |
589 | }) |
590 | } |
591 | |
592 | fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError> |
593 | where |
594 | I: IntoIterator<Item = P>, |
595 | P: AsRef<[u8]>, |
596 | { |
597 | // the dead state, only used for leftmost and fixed to id==0 |
598 | self.add_state(0)?; |
599 | // the fail state, which is never entered and fixed to id==1 |
600 | self.add_state(0)?; |
601 | // unanchored start state, initially fixed to id==2 but later shuffled |
602 | // to appear after all non-start match states. |
603 | self.nfa.special.start_unanchored_id = self.add_state(0)?; |
604 | // anchored start state, initially fixed to id==3 but later shuffled |
605 | // to appear after unanchored start state. |
606 | self.nfa.special.start_anchored_id = self.add_state(0)?; |
607 | // Initialize the unanchored starting state in order to make it dense, |
608 | // and thus make transition lookups on this state faster. |
609 | self.init_unanchored_start_state(); |
610 | // Build the base trie from the given patterns. |
611 | self.build_trie(patterns)?; |
612 | // Add transitions (and maybe matches) to the anchored starting state. |
613 | // The anchored starting state is used for anchored searches. The only |
614 | // mechanical difference between it and the unanchored start state is |
615 | // that missing transitions map to the DEAD state instead of the FAIL |
616 | // state. |
617 | self.set_anchored_start_state(); |
618 | // Rewrite transitions to the FAIL state on the unanchored start state |
619 | // as self-transitions. This keeps the start state active at all times. |
620 | self.add_unanchored_start_state_loop(); |
621 | // Set all transitions on the DEAD state to point to itself. This way, |
622 | // the DEAD state can never be escaped. It MUST be used as a sentinel |
623 | // in any correct search. |
624 | self.add_dead_state_loop(); |
625 | // The meat of the Aho-Corasick algorithm: compute and write failure |
626 | // transitions. i.e., the state to move to when a transition isn't |
627 | // defined in the current state. These are epsilon transitions and thus |
628 | // make this formulation an NFA. |
629 | self.fill_failure_transitions(); |
630 | // Handle a special case under leftmost semantics when at least one |
631 | // of the patterns is the empty string. |
632 | self.close_start_state_loop_for_leftmost(); |
633 | // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START, |
634 | // NON-MATCH, ... This permits us to very quickly query the type of |
635 | // the state we're currently in during a search. |
636 | self.shuffle(); |
637 | // Turn our set of bytes into equivalent classes. This NFA |
638 | // implementation doesn't use byte classes directly, but any |
639 | // Aho-Corasick searcher built from this one might. |
640 | self.nfa.byte_classes = self.byteset.byte_classes(); |
641 | self.nfa.prefilter = self.prefilter.build(); |
642 | self.calculate_memory_usage(); |
643 | // Store the maximum ID of all *relevant* special states. Start states |
644 | // are only relevant when we have a prefilter, otherwise, there is zero |
645 | // reason to care about whether a state is a start state or not during |
646 | // a search. Indeed, without a prefilter, we are careful to explicitly |
647 | // NOT care about start states, otherwise the search can ping pong |
648 | // between the unrolled loop and the handling of special-status states |
649 | // and destroy perf. |
650 | self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() { |
651 | // Why the anchored starting state? Because we always put it |
652 | // after the unanchored starting state and it is therefore the |
653 | // maximum. Why put unanchored followed by anchored? No particular |
654 | // reason, but that's how the states are logically organized in the |
655 | // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯ |
656 | self.nfa.special.start_anchored_id |
657 | } else { |
658 | self.nfa.special.max_match_id |
659 | }; |
660 | Ok(self.nfa) |
661 | } |
662 | |
663 | /// This sets up the initial prefix trie that makes up the Aho-Corasick |
664 | /// automaton. Effectively, it creates the basic structure of the |
665 | /// automaton, where every pattern given has a path from the start state to |
666 | /// the end of the pattern. |
667 | fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError> |
668 | where |
669 | I: IntoIterator<Item = P>, |
670 | P: AsRef<[u8]>, |
671 | { |
672 | 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() { |
673 | let pid = PatternID::new(i).map_err(|e| { |
674 | BuildError::pattern_id_overflow( |
675 | PatternID::MAX.as_u64(), |
676 | e.attempted(), |
677 | ) |
678 | })?; |
679 | let pat = pat.as_ref(); |
680 | let patlen = SmallIndex::new(pat.len()) |
681 | .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?; |
682 | self.nfa.min_pattern_len = |
683 | core::cmp::min(self.nfa.min_pattern_len, pat.len()); |
684 | self.nfa.max_pattern_len = |
685 | core::cmp::max(self.nfa.max_pattern_len, pat.len()); |
686 | assert_eq!( |
687 | i, |
688 | self.nfa.pattern_lens.len(), |
689 | "expected number of patterns to match pattern ID" |
690 | ); |
691 | self.nfa.pattern_lens.push(patlen); |
692 | // We add the pattern to the prefilter here because the pattern |
693 | // ID in the prefilter is determined with respect to the patterns |
694 | // added to the prefilter. That is, it isn't the ID we have here, |
695 | // but the one determined by its own accounting of patterns. |
696 | // To ensure they line up, we add every pattern we see to the |
697 | // prefilter, even if some patterns ultimately are impossible to |
698 | // match (in leftmost-first semantics specifically). |
699 | // |
700 | // Another way of doing this would be to expose an API in the |
701 | // prefilter to permit setting your own pattern IDs. Or to just use |
702 | // our own map and go between them. But this case is sufficiently |
703 | // rare that we don't bother and just make sure they're in sync. |
704 | if self.builder.prefilter { |
705 | self.prefilter.add(pat); |
706 | } |
707 | |
708 | let mut prev = self.nfa.special.start_unanchored_id; |
709 | let mut saw_match = false; |
710 | for (depth, &b) in pat.iter().enumerate() { |
711 | // When leftmost-first match semantics are requested, we |
712 | // specifically stop adding patterns when a previously added |
713 | // pattern is a prefix of it. We avoid adding it because |
714 | // leftmost-first semantics imply that the pattern can never |
715 | // match. This is not just an optimization to save space! It |
716 | // is necessary for correctness. In fact, this is the only |
717 | // difference in the automaton between the implementations for |
718 | // leftmost-first and leftmost-longest. |
719 | saw_match = saw_match || self.nfa.states[prev].is_match(); |
720 | if self.builder.match_kind.is_leftmost_first() && saw_match { |
721 | // Skip to the next pattern immediately. This avoids |
722 | // incorrectly adding a match after this loop terminates. |
723 | continue 'PATTERNS; |
724 | } |
725 | |
726 | // Add this byte to our equivalence classes. We don't use these |
727 | // for NFA construction. These are instead used only if we're |
728 | // building a DFA. They would technically be useful for the |
729 | // NFA, but it would require a second pass over the patterns. |
730 | self.byteset.set_range(b, b); |
731 | if self.builder.ascii_case_insensitive { |
732 | let b = opposite_ascii_case(b); |
733 | self.byteset.set_range(b, b); |
734 | } |
735 | |
736 | // If the transition from prev using the current byte already |
737 | // exists, then just move through it. Otherwise, add a new |
738 | // state. We track the depth here so that we can determine |
739 | // how to represent transitions. States near the start state |
740 | // use a dense representation that uses more memory but is |
741 | // faster. Other states use a sparse representation that uses |
742 | // less memory but is slower. |
743 | let next = self.nfa.states[prev].next_state(b); |
744 | if next != NFA::FAIL { |
745 | prev = next; |
746 | } else { |
747 | let next = self.add_state(depth)?; |
748 | self.nfa.states[prev].set_next_state(b, next); |
749 | if self.builder.ascii_case_insensitive { |
750 | let b = opposite_ascii_case(b); |
751 | self.nfa.states[prev].set_next_state(b, next); |
752 | } |
753 | prev = next; |
754 | } |
755 | } |
756 | // Once the pattern has been added, log the match in the final |
757 | // state that it reached. |
758 | self.nfa.states[prev].matches.push(pid); |
759 | } |
760 | Ok(()) |
761 | } |
762 | |
763 | /// This routine creates failure transitions according to the standard |
764 | /// textbook formulation of the Aho-Corasick algorithm, with a couple small |
765 | /// tweaks to support "leftmost" semantics. |
766 | /// |
767 | /// Building failure transitions is the most interesting part of building |
768 | /// the Aho-Corasick automaton, because they are what allow searches to |
769 | /// be performed in linear time. Specifically, a failure transition is |
770 | /// a single transition associated with each state that points back to |
771 | /// the longest proper suffix of the pattern being searched. The failure |
772 | /// transition is followed whenever there exists no transition on the |
773 | /// current state for the current input byte. If there is no other proper |
774 | /// suffix, then the failure transition points back to the starting state. |
775 | /// |
776 | /// For example, let's say we built an Aho-Corasick automaton with the |
777 | /// following patterns: 'abcd' and 'cef'. The trie looks like this: |
778 | /// |
779 | /// ```ignore |
780 | /// a - S1 - b - S2 - c - S3 - d - S4* |
781 | /// / |
782 | /// S0 - c - S5 - e - S6 - f - S7* |
783 | /// ``` |
784 | /// |
785 | /// At this point, it should be fairly straight-forward to see how this |
786 | /// trie can be used in a simplistic way. At any given position in the |
787 | /// text we're searching (called the "subject" string), all we need to do |
788 | /// is follow the transitions in the trie by consuming one transition for |
789 | /// each byte in the subject string. If we reach a match state, then we can |
790 | /// report that location as a match. |
791 | /// |
792 | /// The trick comes when searching a subject string like 'abcef'. We'll |
793 | /// initially follow the transition from S0 to S1 and wind up in S3 after |
794 | /// observng the 'c' byte. At this point, the next byte is 'e' but state |
795 | /// S3 has no transition for 'e', so the search fails. We then would need |
796 | /// to restart the search at the next position in 'abcef', which |
797 | /// corresponds to 'b'. The match would fail, but the next search starting |
798 | /// at 'c' would finally succeed. The problem with this approach is that |
799 | /// we wind up searching the subject string potentially many times. In |
800 | /// effect, this makes the algorithm have worst case `O(n * m)` complexity, |
801 | /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead |
802 | /// like to achieve a `O(n + m)` worst case complexity. |
803 | /// |
804 | /// This is where failure transitions come in. Instead of dying at S3 in |
805 | /// the first search, the automaton can instruct the search to move to |
806 | /// another part of the automaton that corresponds to a suffix of what |
807 | /// we've seen so far. Recall that we've seen 'abc' in the subject string, |
808 | /// and the automaton does indeed have a non-empty suffix, 'c', that could |
809 | /// potentially lead to another match. Thus, the actual Aho-Corasick |
810 | /// automaton for our patterns in this case looks like this: |
811 | /// |
812 | /// ```ignore |
813 | /// a - S1 - b - S2 - c - S3 - d - S4* |
814 | /// / / |
815 | /// / ---------------- |
816 | /// / / |
817 | /// S0 - c - S5 - e - S6 - f - S7* |
818 | /// ``` |
819 | /// |
820 | /// That is, we have a failure transition from S3 to S5, which is followed |
821 | /// exactly in cases when we are in state S3 but see any byte other than |
822 | /// 'd' (that is, we've "failed" to find a match in this portion of our |
823 | /// trie). We know we can transition back to S5 because we've already seen |
824 | /// a 'c' byte, so we don't need to re-scan it. We can then pick back up |
825 | /// with the search starting at S5 and complete our match. |
826 | /// |
827 | /// Adding failure transitions to a trie is fairly simple, but subtle. The |
828 | /// key issue is that you might have multiple failure transition that you |
829 | /// need to follow. For example, look at the trie for the patterns |
830 | /// 'abcd', 'b', 'bcd' and 'cd': |
831 | /// |
832 | /// ```ignore |
833 | /// - a - S1 - b - S2* - c - S3 - d - S4* |
834 | /// / / / |
835 | /// / ------- ------- |
836 | /// / / / |
837 | /// S0 --- b - S5* - c - S6 - d - S7* |
838 | /// \ / |
839 | /// \ -------- |
840 | /// \ / |
841 | /// - c - S8 - d - S9* |
842 | /// ``` |
843 | /// |
844 | /// The failure transitions for this trie are defined from S2 to S5, |
845 | /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it |
846 | /// corresponds to a match, since its failure transition to S5 is itself |
847 | /// a match state. |
848 | /// |
849 | /// Perhaps simplest way to think about adding these failure transitions |
850 | /// is recursively. That is, if you know the failure transitions for every |
851 | /// possible previous state that could be visited (e.g., when computing the |
852 | /// failure transition for S3, you already know the failure transitions |
853 | /// for S0, S1 and S2), then you can simply follow the failure transition |
854 | /// of the previous state and check whether the incoming transition is |
855 | /// defined after following the failure transition. |
856 | /// |
857 | /// For example, when determining the failure state for S3, by our |
858 | /// assumptions, we already know that there is a failure transition from |
859 | /// S2 (the previous state) to S5. So we follow that transition and check |
860 | /// whether the transition connecting S2 to S3 is defined. Indeed, it is, |
861 | /// as there is a transition from S5 to S6 for the byte 'c'. If no such |
862 | /// transition existed, we could keep following the failure transitions |
863 | /// until we reach the start state, which is the failure transition for |
864 | /// every state that has no corresponding proper suffix. |
865 | /// |
866 | /// We don't actually use recursion to implement this, but instead, use a |
867 | /// breadth first search of the automaton. Our base case is the start |
868 | /// state, whose failure transition is just a transition to itself. |
869 | /// |
870 | /// When building a leftmost automaton, we proceed as above, but only |
871 | /// include a subset of failure transitions. Namely, we omit any failure |
872 | /// transitions that appear after a match state in the trie. This is |
873 | /// because failure transitions always point back to a proper suffix of |
874 | /// what has been seen so far. Thus, following a failure transition after |
875 | /// a match implies looking for a match that starts after the one that has |
876 | /// already been seen, which is of course therefore not the leftmost match. |
877 | /// |
878 | /// N.B. I came up with this algorithm on my own, and after scouring all of |
879 | /// the other AC implementations I know of (Perl, Snort, many on GitHub). |
880 | /// I couldn't find any that implement leftmost semantics like this. |
881 | /// Perl of course needs leftmost-first semantics, but they implement it |
882 | /// with a seeming hack at *search* time instead of encoding it into the |
883 | /// automaton. There are also a couple Java libraries that support leftmost |
884 | /// longest semantics, but they do it by building a queue of matches at |
885 | /// search time, which is even worse than what Perl is doing. ---AG |
886 | fn fill_failure_transitions(&mut self) { |
887 | let is_leftmost = self.builder.match_kind.is_leftmost(); |
888 | let start_uid = self.nfa.special.start_unanchored_id; |
889 | // Initialize the queue for breadth first search with all transitions |
890 | // out of the start state. We handle the start state specially because |
891 | // we only want to follow non-self transitions. If we followed self |
892 | // transitions, then this would never terminate. |
893 | let mut queue = VecDeque::new(); |
894 | let mut seen = self.queued_set(); |
895 | for i in 0..self.nfa.states[start_uid].trans.len() { |
896 | let (_, next) = self.nfa.states[start_uid].trans[i]; |
897 | // Skip anything we've seen before and any self-transitions on the |
898 | // start state. |
899 | if next == start_uid || seen.contains(next) { |
900 | continue; |
901 | } |
902 | queue.push_back(next); |
903 | seen.insert(next); |
904 | // Under leftmost semantics, if a state immediately following |
905 | // the start state is a match state, then we never want to |
906 | // follow its failure transition since the failure transition |
907 | // necessarily leads back to the start state, which we never |
908 | // want to do for leftmost matching after a match has been |
909 | // found. |
910 | // |
911 | // We apply the same logic to non-start states below as well. |
912 | if is_leftmost && self.nfa.states[next].is_match() { |
913 | self.nfa.states[next].fail = NFA::DEAD; |
914 | } |
915 | } |
916 | while let Some(id) = queue.pop_front() { |
917 | for i in 0..self.nfa.states[id].trans.len() { |
918 | let (b, next) = self.nfa.states[id].trans[i]; |
919 | if seen.contains(next) { |
920 | // The only way to visit a duplicate state in a transition |
921 | // list is when ASCII case insensitivity is enabled. In |
922 | // this case, we want to skip it since it's redundant work. |
923 | // But it would also end up duplicating matches, which |
924 | // results in reporting duplicate matches in some cases. |
925 | // See the 'acasei010' regression test. |
926 | continue; |
927 | } |
928 | queue.push_back(next); |
929 | seen.insert(next); |
930 | |
931 | // As above for start states, under leftmost semantics, once |
932 | // we see a match all subsequent states should have no failure |
933 | // transitions because failure transitions always imply looking |
934 | // for a match that is a suffix of what has been seen so far |
935 | // (where "seen so far" corresponds to the string formed by |
936 | // following the transitions from the start state to the |
937 | // current state). Under leftmost semantics, we specifically do |
938 | // not want to allow this to happen because we always want to |
939 | // report the match found at the leftmost position. |
940 | // |
941 | // The difference between leftmost-first and leftmost-longest |
942 | // occurs previously while we build the trie. For |
943 | // leftmost-first, we simply omit any entries that would |
944 | // otherwise require passing through a match state. |
945 | // |
946 | // Note that for correctness, the failure transition has to be |
947 | // set to the dead state for ALL states following a match, not |
948 | // just the match state itself. However, by setting the failure |
949 | // transition to the dead state on all match states, the dead |
950 | // state will automatically propagate to all subsequent states |
951 | // via the failure state computation below. |
952 | if is_leftmost && self.nfa.states[next].is_match() { |
953 | self.nfa.states[next].fail = NFA::DEAD; |
954 | continue; |
955 | } |
956 | let mut fail = self.nfa.states[id].fail; |
957 | while self.nfa.states[fail].next_state(b) == NFA::FAIL { |
958 | fail = self.nfa.states[fail].fail; |
959 | } |
960 | fail = self.nfa.states[fail].next_state(b); |
961 | self.nfa.states[next].fail = fail; |
962 | self.copy_matches(fail, next); |
963 | } |
964 | // If the start state is a match state, then this automaton can |
965 | // match the empty string. This implies all states are match states |
966 | // since every position matches the empty string, so copy the |
967 | // matches from the start state to every state. Strictly speaking, |
968 | // this is only necessary for overlapping matches since each |
969 | // non-empty non-start match state needs to report empty matches |
970 | // in addition to its own. For the non-overlapping case, such |
971 | // states only report the first match, which is never empty since |
972 | // it isn't a start state. |
973 | if !is_leftmost { |
974 | self.copy_matches(self.nfa.special.start_unanchored_id, id); |
975 | } |
976 | } |
977 | } |
978 | |
979 | /// Shuffle the states so that they appear in this sequence: |
980 | /// |
981 | /// DEAD, FAIL, MATCH..., START, START, NON-MATCH... |
982 | /// |
983 | /// The idea here is that if we know how special states are laid out in our |
984 | /// transition table, then we can determine what "kind" of state we're in |
985 | /// just by comparing our current state ID with a particular value. In this |
986 | /// way, we avoid doing extra memory lookups. |
987 | /// |
988 | /// Before shuffling begins, our states look something like this: |
989 | /// |
990 | /// DEAD, FAIL, START, START, (MATCH | NON-MATCH)... |
991 | /// |
992 | /// So all we need to do is move all of the MATCH states so that they |
993 | /// all appear before any NON-MATCH state, like so: |
994 | /// |
995 | /// DEAD, FAIL, START, START, MATCH... NON-MATCH... |
996 | /// |
997 | /// Then it's just a simple matter of swapping the two START states with |
998 | /// the last two MATCH states. |
999 | /// |
1000 | /// (This is the same technique used for fully compiled DFAs in |
1001 | /// regex-automata.) |
1002 | fn shuffle(&mut self) { |
1003 | let old_start_uid = self.nfa.special.start_unanchored_id; |
1004 | let old_start_aid = self.nfa.special.start_anchored_id; |
1005 | assert!(old_start_uid < old_start_aid); |
1006 | assert_eq!( |
1007 | 3, |
1008 | old_start_aid.as_usize(), |
1009 | "anchored start state should be at index 3" |
1010 | ); |
1011 | // We implement shuffling by a sequence of pairwise swaps of states. |
1012 | // Since we have a number of things referencing states via their |
1013 | // IDs and swapping them changes their IDs, we need to record every |
1014 | // swap we make so that we can remap IDs. The remapper handles this |
1015 | // book-keeping for us. |
1016 | let mut remapper = Remapper::new(&self.nfa, 0); |
1017 | // The way we proceed here is by moving all match states so that |
1018 | // they directly follow the start states. So it will go: DEAD, FAIL, |
1019 | // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ... |
1020 | // |
1021 | // To do that, we proceed forward through all states after |
1022 | // START-ANCHORED and swap match states so that they appear before all |
1023 | // non-match states. |
1024 | let mut next_avail = StateID::from(4u8); |
1025 | for i in next_avail.as_usize()..self.nfa.states.len() { |
1026 | let sid = StateID::new(i).unwrap(); |
1027 | if !self.nfa.states[sid].is_match() { |
1028 | continue; |
1029 | } |
1030 | remapper.swap(&mut self.nfa, sid, next_avail); |
1031 | // The key invariant here is that only non-match states exist |
1032 | // between 'next_avail' and 'sid' (with them being potentially |
1033 | // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed |
1034 | // to land on the leftmost non-match state. (Unless 'next_avail' |
1035 | // and 'sid' are equivalent, in which case, a swap will occur but |
1036 | // it is a no-op.) |
1037 | next_avail = StateID::new(next_avail.one_more()).unwrap(); |
1038 | } |
1039 | // Now we'd like to move the start states to immediately following the |
1040 | // match states. (The start states may themselves be match states, but |
1041 | // we'll handle that later.) We arrange the states this way so that we |
1042 | // don't necessarily need to check whether a state is a start state or |
1043 | // not before checking whether a state is a match state. For example, |
1044 | // we'd like to be able to write this as our state machine loop: |
1045 | // |
1046 | // sid = start() |
1047 | // for byte in haystack: |
1048 | // sid = next(sid, byte) |
1049 | // if sid <= nfa.max_start_id: |
1050 | // if sid <= nfa.max_dead_id: |
1051 | // # search complete |
1052 | // elif sid <= nfa.max_match_id: |
1053 | // # found match |
1054 | // |
1055 | // The important context here is that we might not want to look for |
1056 | // start states at all. Namely, if a searcher doesn't have a prefilter, |
1057 | // then there is no reason to care about whether we're in a start state |
1058 | // or not. And indeed, if we did check for it, this very hot loop would |
1059 | // ping pong between the special state handling and the main state |
1060 | // transition logic. This in turn stalls the CPU by killing branch |
1061 | // prediction. |
1062 | // |
1063 | // So essentially, we really want to be able to "forget" that start |
1064 | // states even exist and this is why we put them at the end. |
1065 | let new_start_aid = |
1066 | StateID::new(next_avail.as_usize().checked_sub(1).unwrap()) |
1067 | .unwrap(); |
1068 | remapper.swap(&mut self.nfa, old_start_aid, new_start_aid); |
1069 | let new_start_uid = |
1070 | StateID::new(next_avail.as_usize().checked_sub(2).unwrap()) |
1071 | .unwrap(); |
1072 | remapper.swap(&mut self.nfa, old_start_uid, new_start_uid); |
1073 | let new_max_match_id = |
1074 | StateID::new(next_avail.as_usize().checked_sub(3).unwrap()) |
1075 | .unwrap(); |
1076 | self.nfa.special.max_match_id = new_max_match_id; |
1077 | self.nfa.special.start_unanchored_id = new_start_uid; |
1078 | self.nfa.special.start_anchored_id = new_start_aid; |
1079 | // If one start state is a match state, then they both are. |
1080 | if self.nfa.states[self.nfa.special.start_anchored_id].is_match() { |
1081 | self.nfa.special.max_match_id = self.nfa.special.start_anchored_id; |
1082 | } |
1083 | remapper.remap(&mut self.nfa); |
1084 | } |
1085 | |
1086 | /// Returns a set that tracked queued states. |
1087 | /// |
1088 | /// This is only necessary when ASCII case insensitivity is enabled, since |
1089 | /// it is the only way to visit the same state twice. Otherwise, this |
1090 | /// returns an inert set that nevers adds anything and always reports |
1091 | /// `false` for every member test. |
1092 | fn queued_set(&self) -> QueuedSet { |
1093 | if self.builder.ascii_case_insensitive { |
1094 | QueuedSet::active() |
1095 | } else { |
1096 | QueuedSet::inert() |
1097 | } |
1098 | } |
1099 | |
1100 | /// Initializes the unanchored start state by making it dense. This is |
1101 | /// achieved by explicitly setting every transition to the FAIL state. |
1102 | /// This isn't necessary for correctness, since any missing transition is |
1103 | /// automatically assumed to be mapped to the FAIL state. We do this to |
1104 | /// make the unanchored starting state dense, and thus in turn make |
1105 | /// transition lookups on it faster. (Which is worth doing because it's |
1106 | /// the most active state.) |
1107 | fn init_unanchored_start_state(&mut self) { |
1108 | let start_uid = self.nfa.special.start_unanchored_id; |
1109 | for byte in 0..=255 { |
1110 | self.nfa.states[start_uid].set_next_state(byte, NFA::FAIL); |
1111 | } |
1112 | } |
1113 | |
1114 | /// Setup the anchored start state by copying all of the transitions and |
1115 | /// matches from the unanchored starting state with one change: the failure |
1116 | /// transition is changed to the DEAD state, so that for any undefined |
1117 | /// transitions, the search will stop. |
1118 | fn set_anchored_start_state(&mut self) { |
1119 | let start_uid = self.nfa.special.start_unanchored_id; |
1120 | let start_aid = self.nfa.special.start_anchored_id; |
1121 | self.nfa.states[start_aid].trans = |
1122 | self.nfa.states[start_uid].trans.clone(); |
1123 | self.copy_matches(start_uid, start_aid); |
1124 | // This is the main difference between the unanchored and anchored |
1125 | // starting states. If a lookup on an anchored starting state fails, |
1126 | // then the search should stop. |
1127 | // |
1128 | // N.B. This assumes that the loop on the unanchored starting state |
1129 | // hasn't been created yet. |
1130 | self.nfa.states[start_aid].fail = NFA::DEAD; |
1131 | } |
1132 | |
1133 | /// Set the failure transitions on the start state to loop back to the |
1134 | /// start state. This effectively permits the Aho-Corasick automaton to |
1135 | /// match at any position. This is also required for finding the next |
1136 | /// state to terminate, namely, finding the next state should never return |
1137 | /// a fail_id. |
1138 | /// |
1139 | /// This must be done after building the initial trie, since trie |
1140 | /// construction depends on transitions to `fail_id` to determine whether a |
1141 | /// state already exists or not. |
1142 | fn add_unanchored_start_state_loop(&mut self) { |
1143 | let start_uid = self.nfa.special.start_unanchored_id; |
1144 | let start = &mut self.nfa.states[start_uid]; |
1145 | for b in 0..=255 { |
1146 | if start.next_state(b) == NFA::FAIL { |
1147 | start.set_next_state(b, start_uid); |
1148 | } |
1149 | } |
1150 | } |
1151 | |
1152 | /// Remove the start state loop by rewriting any transitions on the start |
1153 | /// state back to the start state with transitions to the dead state. |
1154 | /// |
1155 | /// The loop is only closed when two conditions are met: the start state |
1156 | /// is a match state and the match kind is leftmost-first or |
1157 | /// leftmost-longest. |
1158 | /// |
1159 | /// The reason for this is that under leftmost semantics, a start state |
1160 | /// that is also a match implies that we should never restart the search |
1161 | /// process. We allow normal transitions out of the start state, but if |
1162 | /// none exist, we transition to the dead state, which signals that |
1163 | /// searching should stop. |
1164 | fn close_start_state_loop_for_leftmost(&mut self) { |
1165 | let start_uid = self.nfa.special.start_unanchored_id; |
1166 | let start = &mut self.nfa.states[start_uid]; |
1167 | if self.builder.match_kind.is_leftmost() && start.is_match() { |
1168 | for b in 0..=255 { |
1169 | if start.next_state(b) == start_uid { |
1170 | start.set_next_state(b, NFA::DEAD); |
1171 | } |
1172 | } |
1173 | } |
1174 | } |
1175 | |
1176 | /// Sets all transitions on the dead state to point back to the dead state. |
1177 | /// Normally, missing transitions map back to the failure state, but the |
1178 | /// point of the dead state is to act as a sink that can never be escaped. |
1179 | fn add_dead_state_loop(&mut self) { |
1180 | let dead = &mut self.nfa.states[NFA::DEAD]; |
1181 | for b in 0..=255 { |
1182 | dead.set_next_state(b, NFA::DEAD); |
1183 | } |
1184 | } |
1185 | |
1186 | /// Copy matches from the `src` state to the `dst` state. This is useful |
1187 | /// when a match state can be reached via a failure transition. In which |
1188 | /// case, you'll want to copy the matches (if any) from the state reached |
1189 | /// by the failure transition to the original state you were at. |
1190 | fn copy_matches(&mut self, src: StateID, dst: StateID) { |
1191 | let (src, dst) = |
1192 | get_two_mut(&mut self.nfa.states, src.as_usize(), dst.as_usize()); |
1193 | dst.matches.extend_from_slice(&src.matches); |
1194 | } |
1195 | |
1196 | /// Allocate and add a fresh state to the underlying NFA and return its |
1197 | /// ID (guaranteed to be one more than the ID of the previously allocated |
1198 | /// state). If the ID would overflow `StateID`, then this returns an error. |
1199 | fn add_state(&mut self, depth: usize) -> Result<StateID, BuildError> { |
1200 | // This is OK because we error when building the trie if we see a |
1201 | // pattern whose length cannot fit into a 'SmallIndex', and the longest |
1202 | // possible depth corresponds to the length of the longest pattern. |
1203 | let depth = SmallIndex::new(depth) |
1204 | .expect("patterns longer than SmallIndex::MAX are not allowed" ); |
1205 | let id = StateID::new(self.nfa.states.len()).map_err(|e| { |
1206 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) |
1207 | })?; |
1208 | self.nfa.states.push(State { |
1209 | trans: vec![], |
1210 | matches: vec![], |
1211 | fail: self.nfa.special.start_unanchored_id, |
1212 | depth, |
1213 | }); |
1214 | Ok(id) |
1215 | } |
1216 | |
1217 | /// Computes the total amount of heap used by this NFA in bytes. |
1218 | fn calculate_memory_usage(&mut self) { |
1219 | use core::mem::size_of; |
1220 | |
1221 | for state in self.nfa.states.iter() { |
1222 | self.nfa.memory_usage += size_of::<State>() + state.memory_usage(); |
1223 | } |
1224 | } |
1225 | } |
1226 | |
1227 | /// A set of state identifiers used to avoid revisiting the same state multiple |
1228 | /// times when filling in failure transitions. |
1229 | /// |
1230 | /// This set has an "inert" and an "active" mode. When inert, the set never |
1231 | /// stores anything and always returns `false` for every member test. This is |
1232 | /// useful to avoid the performance and memory overhead of maintaining this |
1233 | /// set when it is not needed. |
1234 | #[derive (Debug)] |
1235 | struct QueuedSet { |
1236 | set: Option<BTreeSet<StateID>>, |
1237 | } |
1238 | |
1239 | impl QueuedSet { |
1240 | /// Return an inert set that returns `false` for every state ID membership |
1241 | /// test. |
1242 | fn inert() -> QueuedSet { |
1243 | QueuedSet { set: None } |
1244 | } |
1245 | |
1246 | /// Return an active set that tracks state ID membership. |
1247 | fn active() -> QueuedSet { |
1248 | QueuedSet { set: Some(BTreeSet::new()) } |
1249 | } |
1250 | |
1251 | /// Inserts the given state ID into this set. (If the set is inert, then |
1252 | /// this is a no-op.) |
1253 | fn insert(&mut self, state_id: StateID) { |
1254 | if let Some(ref mut set) = self.set { |
1255 | set.insert(state_id); |
1256 | } |
1257 | } |
1258 | |
1259 | /// Returns true if and only if the given state ID is in this set. If the |
1260 | /// set is inert, this always returns false. |
1261 | fn contains(&self, state_id: StateID) -> bool { |
1262 | match self.set { |
1263 | None => false, |
1264 | Some(ref set) => set.contains(&state_id), |
1265 | } |
1266 | } |
1267 | } |
1268 | |
1269 | impl core::fmt::Debug for NFA { |
1270 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
1271 | use crate::automaton::fmt_state_indicator; |
1272 | |
1273 | writeln!(f, "noncontiguous::NFA(" )?; |
1274 | for (sid, state) in self.states.iter().with_state_ids() { |
1275 | // The FAIL state doesn't actually have space for a state allocated |
1276 | // for it, so we have to treat it as a special case. |
1277 | if sid == NFA::FAIL { |
1278 | writeln!(f, "F {:06}:" , sid.as_usize())?; |
1279 | continue; |
1280 | } |
1281 | fmt_state_indicator(f, self, sid)?; |
1282 | write!( |
1283 | f, |
1284 | " {:06}( {:06}): " , |
1285 | sid.as_usize(), |
1286 | state.fail.as_usize() |
1287 | )?; |
1288 | state.fmt(f)?; |
1289 | write!(f, " \n" )?; |
1290 | if self.is_match(sid) { |
1291 | write!(f, " matches: " )?; |
1292 | for (i, pid) in state.matches.iter().enumerate() { |
1293 | if i > 0 { |
1294 | write!(f, ", " )?; |
1295 | } |
1296 | write!(f, " {}" , pid.as_usize())?; |
1297 | } |
1298 | write!(f, " \n" )?; |
1299 | } |
1300 | } |
1301 | writeln!(f, "match kind: {:?}" , self.match_kind)?; |
1302 | writeln!(f, "prefilter: {:?}" , self.prefilter.is_some())?; |
1303 | writeln!(f, "state length: {:?}" , self.states.len())?; |
1304 | writeln!(f, "pattern length: {:?}" , self.patterns_len())?; |
1305 | writeln!(f, "shortest pattern length: {:?}" , self.min_pattern_len)?; |
1306 | writeln!(f, "longest pattern length: {:?}" , self.max_pattern_len)?; |
1307 | writeln!(f, "memory usage: {:?}" , self.memory_usage())?; |
1308 | writeln!(f, ")" )?; |
1309 | Ok(()) |
1310 | } |
1311 | } |
1312 | |
1313 | /// Safely return two mutable borrows to two different locations in the given |
1314 | /// slice. |
1315 | /// |
1316 | /// This panics if i == j. |
1317 | fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) { |
1318 | assert!(i != j, " {} must not be equal to {}" , i, j); |
1319 | if i < j { |
1320 | let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:j); |
1321 | (&mut before[i], &mut after[0]) |
1322 | } else { |
1323 | let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:i); |
1324 | (&mut after[0], &mut before[j]) |
1325 | } |
1326 | } |
1327 | |