| 1 | /*! | 
| 2 | Provides a noncontiguous NFA implementation of Aho-Corasick. | 
|---|
| 3 |  | 
|---|
| 4 | This is a low-level API that generally only needs to be used in niche | 
|---|
| 5 | circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) | 
|---|
| 6 | instead of a noncontiguous NFA directly. Using an `NFA` directly is typically | 
|---|
| 7 | only necessary when one needs access to the [`Automaton`] trait implementation. | 
|---|
| 8 | */ | 
|---|
| 9 |  | 
|---|
| 10 | use alloc::{ | 
|---|
| 11 | collections::{BTreeSet, VecDeque}, | 
|---|
| 12 | vec, | 
|---|
| 13 | vec::Vec, | 
|---|
| 14 | }; | 
|---|
| 15 |  | 
|---|
| 16 | use crate::{ | 
|---|
| 17 | automaton::Automaton, | 
|---|
| 18 | util::{ | 
|---|
| 19 | alphabet::{ByteClassSet, ByteClasses}, | 
|---|
| 20 | error::{BuildError, MatchError}, | 
|---|
| 21 | prefilter::{self, opposite_ascii_case, Prefilter}, | 
|---|
| 22 | primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, | 
|---|
| 23 | remapper::Remapper, | 
|---|
| 24 | search::{Anchored, MatchKind}, | 
|---|
| 25 | special::Special, | 
|---|
| 26 | }, | 
|---|
| 27 | }; | 
|---|
| 28 |  | 
|---|
| 29 | /// A noncontiguous NFA implementation of Aho-Corasick. | 
|---|
| 30 | /// | 
|---|
| 31 | /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of | 
|---|
| 32 | /// this type directly. Using an `NFA` directly is typically only necessary | 
|---|
| 33 | /// when one needs access to the [`Automaton`] trait implementation. | 
|---|
| 34 | /// | 
|---|
| 35 | /// This NFA represents the "core" implementation of Aho-Corasick in this | 
|---|
| 36 | /// crate. Namely, constructing this NFA involving building a trie and then | 
|---|
| 37 | /// filling in the failure transitions between states, similar to what is | 
|---|
| 38 | /// described in any standard textbook description of Aho-Corasick. | 
|---|
| 39 | /// | 
|---|
| 40 | /// In order to minimize heap usage and to avoid additional construction costs, | 
|---|
| 41 | /// this implementation represents the transitions of all states as distinct | 
|---|
| 42 | /// sparse memory allocations. This is where it gets its name from. That is, | 
|---|
| 43 | /// this NFA has no contiguous memory allocation for its transition table. Each | 
|---|
| 44 | /// state gets its own allocation. | 
|---|
| 45 | /// | 
|---|
| 46 | /// While the sparse representation keeps memory usage to somewhat reasonable | 
|---|
| 47 | /// levels, it is still quite large and also results in somewhat mediocre | 
|---|
| 48 | /// search performance. For this reason, it is almost always a good idea to | 
|---|
| 49 | /// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is | 
|---|
| 50 | /// marginally slower to build, but has higher throughput and can sometimes use | 
|---|
| 51 | /// an order of magnitude less memory. The main reason to use a noncontiguous | 
|---|
| 52 | /// NFA is when you need the fastest possible construction time, or when a | 
|---|
| 53 | /// contiguous NFA does not have the desired capacity. (The total number of NFA | 
|---|
| 54 | /// states it can have is fewer than a noncontiguous NFA.) | 
|---|
| 55 | /// | 
|---|
| 56 | /// # Example | 
|---|
| 57 | /// | 
|---|
| 58 | /// This example shows how to build an `NFA` directly and use it to execute | 
|---|
| 59 | /// [`Automaton::try_find`]: | 
|---|
| 60 | /// | 
|---|
| 61 | /// ``` | 
|---|
| 62 | /// use aho_corasick::{ | 
|---|
| 63 | ///     automaton::Automaton, | 
|---|
| 64 | ///     nfa::noncontiguous::NFA, | 
|---|
| 65 | ///     Input, Match, | 
|---|
| 66 | /// }; | 
|---|
| 67 | /// | 
|---|
| 68 | /// let patterns = &[ "b", "abc", "abcd"]; | 
|---|
| 69 | /// let haystack = "abcd"; | 
|---|
| 70 | /// | 
|---|
| 71 | /// let nfa = NFA::new(patterns).unwrap(); | 
|---|
| 72 | /// assert_eq!( | 
|---|
| 73 | ///     Some(Match::must(0, 1..2)), | 
|---|
| 74 | ///     nfa.try_find(&Input::new(haystack))?, | 
|---|
| 75 | /// ); | 
|---|
| 76 | /// # Ok::<(), Box<dyn std::error::Error>>(()) | 
|---|
| 77 | /// ``` | 
|---|
| 78 | /// | 
|---|
| 79 | /// It is also possible to implement your own version of `try_find`. See the | 
|---|
| 80 | /// [`Automaton`] documentation for an example. | 
|---|
| 81 | #[ derive(Clone)] | 
|---|
| 82 | pub struct NFA { | 
|---|
| 83 | /// The match semantics built into this NFA. | 
|---|
| 84 | match_kind: MatchKind, | 
|---|
| 85 | /// A set of states. Each state defines its own transitions, a fail | 
|---|
| 86 | /// transition and a set of indices corresponding to matches. | 
|---|
| 87 | /// | 
|---|
| 88 | /// The first state is always the fail state, which is used only as a | 
|---|
| 89 | /// sentinel. Namely, in the final NFA, no transition into the fail state | 
|---|
| 90 | /// exists. (Well, they do, but they aren't followed. Instead, the state's | 
|---|
| 91 | /// failure transition is followed.) | 
|---|
| 92 | /// | 
|---|
| 93 | /// The second state (index 1) is always the dead state. Dead states are | 
|---|
| 94 | /// in every automaton, but only used when leftmost-{first,longest} match | 
|---|
| 95 | /// semantics are enabled. Specifically, they instruct search to stop | 
|---|
| 96 | /// at specific points in order to report the correct match location. In | 
|---|
| 97 | /// the standard Aho-Corasick construction, there are no transitions to | 
|---|
| 98 | /// the dead state. | 
|---|
| 99 | /// | 
|---|
| 100 | /// The third state (index 2) is generally intended to be the starting or | 
|---|
| 101 | /// "root" state. | 
|---|
| 102 | states: Vec<State>, | 
|---|
| 103 | /// Transitions stored in a sparse representation via a linked list. | 
|---|
| 104 | /// | 
|---|
| 105 | /// Each transition contains three pieces of information: the byte it | 
|---|
| 106 | /// is defined for, the state it transitions to and a link to the next | 
|---|
| 107 | /// transition in the same state (or `StateID::ZERO` if it is the last | 
|---|
| 108 | /// transition). | 
|---|
| 109 | /// | 
|---|
| 110 | /// The first transition for each state is determined by `State::sparse`. | 
|---|
| 111 | /// | 
|---|
| 112 | /// Note that this contains a complete set of all transitions in this NFA, | 
|---|
| 113 | /// including states that have a dense representation for transitions. | 
|---|
| 114 | /// (Adding dense transitions for a state doesn't remove its sparse | 
|---|
| 115 | /// transitions, since deleting transitions from this particular sparse | 
|---|
| 116 | /// representation would be fairly expensive.) | 
|---|
| 117 | sparse: Vec<Transition>, | 
|---|
| 118 | /// Transitions stored in a dense representation. | 
|---|
| 119 | /// | 
|---|
| 120 | /// A state has a row in this table if and only if `State::dense` is | 
|---|
| 121 | /// not equal to `StateID::ZERO`. When not zero, there are precisely | 
|---|
| 122 | /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense` | 
|---|
| 123 | /// in this table. | 
|---|
| 124 | /// | 
|---|
| 125 | /// Generally a very small minority of states have a dense representation | 
|---|
| 126 | /// since it uses so much memory. | 
|---|
| 127 | dense: Vec<StateID>, | 
|---|
| 128 | /// Matches stored in linked list for each state. | 
|---|
| 129 | /// | 
|---|
| 130 | /// Like sparse transitions, each match has a link to the next match in the | 
|---|
| 131 | /// state. | 
|---|
| 132 | /// | 
|---|
| 133 | /// The first match for each state is determined by `State::matches`. | 
|---|
| 134 | matches: Vec<Match>, | 
|---|
| 135 | /// The length, in bytes, of each pattern in this NFA. This slice is | 
|---|
| 136 | /// indexed by `PatternID`. | 
|---|
| 137 | /// | 
|---|
| 138 | /// The number of entries in this vector corresponds to the total number of | 
|---|
| 139 | /// patterns in this automaton. | 
|---|
| 140 | pattern_lens: Vec<SmallIndex>, | 
|---|
| 141 | /// A prefilter for quickly skipping to candidate matches, if pertinent. | 
|---|
| 142 | prefilter: Option<Prefilter>, | 
|---|
| 143 | /// A set of equivalence classes in terms of bytes. We compute this while | 
|---|
| 144 | /// building the NFA, but don't use it in the NFA's states. Instead, we | 
|---|
| 145 | /// use this for building the DFA. We store it on the NFA since it's easy | 
|---|
| 146 | /// to compute while visiting the patterns. | 
|---|
| 147 | byte_classes: ByteClasses, | 
|---|
| 148 | /// The length, in bytes, of the shortest pattern in this automaton. This | 
|---|
| 149 | /// information is useful for detecting whether an automaton matches the | 
|---|
| 150 | /// empty string or not. | 
|---|
| 151 | min_pattern_len: usize, | 
|---|
| 152 | /// The length, in bytes, of the longest pattern in this automaton. This | 
|---|
| 153 | /// information is useful for keeping correct buffer sizes when searching | 
|---|
| 154 | /// on streams. | 
|---|
| 155 | max_pattern_len: usize, | 
|---|
| 156 | /// The information required to deduce which states are "special" in this | 
|---|
| 157 | /// NFA. | 
|---|
| 158 | /// | 
|---|
| 159 | /// Since the DEAD and FAIL states are always the first two states and | 
|---|
| 160 | /// there are only ever two start states (which follow all of the match | 
|---|
| 161 | /// states), it follows that we can determine whether a state is a fail, | 
|---|
| 162 | /// dead, match or start with just a few comparisons on the ID itself: | 
|---|
| 163 | /// | 
|---|
| 164 | ///    is_dead(sid): sid == NFA::DEAD | 
|---|
| 165 | ///    is_fail(sid): sid == NFA::FAIL | 
|---|
| 166 | ///   is_match(sid): NFA::FAIL < sid && sid <= max_match_id | 
|---|
| 167 | ///   is_start(sid): sid == start_unanchored_id || sid == start_anchored_id | 
|---|
| 168 | /// | 
|---|
| 169 | /// Note that this only applies to the NFA after it has been constructed. | 
|---|
| 170 | /// During construction, the start states are the first ones added and the | 
|---|
| 171 | /// match states are inter-leaved with non-match states. Once all of the | 
|---|
| 172 | /// states have been added, the states are shuffled such that the above | 
|---|
| 173 | /// predicates hold. | 
|---|
| 174 | special: Special, | 
|---|
| 175 | } | 
|---|
| 176 |  | 
|---|
| 177 | impl NFA { | 
|---|
| 178 | /// Create a new Aho-Corasick noncontiguous NFA using the default | 
|---|
| 179 | /// configuration. | 
|---|
| 180 | /// | 
|---|
| 181 | /// Use a [`Builder`] if you want to change the configuration. | 
|---|
| 182 | pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError> | 
|---|
| 183 | where | 
|---|
| 184 | I: IntoIterator<Item = P>, | 
|---|
| 185 | P: AsRef<[u8]>, | 
|---|
| 186 | { | 
|---|
| 187 | NFA::builder().build(patterns) | 
|---|
| 188 | } | 
|---|
| 189 |  | 
|---|
| 190 | /// A convenience method for returning a new Aho-Corasick noncontiguous NFA | 
|---|
| 191 | /// builder. | 
|---|
| 192 | /// | 
|---|
| 193 | /// This usually permits one to just import the `NFA` type. | 
|---|
| 194 | pub fn builder() -> Builder { | 
|---|
| 195 | Builder::new() | 
|---|
| 196 | } | 
|---|
| 197 | } | 
|---|
| 198 |  | 
|---|
| 199 | impl NFA { | 
|---|
| 200 | /// The DEAD state is a sentinel state like the FAIL state. The DEAD state | 
|---|
| 201 | /// instructs any search to stop and return any currently recorded match, | 
|---|
| 202 | /// or no match otherwise. Generally speaking, it is impossible for an | 
|---|
| 203 | /// unanchored standard search to enter a DEAD state. But an anchored | 
|---|
| 204 | /// search can, and so to can a leftmost search. | 
|---|
| 205 | /// | 
|---|
| 206 | /// We put DEAD before FAIL so that DEAD is always 0. We repeat this | 
|---|
| 207 | /// decision across the other Aho-Corasicm automata, so that DEAD | 
|---|
| 208 | /// states there are always 0 too. It's not that we need all of the | 
|---|
| 209 | /// implementations to agree, but rather, the contiguous NFA and the DFA | 
|---|
| 210 | /// use a sort of "premultiplied" state identifier where the only state | 
|---|
| 211 | /// whose ID is always known and constant is the first state. Subsequent | 
|---|
| 212 | /// state IDs depend on how much space has already been used in the | 
|---|
| 213 | /// transition table. | 
|---|
| 214 | pub(crate) const DEAD: StateID = StateID::new_unchecked(0); | 
|---|
| 215 | /// The FAIL state mostly just corresponds to the ID of any transition on a | 
|---|
| 216 | /// state that isn't explicitly defined. When one transitions into the FAIL | 
|---|
| 217 | /// state, one must follow the previous state's failure transition before | 
|---|
| 218 | /// doing the next state lookup. In this way, FAIL is more of a sentinel | 
|---|
| 219 | /// than a state that one actually transitions into. In particular, it is | 
|---|
| 220 | /// never exposed in the `Automaton` interface. | 
|---|
| 221 | pub(crate) const FAIL: StateID = StateID::new_unchecked(1); | 
|---|
| 222 |  | 
|---|
| 223 | /// Returns the equivalence classes of bytes found while constructing | 
|---|
| 224 | /// this NFA. | 
|---|
| 225 | /// | 
|---|
| 226 | /// Note that the NFA doesn't actually make use of these equivalence | 
|---|
| 227 | /// classes. Instead, these are useful for building the DFA when desired. | 
|---|
| 228 | pub(crate) fn byte_classes(&self) -> &ByteClasses { | 
|---|
| 229 | &self.byte_classes | 
|---|
| 230 | } | 
|---|
| 231 |  | 
|---|
| 232 | /// Returns a slice containing the length of each pattern in this searcher. | 
|---|
| 233 | /// It is indexed by `PatternID` and has length `NFA::patterns_len`. | 
|---|
| 234 | /// | 
|---|
| 235 | /// This is exposed for convenience when building a contiguous NFA. But it | 
|---|
| 236 | /// can be reconstructed from the `Automaton` API if necessary. | 
|---|
| 237 | pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] { | 
|---|
| 238 | &self.pattern_lens | 
|---|
| 239 | } | 
|---|
| 240 |  | 
|---|
| 241 | /// Returns a slice of all states in this non-contiguous NFA. | 
|---|
| 242 | pub(crate) fn states(&self) -> &[State] { | 
|---|
| 243 | &self.states | 
|---|
| 244 | } | 
|---|
| 245 |  | 
|---|
| 246 | /// Returns the underlying "special" state information for this NFA. | 
|---|
| 247 | pub(crate) fn special(&self) -> &Special { | 
|---|
| 248 | &self.special | 
|---|
| 249 | } | 
|---|
| 250 |  | 
|---|
| 251 | /// Swaps the states at `id1` and `id2`. | 
|---|
| 252 | /// | 
|---|
| 253 | /// This does not update the transitions of any state to account for the | 
|---|
| 254 | /// state swap. | 
|---|
| 255 | pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { | 
|---|
| 256 | self.states.swap(id1.as_usize(), id2.as_usize()); | 
|---|
| 257 | } | 
|---|
| 258 |  | 
|---|
| 259 | /// Re-maps all state IDs in this NFA according to the `map` function | 
|---|
| 260 | /// given. | 
|---|
| 261 | pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { | 
|---|
| 262 | let alphabet_len = self.byte_classes.alphabet_len(); | 
|---|
| 263 | for state in self.states.iter_mut() { | 
|---|
| 264 | state.fail = map(state.fail); | 
|---|
| 265 | let mut link = state.sparse; | 
|---|
| 266 | while link != StateID::ZERO { | 
|---|
| 267 | let t = &mut self.sparse[link]; | 
|---|
| 268 | t.next = map(t.next); | 
|---|
| 269 | link = t.link; | 
|---|
| 270 | } | 
|---|
| 271 | if state.dense != StateID::ZERO { | 
|---|
| 272 | let start = state.dense.as_usize(); | 
|---|
| 273 | for next in self.dense[start..][..alphabet_len].iter_mut() { | 
|---|
| 274 | *next = map(*next); | 
|---|
| 275 | } | 
|---|
| 276 | } | 
|---|
| 277 | } | 
|---|
| 278 | } | 
|---|
| 279 |  | 
|---|
| 280 | /// Iterate over all of the transitions for the given state ID. | 
|---|
| 281 | pub(crate) fn iter_trans( | 
|---|
| 282 | &self, | 
|---|
| 283 | sid: StateID, | 
|---|
| 284 | ) -> impl Iterator<Item = Transition> + '_ { | 
|---|
| 285 | let mut link = self.states[sid].sparse; | 
|---|
| 286 | core::iter::from_fn(move || { | 
|---|
| 287 | if link == StateID::ZERO { | 
|---|
| 288 | return None; | 
|---|
| 289 | } | 
|---|
| 290 | let t = self.sparse[link]; | 
|---|
| 291 | link = t.link; | 
|---|
| 292 | Some(t) | 
|---|
| 293 | }) | 
|---|
| 294 | } | 
|---|
| 295 |  | 
|---|
| 296 | /// Iterate over all of the matches for the given state ID. | 
|---|
| 297 | pub(crate) fn iter_matches( | 
|---|
| 298 | &self, | 
|---|
| 299 | sid: StateID, | 
|---|
| 300 | ) -> impl Iterator<Item = PatternID> + '_ { | 
|---|
| 301 | let mut link = self.states[sid].matches; | 
|---|
| 302 | core::iter::from_fn(move || { | 
|---|
| 303 | if link == StateID::ZERO { | 
|---|
| 304 | return None; | 
|---|
| 305 | } | 
|---|
| 306 | let m = self.matches[link]; | 
|---|
| 307 | link = m.link; | 
|---|
| 308 | Some(m.pid) | 
|---|
| 309 | }) | 
|---|
| 310 | } | 
|---|
| 311 |  | 
|---|
| 312 | /// Return the link following the one given. If the one given is the last | 
|---|
| 313 | /// link for the given state, then return `None`. | 
|---|
| 314 | /// | 
|---|
| 315 | /// If no previous link is given, then this returns the first link in the | 
|---|
| 316 | /// state, if one exists. | 
|---|
| 317 | /// | 
|---|
| 318 | /// This is useful for manually iterating over the transitions in a single | 
|---|
| 319 | /// state without borrowing the NFA. This permits mutating other parts of | 
|---|
| 320 | /// the NFA during iteration. Namely, one can access the transition pointed | 
|---|
| 321 | /// to by the link via `self.sparse[link]`. | 
|---|
| 322 | fn next_link( | 
|---|
| 323 | &self, | 
|---|
| 324 | sid: StateID, | 
|---|
| 325 | prev: Option<StateID>, | 
|---|
| 326 | ) -> Option<StateID> { | 
|---|
| 327 | let link = | 
|---|
| 328 | prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link); | 
|---|
| 329 | if link == StateID::ZERO { | 
|---|
| 330 | None | 
|---|
| 331 | } else { | 
|---|
| 332 | Some(link) | 
|---|
| 333 | } | 
|---|
| 334 | } | 
|---|
| 335 |  | 
|---|
| 336 | /// Follow the transition for the given byte in the given state. If no such | 
|---|
| 337 | /// transition exists, then the FAIL state ID is returned. | 
|---|
| 338 | #[ inline(always)] | 
|---|
| 339 | fn follow_transition(&self, sid: StateID, byte: u8) -> StateID { | 
|---|
| 340 | let s = &self.states[sid]; | 
|---|
| 341 | // This is a special case that targets starting states and states | 
|---|
| 342 | // near a start state. Namely, after the initial trie is constructed, | 
|---|
| 343 | // we look for states close to the start state to convert to a dense | 
|---|
| 344 | // representation for their transitions. This winds up using a lot more | 
|---|
| 345 | // memory per state in exchange for faster transition lookups. But | 
|---|
| 346 | // since we only do this for a small number of states (by default), the | 
|---|
| 347 | // memory usage is usually minimal. | 
|---|
| 348 | // | 
|---|
| 349 | // This has *massive* benefit when executing searches because the | 
|---|
| 350 | // unanchored starting state is by far the hottest state and is | 
|---|
| 351 | // frequently visited. Moreover, the 'for' loop below that works | 
|---|
| 352 | // decently on an actually sparse state is disastrous on a state that | 
|---|
| 353 | // is nearly or completely dense. | 
|---|
| 354 | if s.dense == StateID::ZERO { | 
|---|
| 355 | self.follow_transition_sparse(sid, byte) | 
|---|
| 356 | } else { | 
|---|
| 357 | let class = usize::from(self.byte_classes.get(byte)); | 
|---|
| 358 | self.dense[s.dense.as_usize() + class] | 
|---|
| 359 | } | 
|---|
| 360 | } | 
|---|
| 361 |  | 
|---|
| 362 | /// Like `follow_transition`, but always uses the sparse representation. | 
|---|
| 363 | #[ inline(always)] | 
|---|
| 364 | fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID { | 
|---|
| 365 | for t in self.iter_trans(sid) { | 
|---|
| 366 | if byte <= t.byte { | 
|---|
| 367 | if byte == t.byte { | 
|---|
| 368 | return t.next; | 
|---|
| 369 | } | 
|---|
| 370 | break; | 
|---|
| 371 | } | 
|---|
| 372 | } | 
|---|
| 373 | NFA::FAIL | 
|---|
| 374 | } | 
|---|
| 375 |  | 
|---|
| 376 | /// Set the transition for the given byte to the state ID given. | 
|---|
| 377 | /// | 
|---|
| 378 | /// Note that one should not set transitions to the FAIL state. It is not | 
|---|
| 379 | /// technically incorrect, but it wastes space. If a transition is not | 
|---|
| 380 | /// defined, then it is automatically assumed to lead to the FAIL state. | 
|---|
| 381 | fn add_transition( | 
|---|
| 382 | &mut self, | 
|---|
| 383 | prev: StateID, | 
|---|
| 384 | byte: u8, | 
|---|
| 385 | next: StateID, | 
|---|
| 386 | ) -> Result<(), BuildError> { | 
|---|
| 387 | if self.states[prev].dense != StateID::ZERO { | 
|---|
| 388 | let dense = self.states[prev].dense; | 
|---|
| 389 | let class = usize::from(self.byte_classes.get(byte)); | 
|---|
| 390 | self.dense[dense.as_usize() + class] = next; | 
|---|
| 391 | } | 
|---|
| 392 |  | 
|---|
| 393 | let head = self.states[prev].sparse; | 
|---|
| 394 | if head == StateID::ZERO || byte < self.sparse[head].byte { | 
|---|
| 395 | let new_link = self.alloc_transition()?; | 
|---|
| 396 | self.sparse[new_link] = Transition { byte, next, link: head }; | 
|---|
| 397 | self.states[prev].sparse = new_link; | 
|---|
| 398 | return Ok(()); | 
|---|
| 399 | } else if byte == self.sparse[head].byte { | 
|---|
| 400 | self.sparse[head].next = next; | 
|---|
| 401 | return Ok(()); | 
|---|
| 402 | } | 
|---|
| 403 |  | 
|---|
| 404 | // We handled the only cases where the beginning of the transition | 
|---|
| 405 | // chain needs to change. At this point, we now know that there is | 
|---|
| 406 | // at least one entry in the transition chain and the byte for that | 
|---|
| 407 | // transition is less than the byte for the transition we're adding. | 
|---|
| 408 | let (mut link_prev, mut link_next) = (head, self.sparse[head].link); | 
|---|
| 409 | while link_next != StateID::ZERO && byte > self.sparse[link_next].byte | 
|---|
| 410 | { | 
|---|
| 411 | link_prev = link_next; | 
|---|
| 412 | link_next = self.sparse[link_next].link; | 
|---|
| 413 | } | 
|---|
| 414 | if link_next == StateID::ZERO || byte < self.sparse[link_next].byte { | 
|---|
| 415 | let link = self.alloc_transition()?; | 
|---|
| 416 | self.sparse[link] = Transition { byte, next, link: link_next }; | 
|---|
| 417 | self.sparse[link_prev].link = link; | 
|---|
| 418 | } else { | 
|---|
| 419 | assert_eq!(byte, self.sparse[link_next].byte); | 
|---|
| 420 | self.sparse[link_next].next = next; | 
|---|
| 421 | } | 
|---|
| 422 | Ok(()) | 
|---|
| 423 | } | 
|---|
| 424 |  | 
|---|
| 425 | /// This sets every possible transition (all 255 of them) for the given | 
|---|
| 426 | /// state to the name `next` value. | 
|---|
| 427 | /// | 
|---|
| 428 | /// This is useful for efficiently initializing start/dead states. | 
|---|
| 429 | /// | 
|---|
| 430 | /// # Panics | 
|---|
| 431 | /// | 
|---|
| 432 | /// This requires that the state has no transitions added to it already. | 
|---|
| 433 | /// If it has any transitions, then this panics. It will also panic if | 
|---|
| 434 | /// the state has been densified prior to calling this. | 
|---|
| 435 | fn init_full_state( | 
|---|
| 436 | &mut self, | 
|---|
| 437 | prev: StateID, | 
|---|
| 438 | next: StateID, | 
|---|
| 439 | ) -> Result<(), BuildError> { | 
|---|
| 440 | assert_eq!( | 
|---|
| 441 | StateID::ZERO, | 
|---|
| 442 | self.states[prev].dense, | 
|---|
| 443 | "state must not be dense yet" | 
|---|
| 444 | ); | 
|---|
| 445 | assert_eq!( | 
|---|
| 446 | StateID::ZERO, | 
|---|
| 447 | self.states[prev].sparse, | 
|---|
| 448 | "state must have zero transitions" | 
|---|
| 449 | ); | 
|---|
| 450 | let mut prev_link = StateID::ZERO; | 
|---|
| 451 | for byte in 0..=255 { | 
|---|
| 452 | let new_link = self.alloc_transition()?; | 
|---|
| 453 | self.sparse[new_link] = | 
|---|
| 454 | Transition { byte, next, link: StateID::ZERO }; | 
|---|
| 455 | if prev_link == StateID::ZERO { | 
|---|
| 456 | self.states[prev].sparse = new_link; | 
|---|
| 457 | } else { | 
|---|
| 458 | self.sparse[prev_link].link = new_link; | 
|---|
| 459 | } | 
|---|
| 460 | prev_link = new_link; | 
|---|
| 461 | } | 
|---|
| 462 | Ok(()) | 
|---|
| 463 | } | 
|---|
| 464 |  | 
|---|
| 465 | /// Add a match for the given pattern ID to the state for the given ID. | 
|---|
| 466 | fn add_match( | 
|---|
| 467 | &mut self, | 
|---|
| 468 | sid: StateID, | 
|---|
| 469 | pid: PatternID, | 
|---|
| 470 | ) -> Result<(), BuildError> { | 
|---|
| 471 | let head = self.states[sid].matches; | 
|---|
| 472 | let mut link = head; | 
|---|
| 473 | while self.matches[link].link != StateID::ZERO { | 
|---|
| 474 | link = self.matches[link].link; | 
|---|
| 475 | } | 
|---|
| 476 | let new_match_link = self.alloc_match()?; | 
|---|
| 477 | self.matches[new_match_link].pid = pid; | 
|---|
| 478 | if link == StateID::ZERO { | 
|---|
| 479 | self.states[sid].matches = new_match_link; | 
|---|
| 480 | } else { | 
|---|
| 481 | self.matches[link].link = new_match_link; | 
|---|
| 482 | } | 
|---|
| 483 | Ok(()) | 
|---|
| 484 | } | 
|---|
| 485 |  | 
|---|
| 486 | /// Copy matches from the `src` state to the `dst` state. This is useful | 
|---|
| 487 | /// when a match state can be reached via a failure transition. In which | 
|---|
| 488 | /// case, you'll want to copy the matches (if any) from the state reached | 
|---|
| 489 | /// by the failure transition to the original state you were at. | 
|---|
| 490 | fn copy_matches( | 
|---|
| 491 | &mut self, | 
|---|
| 492 | src: StateID, | 
|---|
| 493 | dst: StateID, | 
|---|
| 494 | ) -> Result<(), BuildError> { | 
|---|
| 495 | let head_dst = self.states[dst].matches; | 
|---|
| 496 | let mut link_dst = head_dst; | 
|---|
| 497 | while self.matches[link_dst].link != StateID::ZERO { | 
|---|
| 498 | link_dst = self.matches[link_dst].link; | 
|---|
| 499 | } | 
|---|
| 500 | let mut link_src = self.states[src].matches; | 
|---|
| 501 | while link_src != StateID::ZERO { | 
|---|
| 502 | let new_match_link = | 
|---|
| 503 | StateID::new(self.matches.len()).map_err(|e| { | 
|---|
| 504 | BuildError::state_id_overflow( | 
|---|
| 505 | StateID::MAX.as_u64(), | 
|---|
| 506 | e.attempted(), | 
|---|
| 507 | ) | 
|---|
| 508 | })?; | 
|---|
| 509 | self.matches.push(Match { | 
|---|
| 510 | pid: self.matches[link_src].pid, | 
|---|
| 511 | link: StateID::ZERO, | 
|---|
| 512 | }); | 
|---|
| 513 | if link_dst == StateID::ZERO { | 
|---|
| 514 | self.states[dst].matches = new_match_link; | 
|---|
| 515 | } else { | 
|---|
| 516 | self.matches[link_dst].link = new_match_link; | 
|---|
| 517 | } | 
|---|
| 518 |  | 
|---|
| 519 | link_dst = new_match_link; | 
|---|
| 520 | link_src = self.matches[link_src].link; | 
|---|
| 521 | } | 
|---|
| 522 | Ok(()) | 
|---|
| 523 | } | 
|---|
| 524 |  | 
|---|
| 525 | /// Create a new entry in `NFA::trans`, if there's room, and return that | 
|---|
| 526 | /// entry's ID. If there's no room, then an error is returned. | 
|---|
| 527 | fn alloc_transition(&mut self) -> Result<StateID, BuildError> { | 
|---|
| 528 | let id = StateID::new(self.sparse.len()).map_err(|e| { | 
|---|
| 529 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) | 
|---|
| 530 | })?; | 
|---|
| 531 | self.sparse.push(Transition::default()); | 
|---|
| 532 | Ok(id) | 
|---|
| 533 | } | 
|---|
| 534 |  | 
|---|
| 535 | /// Create a new entry in `NFA::matches`, if there's room, and return that | 
|---|
| 536 | /// entry's ID. If there's no room, then an error is returned. | 
|---|
| 537 | fn alloc_match(&mut self) -> Result<StateID, BuildError> { | 
|---|
| 538 | let id = StateID::new(self.matches.len()).map_err(|e| { | 
|---|
| 539 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) | 
|---|
| 540 | })?; | 
|---|
| 541 | self.matches.push(Match::default()); | 
|---|
| 542 | Ok(id) | 
|---|
| 543 | } | 
|---|
| 544 |  | 
|---|
| 545 | /// Create a new set of `N` transitions in this NFA's dense transition | 
|---|
| 546 | /// table. The ID return corresponds to the index at which the `N` | 
|---|
| 547 | /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is | 
|---|
| 548 | /// the last. | 
|---|
| 549 | /// | 
|---|
| 550 | /// `N` is determined via `NFA::byte_classes::alphabet_len`. | 
|---|
| 551 | fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> { | 
|---|
| 552 | let id = StateID::new(self.dense.len()).map_err(|e| { | 
|---|
| 553 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) | 
|---|
| 554 | })?; | 
|---|
| 555 | // We use FAIL because it's the correct default. If a state doesn't | 
|---|
| 556 | // have a transition defined for every possible byte value, then the | 
|---|
| 557 | // transition function should return NFA::FAIL. | 
|---|
| 558 | self.dense.extend( | 
|---|
| 559 | core::iter::repeat(NFA::FAIL) | 
|---|
| 560 | .take(self.byte_classes.alphabet_len()), | 
|---|
| 561 | ); | 
|---|
| 562 | Ok(id) | 
|---|
| 563 | } | 
|---|
| 564 |  | 
|---|
| 565 | /// Allocate and add a fresh state to the underlying NFA and return its | 
|---|
| 566 | /// ID (guaranteed to be one more than the ID of the previously allocated | 
|---|
| 567 | /// state). If the ID would overflow `StateID`, then this returns an error. | 
|---|
| 568 | fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> { | 
|---|
| 569 | // This is OK because we error when building the trie if we see a | 
|---|
| 570 | // pattern whose length cannot fit into a 'SmallIndex', and the longest | 
|---|
| 571 | // possible depth corresponds to the length of the longest pattern. | 
|---|
| 572 | let depth = SmallIndex::new(depth) | 
|---|
| 573 | .expect( "patterns longer than SmallIndex::MAX are not allowed"); | 
|---|
| 574 | let id = StateID::new(self.states.len()).map_err(|e| { | 
|---|
| 575 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) | 
|---|
| 576 | })?; | 
|---|
| 577 | self.states.push(State { | 
|---|
| 578 | sparse: StateID::ZERO, | 
|---|
| 579 | dense: StateID::ZERO, | 
|---|
| 580 | matches: StateID::ZERO, | 
|---|
| 581 | fail: self.special.start_unanchored_id, | 
|---|
| 582 | depth, | 
|---|
| 583 | }); | 
|---|
| 584 | Ok(id) | 
|---|
| 585 | } | 
|---|
| 586 | } | 
|---|
| 587 |  | 
|---|
| 588 | // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always | 
|---|
| 589 | // returns a valid state ID given a valid state ID. We otherwise claim that | 
|---|
| 590 | // all other methods are correct as well. | 
|---|
| 591 | unsafe impl Automaton for NFA { | 
|---|
| 592 | #[ inline(always)] | 
|---|
| 593 | fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { | 
|---|
| 594 | match anchored { | 
|---|
| 595 | Anchored::No => Ok(self.special.start_unanchored_id), | 
|---|
| 596 | Anchored::Yes => Ok(self.special.start_anchored_id), | 
|---|
| 597 | } | 
|---|
| 598 | } | 
|---|
| 599 |  | 
|---|
| 600 | #[ inline(always)] | 
|---|
| 601 | fn next_state( | 
|---|
| 602 | &self, | 
|---|
| 603 | anchored: Anchored, | 
|---|
| 604 | mut sid: StateID, | 
|---|
| 605 | byte: u8, | 
|---|
| 606 | ) -> StateID { | 
|---|
| 607 | // This terminates since: | 
|---|
| 608 | // | 
|---|
| 609 | // 1. state.fail never points to the FAIL state. | 
|---|
| 610 | // 2. All state.fail values point to a state closer to the start state. | 
|---|
| 611 | // 3. The start state has no transitions to the FAIL state. | 
|---|
| 612 | loop { | 
|---|
| 613 | let next = self.follow_transition(sid, byte); | 
|---|
| 614 | if next != NFA::FAIL { | 
|---|
| 615 | return next; | 
|---|
| 616 | } | 
|---|
| 617 | // For an anchored search, we never follow failure transitions | 
|---|
| 618 | // because failure transitions lead us down a path to matching | 
|---|
| 619 | // a *proper* suffix of the path we were on. Thus, it can only | 
|---|
| 620 | // produce matches that appear after the beginning of the search. | 
|---|
| 621 | if anchored.is_anchored() { | 
|---|
| 622 | return NFA::DEAD; | 
|---|
| 623 | } | 
|---|
| 624 | sid = self.states[sid].fail(); | 
|---|
| 625 | } | 
|---|
| 626 | } | 
|---|
| 627 |  | 
|---|
| 628 | #[ inline(always)] | 
|---|
| 629 | fn is_special(&self, sid: StateID) -> bool { | 
|---|
| 630 | sid <= self.special.max_special_id | 
|---|
| 631 | } | 
|---|
| 632 |  | 
|---|
| 633 | #[ inline(always)] | 
|---|
| 634 | fn is_dead(&self, sid: StateID) -> bool { | 
|---|
| 635 | sid == NFA::DEAD | 
|---|
| 636 | } | 
|---|
| 637 |  | 
|---|
| 638 | #[ inline(always)] | 
|---|
| 639 | fn is_match(&self, sid: StateID) -> bool { | 
|---|
| 640 | // N.B. This returns true when sid==NFA::FAIL but that's okay because | 
|---|
| 641 | // NFA::FAIL is not actually a valid state ID from the perspective of | 
|---|
| 642 | // the Automaton trait. Namely, it is never returned by 'start_state' | 
|---|
| 643 | // or by 'next_state'. So we don't need to care about it here. | 
|---|
| 644 | !self.is_dead(sid) && sid <= self.special.max_match_id | 
|---|
| 645 | } | 
|---|
| 646 |  | 
|---|
| 647 | #[ inline(always)] | 
|---|
| 648 | fn is_start(&self, sid: StateID) -> bool { | 
|---|
| 649 | sid == self.special.start_unanchored_id | 
|---|
| 650 | || sid == self.special.start_anchored_id | 
|---|
| 651 | } | 
|---|
| 652 |  | 
|---|
| 653 | #[ inline(always)] | 
|---|
| 654 | fn match_kind(&self) -> MatchKind { | 
|---|
| 655 | self.match_kind | 
|---|
| 656 | } | 
|---|
| 657 |  | 
|---|
| 658 | #[ inline(always)] | 
|---|
| 659 | fn patterns_len(&self) -> usize { | 
|---|
| 660 | self.pattern_lens.len() | 
|---|
| 661 | } | 
|---|
| 662 |  | 
|---|
| 663 | #[ inline(always)] | 
|---|
| 664 | fn pattern_len(&self, pid: PatternID) -> usize { | 
|---|
| 665 | self.pattern_lens[pid].as_usize() | 
|---|
| 666 | } | 
|---|
| 667 |  | 
|---|
| 668 | #[ inline(always)] | 
|---|
| 669 | fn min_pattern_len(&self) -> usize { | 
|---|
| 670 | self.min_pattern_len | 
|---|
| 671 | } | 
|---|
| 672 |  | 
|---|
| 673 | #[ inline(always)] | 
|---|
| 674 | fn max_pattern_len(&self) -> usize { | 
|---|
| 675 | self.max_pattern_len | 
|---|
| 676 | } | 
|---|
| 677 |  | 
|---|
| 678 | #[ inline(always)] | 
|---|
| 679 | fn match_len(&self, sid: StateID) -> usize { | 
|---|
| 680 | self.iter_matches(sid).count() | 
|---|
| 681 | } | 
|---|
| 682 |  | 
|---|
| 683 | #[ inline(always)] | 
|---|
| 684 | fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { | 
|---|
| 685 | self.iter_matches(sid).nth(index).unwrap() | 
|---|
| 686 | } | 
|---|
| 687 |  | 
|---|
| 688 | #[ inline(always)] | 
|---|
| 689 | fn memory_usage(&self) -> usize { | 
|---|
| 690 | self.states.len() * core::mem::size_of::<State>() | 
|---|
| 691 | + self.sparse.len() * core::mem::size_of::<Transition>() | 
|---|
| 692 | + self.matches.len() * core::mem::size_of::<Match>() | 
|---|
| 693 | + self.dense.len() * StateID::SIZE | 
|---|
| 694 | + self.pattern_lens.len() * SmallIndex::SIZE | 
|---|
| 695 | + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) | 
|---|
| 696 | } | 
|---|
| 697 |  | 
|---|
| 698 | #[ inline(always)] | 
|---|
| 699 | fn prefilter(&self) -> Option<&Prefilter> { | 
|---|
| 700 | self.prefilter.as_ref() | 
|---|
| 701 | } | 
|---|
| 702 | } | 
|---|
| 703 |  | 
|---|
| 704 | /// A representation of a sparse NFA state for an Aho-Corasick automaton. | 
|---|
| 705 | /// | 
|---|
| 706 | /// It contains the transitions to the next state, a failure transition for | 
|---|
| 707 | /// cases where there exists no other transition for the current input byte | 
|---|
| 708 | /// and the matches implied by visiting this state (if any). | 
|---|
| 709 | #[ derive(Clone, Debug)] | 
|---|
| 710 | pub(crate) struct State { | 
|---|
| 711 | /// A pointer to `NFA::trans` corresponding to the head of a linked list | 
|---|
| 712 | /// containing all of the transitions for this state. | 
|---|
| 713 | /// | 
|---|
| 714 | /// This is `StateID::ZERO` if and only if this state has zero transitions. | 
|---|
| 715 | sparse: StateID, | 
|---|
| 716 | /// A pointer to a row of `N` transitions in `NFA::dense`. These | 
|---|
| 717 | /// transitions correspond precisely to what is obtained by traversing | 
|---|
| 718 | /// `sparse`, but permits constant time lookup. | 
|---|
| 719 | /// | 
|---|
| 720 | /// When this is zero (which is true for most states in the default | 
|---|
| 721 | /// configuration), then this state has no dense representation. | 
|---|
| 722 | /// | 
|---|
| 723 | /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is | 
|---|
| 724 | /// typically much less than 256 (the maximum value). | 
|---|
| 725 | dense: StateID, | 
|---|
| 726 | /// A pointer to `NFA::matches` corresponding to the head of a linked list | 
|---|
| 727 | /// containing all of the matches for this state. | 
|---|
| 728 | /// | 
|---|
| 729 | /// This is `StateID::ZERO` if and only if this state is not a match state. | 
|---|
| 730 | matches: StateID, | 
|---|
| 731 | /// The state that should be transitioned to if the current byte in the | 
|---|
| 732 | /// haystack does not have a corresponding transition defined in this | 
|---|
| 733 | /// state. | 
|---|
| 734 | fail: StateID, | 
|---|
| 735 | /// The depth of this state. Specifically, this is the distance from this | 
|---|
| 736 | /// state to the starting state. (For the special sentinel states DEAD and | 
|---|
| 737 | /// FAIL, their depth is always 0.) The depth of a starting state is 0. | 
|---|
| 738 | /// | 
|---|
| 739 | /// Note that depth is currently not used in this non-contiguous NFA. It | 
|---|
| 740 | /// may in the future, but it is used in the contiguous NFA. Namely, it | 
|---|
| 741 | /// permits an optimization where states near the starting state have their | 
|---|
| 742 | /// transitions stored in a dense fashion, but all other states have their | 
|---|
| 743 | /// transitions stored in a sparse fashion. (This non-contiguous NFA uses | 
|---|
| 744 | /// a sparse representation for all states unconditionally.) In any case, | 
|---|
| 745 | /// this is really the only convenient place to compute and store this | 
|---|
| 746 | /// information, which we need when building the contiguous NFA. | 
|---|
| 747 | depth: SmallIndex, | 
|---|
| 748 | } | 
|---|
| 749 |  | 
|---|
| 750 | impl State { | 
|---|
| 751 | /// Return true if and only if this state is a match state. | 
|---|
| 752 | pub(crate) fn is_match(&self) -> bool { | 
|---|
| 753 | self.matches != StateID::ZERO | 
|---|
| 754 | } | 
|---|
| 755 |  | 
|---|
| 756 | /// Returns the failure transition for this state. | 
|---|
| 757 | pub(crate) fn fail(&self) -> StateID { | 
|---|
| 758 | self.fail | 
|---|
| 759 | } | 
|---|
| 760 |  | 
|---|
| 761 | /// Returns the depth of this state. That is, the number of transitions | 
|---|
| 762 | /// this state is from the start state of the NFA. | 
|---|
| 763 | pub(crate) fn depth(&self) -> SmallIndex { | 
|---|
| 764 | self.depth | 
|---|
| 765 | } | 
|---|
| 766 | } | 
|---|
| 767 |  | 
|---|
| 768 | /// A single transition in a non-contiguous NFA. | 
|---|
| 769 | #[ derive(Clone, Copy, Default)] | 
|---|
| 770 | #[ repr(packed)] | 
|---|
| 771 | pub(crate) struct Transition { | 
|---|
| 772 | byte: u8, | 
|---|
| 773 | next: StateID, | 
|---|
| 774 | link: StateID, | 
|---|
| 775 | } | 
|---|
| 776 |  | 
|---|
| 777 | impl Transition { | 
|---|
| 778 | /// Return the byte for which this transition is defined. | 
|---|
| 779 | pub(crate) fn byte(&self) -> u8 { | 
|---|
| 780 | self.byte | 
|---|
| 781 | } | 
|---|
| 782 |  | 
|---|
| 783 | /// Return the ID of the state that this transition points to. | 
|---|
| 784 | pub(crate) fn next(&self) -> StateID { | 
|---|
| 785 | self.next | 
|---|
| 786 | } | 
|---|
| 787 |  | 
|---|
| 788 | /// Return the ID of the next transition. | 
|---|
| 789 | fn link(&self) -> StateID { | 
|---|
| 790 | self.link | 
|---|
| 791 | } | 
|---|
| 792 | } | 
|---|
| 793 |  | 
|---|
| 794 | impl core::fmt::Debug for Transition { | 
|---|
| 795 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { | 
|---|
| 796 | write!( | 
|---|
| 797 | f, | 
|---|
| 798 | "Transition(byte: {:X?} , next: {:?} , link: {:?} )", | 
|---|
| 799 | self.byte, | 
|---|
| 800 | self.next().as_usize(), | 
|---|
| 801 | self.link().as_usize() | 
|---|
| 802 | ) | 
|---|
| 803 | } | 
|---|
| 804 | } | 
|---|
| 805 |  | 
|---|
| 806 | /// A single match in a non-contiguous NFA. | 
|---|
| 807 | #[ derive(Clone, Copy, Default)] | 
|---|
| 808 | struct Match { | 
|---|
| 809 | pid: PatternID, | 
|---|
| 810 | link: StateID, | 
|---|
| 811 | } | 
|---|
| 812 |  | 
|---|
| 813 | impl Match { | 
|---|
| 814 | /// Return the pattern ID for this match. | 
|---|
| 815 | pub(crate) fn pattern(&self) -> PatternID { | 
|---|
| 816 | self.pid | 
|---|
| 817 | } | 
|---|
| 818 |  | 
|---|
| 819 | /// Return the ID of the next match. | 
|---|
| 820 | fn link(&self) -> StateID { | 
|---|
| 821 | self.link | 
|---|
| 822 | } | 
|---|
| 823 | } | 
|---|
| 824 |  | 
|---|
| 825 | impl core::fmt::Debug for Match { | 
|---|
| 826 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { | 
|---|
| 827 | write!( | 
|---|
| 828 | f, | 
|---|
| 829 | "Match(pid: {:?} , link: {:?} )", | 
|---|
| 830 | self.pattern().as_usize(), | 
|---|
| 831 | self.link().as_usize() | 
|---|
| 832 | ) | 
|---|
| 833 | } | 
|---|
| 834 | } | 
|---|
| 835 |  | 
|---|
| 836 | /// A builder for configuring an Aho-Corasick noncontiguous NFA. | 
|---|
| 837 | /// | 
|---|
| 838 | /// This builder has a subset of the options available to a | 
|---|
| 839 | /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, | 
|---|
| 840 | /// their behavior is identical. | 
|---|
| 841 | #[ derive(Clone, Debug)] | 
|---|
| 842 | pub struct Builder { | 
|---|
| 843 | match_kind: MatchKind, | 
|---|
| 844 | prefilter: bool, | 
|---|
| 845 | ascii_case_insensitive: bool, | 
|---|
| 846 | dense_depth: usize, | 
|---|
| 847 | } | 
|---|
| 848 |  | 
|---|
| 849 | impl Default for Builder { | 
|---|
| 850 | fn default() -> Builder { | 
|---|
| 851 | Builder { | 
|---|
| 852 | match_kind: MatchKind::default(), | 
|---|
| 853 | prefilter: true, | 
|---|
| 854 | ascii_case_insensitive: false, | 
|---|
| 855 | dense_depth: 3, | 
|---|
| 856 | } | 
|---|
| 857 | } | 
|---|
| 858 | } | 
|---|
| 859 |  | 
|---|
| 860 | impl Builder { | 
|---|
| 861 | /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA. | 
|---|
| 862 | pub fn new() -> Builder { | 
|---|
| 863 | Builder::default() | 
|---|
| 864 | } | 
|---|
| 865 |  | 
|---|
| 866 | /// Build an Aho-Corasick noncontiguous NFA from the given iterator of | 
|---|
| 867 | /// patterns. | 
|---|
| 868 | /// | 
|---|
| 869 | /// A builder may be reused to create more NFAs. | 
|---|
| 870 | pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError> | 
|---|
| 871 | where | 
|---|
| 872 | I: IntoIterator<Item = P>, | 
|---|
| 873 | P: AsRef<[u8]>, | 
|---|
| 874 | { | 
|---|
| 875 | debug!( "building non-contiguous NFA"); | 
|---|
| 876 | let nfa = Compiler::new(self)?.compile(patterns)?; | 
|---|
| 877 | debug!( | 
|---|
| 878 | "non-contiguous NFA built, <states: {:?}, size: {:?}>", | 
|---|
| 879 | nfa.states.len(), | 
|---|
| 880 | nfa.memory_usage() | 
|---|
| 881 | ); | 
|---|
| 882 | Ok(nfa) | 
|---|
| 883 | } | 
|---|
| 884 |  | 
|---|
| 885 | /// Set the desired match semantics. | 
|---|
| 886 | /// | 
|---|
| 887 | /// See | 
|---|
| 888 | /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) | 
|---|
| 889 | /// for more documentation and examples. | 
|---|
| 890 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { | 
|---|
| 891 | self.match_kind = kind; | 
|---|
| 892 | self | 
|---|
| 893 | } | 
|---|
| 894 |  | 
|---|
| 895 | /// Enable ASCII-aware case insensitive matching. | 
|---|
| 896 | /// | 
|---|
| 897 | /// See | 
|---|
| 898 | /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) | 
|---|
| 899 | /// for more documentation and examples. | 
|---|
| 900 | pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { | 
|---|
| 901 | self.ascii_case_insensitive = yes; | 
|---|
| 902 | self | 
|---|
| 903 | } | 
|---|
| 904 |  | 
|---|
| 905 | /// Set the limit on how many states use a dense representation for their | 
|---|
| 906 | /// transitions. Other states will generally use a sparse representation. | 
|---|
| 907 | /// | 
|---|
| 908 | /// See | 
|---|
| 909 | /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth) | 
|---|
| 910 | /// for more documentation and examples. | 
|---|
| 911 | pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { | 
|---|
| 912 | self.dense_depth = depth; | 
|---|
| 913 | self | 
|---|
| 914 | } | 
|---|
| 915 |  | 
|---|
| 916 | /// Enable heuristic prefilter optimizations. | 
|---|
| 917 | /// | 
|---|
| 918 | /// See | 
|---|
| 919 | /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) | 
|---|
| 920 | /// for more documentation and examples. | 
|---|
| 921 | pub fn prefilter(&mut self, yes: bool) -> &mut Builder { | 
|---|
| 922 | self.prefilter = yes; | 
|---|
| 923 | self | 
|---|
| 924 | } | 
|---|
| 925 | } | 
|---|
| 926 |  | 
|---|
| 927 | /// A compiler uses a builder configuration and builds up the NFA formulation | 
|---|
| 928 | /// of an Aho-Corasick automaton. This roughly corresponds to the standard | 
|---|
| 929 | /// formulation described in textbooks, with some tweaks to support leftmost | 
|---|
| 930 | /// searching. | 
|---|
| 931 | #[ derive(Debug)] | 
|---|
| 932 | struct Compiler<'a> { | 
|---|
| 933 | builder: &'a Builder, | 
|---|
| 934 | prefilter: prefilter::Builder, | 
|---|
| 935 | nfa: NFA, | 
|---|
| 936 | byteset: ByteClassSet, | 
|---|
| 937 | } | 
|---|
| 938 |  | 
|---|
| 939 | impl<'a> Compiler<'a> { | 
|---|
| 940 | fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> { | 
|---|
| 941 | let prefilter = prefilter::Builder::new(builder.match_kind) | 
|---|
| 942 | .ascii_case_insensitive(builder.ascii_case_insensitive); | 
|---|
| 943 | Ok(Compiler { | 
|---|
| 944 | builder, | 
|---|
| 945 | prefilter, | 
|---|
| 946 | nfa: NFA { | 
|---|
| 947 | match_kind: builder.match_kind, | 
|---|
| 948 | states: vec![], | 
|---|
| 949 | sparse: vec![], | 
|---|
| 950 | dense: vec![], | 
|---|
| 951 | matches: vec![], | 
|---|
| 952 | pattern_lens: vec![], | 
|---|
| 953 | prefilter: None, | 
|---|
| 954 | byte_classes: ByteClasses::singletons(), | 
|---|
| 955 | min_pattern_len: usize::MAX, | 
|---|
| 956 | max_pattern_len: 0, | 
|---|
| 957 | special: Special::zero(), | 
|---|
| 958 | }, | 
|---|
| 959 | byteset: ByteClassSet::empty(), | 
|---|
| 960 | }) | 
|---|
| 961 | } | 
|---|
| 962 |  | 
|---|
| 963 | fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError> | 
|---|
| 964 | where | 
|---|
| 965 | I: IntoIterator<Item = P>, | 
|---|
| 966 | P: AsRef<[u8]>, | 
|---|
| 967 | { | 
|---|
| 968 | // Add dummy transition/match links, so that no valid link will point | 
|---|
| 969 | // to another link at index 0. | 
|---|
| 970 | self.nfa.sparse.push(Transition::default()); | 
|---|
| 971 | self.nfa.matches.push(Match::default()); | 
|---|
| 972 | // Add a dummy dense transition so that no states can have dense==0 | 
|---|
| 973 | // represent a valid pointer to dense transitions. This permits | 
|---|
| 974 | // dense==0 to be a sentinel indicating "no dense transitions." | 
|---|
| 975 | self.nfa.dense.push(NFA::DEAD); | 
|---|
| 976 | // the dead state, only used for leftmost and fixed to id==0 | 
|---|
| 977 | self.nfa.alloc_state(0)?; | 
|---|
| 978 | // the fail state, which is never entered and fixed to id==1 | 
|---|
| 979 | self.nfa.alloc_state(0)?; | 
|---|
| 980 | // unanchored start state, initially fixed to id==2 but later shuffled | 
|---|
| 981 | // to appear after all non-start match states. | 
|---|
| 982 | self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?; | 
|---|
| 983 | // anchored start state, initially fixed to id==3 but later shuffled | 
|---|
| 984 | // to appear after unanchored start state. | 
|---|
| 985 | self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?; | 
|---|
| 986 | // Initialize the unanchored starting state in order to make it dense, | 
|---|
| 987 | // and thus make transition lookups on this state faster. | 
|---|
| 988 | self.init_unanchored_start_state()?; | 
|---|
| 989 | // Set all transitions on the DEAD state to point to itself. This way, | 
|---|
| 990 | // the DEAD state can never be escaped. It MUST be used as a sentinel | 
|---|
| 991 | // in any correct search. | 
|---|
| 992 | self.add_dead_state_loop()?; | 
|---|
| 993 | // Build the base trie from the given patterns. | 
|---|
| 994 | self.build_trie(patterns)?; | 
|---|
| 995 | self.nfa.states.shrink_to_fit(); | 
|---|
| 996 | // Turn our set of bytes into equivalent classes. This NFA | 
|---|
| 997 | // implementation uses byte classes only for states that use a dense | 
|---|
| 998 | // representation of transitions. (And that's why this comes before | 
|---|
| 999 | // `self.densify()`, as the byte classes need to be set first.) | 
|---|
| 1000 | self.nfa.byte_classes = self.byteset.byte_classes(); | 
|---|
| 1001 | // Add transitions (and maybe matches) to the anchored starting state. | 
|---|
| 1002 | // The anchored starting state is used for anchored searches. The only | 
|---|
| 1003 | // mechanical difference between it and the unanchored start state is | 
|---|
| 1004 | // that missing transitions map to the DEAD state instead of the FAIL | 
|---|
| 1005 | // state. | 
|---|
| 1006 | self.set_anchored_start_state()?; | 
|---|
| 1007 | // Rewrite transitions to the FAIL state on the unanchored start state | 
|---|
| 1008 | // as self-transitions. This keeps the start state active at all times. | 
|---|
| 1009 | self.add_unanchored_start_state_loop(); | 
|---|
| 1010 | // Make some (possibly zero) states use a dense representation for | 
|---|
| 1011 | // transitions. It's important to do this right after the states | 
|---|
| 1012 | // and non-failure transitions are solidified. That way, subsequent | 
|---|
| 1013 | // accesses (particularly `fill_failure_transitions`) will benefit from | 
|---|
| 1014 | // the faster transition lookup in densified states. | 
|---|
| 1015 | self.densify()?; | 
|---|
| 1016 | // The meat of the Aho-Corasick algorithm: compute and write failure | 
|---|
| 1017 | // transitions. i.e., the state to move to when a transition isn't | 
|---|
| 1018 | // defined in the current state. These are epsilon transitions and thus | 
|---|
| 1019 | // make this formulation an NFA. | 
|---|
| 1020 | self.fill_failure_transitions()?; | 
|---|
| 1021 | // Handle a special case under leftmost semantics when at least one | 
|---|
| 1022 | // of the patterns is the empty string. | 
|---|
| 1023 | self.close_start_state_loop_for_leftmost(); | 
|---|
| 1024 | // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START, | 
|---|
| 1025 | // NON-MATCH, ... This permits us to very quickly query the type of | 
|---|
| 1026 | // the state we're currently in during a search. | 
|---|
| 1027 | self.shuffle(); | 
|---|
| 1028 | self.nfa.prefilter = self.prefilter.build(); | 
|---|
| 1029 | // Store the maximum ID of all *relevant* special states. Start states | 
|---|
| 1030 | // are only relevant when we have a prefilter, otherwise, there is zero | 
|---|
| 1031 | // reason to care about whether a state is a start state or not during | 
|---|
| 1032 | // a search. Indeed, without a prefilter, we are careful to explicitly | 
|---|
| 1033 | // NOT care about start states, otherwise the search can ping pong | 
|---|
| 1034 | // between the unrolled loop and the handling of special-status states | 
|---|
| 1035 | // and destroy perf. | 
|---|
| 1036 | self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() { | 
|---|
| 1037 | // Why the anchored starting state? Because we always put it | 
|---|
| 1038 | // after the unanchored starting state and it is therefore the | 
|---|
| 1039 | // maximum. Why put unanchored followed by anchored? No particular | 
|---|
| 1040 | // reason, but that's how the states are logically organized in the | 
|---|
| 1041 | // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯ | 
|---|
| 1042 | self.nfa.special.start_anchored_id | 
|---|
| 1043 | } else { | 
|---|
| 1044 | self.nfa.special.max_match_id | 
|---|
| 1045 | }; | 
|---|
| 1046 | self.nfa.sparse.shrink_to_fit(); | 
|---|
| 1047 | self.nfa.dense.shrink_to_fit(); | 
|---|
| 1048 | self.nfa.matches.shrink_to_fit(); | 
|---|
| 1049 | self.nfa.pattern_lens.shrink_to_fit(); | 
|---|
| 1050 | Ok(self.nfa) | 
|---|
| 1051 | } | 
|---|
| 1052 |  | 
|---|
| 1053 | /// This sets up the initial prefix trie that makes up the Aho-Corasick | 
|---|
| 1054 | /// automaton. Effectively, it creates the basic structure of the | 
|---|
| 1055 | /// automaton, where every pattern given has a path from the start state to | 
|---|
| 1056 | /// the end of the pattern. | 
|---|
| 1057 | fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError> | 
|---|
| 1058 | where | 
|---|
| 1059 | I: IntoIterator<Item = P>, | 
|---|
| 1060 | P: AsRef<[u8]>, | 
|---|
| 1061 | { | 
|---|
| 1062 | 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() { | 
|---|
| 1063 | let pid = PatternID::new(i).map_err(|e| { | 
|---|
| 1064 | BuildError::pattern_id_overflow( | 
|---|
| 1065 | PatternID::MAX.as_u64(), | 
|---|
| 1066 | e.attempted(), | 
|---|
| 1067 | ) | 
|---|
| 1068 | })?; | 
|---|
| 1069 | let pat = pat.as_ref(); | 
|---|
| 1070 | let patlen = SmallIndex::new(pat.len()) | 
|---|
| 1071 | .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?; | 
|---|
| 1072 | self.nfa.min_pattern_len = | 
|---|
| 1073 | core::cmp::min(self.nfa.min_pattern_len, pat.len()); | 
|---|
| 1074 | self.nfa.max_pattern_len = | 
|---|
| 1075 | core::cmp::max(self.nfa.max_pattern_len, pat.len()); | 
|---|
| 1076 | assert_eq!( | 
|---|
| 1077 | i, | 
|---|
| 1078 | self.nfa.pattern_lens.len(), | 
|---|
| 1079 | "expected number of patterns to match pattern ID" | 
|---|
| 1080 | ); | 
|---|
| 1081 | self.nfa.pattern_lens.push(patlen); | 
|---|
| 1082 | // We add the pattern to the prefilter here because the pattern | 
|---|
| 1083 | // ID in the prefilter is determined with respect to the patterns | 
|---|
| 1084 | // added to the prefilter. That is, it isn't the ID we have here, | 
|---|
| 1085 | // but the one determined by its own accounting of patterns. | 
|---|
| 1086 | // To ensure they line up, we add every pattern we see to the | 
|---|
| 1087 | // prefilter, even if some patterns ultimately are impossible to | 
|---|
| 1088 | // match (in leftmost-first semantics specifically). | 
|---|
| 1089 | // | 
|---|
| 1090 | // Another way of doing this would be to expose an API in the | 
|---|
| 1091 | // prefilter to permit setting your own pattern IDs. Or to just use | 
|---|
| 1092 | // our own map and go between them. But this case is sufficiently | 
|---|
| 1093 | // rare that we don't bother and just make sure they're in sync. | 
|---|
| 1094 | if self.builder.prefilter { | 
|---|
| 1095 | self.prefilter.add(pat); | 
|---|
| 1096 | } | 
|---|
| 1097 |  | 
|---|
| 1098 | let mut prev = self.nfa.special.start_unanchored_id; | 
|---|
| 1099 | let mut saw_match = false; | 
|---|
| 1100 | for (depth, &b) in pat.iter().enumerate() { | 
|---|
| 1101 | // When leftmost-first match semantics are requested, we | 
|---|
| 1102 | // specifically stop adding patterns when a previously added | 
|---|
| 1103 | // pattern is a prefix of it. We avoid adding it because | 
|---|
| 1104 | // leftmost-first semantics imply that the pattern can never | 
|---|
| 1105 | // match. This is not just an optimization to save space! It | 
|---|
| 1106 | // is necessary for correctness. In fact, this is the only | 
|---|
| 1107 | // difference in the automaton between the implementations for | 
|---|
| 1108 | // leftmost-first and leftmost-longest. | 
|---|
| 1109 | saw_match = saw_match || self.nfa.states[prev].is_match(); | 
|---|
| 1110 | if self.builder.match_kind.is_leftmost_first() && saw_match { | 
|---|
| 1111 | // Skip to the next pattern immediately. This avoids | 
|---|
| 1112 | // incorrectly adding a match after this loop terminates. | 
|---|
| 1113 | continue 'PATTERNS; | 
|---|
| 1114 | } | 
|---|
| 1115 |  | 
|---|
| 1116 | // Add this byte to our equivalence classes. These don't | 
|---|
| 1117 | // get used while building the trie, but other Aho-Corasick | 
|---|
| 1118 | // implementations may use them. | 
|---|
| 1119 | self.byteset.set_range(b, b); | 
|---|
| 1120 | if self.builder.ascii_case_insensitive { | 
|---|
| 1121 | let b = opposite_ascii_case(b); | 
|---|
| 1122 | self.byteset.set_range(b, b); | 
|---|
| 1123 | } | 
|---|
| 1124 |  | 
|---|
| 1125 | // If the transition from prev using the current byte already | 
|---|
| 1126 | // exists, then just move through it. Otherwise, add a new | 
|---|
| 1127 | // state. We track the depth here so that we can determine | 
|---|
| 1128 | // how to represent transitions. States near the start state | 
|---|
| 1129 | // use a dense representation that uses more memory but is | 
|---|
| 1130 | // faster. Other states use a sparse representation that uses | 
|---|
| 1131 | // less memory but is slower. | 
|---|
| 1132 | let next = self.nfa.follow_transition(prev, b); | 
|---|
| 1133 | if next != NFA::FAIL { | 
|---|
| 1134 | prev = next; | 
|---|
| 1135 | } else { | 
|---|
| 1136 | let next = self.nfa.alloc_state(depth)?; | 
|---|
| 1137 | self.nfa.add_transition(prev, b, next)?; | 
|---|
| 1138 | if self.builder.ascii_case_insensitive { | 
|---|
| 1139 | let b = opposite_ascii_case(b); | 
|---|
| 1140 | self.nfa.add_transition(prev, b, next)?; | 
|---|
| 1141 | } | 
|---|
| 1142 | prev = next; | 
|---|
| 1143 | } | 
|---|
| 1144 | } | 
|---|
| 1145 | // Once the pattern has been added, log the match in the final | 
|---|
| 1146 | // state that it reached. | 
|---|
| 1147 | self.nfa.add_match(prev, pid)?; | 
|---|
| 1148 | } | 
|---|
| 1149 | Ok(()) | 
|---|
| 1150 | } | 
|---|
| 1151 |  | 
|---|
| 1152 | /// This routine creates failure transitions according to the standard | 
|---|
| 1153 | /// textbook formulation of the Aho-Corasick algorithm, with a couple small | 
|---|
| 1154 | /// tweaks to support "leftmost" semantics. | 
|---|
| 1155 | /// | 
|---|
| 1156 | /// Building failure transitions is the most interesting part of building | 
|---|
| 1157 | /// the Aho-Corasick automaton, because they are what allow searches to | 
|---|
| 1158 | /// be performed in linear time. Specifically, a failure transition is | 
|---|
| 1159 | /// a single transition associated with each state that points back to | 
|---|
| 1160 | /// the longest proper suffix of the pattern being searched. The failure | 
|---|
| 1161 | /// transition is followed whenever there exists no transition on the | 
|---|
| 1162 | /// current state for the current input byte. If there is no other proper | 
|---|
| 1163 | /// suffix, then the failure transition points back to the starting state. | 
|---|
| 1164 | /// | 
|---|
| 1165 | /// For example, let's say we built an Aho-Corasick automaton with the | 
|---|
| 1166 | /// following patterns: 'abcd' and 'cef'. The trie looks like this: | 
|---|
| 1167 | /// | 
|---|
| 1168 | /// ```ignore | 
|---|
| 1169 | ///          a - S1 - b - S2 - c - S3 - d - S4* | 
|---|
| 1170 | ///         / | 
|---|
| 1171 | ///     S0 - c - S5 - e - S6 - f - S7* | 
|---|
| 1172 | /// ``` | 
|---|
| 1173 | /// | 
|---|
| 1174 | /// At this point, it should be fairly straight-forward to see how this | 
|---|
| 1175 | /// trie can be used in a simplistic way. At any given position in the | 
|---|
| 1176 | /// text we're searching (called the "subject" string), all we need to do | 
|---|
| 1177 | /// is follow the transitions in the trie by consuming one transition for | 
|---|
| 1178 | /// each byte in the subject string. If we reach a match state, then we can | 
|---|
| 1179 | /// report that location as a match. | 
|---|
| 1180 | /// | 
|---|
| 1181 | /// The trick comes when searching a subject string like 'abcef'. We'll | 
|---|
| 1182 | /// initially follow the transition from S0 to S1 and wind up in S3 after | 
|---|
| 1183 | /// observng the 'c' byte. At this point, the next byte is 'e' but state | 
|---|
| 1184 | /// S3 has no transition for 'e', so the search fails. We then would need | 
|---|
| 1185 | /// to restart the search at the next position in 'abcef', which | 
|---|
| 1186 | /// corresponds to 'b'. The match would fail, but the next search starting | 
|---|
| 1187 | /// at 'c' would finally succeed. The problem with this approach is that | 
|---|
| 1188 | /// we wind up searching the subject string potentially many times. In | 
|---|
| 1189 | /// effect, this makes the algorithm have worst case `O(n * m)` complexity, | 
|---|
| 1190 | /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead | 
|---|
| 1191 | /// like to achieve a `O(n + m)` worst case complexity. | 
|---|
| 1192 | /// | 
|---|
| 1193 | /// This is where failure transitions come in. Instead of dying at S3 in | 
|---|
| 1194 | /// the first search, the automaton can instruct the search to move to | 
|---|
| 1195 | /// another part of the automaton that corresponds to a suffix of what | 
|---|
| 1196 | /// we've seen so far. Recall that we've seen 'abc' in the subject string, | 
|---|
| 1197 | /// and the automaton does indeed have a non-empty suffix, 'c', that could | 
|---|
| 1198 | /// potentially lead to another match. Thus, the actual Aho-Corasick | 
|---|
| 1199 | /// automaton for our patterns in this case looks like this: | 
|---|
| 1200 | /// | 
|---|
| 1201 | /// ```ignore | 
|---|
| 1202 | ///          a - S1 - b - S2 - c - S3 - d - S4* | 
|---|
| 1203 | ///         /                      / | 
|---|
| 1204 | ///        /       ---------------- | 
|---|
| 1205 | ///       /       / | 
|---|
| 1206 | ///     S0 - c - S5 - e - S6 - f - S7* | 
|---|
| 1207 | /// ``` | 
|---|
| 1208 | /// | 
|---|
| 1209 | /// That is, we have a failure transition from S3 to S5, which is followed | 
|---|
| 1210 | /// exactly in cases when we are in state S3 but see any byte other than | 
|---|
| 1211 | /// 'd' (that is, we've "failed" to find a match in this portion of our | 
|---|
| 1212 | /// trie). We know we can transition back to S5 because we've already seen | 
|---|
| 1213 | /// a 'c' byte, so we don't need to re-scan it. We can then pick back up | 
|---|
| 1214 | /// with the search starting at S5 and complete our match. | 
|---|
| 1215 | /// | 
|---|
| 1216 | /// Adding failure transitions to a trie is fairly simple, but subtle. The | 
|---|
| 1217 | /// key issue is that you might have multiple failure transition that you | 
|---|
| 1218 | /// need to follow. For example, look at the trie for the patterns | 
|---|
| 1219 | /// 'abcd', 'b', 'bcd' and 'cd': | 
|---|
| 1220 | /// | 
|---|
| 1221 | /// ```ignore | 
|---|
| 1222 | ///          - a - S1 - b - S2* - c - S3 - d - S4* | 
|---|
| 1223 | ///         /               /         / | 
|---|
| 1224 | ///        /         -------   ------- | 
|---|
| 1225 | ///       /         /         / | 
|---|
| 1226 | ///     S0 --- b - S5* - c - S6 - d - S7* | 
|---|
| 1227 | ///       \                  / | 
|---|
| 1228 | ///        \         -------- | 
|---|
| 1229 | ///         \       / | 
|---|
| 1230 | ///          - c - S8 - d - S9* | 
|---|
| 1231 | /// ``` | 
|---|
| 1232 | /// | 
|---|
| 1233 | /// The failure transitions for this trie are defined from S2 to S5, | 
|---|
| 1234 | /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it | 
|---|
| 1235 | /// corresponds to a match, since its failure transition to S5 is itself | 
|---|
| 1236 | /// a match state. | 
|---|
| 1237 | /// | 
|---|
| 1238 | /// Perhaps simplest way to think about adding these failure transitions | 
|---|
| 1239 | /// is recursively. That is, if you know the failure transitions for every | 
|---|
| 1240 | /// possible previous state that could be visited (e.g., when computing the | 
|---|
| 1241 | /// failure transition for S3, you already know the failure transitions | 
|---|
| 1242 | /// for S0, S1 and S2), then you can simply follow the failure transition | 
|---|
| 1243 | /// of the previous state and check whether the incoming transition is | 
|---|
| 1244 | /// defined after following the failure transition. | 
|---|
| 1245 | /// | 
|---|
| 1246 | /// For example, when determining the failure state for S3, by our | 
|---|
| 1247 | /// assumptions, we already know that there is a failure transition from | 
|---|
| 1248 | /// S2 (the previous state) to S5. So we follow that transition and check | 
|---|
| 1249 | /// whether the transition connecting S2 to S3 is defined. Indeed, it is, | 
|---|
| 1250 | /// as there is a transition from S5 to S6 for the byte 'c'. If no such | 
|---|
| 1251 | /// transition existed, we could keep following the failure transitions | 
|---|
| 1252 | /// until we reach the start state, which is the failure transition for | 
|---|
| 1253 | /// every state that has no corresponding proper suffix. | 
|---|
| 1254 | /// | 
|---|
| 1255 | /// We don't actually use recursion to implement this, but instead, use a | 
|---|
| 1256 | /// breadth first search of the automaton. Our base case is the start | 
|---|
| 1257 | /// state, whose failure transition is just a transition to itself. | 
|---|
| 1258 | /// | 
|---|
| 1259 | /// When building a leftmost automaton, we proceed as above, but only | 
|---|
| 1260 | /// include a subset of failure transitions. Namely, we omit any failure | 
|---|
| 1261 | /// transitions that appear after a match state in the trie. This is | 
|---|
| 1262 | /// because failure transitions always point back to a proper suffix of | 
|---|
| 1263 | /// what has been seen so far. Thus, following a failure transition after | 
|---|
| 1264 | /// a match implies looking for a match that starts after the one that has | 
|---|
| 1265 | /// already been seen, which is of course therefore not the leftmost match. | 
|---|
| 1266 | /// | 
|---|
| 1267 | /// N.B. I came up with this algorithm on my own, and after scouring all of | 
|---|
| 1268 | /// the other AC implementations I know of (Perl, Snort, many on GitHub). | 
|---|
| 1269 | /// I couldn't find any that implement leftmost semantics like this. | 
|---|
| 1270 | /// Perl of course needs leftmost-first semantics, but they implement it | 
|---|
| 1271 | /// with a seeming hack at *search* time instead of encoding it into the | 
|---|
| 1272 | /// automaton. There are also a couple Java libraries that support leftmost | 
|---|
| 1273 | /// longest semantics, but they do it by building a queue of matches at | 
|---|
| 1274 | /// search time, which is even worse than what Perl is doing. ---AG | 
|---|
| 1275 | fn fill_failure_transitions(&mut self) -> Result<(), BuildError> { | 
|---|
| 1276 | let is_leftmost = self.builder.match_kind.is_leftmost(); | 
|---|
| 1277 | let start_uid = self.nfa.special.start_unanchored_id; | 
|---|
| 1278 | // Initialize the queue for breadth first search with all transitions | 
|---|
| 1279 | // out of the start state. We handle the start state specially because | 
|---|
| 1280 | // we only want to follow non-self transitions. If we followed self | 
|---|
| 1281 | // transitions, then this would never terminate. | 
|---|
| 1282 | let mut queue = VecDeque::new(); | 
|---|
| 1283 | let mut seen = self.queued_set(); | 
|---|
| 1284 | let mut prev_link = None; | 
|---|
| 1285 | while let Some(link) = self.nfa.next_link(start_uid, prev_link) { | 
|---|
| 1286 | prev_link = Some(link); | 
|---|
| 1287 | let t = self.nfa.sparse[link]; | 
|---|
| 1288 |  | 
|---|
| 1289 | // Skip anything we've seen before and any self-transitions on the | 
|---|
| 1290 | // start state. | 
|---|
| 1291 | if start_uid == t.next() || seen.contains(t.next) { | 
|---|
| 1292 | continue; | 
|---|
| 1293 | } | 
|---|
| 1294 | queue.push_back(t.next); | 
|---|
| 1295 | seen.insert(t.next); | 
|---|
| 1296 | // Under leftmost semantics, if a state immediately following | 
|---|
| 1297 | // the start state is a match state, then we never want to | 
|---|
| 1298 | // follow its failure transition since the failure transition | 
|---|
| 1299 | // necessarily leads back to the start state, which we never | 
|---|
| 1300 | // want to do for leftmost matching after a match has been | 
|---|
| 1301 | // found. | 
|---|
| 1302 | // | 
|---|
| 1303 | // We apply the same logic to non-start states below as well. | 
|---|
| 1304 | if is_leftmost && self.nfa.states[t.next].is_match() { | 
|---|
| 1305 | self.nfa.states[t.next].fail = NFA::DEAD; | 
|---|
| 1306 | } | 
|---|
| 1307 | } | 
|---|
| 1308 | while let Some(id) = queue.pop_front() { | 
|---|
| 1309 | let mut prev_link = None; | 
|---|
| 1310 | while let Some(link) = self.nfa.next_link(id, prev_link) { | 
|---|
| 1311 | prev_link = Some(link); | 
|---|
| 1312 | let t = self.nfa.sparse[link]; | 
|---|
| 1313 |  | 
|---|
| 1314 | if seen.contains(t.next) { | 
|---|
| 1315 | // The only way to visit a duplicate state in a transition | 
|---|
| 1316 | // list is when ASCII case insensitivity is enabled. In | 
|---|
| 1317 | // this case, we want to skip it since it's redundant work. | 
|---|
| 1318 | // But it would also end up duplicating matches, which | 
|---|
| 1319 | // results in reporting duplicate matches in some cases. | 
|---|
| 1320 | // See the 'acasei010' regression test. | 
|---|
| 1321 | continue; | 
|---|
| 1322 | } | 
|---|
| 1323 | queue.push_back(t.next); | 
|---|
| 1324 | seen.insert(t.next); | 
|---|
| 1325 |  | 
|---|
| 1326 | // As above for start states, under leftmost semantics, once | 
|---|
| 1327 | // we see a match all subsequent states should have no failure | 
|---|
| 1328 | // transitions because failure transitions always imply looking | 
|---|
| 1329 | // for a match that is a suffix of what has been seen so far | 
|---|
| 1330 | // (where "seen so far" corresponds to the string formed by | 
|---|
| 1331 | // following the transitions from the start state to the | 
|---|
| 1332 | // current state). Under leftmost semantics, we specifically do | 
|---|
| 1333 | // not want to allow this to happen because we always want to | 
|---|
| 1334 | // report the match found at the leftmost position. | 
|---|
| 1335 | // | 
|---|
| 1336 | // The difference between leftmost-first and leftmost-longest | 
|---|
| 1337 | // occurs previously while we build the trie. For | 
|---|
| 1338 | // leftmost-first, we simply omit any entries that would | 
|---|
| 1339 | // otherwise require passing through a match state. | 
|---|
| 1340 | // | 
|---|
| 1341 | // Note that for correctness, the failure transition has to be | 
|---|
| 1342 | // set to the dead state for ALL states following a match, not | 
|---|
| 1343 | // just the match state itself. However, by setting the failure | 
|---|
| 1344 | // transition to the dead state on all match states, the dead | 
|---|
| 1345 | // state will automatically propagate to all subsequent states | 
|---|
| 1346 | // via the failure state computation below. | 
|---|
| 1347 | if is_leftmost && self.nfa.states[t.next].is_match() { | 
|---|
| 1348 | self.nfa.states[t.next].fail = NFA::DEAD; | 
|---|
| 1349 | continue; | 
|---|
| 1350 | } | 
|---|
| 1351 | let mut fail = self.nfa.states[id].fail; | 
|---|
| 1352 | while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL { | 
|---|
| 1353 | fail = self.nfa.states[fail].fail; | 
|---|
| 1354 | } | 
|---|
| 1355 | fail = self.nfa.follow_transition(fail, t.byte); | 
|---|
| 1356 | self.nfa.states[t.next].fail = fail; | 
|---|
| 1357 | self.nfa.copy_matches(fail, t.next)?; | 
|---|
| 1358 | } | 
|---|
| 1359 | // If the start state is a match state, then this automaton can | 
|---|
| 1360 | // match the empty string. This implies all states are match states | 
|---|
| 1361 | // since every position matches the empty string, so copy the | 
|---|
| 1362 | // matches from the start state to every state. Strictly speaking, | 
|---|
| 1363 | // this is only necessary for overlapping matches since each | 
|---|
| 1364 | // non-empty non-start match state needs to report empty matches | 
|---|
| 1365 | // in addition to its own. For the non-overlapping case, such | 
|---|
| 1366 | // states only report the first match, which is never empty since | 
|---|
| 1367 | // it isn't a start state. | 
|---|
| 1368 | if !is_leftmost { | 
|---|
| 1369 | self.nfa | 
|---|
| 1370 | .copy_matches(self.nfa.special.start_unanchored_id, id)?; | 
|---|
| 1371 | } | 
|---|
| 1372 | } | 
|---|
| 1373 | Ok(()) | 
|---|
| 1374 | } | 
|---|
| 1375 |  | 
|---|
| 1376 | /// Shuffle the states so that they appear in this sequence: | 
|---|
| 1377 | /// | 
|---|
| 1378 | ///   DEAD, FAIL, MATCH..., START, START, NON-MATCH... | 
|---|
| 1379 | /// | 
|---|
| 1380 | /// The idea here is that if we know how special states are laid out in our | 
|---|
| 1381 | /// transition table, then we can determine what "kind" of state we're in | 
|---|
| 1382 | /// just by comparing our current state ID with a particular value. In this | 
|---|
| 1383 | /// way, we avoid doing extra memory lookups. | 
|---|
| 1384 | /// | 
|---|
| 1385 | /// Before shuffling begins, our states look something like this: | 
|---|
| 1386 | /// | 
|---|
| 1387 | ///   DEAD, FAIL, START, START, (MATCH | NON-MATCH)... | 
|---|
| 1388 | /// | 
|---|
| 1389 | /// So all we need to do is move all of the MATCH states so that they | 
|---|
| 1390 | /// all appear before any NON-MATCH state, like so: | 
|---|
| 1391 | /// | 
|---|
| 1392 | ///   DEAD, FAIL, START, START, MATCH... NON-MATCH... | 
|---|
| 1393 | /// | 
|---|
| 1394 | /// Then it's just a simple matter of swapping the two START states with | 
|---|
| 1395 | /// the last two MATCH states. | 
|---|
| 1396 | /// | 
|---|
| 1397 | /// (This is the same technique used for fully compiled DFAs in | 
|---|
| 1398 | /// regex-automata.) | 
|---|
| 1399 | fn shuffle(&mut self) { | 
|---|
| 1400 | let old_start_uid = self.nfa.special.start_unanchored_id; | 
|---|
| 1401 | let old_start_aid = self.nfa.special.start_anchored_id; | 
|---|
| 1402 | assert!(old_start_uid < old_start_aid); | 
|---|
| 1403 | assert_eq!( | 
|---|
| 1404 | 3, | 
|---|
| 1405 | old_start_aid.as_usize(), | 
|---|
| 1406 | "anchored start state should be at index 3" | 
|---|
| 1407 | ); | 
|---|
| 1408 | // We implement shuffling by a sequence of pairwise swaps of states. | 
|---|
| 1409 | // Since we have a number of things referencing states via their | 
|---|
| 1410 | // IDs and swapping them changes their IDs, we need to record every | 
|---|
| 1411 | // swap we make so that we can remap IDs. The remapper handles this | 
|---|
| 1412 | // book-keeping for us. | 
|---|
| 1413 | let mut remapper = Remapper::new(&self.nfa, 0); | 
|---|
| 1414 | // The way we proceed here is by moving all match states so that | 
|---|
| 1415 | // they directly follow the start states. So it will go: DEAD, FAIL, | 
|---|
| 1416 | // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ... | 
|---|
| 1417 | // | 
|---|
| 1418 | // To do that, we proceed forward through all states after | 
|---|
| 1419 | // START-ANCHORED and swap match states so that they appear before all | 
|---|
| 1420 | // non-match states. | 
|---|
| 1421 | let mut next_avail = StateID::from(4u8); | 
|---|
| 1422 | for i in next_avail.as_usize()..self.nfa.states.len() { | 
|---|
| 1423 | let sid = StateID::new(i).unwrap(); | 
|---|
| 1424 | if !self.nfa.states[sid].is_match() { | 
|---|
| 1425 | continue; | 
|---|
| 1426 | } | 
|---|
| 1427 | remapper.swap(&mut self.nfa, sid, next_avail); | 
|---|
| 1428 | // The key invariant here is that only non-match states exist | 
|---|
| 1429 | // between 'next_avail' and 'sid' (with them being potentially | 
|---|
| 1430 | // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed | 
|---|
| 1431 | // to land on the leftmost non-match state. (Unless 'next_avail' | 
|---|
| 1432 | // and 'sid' are equivalent, in which case, a swap will occur but | 
|---|
| 1433 | // it is a no-op.) | 
|---|
| 1434 | next_avail = StateID::new(next_avail.one_more()).unwrap(); | 
|---|
| 1435 | } | 
|---|
| 1436 | // Now we'd like to move the start states to immediately following the | 
|---|
| 1437 | // match states. (The start states may themselves be match states, but | 
|---|
| 1438 | // we'll handle that later.) We arrange the states this way so that we | 
|---|
| 1439 | // don't necessarily need to check whether a state is a start state or | 
|---|
| 1440 | // not before checking whether a state is a match state. For example, | 
|---|
| 1441 | // we'd like to be able to write this as our state machine loop: | 
|---|
| 1442 | // | 
|---|
| 1443 | //   sid = start() | 
|---|
| 1444 | //   for byte in haystack: | 
|---|
| 1445 | //     sid = next(sid, byte) | 
|---|
| 1446 | //     if sid <= nfa.max_start_id: | 
|---|
| 1447 | //       if sid <= nfa.max_dead_id: | 
|---|
| 1448 | //         # search complete | 
|---|
| 1449 | //       elif sid <= nfa.max_match_id: | 
|---|
| 1450 | //         # found match | 
|---|
| 1451 | // | 
|---|
| 1452 | // The important context here is that we might not want to look for | 
|---|
| 1453 | // start states at all. Namely, if a searcher doesn't have a prefilter, | 
|---|
| 1454 | // then there is no reason to care about whether we're in a start state | 
|---|
| 1455 | // or not. And indeed, if we did check for it, this very hot loop would | 
|---|
| 1456 | // ping pong between the special state handling and the main state | 
|---|
| 1457 | // transition logic. This in turn stalls the CPU by killing branch | 
|---|
| 1458 | // prediction. | 
|---|
| 1459 | // | 
|---|
| 1460 | // So essentially, we really want to be able to "forget" that start | 
|---|
| 1461 | // states even exist and this is why we put them at the end. | 
|---|
| 1462 | let new_start_aid = | 
|---|
| 1463 | StateID::new(next_avail.as_usize().checked_sub(1).unwrap()) | 
|---|
| 1464 | .unwrap(); | 
|---|
| 1465 | remapper.swap(&mut self.nfa, old_start_aid, new_start_aid); | 
|---|
| 1466 | let new_start_uid = | 
|---|
| 1467 | StateID::new(next_avail.as_usize().checked_sub(2).unwrap()) | 
|---|
| 1468 | .unwrap(); | 
|---|
| 1469 | remapper.swap(&mut self.nfa, old_start_uid, new_start_uid); | 
|---|
| 1470 | let new_max_match_id = | 
|---|
| 1471 | StateID::new(next_avail.as_usize().checked_sub(3).unwrap()) | 
|---|
| 1472 | .unwrap(); | 
|---|
| 1473 | self.nfa.special.max_match_id = new_max_match_id; | 
|---|
| 1474 | self.nfa.special.start_unanchored_id = new_start_uid; | 
|---|
| 1475 | self.nfa.special.start_anchored_id = new_start_aid; | 
|---|
| 1476 | // If one start state is a match state, then they both are. | 
|---|
| 1477 | if self.nfa.states[self.nfa.special.start_anchored_id].is_match() { | 
|---|
| 1478 | self.nfa.special.max_match_id = self.nfa.special.start_anchored_id; | 
|---|
| 1479 | } | 
|---|
| 1480 | remapper.remap(&mut self.nfa); | 
|---|
| 1481 | } | 
|---|
| 1482 |  | 
|---|
| 1483 | /// Attempts to convert the transition representation of a subset of states | 
|---|
| 1484 | /// in this NFA from sparse to dense. This can greatly improve search | 
|---|
| 1485 | /// performance since states with a higher number of transitions tend to | 
|---|
| 1486 | /// correlate with very active states. | 
|---|
| 1487 | /// | 
|---|
| 1488 | /// We generally only densify states that are close to the start state. | 
|---|
| 1489 | /// These tend to be the most active states and thus benefit from a dense | 
|---|
| 1490 | /// representation more than other states. | 
|---|
| 1491 | /// | 
|---|
| 1492 | /// This tends to best balance between memory usage and performance. In | 
|---|
| 1493 | /// particular, the *vast majority* of all states in a typical Aho-Corasick | 
|---|
| 1494 | /// automaton have only 1 transition and are usually farther from the start | 
|---|
| 1495 | /// state and thus don't get densified. | 
|---|
| 1496 | /// | 
|---|
| 1497 | /// Note that this doesn't remove the sparse representation of transitions | 
|---|
| 1498 | /// for states that are densified. It could be done, but actually removing | 
|---|
| 1499 | /// entries from `NFA::sparse` is likely more expensive than it's worth. | 
|---|
| 1500 | fn densify(&mut self) -> Result<(), BuildError> { | 
|---|
| 1501 | for i in 0..self.nfa.states.len() { | 
|---|
| 1502 | let sid = StateID::new(i).unwrap(); | 
|---|
| 1503 | // Don't bother densifying states that are only used as sentinels. | 
|---|
| 1504 | if sid == NFA::DEAD || sid == NFA::FAIL { | 
|---|
| 1505 | continue; | 
|---|
| 1506 | } | 
|---|
| 1507 | // Only densify states that are "close enough" to the start state. | 
|---|
| 1508 | if self.nfa.states[sid].depth.as_usize() | 
|---|
| 1509 | >= self.builder.dense_depth | 
|---|
| 1510 | { | 
|---|
| 1511 | continue; | 
|---|
| 1512 | } | 
|---|
| 1513 | let dense = self.nfa.alloc_dense_state()?; | 
|---|
| 1514 | let mut prev_link = None; | 
|---|
| 1515 | while let Some(link) = self.nfa.next_link(sid, prev_link) { | 
|---|
| 1516 | prev_link = Some(link); | 
|---|
| 1517 | let t = self.nfa.sparse[link]; | 
|---|
| 1518 |  | 
|---|
| 1519 | let class = usize::from(self.nfa.byte_classes.get(t.byte)); | 
|---|
| 1520 | let index = dense.as_usize() + class; | 
|---|
| 1521 | self.nfa.dense[index] = t.next; | 
|---|
| 1522 | } | 
|---|
| 1523 | self.nfa.states[sid].dense = dense; | 
|---|
| 1524 | } | 
|---|
| 1525 | Ok(()) | 
|---|
| 1526 | } | 
|---|
| 1527 |  | 
|---|
| 1528 | /// Returns a set that tracked queued states. | 
|---|
| 1529 | /// | 
|---|
| 1530 | /// This is only necessary when ASCII case insensitivity is enabled, since | 
|---|
| 1531 | /// it is the only way to visit the same state twice. Otherwise, this | 
|---|
| 1532 | /// returns an inert set that nevers adds anything and always reports | 
|---|
| 1533 | /// `false` for every member test. | 
|---|
| 1534 | fn queued_set(&self) -> QueuedSet { | 
|---|
| 1535 | if self.builder.ascii_case_insensitive { | 
|---|
| 1536 | QueuedSet::active() | 
|---|
| 1537 | } else { | 
|---|
| 1538 | QueuedSet::inert() | 
|---|
| 1539 | } | 
|---|
| 1540 | } | 
|---|
| 1541 |  | 
|---|
| 1542 | /// Initializes the unanchored start state by making it dense. This is | 
|---|
| 1543 | /// achieved by explicitly setting every transition to the FAIL state. | 
|---|
| 1544 | /// This isn't necessary for correctness, since any missing transition is | 
|---|
| 1545 | /// automatically assumed to be mapped to the FAIL state. We do this to | 
|---|
| 1546 | /// make the unanchored starting state dense, and thus in turn make | 
|---|
| 1547 | /// transition lookups on it faster. (Which is worth doing because it's | 
|---|
| 1548 | /// the most active state.) | 
|---|
| 1549 | fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> { | 
|---|
| 1550 | let start_uid = self.nfa.special.start_unanchored_id; | 
|---|
| 1551 | let start_aid = self.nfa.special.start_anchored_id; | 
|---|
| 1552 | self.nfa.init_full_state(start_uid, NFA::FAIL)?; | 
|---|
| 1553 | self.nfa.init_full_state(start_aid, NFA::FAIL)?; | 
|---|
| 1554 | Ok(()) | 
|---|
| 1555 | } | 
|---|
| 1556 |  | 
|---|
| 1557 | /// Setup the anchored start state by copying all of the transitions and | 
|---|
| 1558 | /// matches from the unanchored starting state with one change: the failure | 
|---|
| 1559 | /// transition is changed to the DEAD state, so that for any undefined | 
|---|
| 1560 | /// transitions, the search will stop. | 
|---|
| 1561 | fn set_anchored_start_state(&mut self) -> Result<(), BuildError> { | 
|---|
| 1562 | let start_uid = self.nfa.special.start_unanchored_id; | 
|---|
| 1563 | let start_aid = self.nfa.special.start_anchored_id; | 
|---|
| 1564 | let (mut uprev_link, mut aprev_link) = (None, None); | 
|---|
| 1565 | loop { | 
|---|
| 1566 | let unext = self.nfa.next_link(start_uid, uprev_link); | 
|---|
| 1567 | let anext = self.nfa.next_link(start_aid, aprev_link); | 
|---|
| 1568 | let (ulink, alink) = match (unext, anext) { | 
|---|
| 1569 | (Some(ulink), Some(alink)) => (ulink, alink), | 
|---|
| 1570 | (None, None) => break, | 
|---|
| 1571 | _ => unreachable!(), | 
|---|
| 1572 | }; | 
|---|
| 1573 | uprev_link = Some(ulink); | 
|---|
| 1574 | aprev_link = Some(alink); | 
|---|
| 1575 | self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next; | 
|---|
| 1576 | } | 
|---|
| 1577 | self.nfa.copy_matches(start_uid, start_aid)?; | 
|---|
| 1578 | // This is the main difference between the unanchored and anchored | 
|---|
| 1579 | // starting states. If a lookup on an anchored starting state fails, | 
|---|
| 1580 | // then the search should stop. | 
|---|
| 1581 | // | 
|---|
| 1582 | // N.B. This assumes that the loop on the unanchored starting state | 
|---|
| 1583 | // hasn't been created yet. | 
|---|
| 1584 | self.nfa.states[start_aid].fail = NFA::DEAD; | 
|---|
| 1585 | Ok(()) | 
|---|
| 1586 | } | 
|---|
| 1587 |  | 
|---|
| 1588 | /// Set the failure transitions on the start state to loop back to the | 
|---|
| 1589 | /// start state. This effectively permits the Aho-Corasick automaton to | 
|---|
| 1590 | /// match at any position. This is also required for finding the next | 
|---|
| 1591 | /// state to terminate, namely, finding the next state should never return | 
|---|
| 1592 | /// a fail_id. | 
|---|
| 1593 | /// | 
|---|
| 1594 | /// This must be done after building the initial trie, since trie | 
|---|
| 1595 | /// construction depends on transitions to `fail_id` to determine whether a | 
|---|
| 1596 | /// state already exists or not. | 
|---|
| 1597 | fn add_unanchored_start_state_loop(&mut self) { | 
|---|
| 1598 | let start_uid = self.nfa.special.start_unanchored_id; | 
|---|
| 1599 | let mut prev_link = None; | 
|---|
| 1600 | while let Some(link) = self.nfa.next_link(start_uid, prev_link) { | 
|---|
| 1601 | prev_link = Some(link); | 
|---|
| 1602 | if self.nfa.sparse[link].next() == NFA::FAIL { | 
|---|
| 1603 | self.nfa.sparse[link].next = start_uid; | 
|---|
| 1604 | } | 
|---|
| 1605 | } | 
|---|
| 1606 | } | 
|---|
| 1607 |  | 
|---|
| 1608 | /// Remove the start state loop by rewriting any transitions on the start | 
|---|
| 1609 | /// state back to the start state with transitions to the dead state. | 
|---|
| 1610 | /// | 
|---|
| 1611 | /// The loop is only closed when two conditions are met: the start state | 
|---|
| 1612 | /// is a match state and the match kind is leftmost-first or | 
|---|
| 1613 | /// leftmost-longest. | 
|---|
| 1614 | /// | 
|---|
| 1615 | /// The reason for this is that under leftmost semantics, a start state | 
|---|
| 1616 | /// that is also a match implies that we should never restart the search | 
|---|
| 1617 | /// process. We allow normal transitions out of the start state, but if | 
|---|
| 1618 | /// none exist, we transition to the dead state, which signals that | 
|---|
| 1619 | /// searching should stop. | 
|---|
| 1620 | fn close_start_state_loop_for_leftmost(&mut self) { | 
|---|
| 1621 | let start_uid = self.nfa.special.start_unanchored_id; | 
|---|
| 1622 | let start = &mut self.nfa.states[start_uid]; | 
|---|
| 1623 | let dense = start.dense; | 
|---|
| 1624 | if self.builder.match_kind.is_leftmost() && start.is_match() { | 
|---|
| 1625 | let mut prev_link = None; | 
|---|
| 1626 | while let Some(link) = self.nfa.next_link(start_uid, prev_link) { | 
|---|
| 1627 | prev_link = Some(link); | 
|---|
| 1628 | if self.nfa.sparse[link].next() == start_uid { | 
|---|
| 1629 | self.nfa.sparse[link].next = NFA::DEAD; | 
|---|
| 1630 | if dense != StateID::ZERO { | 
|---|
| 1631 | let b = self.nfa.sparse[link].byte; | 
|---|
| 1632 | let class = usize::from(self.nfa.byte_classes.get(b)); | 
|---|
| 1633 | self.nfa.dense[dense.as_usize() + class] = NFA::DEAD; | 
|---|
| 1634 | } | 
|---|
| 1635 | } | 
|---|
| 1636 | } | 
|---|
| 1637 | } | 
|---|
| 1638 | } | 
|---|
| 1639 |  | 
|---|
| 1640 | /// Sets all transitions on the dead state to point back to the dead state. | 
|---|
| 1641 | /// Normally, missing transitions map back to the failure state, but the | 
|---|
| 1642 | /// point of the dead state is to act as a sink that can never be escaped. | 
|---|
| 1643 | fn add_dead_state_loop(&mut self) -> Result<(), BuildError> { | 
|---|
| 1644 | self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?; | 
|---|
| 1645 | Ok(()) | 
|---|
| 1646 | } | 
|---|
| 1647 | } | 
|---|
| 1648 |  | 
|---|
| 1649 | /// A set of state identifiers used to avoid revisiting the same state multiple | 
|---|
| 1650 | /// times when filling in failure transitions. | 
|---|
| 1651 | /// | 
|---|
| 1652 | /// This set has an "inert" and an "active" mode. When inert, the set never | 
|---|
| 1653 | /// stores anything and always returns `false` for every member test. This is | 
|---|
| 1654 | /// useful to avoid the performance and memory overhead of maintaining this | 
|---|
| 1655 | /// set when it is not needed. | 
|---|
| 1656 | #[ derive(Debug)] | 
|---|
| 1657 | struct QueuedSet { | 
|---|
| 1658 | set: Option<BTreeSet<StateID>>, | 
|---|
| 1659 | } | 
|---|
| 1660 |  | 
|---|
| 1661 | impl QueuedSet { | 
|---|
| 1662 | /// Return an inert set that returns `false` for every state ID membership | 
|---|
| 1663 | /// test. | 
|---|
| 1664 | fn inert() -> QueuedSet { | 
|---|
| 1665 | QueuedSet { set: None } | 
|---|
| 1666 | } | 
|---|
| 1667 |  | 
|---|
| 1668 | /// Return an active set that tracks state ID membership. | 
|---|
| 1669 | fn active() -> QueuedSet { | 
|---|
| 1670 | QueuedSet { set: Some(BTreeSet::new()) } | 
|---|
| 1671 | } | 
|---|
| 1672 |  | 
|---|
| 1673 | /// Inserts the given state ID into this set. (If the set is inert, then | 
|---|
| 1674 | /// this is a no-op.) | 
|---|
| 1675 | fn insert(&mut self, state_id: StateID) { | 
|---|
| 1676 | if let Some(ref mut set) = self.set { | 
|---|
| 1677 | set.insert(state_id); | 
|---|
| 1678 | } | 
|---|
| 1679 | } | 
|---|
| 1680 |  | 
|---|
| 1681 | /// Returns true if and only if the given state ID is in this set. If the | 
|---|
| 1682 | /// set is inert, this always returns false. | 
|---|
| 1683 | fn contains(&self, state_id: StateID) -> bool { | 
|---|
| 1684 | match self.set { | 
|---|
| 1685 | None => false, | 
|---|
| 1686 | Some(ref set) => set.contains(&state_id), | 
|---|
| 1687 | } | 
|---|
| 1688 | } | 
|---|
| 1689 | } | 
|---|
| 1690 |  | 
|---|
| 1691 | impl core::fmt::Debug for NFA { | 
|---|
| 1692 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { | 
|---|
| 1693 | use crate::{ | 
|---|
| 1694 | automaton::{fmt_state_indicator, sparse_transitions}, | 
|---|
| 1695 | util::debug::DebugByte, | 
|---|
| 1696 | }; | 
|---|
| 1697 |  | 
|---|
| 1698 | writeln!(f, "noncontiguous::NFA(")?; | 
|---|
| 1699 | for (sid, state) in self.states.iter().with_state_ids() { | 
|---|
| 1700 | // The FAIL state doesn't actually have space for a state allocated | 
|---|
| 1701 | // for it, so we have to treat it as a special case. | 
|---|
| 1702 | if sid == NFA::FAIL { | 
|---|
| 1703 | writeln!(f, "F {:06} :", sid.as_usize())?; | 
|---|
| 1704 | continue; | 
|---|
| 1705 | } | 
|---|
| 1706 | fmt_state_indicator(f, self, sid)?; | 
|---|
| 1707 | write!( | 
|---|
| 1708 | f, | 
|---|
| 1709 | "{:06} ({:06} ): ", | 
|---|
| 1710 | sid.as_usize(), | 
|---|
| 1711 | state.fail.as_usize() | 
|---|
| 1712 | )?; | 
|---|
| 1713 |  | 
|---|
| 1714 | let it = sparse_transitions( | 
|---|
| 1715 | self.iter_trans(sid).map(|t| (t.byte, t.next)), | 
|---|
| 1716 | ) | 
|---|
| 1717 | .enumerate(); | 
|---|
| 1718 | for (i, (start, end, sid)) in it { | 
|---|
| 1719 | if i > 0 { | 
|---|
| 1720 | write!(f, ", ")?; | 
|---|
| 1721 | } | 
|---|
| 1722 | if start == end { | 
|---|
| 1723 | write!( | 
|---|
| 1724 | f, | 
|---|
| 1725 | "{:?}  => {:?} ", | 
|---|
| 1726 | DebugByte(start), | 
|---|
| 1727 | sid.as_usize() | 
|---|
| 1728 | )?; | 
|---|
| 1729 | } else { | 
|---|
| 1730 | write!( | 
|---|
| 1731 | f, | 
|---|
| 1732 | "{:?} -{:?}  => {:?} ", | 
|---|
| 1733 | DebugByte(start), | 
|---|
| 1734 | DebugByte(end), | 
|---|
| 1735 | sid.as_usize() | 
|---|
| 1736 | )?; | 
|---|
| 1737 | } | 
|---|
| 1738 | } | 
|---|
| 1739 |  | 
|---|
| 1740 | write!(f, "\n ")?; | 
|---|
| 1741 | if self.is_match(sid) { | 
|---|
| 1742 | write!(f, "         matches: ")?; | 
|---|
| 1743 | for (i, pid) in self.iter_matches(sid).enumerate() { | 
|---|
| 1744 | if i > 0 { | 
|---|
| 1745 | write!(f, ", ")?; | 
|---|
| 1746 | } | 
|---|
| 1747 | write!(f, "{} ", pid.as_usize())?; | 
|---|
| 1748 | } | 
|---|
| 1749 | write!(f, "\n ")?; | 
|---|
| 1750 | } | 
|---|
| 1751 | } | 
|---|
| 1752 | writeln!(f, "match kind: {:?} ", self.match_kind)?; | 
|---|
| 1753 | writeln!(f, "prefilter: {:?} ", self.prefilter.is_some())?; | 
|---|
| 1754 | writeln!(f, "state length: {:?} ", self.states.len())?; | 
|---|
| 1755 | writeln!(f, "pattern length: {:?} ", self.patterns_len())?; | 
|---|
| 1756 | writeln!(f, "shortest pattern length: {:?} ", self.min_pattern_len)?; | 
|---|
| 1757 | writeln!(f, "longest pattern length: {:?} ", self.max_pattern_len)?; | 
|---|
| 1758 | writeln!(f, "memory usage: {:?} ", self.memory_usage())?; | 
|---|
| 1759 | writeln!(f, ")")?; | 
|---|
| 1760 | Ok(()) | 
|---|
| 1761 | } | 
|---|
| 1762 | } | 
|---|
| 1763 |  | 
|---|