| 1 | /*! |
| 2 | Provides direct access to a DFA implementation of Aho-Corasick. |
| 3 | |
| 4 | This is a low-level API that generally only needs to be used in niche |
| 5 | circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) |
| 6 | instead of a DFA directly. Using an `DFA` directly is typically only necessary |
| 7 | when one needs access to the [`Automaton`] trait implementation. |
| 8 | */ |
| 9 | |
| 10 | use alloc::{vec, vec::Vec}; |
| 11 | |
| 12 | use crate::{ |
| 13 | automaton::Automaton, |
| 14 | nfa::noncontiguous, |
| 15 | util::{ |
| 16 | alphabet::ByteClasses, |
| 17 | error::{BuildError, MatchError}, |
| 18 | int::{Usize, U32}, |
| 19 | prefilter::Prefilter, |
| 20 | primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, |
| 21 | search::{Anchored, MatchKind, StartKind}, |
| 22 | special::Special, |
| 23 | }, |
| 24 | }; |
| 25 | |
| 26 | /// A DFA implementation of Aho-Corasick. |
| 27 | /// |
| 28 | /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of |
| 29 | /// this type directly. Using a `DFA` directly is typically only necessary when |
| 30 | /// one needs access to the [`Automaton`] trait implementation. |
| 31 | /// |
| 32 | /// This DFA can only be built by first constructing a [`noncontiguous::NFA`]. |
| 33 | /// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but |
| 34 | /// [`Builder::build_from_noncontiguous`] permits doing it explicitly. |
| 35 | /// |
| 36 | /// A DFA provides the best possible search performance (in this crate) via two |
| 37 | /// mechanisms: |
| 38 | /// |
| 39 | /// * All states use a dense representation for their transitions. |
| 40 | /// * All failure transitions are pre-computed such that they are never |
| 41 | /// explicitly handled at search time. |
| 42 | /// |
| 43 | /// These two facts combined mean that every state transition is performed |
| 44 | /// using a constant number of instructions. However, this comes at |
| 45 | /// great cost. The memory usage of a DFA can be quite exorbitant. |
| 46 | /// It is potentially multiple orders of magnitude greater than a |
| 47 | /// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange, |
| 48 | /// a DFA will typically have better search speed than a `contiguous::NFA`, but |
| 49 | /// not by orders of magnitude. |
| 50 | /// |
| 51 | /// Unless you have a small number of patterns or memory usage is not a concern |
| 52 | /// and search performance is critical, a DFA is usually not the best choice. |
| 53 | /// |
| 54 | /// Moreover, unlike the NFAs in this crate, it is costly for a DFA to |
| 55 | /// support for anchored and unanchored search configurations. Namely, |
| 56 | /// since failure transitions are pre-computed, supporting both anchored |
| 57 | /// and unanchored searches requires a duplication of the transition table, |
| 58 | /// making the memory usage of such a DFA ever bigger. (The NFAs in this crate |
| 59 | /// unconditionally support both anchored and unanchored searches because there |
| 60 | /// is essentially no added cost for doing so.) It is for this reason that |
| 61 | /// a DFA's support for anchored and unanchored searches can be configured |
| 62 | /// via [`Builder::start_kind`]. By default, a DFA only supports unanchored |
| 63 | /// searches. |
| 64 | /// |
| 65 | /// # Example |
| 66 | /// |
| 67 | /// This example shows how to build an `DFA` directly and use it to execute |
| 68 | /// [`Automaton::try_find`]: |
| 69 | /// |
| 70 | /// ``` |
| 71 | /// use aho_corasick::{ |
| 72 | /// automaton::Automaton, |
| 73 | /// dfa::DFA, |
| 74 | /// Input, Match, |
| 75 | /// }; |
| 76 | /// |
| 77 | /// let patterns = &["b" , "abc" , "abcd" ]; |
| 78 | /// let haystack = "abcd" ; |
| 79 | /// |
| 80 | /// let nfa = DFA::new(patterns).unwrap(); |
| 81 | /// assert_eq!( |
| 82 | /// Some(Match::must(0, 1..2)), |
| 83 | /// nfa.try_find(&Input::new(haystack))?, |
| 84 | /// ); |
| 85 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 86 | /// ``` |
| 87 | /// |
| 88 | /// It is also possible to implement your own version of `try_find`. See the |
| 89 | /// [`Automaton`] documentation for an example. |
| 90 | #[derive (Clone)] |
| 91 | pub struct DFA { |
| 92 | /// The DFA transition table. IDs in this table are pre-multiplied. So |
| 93 | /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride, |
| 94 | /// 2*stride, 3*stride, ... |
| 95 | trans: Vec<StateID>, |
| 96 | /// The matches for every match state in this DFA. This is first indexed by |
| 97 | /// state index (so that's `sid >> stride2`) and then by order in which the |
| 98 | /// matches are meant to occur. |
| 99 | matches: Vec<Vec<PatternID>>, |
| 100 | /// The amount of heap memory used, in bytes, by the inner Vecs of |
| 101 | /// 'matches'. |
| 102 | matches_memory_usage: usize, |
| 103 | /// The length of each pattern. This is used to compute the start offset |
| 104 | /// of a match. |
| 105 | pattern_lens: Vec<SmallIndex>, |
| 106 | /// A prefilter for accelerating searches, if one exists. |
| 107 | prefilter: Option<Prefilter>, |
| 108 | /// The match semantics built into this DFA. |
| 109 | match_kind: MatchKind, |
| 110 | /// The total number of states in this DFA. |
| 111 | state_len: usize, |
| 112 | /// The alphabet size, or total number of equivalence classes, for this |
| 113 | /// DFA. Note that the actual number of transitions in each state is |
| 114 | /// stride=2^stride2, where stride is the smallest power of 2 greater than |
| 115 | /// or equal to alphabet_len. We do things this way so that we can use |
| 116 | /// bitshifting to go from a state ID to an index into 'matches'. |
| 117 | alphabet_len: usize, |
| 118 | /// The exponent with a base 2, such that stride=2^stride2. Given a state |
| 119 | /// index 'i', its state identifier is 'i << stride2'. Given a state |
| 120 | /// identifier 'sid', its state index is 'sid >> stride2'. |
| 121 | stride2: usize, |
| 122 | /// The equivalence classes for this DFA. All transitions are defined on |
| 123 | /// equivalence classes and not on the 256 distinct byte values. |
| 124 | byte_classes: ByteClasses, |
| 125 | /// The length of the shortest pattern in this automaton. |
| 126 | min_pattern_len: usize, |
| 127 | /// The length of the longest pattern in this automaton. |
| 128 | max_pattern_len: usize, |
| 129 | /// The information required to deduce which states are "special" in this |
| 130 | /// DFA. |
| 131 | special: Special, |
| 132 | } |
| 133 | |
| 134 | impl DFA { |
| 135 | /// Create a new Aho-Corasick DFA using the default configuration. |
| 136 | /// |
| 137 | /// Use a [`Builder`] if you want to change the configuration. |
| 138 | pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError> |
| 139 | where |
| 140 | I: IntoIterator<Item = P>, |
| 141 | P: AsRef<[u8]>, |
| 142 | { |
| 143 | DFA::builder().build(patterns) |
| 144 | } |
| 145 | |
| 146 | /// A convenience method for returning a new Aho-Corasick DFA builder. |
| 147 | /// |
| 148 | /// This usually permits one to just import the `DFA` type. |
| 149 | pub fn builder() -> Builder { |
| 150 | Builder::new() |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | impl DFA { |
| 155 | /// A sentinel state ID indicating that a search should stop once it has |
| 156 | /// entered this state. When a search stops, it returns a match if one has |
| 157 | /// been found, otherwise no match. A DFA always has an actual dead state |
| 158 | /// at this ID. |
| 159 | /// |
| 160 | /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state. |
| 161 | /// Namely, the whole point of a DFA is that the FAIL state is completely |
| 162 | /// compiled away. That is, DFA construction involves pre-computing the |
| 163 | /// failure transitions everywhere, such that failure transitions are no |
| 164 | /// longer used at search time. This, combined with its uniformly dense |
| 165 | /// representation, are the two most important factors in why it's faster |
| 166 | /// than the NFAs in this crate. |
| 167 | const DEAD: StateID = StateID::new_unchecked(0); |
| 168 | |
| 169 | /// Adds the given pattern IDs as matches to the given state and also |
| 170 | /// records the added memory usage. |
| 171 | fn set_matches( |
| 172 | &mut self, |
| 173 | sid: StateID, |
| 174 | pids: impl Iterator<Item = PatternID>, |
| 175 | ) { |
| 176 | let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap(); |
| 177 | let mut at_least_one = false; |
| 178 | for pid in pids { |
| 179 | self.matches[index].push(pid); |
| 180 | self.matches_memory_usage += PatternID::SIZE; |
| 181 | at_least_one = true; |
| 182 | } |
| 183 | assert!(at_least_one, "match state must have non-empty pids" ); |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always |
| 188 | // returns a valid state ID given a valid state ID. We otherwise claim that |
| 189 | // all other methods are correct as well. |
| 190 | unsafe impl Automaton for DFA { |
| 191 | #[inline (always)] |
| 192 | fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { |
| 193 | // Either of the start state IDs can be DEAD, in which case, support |
| 194 | // for that type of search is not provided by this DFA. Which start |
| 195 | // state IDs are inactive depends on the 'StartKind' configuration at |
| 196 | // DFA construction time. |
| 197 | match anchored { |
| 198 | Anchored::No => { |
| 199 | let start = self.special.start_unanchored_id; |
| 200 | if start == DFA::DEAD { |
| 201 | Err(MatchError::invalid_input_unanchored()) |
| 202 | } else { |
| 203 | Ok(start) |
| 204 | } |
| 205 | } |
| 206 | Anchored::Yes => { |
| 207 | let start = self.special.start_anchored_id; |
| 208 | if start == DFA::DEAD { |
| 209 | Err(MatchError::invalid_input_anchored()) |
| 210 | } else { |
| 211 | Ok(start) |
| 212 | } |
| 213 | } |
| 214 | } |
| 215 | } |
| 216 | |
| 217 | #[inline (always)] |
| 218 | fn next_state( |
| 219 | &self, |
| 220 | _anchored: Anchored, |
| 221 | sid: StateID, |
| 222 | byte: u8, |
| 223 | ) -> StateID { |
| 224 | let class = self.byte_classes.get(byte); |
| 225 | self.trans[(sid.as_u32() + u32::from(class)).as_usize()] |
| 226 | } |
| 227 | |
| 228 | #[inline (always)] |
| 229 | fn is_special(&self, sid: StateID) -> bool { |
| 230 | sid <= self.special.max_special_id |
| 231 | } |
| 232 | |
| 233 | #[inline (always)] |
| 234 | fn is_dead(&self, sid: StateID) -> bool { |
| 235 | sid == DFA::DEAD |
| 236 | } |
| 237 | |
| 238 | #[inline (always)] |
| 239 | fn is_match(&self, sid: StateID) -> bool { |
| 240 | !self.is_dead(sid) && sid <= self.special.max_match_id |
| 241 | } |
| 242 | |
| 243 | #[inline (always)] |
| 244 | fn is_start(&self, sid: StateID) -> bool { |
| 245 | sid == self.special.start_unanchored_id |
| 246 | || sid == self.special.start_anchored_id |
| 247 | } |
| 248 | |
| 249 | #[inline (always)] |
| 250 | fn match_kind(&self) -> MatchKind { |
| 251 | self.match_kind |
| 252 | } |
| 253 | |
| 254 | #[inline (always)] |
| 255 | fn patterns_len(&self) -> usize { |
| 256 | self.pattern_lens.len() |
| 257 | } |
| 258 | |
| 259 | #[inline (always)] |
| 260 | fn pattern_len(&self, pid: PatternID) -> usize { |
| 261 | self.pattern_lens[pid].as_usize() |
| 262 | } |
| 263 | |
| 264 | #[inline (always)] |
| 265 | fn min_pattern_len(&self) -> usize { |
| 266 | self.min_pattern_len |
| 267 | } |
| 268 | |
| 269 | #[inline (always)] |
| 270 | fn max_pattern_len(&self) -> usize { |
| 271 | self.max_pattern_len |
| 272 | } |
| 273 | |
| 274 | #[inline (always)] |
| 275 | fn match_len(&self, sid: StateID) -> usize { |
| 276 | debug_assert!(self.is_match(sid)); |
| 277 | let offset = (sid.as_usize() >> self.stride2) - 2; |
| 278 | self.matches[offset].len() |
| 279 | } |
| 280 | |
| 281 | #[inline (always)] |
| 282 | fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { |
| 283 | debug_assert!(self.is_match(sid)); |
| 284 | let offset = (sid.as_usize() >> self.stride2) - 2; |
| 285 | self.matches[offset][index] |
| 286 | } |
| 287 | |
| 288 | #[inline (always)] |
| 289 | fn memory_usage(&self) -> usize { |
| 290 | use core::mem::size_of; |
| 291 | |
| 292 | (self.trans.len() * size_of::<u32>()) |
| 293 | + (self.matches.len() * size_of::<Vec<PatternID>>()) |
| 294 | + self.matches_memory_usage |
| 295 | + (self.pattern_lens.len() * size_of::<SmallIndex>()) |
| 296 | + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) |
| 297 | } |
| 298 | |
| 299 | #[inline (always)] |
| 300 | fn prefilter(&self) -> Option<&Prefilter> { |
| 301 | self.prefilter.as_ref() |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | impl core::fmt::Debug for DFA { |
| 306 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 307 | use crate::{ |
| 308 | automaton::{fmt_state_indicator, sparse_transitions}, |
| 309 | util::debug::DebugByte, |
| 310 | }; |
| 311 | |
| 312 | writeln!(f, "dfa::DFA(" )?; |
| 313 | for index in 0..self.state_len { |
| 314 | let sid = StateID::new_unchecked(index << self.stride2); |
| 315 | // While we do currently include the FAIL state in the transition |
| 316 | // table (to simplify construction), it is never actually used. It |
| 317 | // poses problems with the code below because it gets treated as |
| 318 | // a match state incidentally when it is, of course, not. So we |
| 319 | // special case it. The fail state is always the first state after |
| 320 | // the dead state. |
| 321 | // |
| 322 | // If the construction is changed to remove the fail state (it |
| 323 | // probably should be), then this special case should be updated. |
| 324 | if index == 1 { |
| 325 | writeln!(f, "F {:06}:" , sid.as_usize())?; |
| 326 | continue; |
| 327 | } |
| 328 | fmt_state_indicator(f, self, sid)?; |
| 329 | write!(f, " {:06}: " , sid.as_usize())?; |
| 330 | |
| 331 | let it = (0..self.byte_classes.alphabet_len()).map(|class| { |
| 332 | (class.as_u8(), self.trans[sid.as_usize() + class]) |
| 333 | }); |
| 334 | for (i, (start, end, next)) in sparse_transitions(it).enumerate() { |
| 335 | if i > 0 { |
| 336 | write!(f, ", " )?; |
| 337 | } |
| 338 | if start == end { |
| 339 | write!( |
| 340 | f, |
| 341 | " {:?} => {:?}" , |
| 342 | DebugByte(start), |
| 343 | next.as_usize() |
| 344 | )?; |
| 345 | } else { |
| 346 | write!( |
| 347 | f, |
| 348 | " {:?}- {:?} => {:?}" , |
| 349 | DebugByte(start), |
| 350 | DebugByte(end), |
| 351 | next.as_usize() |
| 352 | )?; |
| 353 | } |
| 354 | } |
| 355 | write!(f, " \n" )?; |
| 356 | if self.is_match(sid) { |
| 357 | write!(f, " matches: " )?; |
| 358 | for i in 0..self.match_len(sid) { |
| 359 | if i > 0 { |
| 360 | write!(f, ", " )?; |
| 361 | } |
| 362 | let pid = self.match_pattern(sid, i); |
| 363 | write!(f, " {}" , pid.as_usize())?; |
| 364 | } |
| 365 | write!(f, " \n" )?; |
| 366 | } |
| 367 | } |
| 368 | writeln!(f, "match kind: {:?}" , self.match_kind)?; |
| 369 | writeln!(f, "prefilter: {:?}" , self.prefilter.is_some())?; |
| 370 | writeln!(f, "state length: {:?}" , self.state_len)?; |
| 371 | writeln!(f, "pattern length: {:?}" , self.patterns_len())?; |
| 372 | writeln!(f, "shortest pattern length: {:?}" , self.min_pattern_len)?; |
| 373 | writeln!(f, "longest pattern length: {:?}" , self.max_pattern_len)?; |
| 374 | writeln!(f, "alphabet length: {:?}" , self.alphabet_len)?; |
| 375 | writeln!(f, "stride: {:?}" , 1 << self.stride2)?; |
| 376 | writeln!(f, "byte classes: {:?}" , self.byte_classes)?; |
| 377 | writeln!(f, "memory usage: {:?}" , self.memory_usage())?; |
| 378 | writeln!(f, ")" )?; |
| 379 | Ok(()) |
| 380 | } |
| 381 | } |
| 382 | |
| 383 | /// A builder for configuring an Aho-Corasick DFA. |
| 384 | /// |
| 385 | /// This builder has a subset of the options available to a |
| 386 | /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, |
| 387 | /// their behavior is identical. |
| 388 | #[derive (Clone, Debug)] |
| 389 | pub struct Builder { |
| 390 | noncontiguous: noncontiguous::Builder, |
| 391 | start_kind: StartKind, |
| 392 | byte_classes: bool, |
| 393 | } |
| 394 | |
| 395 | impl Default for Builder { |
| 396 | fn default() -> Builder { |
| 397 | Builder { |
| 398 | noncontiguous: noncontiguous::Builder::new(), |
| 399 | start_kind: StartKind::Unanchored, |
| 400 | byte_classes: true, |
| 401 | } |
| 402 | } |
| 403 | } |
| 404 | |
| 405 | impl Builder { |
| 406 | /// Create a new builder for configuring an Aho-Corasick DFA. |
| 407 | pub fn new() -> Builder { |
| 408 | Builder::default() |
| 409 | } |
| 410 | |
| 411 | /// Build an Aho-Corasick DFA from the given iterator of patterns. |
| 412 | /// |
| 413 | /// A builder may be reused to create more DFAs. |
| 414 | pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError> |
| 415 | where |
| 416 | I: IntoIterator<Item = P>, |
| 417 | P: AsRef<[u8]>, |
| 418 | { |
| 419 | let nnfa = self.noncontiguous.build(patterns)?; |
| 420 | self.build_from_noncontiguous(&nnfa) |
| 421 | } |
| 422 | |
| 423 | /// Build an Aho-Corasick DFA from the given noncontiguous NFA. |
| 424 | /// |
| 425 | /// Note that when this method is used, only the `start_kind` and |
| 426 | /// `byte_classes` settings on this builder are respected. The other |
| 427 | /// settings only apply to the initial construction of the Aho-Corasick |
| 428 | /// automaton. Since using this method requires that initial construction |
| 429 | /// has already completed, all settings impacting only initial construction |
| 430 | /// are no longer relevant. |
| 431 | pub fn build_from_noncontiguous( |
| 432 | &self, |
| 433 | nnfa: &noncontiguous::NFA, |
| 434 | ) -> Result<DFA, BuildError> { |
| 435 | debug!("building DFA" ); |
| 436 | let byte_classes = if self.byte_classes { |
| 437 | nnfa.byte_classes().clone() |
| 438 | } else { |
| 439 | ByteClasses::singletons() |
| 440 | }; |
| 441 | let state_len = match self.start_kind { |
| 442 | StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(), |
| 443 | StartKind::Both => { |
| 444 | // These unwraps are OK because we know that the number of |
| 445 | // NFA states is < StateID::LIMIT which is in turn less than |
| 446 | // i32::MAX. Thus, there is always room to multiply by 2. |
| 447 | // Finally, the number of states is always at least 4 in the |
| 448 | // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the |
| 449 | // subtraction of 4 is okay. |
| 450 | // |
| 451 | // Note that we subtract 4 because the "anchored" part of |
| 452 | // the DFA duplicates the unanchored part (without failure |
| 453 | // transitions), but reuses the DEAD, FAIL and START states. |
| 454 | nnfa.states() |
| 455 | .len() |
| 456 | .checked_mul(2) |
| 457 | .unwrap() |
| 458 | .checked_sub(4) |
| 459 | .unwrap() |
| 460 | } |
| 461 | }; |
| 462 | let trans_len = |
| 463 | match state_len.checked_shl(byte_classes.stride2().as_u32()) { |
| 464 | Some(trans_len) => trans_len, |
| 465 | None => { |
| 466 | return Err(BuildError::state_id_overflow( |
| 467 | StateID::MAX.as_u64(), |
| 468 | usize::MAX.as_u64(), |
| 469 | )) |
| 470 | } |
| 471 | }; |
| 472 | StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap()) |
| 473 | .map_err(|e| { |
| 474 | BuildError::state_id_overflow( |
| 475 | StateID::MAX.as_u64(), |
| 476 | e.attempted(), |
| 477 | ) |
| 478 | })?; |
| 479 | let num_match_states = match self.start_kind { |
| 480 | StartKind::Unanchored | StartKind::Anchored => { |
| 481 | nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap() |
| 482 | } |
| 483 | StartKind::Both => nnfa |
| 484 | .special() |
| 485 | .max_match_id |
| 486 | .as_usize() |
| 487 | .checked_sub(1) |
| 488 | .unwrap() |
| 489 | .checked_mul(2) |
| 490 | .unwrap(), |
| 491 | }; |
| 492 | let mut dfa = DFA { |
| 493 | trans: vec![DFA::DEAD; trans_len], |
| 494 | matches: vec![vec![]; num_match_states], |
| 495 | matches_memory_usage: 0, |
| 496 | pattern_lens: nnfa.pattern_lens_raw().to_vec(), |
| 497 | prefilter: nnfa.prefilter().map(|p| p.clone()), |
| 498 | match_kind: nnfa.match_kind(), |
| 499 | state_len, |
| 500 | alphabet_len: byte_classes.alphabet_len(), |
| 501 | stride2: byte_classes.stride2(), |
| 502 | byte_classes, |
| 503 | min_pattern_len: nnfa.min_pattern_len(), |
| 504 | max_pattern_len: nnfa.max_pattern_len(), |
| 505 | // The special state IDs are set later. |
| 506 | special: Special::zero(), |
| 507 | }; |
| 508 | match self.start_kind { |
| 509 | StartKind::Both => { |
| 510 | self.finish_build_both_starts(nnfa, &mut dfa); |
| 511 | } |
| 512 | StartKind::Unanchored => { |
| 513 | self.finish_build_one_start(Anchored::No, nnfa, &mut dfa); |
| 514 | } |
| 515 | StartKind::Anchored => { |
| 516 | self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa) |
| 517 | } |
| 518 | } |
| 519 | debug!( |
| 520 | "DFA built, <states: {:?}, size: {:?}, \ |
| 521 | alphabet len: {:?}, stride: {:?}>" , |
| 522 | dfa.state_len, |
| 523 | dfa.memory_usage(), |
| 524 | dfa.byte_classes.alphabet_len(), |
| 525 | dfa.byte_classes.stride(), |
| 526 | ); |
| 527 | // The vectors can grow ~twice as big during construction because a |
| 528 | // Vec amortizes growth. But here, let's shrink things back down to |
| 529 | // what we actually need since we're never going to add more to it. |
| 530 | dfa.trans.shrink_to_fit(); |
| 531 | dfa.pattern_lens.shrink_to_fit(); |
| 532 | dfa.matches.shrink_to_fit(); |
| 533 | // TODO: We might also want to shrink each Vec inside of `dfa.matches`, |
| 534 | // or even better, convert it to one contiguous allocation. But I think |
| 535 | // I went with nested allocs for good reason (can't remember), so this |
| 536 | // may be tricky to do. I decided not to shrink them here because it |
| 537 | // might require a fair bit of work to do. It's unclear whether it's |
| 538 | // worth it. |
| 539 | Ok(dfa) |
| 540 | } |
| 541 | |
| 542 | /// Finishes building a DFA for either unanchored or anchored searches, |
| 543 | /// but NOT both. |
| 544 | fn finish_build_one_start( |
| 545 | &self, |
| 546 | anchored: Anchored, |
| 547 | nnfa: &noncontiguous::NFA, |
| 548 | dfa: &mut DFA, |
| 549 | ) { |
| 550 | // This function always succeeds because we check above that all of the |
| 551 | // states in the NFA can be mapped to DFA state IDs. |
| 552 | let stride2 = dfa.stride2; |
| 553 | let old2new = |oldsid: StateID| { |
| 554 | StateID::new_unchecked(oldsid.as_usize() << stride2) |
| 555 | }; |
| 556 | for (oldsid, state) in nnfa.states().iter().with_state_ids() { |
| 557 | let newsid = old2new(oldsid); |
| 558 | if state.is_match() { |
| 559 | dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); |
| 560 | } |
| 561 | sparse_iter( |
| 562 | nnfa, |
| 563 | oldsid, |
| 564 | &dfa.byte_classes, |
| 565 | |byte, class, mut oldnextsid| { |
| 566 | if oldnextsid == noncontiguous::NFA::FAIL { |
| 567 | if anchored.is_anchored() { |
| 568 | oldnextsid = noncontiguous::NFA::DEAD; |
| 569 | } else if state.fail() == noncontiguous::NFA::DEAD { |
| 570 | // This is a special case that avoids following |
| 571 | // DEAD transitions in a non-contiguous NFA. |
| 572 | // Following these transitions is pretty slow |
| 573 | // because the non-contiguous NFA will always use |
| 574 | // a sparse representation for it (because the |
| 575 | // DEAD state is usually treated as a sentinel). |
| 576 | // The *vast* majority of failure states are DEAD |
| 577 | // states, so this winds up being pretty slow if |
| 578 | // we go through the non-contiguous NFA state |
| 579 | // transition logic. Instead, just do it ourselves. |
| 580 | oldnextsid = noncontiguous::NFA::DEAD; |
| 581 | } else { |
| 582 | oldnextsid = nnfa.next_state( |
| 583 | Anchored::No, |
| 584 | state.fail(), |
| 585 | byte, |
| 586 | ); |
| 587 | } |
| 588 | } |
| 589 | dfa.trans[newsid.as_usize() + usize::from(class)] = |
| 590 | old2new(oldnextsid); |
| 591 | }, |
| 592 | ); |
| 593 | } |
| 594 | // Now that we've remapped all the IDs in our states, all that's left |
| 595 | // is remapping the special state IDs. |
| 596 | let old = nnfa.special(); |
| 597 | let new = &mut dfa.special; |
| 598 | new.max_special_id = old2new(old.max_special_id); |
| 599 | new.max_match_id = old2new(old.max_match_id); |
| 600 | if anchored.is_anchored() { |
| 601 | new.start_unanchored_id = DFA::DEAD; |
| 602 | new.start_anchored_id = old2new(old.start_anchored_id); |
| 603 | } else { |
| 604 | new.start_unanchored_id = old2new(old.start_unanchored_id); |
| 605 | new.start_anchored_id = DFA::DEAD; |
| 606 | } |
| 607 | } |
| 608 | |
| 609 | /// Finishes building a DFA that supports BOTH unanchored and anchored |
| 610 | /// searches. It works by inter-leaving unanchored states with anchored |
| 611 | /// states in the same transition table. This way, we avoid needing to |
| 612 | /// re-shuffle states afterward to ensure that our states still look like |
| 613 | /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ... |
| 614 | /// |
| 615 | /// Honestly this is pretty inscrutable... Simplifications are most |
| 616 | /// welcome. |
| 617 | fn finish_build_both_starts( |
| 618 | &self, |
| 619 | nnfa: &noncontiguous::NFA, |
| 620 | dfa: &mut DFA, |
| 621 | ) { |
| 622 | let stride2 = dfa.stride2; |
| 623 | let stride = 1 << stride2; |
| 624 | let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()]; |
| 625 | let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()]; |
| 626 | let mut is_anchored = vec![false; dfa.state_len]; |
| 627 | let mut newsid = DFA::DEAD; |
| 628 | let next_dfa_id = |
| 629 | |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride); |
| 630 | for (oldsid, state) in nnfa.states().iter().with_state_ids() { |
| 631 | if oldsid == noncontiguous::NFA::DEAD |
| 632 | || oldsid == noncontiguous::NFA::FAIL |
| 633 | { |
| 634 | remap_unanchored[oldsid] = newsid; |
| 635 | remap_anchored[oldsid] = newsid; |
| 636 | newsid = next_dfa_id(newsid); |
| 637 | } else if oldsid == nnfa.special().start_unanchored_id |
| 638 | || oldsid == nnfa.special().start_anchored_id |
| 639 | { |
| 640 | if oldsid == nnfa.special().start_unanchored_id { |
| 641 | remap_unanchored[oldsid] = newsid; |
| 642 | remap_anchored[oldsid] = DFA::DEAD; |
| 643 | } else { |
| 644 | remap_unanchored[oldsid] = DFA::DEAD; |
| 645 | remap_anchored[oldsid] = newsid; |
| 646 | is_anchored[newsid.as_usize() >> stride2] = true; |
| 647 | } |
| 648 | if state.is_match() { |
| 649 | dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); |
| 650 | } |
| 651 | sparse_iter( |
| 652 | nnfa, |
| 653 | oldsid, |
| 654 | &dfa.byte_classes, |
| 655 | |_, class, oldnextsid| { |
| 656 | let class = usize::from(class); |
| 657 | if oldnextsid == noncontiguous::NFA::FAIL { |
| 658 | dfa.trans[newsid.as_usize() + class] = DFA::DEAD; |
| 659 | } else { |
| 660 | dfa.trans[newsid.as_usize() + class] = oldnextsid; |
| 661 | } |
| 662 | }, |
| 663 | ); |
| 664 | newsid = next_dfa_id(newsid); |
| 665 | } else { |
| 666 | let unewsid = newsid; |
| 667 | newsid = next_dfa_id(newsid); |
| 668 | let anewsid = newsid; |
| 669 | newsid = next_dfa_id(newsid); |
| 670 | |
| 671 | remap_unanchored[oldsid] = unewsid; |
| 672 | remap_anchored[oldsid] = anewsid; |
| 673 | is_anchored[anewsid.as_usize() >> stride2] = true; |
| 674 | if state.is_match() { |
| 675 | dfa.set_matches(unewsid, nnfa.iter_matches(oldsid)); |
| 676 | dfa.set_matches(anewsid, nnfa.iter_matches(oldsid)); |
| 677 | } |
| 678 | sparse_iter( |
| 679 | nnfa, |
| 680 | oldsid, |
| 681 | &dfa.byte_classes, |
| 682 | |byte, class, oldnextsid| { |
| 683 | let class = usize::from(class); |
| 684 | if oldnextsid == noncontiguous::NFA::FAIL { |
| 685 | let oldnextsid = |
| 686 | if state.fail() == noncontiguous::NFA::DEAD { |
| 687 | noncontiguous::NFA::DEAD |
| 688 | } else { |
| 689 | nnfa.next_state( |
| 690 | Anchored::No, |
| 691 | state.fail(), |
| 692 | byte, |
| 693 | ) |
| 694 | }; |
| 695 | dfa.trans[unewsid.as_usize() + class] = oldnextsid; |
| 696 | } else { |
| 697 | dfa.trans[unewsid.as_usize() + class] = oldnextsid; |
| 698 | dfa.trans[anewsid.as_usize() + class] = oldnextsid; |
| 699 | } |
| 700 | }, |
| 701 | ); |
| 702 | } |
| 703 | } |
| 704 | for i in 0..dfa.state_len { |
| 705 | let sid = i << stride2; |
| 706 | if is_anchored[i] { |
| 707 | for next in dfa.trans[sid..][..stride].iter_mut() { |
| 708 | *next = remap_anchored[*next]; |
| 709 | } |
| 710 | } else { |
| 711 | for next in dfa.trans[sid..][..stride].iter_mut() { |
| 712 | *next = remap_unanchored[*next]; |
| 713 | } |
| 714 | } |
| 715 | } |
| 716 | // Now that we've remapped all the IDs in our states, all that's left |
| 717 | // is remapping the special state IDs. |
| 718 | let old = nnfa.special(); |
| 719 | let new = &mut dfa.special; |
| 720 | new.max_special_id = remap_anchored[old.max_special_id]; |
| 721 | new.max_match_id = remap_anchored[old.max_match_id]; |
| 722 | new.start_unanchored_id = remap_unanchored[old.start_unanchored_id]; |
| 723 | new.start_anchored_id = remap_anchored[old.start_anchored_id]; |
| 724 | } |
| 725 | |
| 726 | /// Set the desired match semantics. |
| 727 | /// |
| 728 | /// This only applies when using [`Builder::build`] and not |
| 729 | /// [`Builder::build_from_noncontiguous`]. |
| 730 | /// |
| 731 | /// See |
| 732 | /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) |
| 733 | /// for more documentation and examples. |
| 734 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { |
| 735 | self.noncontiguous.match_kind(kind); |
| 736 | self |
| 737 | } |
| 738 | |
| 739 | /// Enable ASCII-aware case insensitive matching. |
| 740 | /// |
| 741 | /// This only applies when using [`Builder::build`] and not |
| 742 | /// [`Builder::build_from_noncontiguous`]. |
| 743 | /// |
| 744 | /// See |
| 745 | /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) |
| 746 | /// for more documentation and examples. |
| 747 | pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { |
| 748 | self.noncontiguous.ascii_case_insensitive(yes); |
| 749 | self |
| 750 | } |
| 751 | |
| 752 | /// Enable heuristic prefilter optimizations. |
| 753 | /// |
| 754 | /// This only applies when using [`Builder::build`] and not |
| 755 | /// [`Builder::build_from_noncontiguous`]. |
| 756 | /// |
| 757 | /// See |
| 758 | /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) |
| 759 | /// for more documentation and examples. |
| 760 | pub fn prefilter(&mut self, yes: bool) -> &mut Builder { |
| 761 | self.noncontiguous.prefilter(yes); |
| 762 | self |
| 763 | } |
| 764 | |
| 765 | /// Sets the starting state configuration for the automaton. |
| 766 | /// |
| 767 | /// See |
| 768 | /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) |
| 769 | /// for more documentation and examples. |
| 770 | pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder { |
| 771 | self.start_kind = kind; |
| 772 | self |
| 773 | } |
| 774 | |
| 775 | /// A debug setting for whether to attempt to shrink the size of the |
| 776 | /// automaton's alphabet or not. |
| 777 | /// |
| 778 | /// This should never be enabled unless you're debugging an automaton. |
| 779 | /// Namely, disabling byte classes makes transitions easier to reason |
| 780 | /// about, since they use the actual bytes instead of equivalence classes. |
| 781 | /// Disabling this confers no performance benefit at search time. |
| 782 | /// |
| 783 | /// See |
| 784 | /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes) |
| 785 | /// for more documentation and examples. |
| 786 | pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { |
| 787 | self.byte_classes = yes; |
| 788 | self |
| 789 | } |
| 790 | } |
| 791 | |
| 792 | /// Iterate over all possible equivalence class transitions in this state. |
| 793 | /// The closure is called for all transitions with a distinct equivalence |
| 794 | /// class, even those not explicitly represented in this sparse state. For |
| 795 | /// any implicitly defined transitions, the given closure is called with |
| 796 | /// the fail state ID. |
| 797 | /// |
| 798 | /// The closure is guaranteed to be called precisely |
| 799 | /// `byte_classes.alphabet_len()` times, once for every possible class in |
| 800 | /// ascending order. |
| 801 | fn sparse_iter<F: FnMut(u8, u8, StateID)>( |
| 802 | nnfa: &noncontiguous::NFA, |
| 803 | oldsid: StateID, |
| 804 | classes: &ByteClasses, |
| 805 | mut f: F, |
| 806 | ) { |
| 807 | let mut prev_class = None; |
| 808 | let mut byte = 0usize; |
| 809 | for t in nnfa.iter_trans(oldsid) { |
| 810 | while byte < usize::from(t.byte()) { |
| 811 | let rep = byte.as_u8(); |
| 812 | let class = classes.get(rep); |
| 813 | byte += 1; |
| 814 | if prev_class != Some(class) { |
| 815 | f(rep, class, noncontiguous::NFA::FAIL); |
| 816 | prev_class = Some(class); |
| 817 | } |
| 818 | } |
| 819 | let rep = t.byte(); |
| 820 | let class = classes.get(rep); |
| 821 | byte += 1; |
| 822 | if prev_class != Some(class) { |
| 823 | f(rep, class, t.next()); |
| 824 | prev_class = Some(class); |
| 825 | } |
| 826 | } |
| 827 | for b in byte..=255 { |
| 828 | let rep = b.as_u8(); |
| 829 | let class = classes.get(rep); |
| 830 | if prev_class != Some(class) { |
| 831 | f(rep, class, noncontiguous::NFA::FAIL); |
| 832 | prev_class = Some(class); |
| 833 | } |
| 834 | } |
| 835 | } |
| 836 | |