| 1 | /*! |
| 2 | Provides [`Automaton`] trait for abstracting over Aho-Corasick automata. |
| 3 | |
| 4 | The `Automaton` trait provides a way to write generic code over any |
| 5 | Aho-Corasick automaton. It also provides access to lower level APIs that |
| 6 | permit walking the state transitions of an Aho-Corasick automaton manually. |
| 7 | */ |
| 8 | |
| 9 | use alloc::{string::String, vec::Vec}; |
| 10 | |
| 11 | use crate::util::{ |
| 12 | error::MatchError, |
| 13 | primitives::PatternID, |
| 14 | search::{Anchored, Input, Match, MatchKind, Span}, |
| 15 | }; |
| 16 | |
| 17 | pub use crate::util::{ |
| 18 | prefilter::{Candidate, Prefilter}, |
| 19 | primitives::{StateID, StateIDError}, |
| 20 | }; |
| 21 | |
| 22 | /// We seal the `Automaton` trait for now. It's a big trait, and it's |
| 23 | /// conceivable that I might want to add new required methods, and sealing the |
| 24 | /// trait permits doing that in a backwards compatible fashion. On other the |
| 25 | /// hand, if you have a solid use case for implementing the trait yourself, |
| 26 | /// please file an issue and we can discuss it. This was *mostly* done as a |
| 27 | /// conservative step. |
| 28 | pub(crate) mod private { |
| 29 | pub trait Sealed {} |
| 30 | } |
| 31 | impl private::Sealed for crate::nfa::noncontiguous::NFA {} |
| 32 | impl private::Sealed for crate::nfa::contiguous::NFA {} |
| 33 | impl private::Sealed for crate::dfa::DFA {} |
| 34 | |
| 35 | impl<'a, T: private::Sealed + ?Sized> private::Sealed for &'a T {} |
| 36 | |
| 37 | /// A trait that abstracts over Aho-Corasick automata. |
| 38 | /// |
| 39 | /// This trait primarily exists for niche use cases such as: |
| 40 | /// |
| 41 | /// * Using an NFA or DFA directly, bypassing the top-level |
| 42 | /// [`AhoCorasick`](crate::AhoCorasick) searcher. Currently, these include |
| 43 | /// [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA), |
| 44 | /// [`contiguous::NFA`](crate::nfa::contiguous::NFA) and |
| 45 | /// [`dfa::DFA`](crate::dfa::DFA). |
| 46 | /// * Implementing your own custom search routine by walking the automaton |
| 47 | /// yourself. This might be useful for implementing search on non-contiguous |
| 48 | /// strings or streams. |
| 49 | /// |
| 50 | /// For most use cases, it is not expected that users will need |
| 51 | /// to use or even know about this trait. Indeed, the top level |
| 52 | /// [`AhoCorasick`](crate::AhoCorasick) searcher does not expose any details |
| 53 | /// about this trait, nor does it implement it itself. |
| 54 | /// |
| 55 | /// Note that this trait defines a number of default methods, such as |
| 56 | /// [`Automaton::try_find`] and [`Automaton::try_find_iter`], which implement |
| 57 | /// higher level search routines in terms of the lower level automata API. |
| 58 | /// |
| 59 | /// # Sealed |
| 60 | /// |
| 61 | /// Currently, this trait is sealed. That means users of this crate can write |
| 62 | /// generic routines over this trait but cannot implement it themselves. This |
| 63 | /// restriction may be lifted in the future, but sealing the trait permits |
| 64 | /// adding new required methods in a backwards compatible fashion. |
| 65 | /// |
| 66 | /// # Special states |
| 67 | /// |
| 68 | /// This trait encodes a notion of "special" states in an automaton. Namely, |
| 69 | /// a state is treated as special if it is a dead, match or start state: |
| 70 | /// |
| 71 | /// * A dead state is a state that cannot be left once entered. All transitions |
| 72 | /// on a dead state lead back to itself. The dead state is meant to be treated |
| 73 | /// as a sentinel indicating that the search should stop and return a match if |
| 74 | /// one has been found, and nothing otherwise. |
| 75 | /// * A match state is a state that indicates one or more patterns have |
| 76 | /// matched. Depending on the [`MatchKind`] of the automaton, a search may |
| 77 | /// stop once a match is seen, or it may continue looking for matches until |
| 78 | /// it enters a dead state or sees the end of the haystack. |
| 79 | /// * A start state is a state that a search begins in. It is useful to know |
| 80 | /// when a search enters a start state because it may mean that a prefilter can |
| 81 | /// be used to skip ahead and quickly look for candidate matches. Unlike dead |
| 82 | /// and match states, it is never necessary to explicitly handle start states |
| 83 | /// for correctness. Indeed, in this crate, implementations of `Automaton` |
| 84 | /// will only treat start states as "special" when a prefilter is enabled and |
| 85 | /// active. Otherwise, treating it as special has no purpose and winds up |
| 86 | /// slowing down the overall search because it results in ping-ponging between |
| 87 | /// the main state transition and the "special" state logic. |
| 88 | /// |
| 89 | /// Since checking whether a state is special by doing three different |
| 90 | /// checks would be too expensive inside a fast search loop, the |
| 91 | /// [`Automaton::is_special`] method is provided for quickly checking whether |
| 92 | /// the state is special. The `Automaton::is_dead`, `Automaton::is_match` and |
| 93 | /// `Automaton::is_start` predicates can then be used to determine which kind |
| 94 | /// of special state it is. |
| 95 | /// |
| 96 | /// # Panics |
| 97 | /// |
| 98 | /// Most of the APIs on this trait should panic or give incorrect results |
| 99 | /// if invalid inputs are given to it. For example, `Automaton::next_state` |
| 100 | /// has unspecified behavior if the state ID given to it is not a valid |
| 101 | /// state ID for the underlying automaton. Valid state IDs can only be |
| 102 | /// retrieved in one of two ways: calling `Automaton::start_state` or calling |
| 103 | /// `Automaton::next_state` with a valid state ID. |
| 104 | /// |
| 105 | /// # Safety |
| 106 | /// |
| 107 | /// This trait is not safe to implement so that code may rely on the |
| 108 | /// correctness of implementations of this trait to avoid undefined behavior. |
| 109 | /// The primary correctness guarantees are: |
| 110 | /// |
| 111 | /// * `Automaton::start_state` always returns a valid state ID or an error or |
| 112 | /// panics. |
| 113 | /// * `Automaton::next_state`, when given a valid state ID, always returns |
| 114 | /// a valid state ID for all values of `anchored` and `byte`, or otherwise |
| 115 | /// panics. |
| 116 | /// |
| 117 | /// In general, the rest of the methods on `Automaton` need to uphold their |
| 118 | /// contracts as well. For example, `Automaton::is_dead` should only returns |
| 119 | /// true if the given state ID is actually a dead state. |
| 120 | /// |
| 121 | /// Note that currently this crate does not rely on the safety property defined |
| 122 | /// here to avoid undefined behavior. Instead, this was done to make it |
| 123 | /// _possible_ to do in the future. |
| 124 | /// |
| 125 | /// # Example |
| 126 | /// |
| 127 | /// This example shows how one might implement a basic but correct search |
| 128 | /// routine. We keep things simple by not using prefilters or worrying about |
| 129 | /// anchored searches, but do make sure our search is correct for all possible |
| 130 | /// [`MatchKind`] semantics. (The comments in the code below note the parts |
| 131 | /// that are needed to support certain `MatchKind` semantics.) |
| 132 | /// |
| 133 | /// ``` |
| 134 | /// use aho_corasick::{ |
| 135 | /// automaton::Automaton, |
| 136 | /// nfa::noncontiguous::NFA, |
| 137 | /// Anchored, Match, MatchError, MatchKind, |
| 138 | /// }; |
| 139 | /// |
| 140 | /// // Run an unanchored search for 'aut' in 'haystack'. Return the first match |
| 141 | /// // seen according to the automaton's match semantics. This returns an error |
| 142 | /// // if the given automaton does not support unanchored searches. |
| 143 | /// fn find<A: Automaton>( |
| 144 | /// aut: A, |
| 145 | /// haystack: &[u8], |
| 146 | /// ) -> Result<Option<Match>, MatchError> { |
| 147 | /// let mut sid = aut.start_state(Anchored::No)?; |
| 148 | /// let mut at = 0; |
| 149 | /// let mut mat = None; |
| 150 | /// let get_match = |sid, at| { |
| 151 | /// let pid = aut.match_pattern(sid, 0); |
| 152 | /// let len = aut.pattern_len(pid); |
| 153 | /// Match::new(pid, (at - len)..at) |
| 154 | /// }; |
| 155 | /// // Start states can be match states! |
| 156 | /// if aut.is_match(sid) { |
| 157 | /// mat = Some(get_match(sid, at)); |
| 158 | /// // Standard semantics require matches to be reported as soon as |
| 159 | /// // they're seen. Otherwise, we continue until we see a dead state |
| 160 | /// // or the end of the haystack. |
| 161 | /// if matches!(aut.match_kind(), MatchKind::Standard) { |
| 162 | /// return Ok(mat); |
| 163 | /// } |
| 164 | /// } |
| 165 | /// while at < haystack.len() { |
| 166 | /// sid = aut.next_state(Anchored::No, sid, haystack[at]); |
| 167 | /// if aut.is_special(sid) { |
| 168 | /// if aut.is_dead(sid) { |
| 169 | /// return Ok(mat); |
| 170 | /// } else if aut.is_match(sid) { |
| 171 | /// mat = Some(get_match(sid, at + 1)); |
| 172 | /// // As above, standard semantics require that we return |
| 173 | /// // immediately once a match is found. |
| 174 | /// if matches!(aut.match_kind(), MatchKind::Standard) { |
| 175 | /// return Ok(mat); |
| 176 | /// } |
| 177 | /// } |
| 178 | /// } |
| 179 | /// at += 1; |
| 180 | /// } |
| 181 | /// Ok(mat) |
| 182 | /// } |
| 183 | /// |
| 184 | /// // Show that it works for standard searches. |
| 185 | /// let nfa = NFA::new(&["samwise" , "sam" ]).unwrap(); |
| 186 | /// assert_eq!(Some(Match::must(1, 0..3)), find(&nfa, b"samwise" )?); |
| 187 | /// |
| 188 | /// // But also works when using leftmost-first. Notice how the match result |
| 189 | /// // has changed! |
| 190 | /// let nfa = NFA::builder() |
| 191 | /// .match_kind(MatchKind::LeftmostFirst) |
| 192 | /// .build(&["samwise" , "sam" ]) |
| 193 | /// .unwrap(); |
| 194 | /// assert_eq!(Some(Match::must(0, 0..7)), find(&nfa, b"samwise" )?); |
| 195 | /// |
| 196 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 197 | /// ``` |
| 198 | pub unsafe trait Automaton: private::Sealed { |
| 199 | /// Returns the starting state for the given anchor mode. |
| 200 | /// |
| 201 | /// Upon success, the state ID returned is guaranteed to be valid for |
| 202 | /// this automaton. |
| 203 | /// |
| 204 | /// # Errors |
| 205 | /// |
| 206 | /// This returns an error when the given search configuration is not |
| 207 | /// supported by the underlying automaton. For example, if the underlying |
| 208 | /// automaton only supports unanchored searches but the given configuration |
| 209 | /// was set to an anchored search, then this must return an error. |
| 210 | fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError>; |
| 211 | |
| 212 | /// Performs a state transition from `sid` for `byte` and returns the next |
| 213 | /// state. |
| 214 | /// |
| 215 | /// `anchored` should be [`Anchored::Yes`] when executing an anchored |
| 216 | /// search and [`Anchored::No`] otherwise. For some implementations of |
| 217 | /// `Automaton`, it is required to know whether the search is anchored |
| 218 | /// or not in order to avoid following failure transitions. Other |
| 219 | /// implementations may ignore `anchored` altogether and depend on |
| 220 | /// `Automaton::start_state` returning a state that walks a different path |
| 221 | /// through the automaton depending on whether the search is anchored or |
| 222 | /// not. |
| 223 | /// |
| 224 | /// # Panics |
| 225 | /// |
| 226 | /// This routine may panic or return incorrect results when the given state |
| 227 | /// ID is invalid. A state ID is valid if and only if: |
| 228 | /// |
| 229 | /// 1. It came from a call to `Automaton::start_state`, or |
| 230 | /// 2. It came from a previous call to `Automaton::next_state` with a |
| 231 | /// valid state ID. |
| 232 | /// |
| 233 | /// Implementations must treat all possible values of `byte` as valid. |
| 234 | /// |
| 235 | /// Implementations may panic on unsupported values of `anchored`, but are |
| 236 | /// not required to do so. |
| 237 | fn next_state( |
| 238 | &self, |
| 239 | anchored: Anchored, |
| 240 | sid: StateID, |
| 241 | byte: u8, |
| 242 | ) -> StateID; |
| 243 | |
| 244 | /// Returns true if the given ID represents a "special" state. A special |
| 245 | /// state is a dead, match or start state. |
| 246 | /// |
| 247 | /// Note that implementations may choose to return false when the given ID |
| 248 | /// corresponds to a start state. Namely, it always correct to treat start |
| 249 | /// states as non-special. Implementations must return true for states that |
| 250 | /// are dead or contain matches. |
| 251 | /// |
| 252 | /// This has unspecified behavior when given an invalid state ID. |
| 253 | fn is_special(&self, sid: StateID) -> bool; |
| 254 | |
| 255 | /// Returns true if the given ID represents a dead state. |
| 256 | /// |
| 257 | /// A dead state is a type of "sink" in a finite state machine. It |
| 258 | /// corresponds to a state whose transitions all loop back to itself. That |
| 259 | /// is, once entered, it can never be left. In practice, it serves as a |
| 260 | /// sentinel indicating that the search should terminate. |
| 261 | /// |
| 262 | /// This has unspecified behavior when given an invalid state ID. |
| 263 | fn is_dead(&self, sid: StateID) -> bool; |
| 264 | |
| 265 | /// Returns true if the given ID represents a match state. |
| 266 | /// |
| 267 | /// A match state is always associated with one or more pattern IDs that |
| 268 | /// matched at the position in the haystack when the match state was |
| 269 | /// entered. When a match state is entered, the match semantics dictate |
| 270 | /// whether it should be returned immediately (for `MatchKind::Standard`) |
| 271 | /// or if the search should continue (for `MatchKind::LeftmostFirst` and |
| 272 | /// `MatchKind::LeftmostLongest`) until a dead state is seen or the end of |
| 273 | /// the haystack has been reached. |
| 274 | /// |
| 275 | /// This has unspecified behavior when given an invalid state ID. |
| 276 | fn is_match(&self, sid: StateID) -> bool; |
| 277 | |
| 278 | /// Returns true if the given ID represents a start state. |
| 279 | /// |
| 280 | /// While it is never incorrect to ignore start states during a search |
| 281 | /// (except for the start of the search of course), knowing whether one has |
| 282 | /// entered a start state can be useful for certain classes of performance |
| 283 | /// optimizations. For example, if one is in a start state, it may be legal |
| 284 | /// to try to skip ahead and look for match candidates more quickly than |
| 285 | /// would otherwise be accomplished by walking the automaton. |
| 286 | /// |
| 287 | /// Implementations of `Automaton` in this crate "unspecialize" start |
| 288 | /// states when a prefilter is not active or enabled. In this case, it |
| 289 | /// is possible for `Automaton::is_special(sid)` to return false while |
| 290 | /// `Automaton::is_start(sid)` returns true. |
| 291 | /// |
| 292 | /// This has unspecified behavior when given an invalid state ID. |
| 293 | fn is_start(&self, sid: StateID) -> bool; |
| 294 | |
| 295 | /// Returns the match semantics that this automaton was built with. |
| 296 | fn match_kind(&self) -> MatchKind; |
| 297 | |
| 298 | /// Returns the total number of matches for the given state ID. |
| 299 | /// |
| 300 | /// This has unspecified behavior if the given ID does not refer to a match |
| 301 | /// state. |
| 302 | fn match_len(&self, sid: StateID) -> usize; |
| 303 | |
| 304 | /// Returns the pattern ID for the match state given by `sid` at the |
| 305 | /// `index` given. |
| 306 | /// |
| 307 | /// Typically, `index` is only ever greater than `0` when implementing an |
| 308 | /// overlapping search. Otherwise, it's likely that your search only cares |
| 309 | /// about reporting the first pattern ID in a match state. |
| 310 | /// |
| 311 | /// This has unspecified behavior if the given ID does not refer to a match |
| 312 | /// state, or if the index is greater than or equal to the total number of |
| 313 | /// matches in this match state. |
| 314 | fn match_pattern(&self, sid: StateID, index: usize) -> PatternID; |
| 315 | |
| 316 | /// Returns the total number of patterns compiled into this automaton. |
| 317 | fn patterns_len(&self) -> usize; |
| 318 | |
| 319 | /// Returns the length of the pattern for the given ID. |
| 320 | /// |
| 321 | /// This has unspecified behavior when given an invalid pattern |
| 322 | /// ID. A pattern ID is valid if and only if it is less than |
| 323 | /// `Automaton::patterns_len`. |
| 324 | fn pattern_len(&self, pid: PatternID) -> usize; |
| 325 | |
| 326 | /// Returns the length, in bytes, of the shortest pattern in this |
| 327 | /// automaton. |
| 328 | fn min_pattern_len(&self) -> usize; |
| 329 | |
| 330 | /// Returns the length, in bytes, of the longest pattern in this automaton. |
| 331 | fn max_pattern_len(&self) -> usize; |
| 332 | |
| 333 | /// Returns the heap memory usage, in bytes, used by this automaton. |
| 334 | fn memory_usage(&self) -> usize; |
| 335 | |
| 336 | /// Returns a prefilter, if available, that can be used to accelerate |
| 337 | /// searches for this automaton. |
| 338 | /// |
| 339 | /// The typical way this is used is when the start state is entered during |
| 340 | /// a search. When that happens, one can use a prefilter to skip ahead and |
| 341 | /// look for candidate matches without having to walk the automaton on the |
| 342 | /// bytes between candidates. |
| 343 | /// |
| 344 | /// Typically a prefilter is only available when there are a small (<100) |
| 345 | /// number of patterns built into the automaton. |
| 346 | fn prefilter(&self) -> Option<&Prefilter>; |
| 347 | |
| 348 | /// Executes a non-overlapping search with this automaton using the given |
| 349 | /// configuration. |
| 350 | /// |
| 351 | /// See |
| 352 | /// [`AhoCorasick::try_find`](crate::AhoCorasick::try_find) |
| 353 | /// for more documentation and examples. |
| 354 | fn try_find( |
| 355 | &self, |
| 356 | input: &Input<'_>, |
| 357 | ) -> Result<Option<Match>, MatchError> { |
| 358 | try_find_fwd(&self, input) |
| 359 | } |
| 360 | |
| 361 | /// Executes a overlapping search with this automaton using the given |
| 362 | /// configuration. |
| 363 | /// |
| 364 | /// See |
| 365 | /// [`AhoCorasick::try_find_overlapping`](crate::AhoCorasick::try_find_overlapping) |
| 366 | /// for more documentation and examples. |
| 367 | fn try_find_overlapping( |
| 368 | &self, |
| 369 | input: &Input<'_>, |
| 370 | state: &mut OverlappingState, |
| 371 | ) -> Result<(), MatchError> { |
| 372 | try_find_overlapping_fwd(&self, input, state) |
| 373 | } |
| 374 | |
| 375 | /// Returns an iterator of non-overlapping matches with this automaton |
| 376 | /// using the given configuration. |
| 377 | /// |
| 378 | /// See |
| 379 | /// [`AhoCorasick::try_find_iter`](crate::AhoCorasick::try_find_iter) |
| 380 | /// for more documentation and examples. |
| 381 | fn try_find_iter<'a, 'h>( |
| 382 | &'a self, |
| 383 | input: Input<'h>, |
| 384 | ) -> Result<FindIter<'a, 'h, Self>, MatchError> |
| 385 | where |
| 386 | Self: Sized, |
| 387 | { |
| 388 | FindIter::new(self, input) |
| 389 | } |
| 390 | |
| 391 | /// Returns an iterator of overlapping matches with this automaton |
| 392 | /// using the given configuration. |
| 393 | /// |
| 394 | /// See |
| 395 | /// [`AhoCorasick::try_find_overlapping_iter`](crate::AhoCorasick::try_find_overlapping_iter) |
| 396 | /// for more documentation and examples. |
| 397 | fn try_find_overlapping_iter<'a, 'h>( |
| 398 | &'a self, |
| 399 | input: Input<'h>, |
| 400 | ) -> Result<FindOverlappingIter<'a, 'h, Self>, MatchError> |
| 401 | where |
| 402 | Self: Sized, |
| 403 | { |
| 404 | if !self.match_kind().is_standard() { |
| 405 | return Err(MatchError::unsupported_overlapping( |
| 406 | self.match_kind(), |
| 407 | )); |
| 408 | } |
| 409 | // We might consider lifting this restriction. The reason why I added |
| 410 | // it was to ban the combination of "anchored search" and "overlapping |
| 411 | // iteration." The match semantics aren't totally clear in that case. |
| 412 | // Should we allow *any* matches that are adjacent to *any* previous |
| 413 | // match? Or only following the most recent one? Or only matches |
| 414 | // that start at the beginning of the search? We might also elect to |
| 415 | // just keep this restriction in place, as callers should be able to |
| 416 | // implement it themselves if they want to. |
| 417 | if input.get_anchored().is_anchored() { |
| 418 | return Err(MatchError::invalid_input_anchored()); |
| 419 | } |
| 420 | let _ = self.start_state(input.get_anchored())?; |
| 421 | let state = OverlappingState::start(); |
| 422 | Ok(FindOverlappingIter { aut: self, input, state }) |
| 423 | } |
| 424 | |
| 425 | /// Replaces all non-overlapping matches in `haystack` with |
| 426 | /// strings from `replace_with` depending on the pattern that |
| 427 | /// matched. The `replace_with` slice must have length equal to |
| 428 | /// `Automaton::patterns_len`. |
| 429 | /// |
| 430 | /// See |
| 431 | /// [`AhoCorasick::try_replace_all`](crate::AhoCorasick::try_replace_all) |
| 432 | /// for more documentation and examples. |
| 433 | fn try_replace_all<B>( |
| 434 | &self, |
| 435 | haystack: &str, |
| 436 | replace_with: &[B], |
| 437 | ) -> Result<String, MatchError> |
| 438 | where |
| 439 | Self: Sized, |
| 440 | B: AsRef<str>, |
| 441 | { |
| 442 | assert_eq!( |
| 443 | replace_with.len(), |
| 444 | self.patterns_len(), |
| 445 | "replace_all requires a replacement for every pattern \ |
| 446 | in the automaton" |
| 447 | ); |
| 448 | let mut dst = String::with_capacity(haystack.len()); |
| 449 | self.try_replace_all_with(haystack, &mut dst, |mat, _, dst| { |
| 450 | dst.push_str(replace_with[mat.pattern()].as_ref()); |
| 451 | true |
| 452 | })?; |
| 453 | Ok(dst) |
| 454 | } |
| 455 | |
| 456 | /// Replaces all non-overlapping matches in `haystack` with |
| 457 | /// strings from `replace_with` depending on the pattern that |
| 458 | /// matched. The `replace_with` slice must have length equal to |
| 459 | /// `Automaton::patterns_len`. |
| 460 | /// |
| 461 | /// See |
| 462 | /// [`AhoCorasick::try_replace_all_bytes`](crate::AhoCorasick::try_replace_all_bytes) |
| 463 | /// for more documentation and examples. |
| 464 | fn try_replace_all_bytes<B>( |
| 465 | &self, |
| 466 | haystack: &[u8], |
| 467 | replace_with: &[B], |
| 468 | ) -> Result<Vec<u8>, MatchError> |
| 469 | where |
| 470 | Self: Sized, |
| 471 | B: AsRef<[u8]>, |
| 472 | { |
| 473 | assert_eq!( |
| 474 | replace_with.len(), |
| 475 | self.patterns_len(), |
| 476 | "replace_all requires a replacement for every pattern \ |
| 477 | in the automaton" |
| 478 | ); |
| 479 | let mut dst = Vec::with_capacity(haystack.len()); |
| 480 | self.try_replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| { |
| 481 | dst.extend(replace_with[mat.pattern()].as_ref()); |
| 482 | true |
| 483 | })?; |
| 484 | Ok(dst) |
| 485 | } |
| 486 | |
| 487 | /// Replaces all non-overlapping matches in `haystack` by calling the |
| 488 | /// `replace_with` closure given. |
| 489 | /// |
| 490 | /// See |
| 491 | /// [`AhoCorasick::try_replace_all_with`](crate::AhoCorasick::try_replace_all_with) |
| 492 | /// for more documentation and examples. |
| 493 | fn try_replace_all_with<F>( |
| 494 | &self, |
| 495 | haystack: &str, |
| 496 | dst: &mut String, |
| 497 | mut replace_with: F, |
| 498 | ) -> Result<(), MatchError> |
| 499 | where |
| 500 | Self: Sized, |
| 501 | F: FnMut(&Match, &str, &mut String) -> bool, |
| 502 | { |
| 503 | let mut last_match = 0; |
| 504 | for m in self.try_find_iter(Input::new(haystack))? { |
| 505 | // Since there are no restrictions on what kinds of patterns are |
| 506 | // in an Aho-Corasick automaton, we might get matches that split |
| 507 | // a codepoint, or even matches of a partial codepoint. When that |
| 508 | // happens, we just skip the match. |
| 509 | if !haystack.is_char_boundary(m.start()) |
| 510 | || !haystack.is_char_boundary(m.end()) |
| 511 | { |
| 512 | continue; |
| 513 | } |
| 514 | dst.push_str(&haystack[last_match..m.start()]); |
| 515 | last_match = m.end(); |
| 516 | if !replace_with(&m, &haystack[m.start()..m.end()], dst) { |
| 517 | break; |
| 518 | }; |
| 519 | } |
| 520 | dst.push_str(&haystack[last_match..]); |
| 521 | Ok(()) |
| 522 | } |
| 523 | |
| 524 | /// Replaces all non-overlapping matches in `haystack` by calling the |
| 525 | /// `replace_with` closure given. |
| 526 | /// |
| 527 | /// See |
| 528 | /// [`AhoCorasick::try_replace_all_with_bytes`](crate::AhoCorasick::try_replace_all_with_bytes) |
| 529 | /// for more documentation and examples. |
| 530 | fn try_replace_all_with_bytes<F>( |
| 531 | &self, |
| 532 | haystack: &[u8], |
| 533 | dst: &mut Vec<u8>, |
| 534 | mut replace_with: F, |
| 535 | ) -> Result<(), MatchError> |
| 536 | where |
| 537 | Self: Sized, |
| 538 | F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool, |
| 539 | { |
| 540 | let mut last_match = 0; |
| 541 | for m in self.try_find_iter(Input::new(haystack))? { |
| 542 | dst.extend(&haystack[last_match..m.start()]); |
| 543 | last_match = m.end(); |
| 544 | if !replace_with(&m, &haystack[m.start()..m.end()], dst) { |
| 545 | break; |
| 546 | }; |
| 547 | } |
| 548 | dst.extend(&haystack[last_match..]); |
| 549 | Ok(()) |
| 550 | } |
| 551 | |
| 552 | /// Returns an iterator of non-overlapping matches with this automaton |
| 553 | /// from the stream given. |
| 554 | /// |
| 555 | /// See |
| 556 | /// [`AhoCorasick::try_stream_find_iter`](crate::AhoCorasick::try_stream_find_iter) |
| 557 | /// for more documentation and examples. |
| 558 | #[cfg (feature = "std" )] |
| 559 | fn try_stream_find_iter<'a, R: std::io::Read>( |
| 560 | &'a self, |
| 561 | rdr: R, |
| 562 | ) -> Result<StreamFindIter<'a, Self, R>, MatchError> |
| 563 | where |
| 564 | Self: Sized, |
| 565 | { |
| 566 | Ok(StreamFindIter { it: StreamChunkIter::new(self, rdr)? }) |
| 567 | } |
| 568 | |
| 569 | /// Replaces all non-overlapping matches in `rdr` with strings from |
| 570 | /// `replace_with` depending on the pattern that matched, and writes the |
| 571 | /// result to `wtr`. The `replace_with` slice must have length equal to |
| 572 | /// `Automaton::patterns_len`. |
| 573 | /// |
| 574 | /// See |
| 575 | /// [`AhoCorasick::try_stream_replace_all`](crate::AhoCorasick::try_stream_replace_all) |
| 576 | /// for more documentation and examples. |
| 577 | #[cfg (feature = "std" )] |
| 578 | fn try_stream_replace_all<R, W, B>( |
| 579 | &self, |
| 580 | rdr: R, |
| 581 | wtr: W, |
| 582 | replace_with: &[B], |
| 583 | ) -> std::io::Result<()> |
| 584 | where |
| 585 | Self: Sized, |
| 586 | R: std::io::Read, |
| 587 | W: std::io::Write, |
| 588 | B: AsRef<[u8]>, |
| 589 | { |
| 590 | assert_eq!( |
| 591 | replace_with.len(), |
| 592 | self.patterns_len(), |
| 593 | "streaming replace_all requires a replacement for every pattern \ |
| 594 | in the automaton" , |
| 595 | ); |
| 596 | self.try_stream_replace_all_with(rdr, wtr, |mat, _, wtr| { |
| 597 | wtr.write_all(replace_with[mat.pattern()].as_ref()) |
| 598 | }) |
| 599 | } |
| 600 | |
| 601 | /// Replaces all non-overlapping matches in `rdr` by calling the |
| 602 | /// `replace_with` closure given and writing the result to `wtr`. |
| 603 | /// |
| 604 | /// See |
| 605 | /// [`AhoCorasick::try_stream_replace_all_with`](crate::AhoCorasick::try_stream_replace_all_with) |
| 606 | /// for more documentation and examples. |
| 607 | #[cfg (feature = "std" )] |
| 608 | fn try_stream_replace_all_with<R, W, F>( |
| 609 | &self, |
| 610 | rdr: R, |
| 611 | mut wtr: W, |
| 612 | mut replace_with: F, |
| 613 | ) -> std::io::Result<()> |
| 614 | where |
| 615 | Self: Sized, |
| 616 | R: std::io::Read, |
| 617 | W: std::io::Write, |
| 618 | F: FnMut(&Match, &[u8], &mut W) -> std::io::Result<()>, |
| 619 | { |
| 620 | let mut it = StreamChunkIter::new(self, rdr).map_err(|e| { |
| 621 | let kind = std::io::ErrorKind::Other; |
| 622 | std::io::Error::new(kind, e) |
| 623 | })?; |
| 624 | while let Some(result) = it.next() { |
| 625 | let chunk = result?; |
| 626 | match chunk { |
| 627 | StreamChunk::NonMatch { bytes, .. } => { |
| 628 | wtr.write_all(bytes)?; |
| 629 | } |
| 630 | StreamChunk::Match { bytes, mat } => { |
| 631 | replace_with(&mat, bytes, &mut wtr)?; |
| 632 | } |
| 633 | } |
| 634 | } |
| 635 | Ok(()) |
| 636 | } |
| 637 | } |
| 638 | |
| 639 | // SAFETY: This just defers to the underlying 'AcAutomaton' and thus inherits |
| 640 | // its safety properties. |
| 641 | unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { |
| 642 | #[inline (always)] |
| 643 | fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { |
| 644 | (**self).start_state(anchored) |
| 645 | } |
| 646 | |
| 647 | #[inline (always)] |
| 648 | fn next_state( |
| 649 | &self, |
| 650 | anchored: Anchored, |
| 651 | sid: StateID, |
| 652 | byte: u8, |
| 653 | ) -> StateID { |
| 654 | (**self).next_state(anchored, sid, byte) |
| 655 | } |
| 656 | |
| 657 | #[inline (always)] |
| 658 | fn is_special(&self, sid: StateID) -> bool { |
| 659 | (**self).is_special(sid) |
| 660 | } |
| 661 | |
| 662 | #[inline (always)] |
| 663 | fn is_dead(&self, sid: StateID) -> bool { |
| 664 | (**self).is_dead(sid) |
| 665 | } |
| 666 | |
| 667 | #[inline (always)] |
| 668 | fn is_match(&self, sid: StateID) -> bool { |
| 669 | (**self).is_match(sid) |
| 670 | } |
| 671 | |
| 672 | #[inline (always)] |
| 673 | fn is_start(&self, sid: StateID) -> bool { |
| 674 | (**self).is_start(sid) |
| 675 | } |
| 676 | |
| 677 | #[inline (always)] |
| 678 | fn match_kind(&self) -> MatchKind { |
| 679 | (**self).match_kind() |
| 680 | } |
| 681 | |
| 682 | #[inline (always)] |
| 683 | fn match_len(&self, sid: StateID) -> usize { |
| 684 | (**self).match_len(sid) |
| 685 | } |
| 686 | |
| 687 | #[inline (always)] |
| 688 | fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { |
| 689 | (**self).match_pattern(sid, index) |
| 690 | } |
| 691 | |
| 692 | #[inline (always)] |
| 693 | fn patterns_len(&self) -> usize { |
| 694 | (**self).patterns_len() |
| 695 | } |
| 696 | |
| 697 | #[inline (always)] |
| 698 | fn pattern_len(&self, pid: PatternID) -> usize { |
| 699 | (**self).pattern_len(pid) |
| 700 | } |
| 701 | |
| 702 | #[inline (always)] |
| 703 | fn min_pattern_len(&self) -> usize { |
| 704 | (**self).min_pattern_len() |
| 705 | } |
| 706 | |
| 707 | #[inline (always)] |
| 708 | fn max_pattern_len(&self) -> usize { |
| 709 | (**self).max_pattern_len() |
| 710 | } |
| 711 | |
| 712 | #[inline (always)] |
| 713 | fn memory_usage(&self) -> usize { |
| 714 | (**self).memory_usage() |
| 715 | } |
| 716 | |
| 717 | #[inline (always)] |
| 718 | fn prefilter(&self) -> Option<&Prefilter> { |
| 719 | (**self).prefilter() |
| 720 | } |
| 721 | } |
| 722 | |
| 723 | /// Represents the current state of an overlapping search. |
| 724 | /// |
| 725 | /// This is used for overlapping searches since they need to know something |
| 726 | /// about the previous search. For example, when multiple patterns match at the |
| 727 | /// same position, this state tracks the last reported pattern so that the next |
| 728 | /// search knows whether to report another matching pattern or continue with |
| 729 | /// the search at the next position. Additionally, it also tracks which state |
| 730 | /// the last search call terminated in and the current offset of the search |
| 731 | /// in the haystack. |
| 732 | /// |
| 733 | /// This type provides limited introspection capabilities. The only thing a |
| 734 | /// caller can do is construct it and pass it around to permit search routines |
| 735 | /// to use it to track state, and to ask whether a match has been found. |
| 736 | /// |
| 737 | /// Callers should always provide a fresh state constructed via |
| 738 | /// [`OverlappingState::start`] when starting a new search. That same state |
| 739 | /// should be reused for subsequent searches on the same `Input`. The state |
| 740 | /// given will advance through the haystack itself. Callers can detect the end |
| 741 | /// of a search when neither an error nor a match is returned. |
| 742 | /// |
| 743 | /// # Example |
| 744 | /// |
| 745 | /// This example shows how to manually iterate over all overlapping matches. If |
| 746 | /// you need this, you might consider using |
| 747 | /// [`AhoCorasick::find_overlapping_iter`](crate::AhoCorasick::find_overlapping_iter) |
| 748 | /// instead, but this shows how to correctly use an `OverlappingState`. |
| 749 | /// |
| 750 | /// ``` |
| 751 | /// use aho_corasick::{ |
| 752 | /// automaton::OverlappingState, |
| 753 | /// AhoCorasick, Input, Match, |
| 754 | /// }; |
| 755 | /// |
| 756 | /// let patterns = &["append" , "appendage" , "app" ]; |
| 757 | /// let haystack = "append the app to the appendage" ; |
| 758 | /// |
| 759 | /// let ac = AhoCorasick::new(patterns).unwrap(); |
| 760 | /// let mut state = OverlappingState::start(); |
| 761 | /// let mut matches = vec![]; |
| 762 | /// |
| 763 | /// loop { |
| 764 | /// ac.find_overlapping(haystack, &mut state); |
| 765 | /// let mat = match state.get_match() { |
| 766 | /// None => break, |
| 767 | /// Some(mat) => mat, |
| 768 | /// }; |
| 769 | /// matches.push(mat); |
| 770 | /// } |
| 771 | /// let expected = vec![ |
| 772 | /// Match::must(2, 0..3), |
| 773 | /// Match::must(0, 0..6), |
| 774 | /// Match::must(2, 11..14), |
| 775 | /// Match::must(2, 22..25), |
| 776 | /// Match::must(0, 22..28), |
| 777 | /// Match::must(1, 22..31), |
| 778 | /// ]; |
| 779 | /// assert_eq!(expected, matches); |
| 780 | /// ``` |
| 781 | #[derive (Clone, Debug)] |
| 782 | pub struct OverlappingState { |
| 783 | /// The match reported by the most recent overlapping search to use this |
| 784 | /// state. |
| 785 | /// |
| 786 | /// If a search does not find any matches, then it is expected to clear |
| 787 | /// this value. |
| 788 | mat: Option<Match>, |
| 789 | /// The state ID of the state at which the search was in when the call |
| 790 | /// terminated. When this is a match state, `last_match` must be set to a |
| 791 | /// non-None value. |
| 792 | /// |
| 793 | /// A `None` value indicates the start state of the corresponding |
| 794 | /// automaton. We cannot use the actual ID, since any one automaton may |
| 795 | /// have many start states, and which one is in use depends on search-time |
| 796 | /// factors (such as whether the search is anchored or not). |
| 797 | id: Option<StateID>, |
| 798 | /// The position of the search. |
| 799 | /// |
| 800 | /// When `id` is None (i.e., we are starting a search), this is set to |
| 801 | /// the beginning of the search as given by the caller regardless of its |
| 802 | /// current value. Subsequent calls to an overlapping search pick up at |
| 803 | /// this offset. |
| 804 | at: usize, |
| 805 | /// The index into the matching patterns of the next match to report if the |
| 806 | /// current state is a match state. Note that this may be 1 greater than |
| 807 | /// the total number of matches to report for the current match state. (In |
| 808 | /// which case, no more matches should be reported at the current position |
| 809 | /// and the search should advance to the next position.) |
| 810 | next_match_index: Option<usize>, |
| 811 | } |
| 812 | |
| 813 | impl OverlappingState { |
| 814 | /// Create a new overlapping state that begins at the start state. |
| 815 | pub fn start() -> OverlappingState { |
| 816 | OverlappingState { mat: None, id: None, at: 0, next_match_index: None } |
| 817 | } |
| 818 | |
| 819 | /// Return the match result of the most recent search to execute with this |
| 820 | /// state. |
| 821 | /// |
| 822 | /// Every search will clear this result automatically, such that if no |
| 823 | /// match is found, this will always correctly report `None`. |
| 824 | pub fn get_match(&self) -> Option<Match> { |
| 825 | self.mat |
| 826 | } |
| 827 | } |
| 828 | |
| 829 | /// An iterator of non-overlapping matches in a particular haystack. |
| 830 | /// |
| 831 | /// This iterator yields matches according to the [`MatchKind`] used by this |
| 832 | /// automaton. |
| 833 | /// |
| 834 | /// This iterator is constructed via the [`Automaton::try_find_iter`] method. |
| 835 | /// |
| 836 | /// The type variable `A` refers to the implementation of the [`Automaton`] |
| 837 | /// trait used to execute the search. |
| 838 | /// |
| 839 | /// The lifetime `'a` refers to the lifetime of the [`Automaton`] |
| 840 | /// implementation. |
| 841 | /// |
| 842 | /// The lifetime `'h` refers to the lifetime of the haystack being searched. |
| 843 | #[derive (Debug)] |
| 844 | pub struct FindIter<'a, 'h, A> { |
| 845 | /// The automaton used to drive the search. |
| 846 | aut: &'a A, |
| 847 | /// The input parameters to give to each search call. |
| 848 | /// |
| 849 | /// The start position of the search is mutated during iteration. |
| 850 | input: Input<'h>, |
| 851 | /// Records the end offset of the most recent match. This is necessary to |
| 852 | /// handle a corner case for preventing empty matches from overlapping with |
| 853 | /// the ending bounds of a prior match. |
| 854 | last_match_end: Option<usize>, |
| 855 | } |
| 856 | |
| 857 | impl<'a, 'h, A: Automaton> FindIter<'a, 'h, A> { |
| 858 | /// Creates a new non-overlapping iterator. If the given automaton would |
| 859 | /// return an error on a search with the given input configuration, then |
| 860 | /// that error is returned here. |
| 861 | fn new( |
| 862 | aut: &'a A, |
| 863 | input: Input<'h>, |
| 864 | ) -> Result<FindIter<'a, 'h, A>, MatchError> { |
| 865 | // The only way this search can fail is if we cannot retrieve the start |
| 866 | // state. e.g., Asking for an anchored search when only unanchored |
| 867 | // searches are supported. |
| 868 | let _ = aut.start_state(input.get_anchored())?; |
| 869 | Ok(FindIter { aut, input, last_match_end: None }) |
| 870 | } |
| 871 | |
| 872 | /// Executes a search and returns a match if one is found. |
| 873 | /// |
| 874 | /// This does not advance the input forward. It just executes a search |
| 875 | /// based on the current configuration/offsets. |
| 876 | fn search(&self) -> Option<Match> { |
| 877 | // The unwrap is OK here because we check at iterator construction time |
| 878 | // that no subsequent search call (using the same configuration) will |
| 879 | // ever return an error. |
| 880 | self.aut |
| 881 | .try_find(&self.input) |
| 882 | .expect("already checked that no match error can occur" ) |
| 883 | } |
| 884 | |
| 885 | /// Handles the special case of an empty match by ensuring that 1) the |
| 886 | /// iterator always advances and 2) empty matches never overlap with other |
| 887 | /// matches. |
| 888 | /// |
| 889 | /// (1) is necessary because we principally make progress by setting the |
| 890 | /// starting location of the next search to the ending location of the last |
| 891 | /// match. But if a match is empty, then this results in a search that does |
| 892 | /// not advance and thus does not terminate. |
| 893 | /// |
| 894 | /// (2) is not strictly necessary, but makes intuitive sense and matches |
| 895 | /// the presiding behavior of most general purpose regex engines. |
| 896 | /// (Obviously this crate isn't a regex engine, but we choose to match |
| 897 | /// their semantics.) The "intuitive sense" here is that we want to report |
| 898 | /// NON-overlapping matches. So for example, given the patterns 'a' and |
| 899 | /// '' (an empty string) against the haystack 'a', without the special |
| 900 | /// handling, you'd get the matches [0, 1) and [1, 1), where the latter |
| 901 | /// overlaps with the end bounds of the former. |
| 902 | /// |
| 903 | /// Note that we mark this cold and forcefully prevent inlining because |
| 904 | /// handling empty matches like this is extremely rare and does require |
| 905 | /// quite a bit of code, comparatively. Keeping this code out of the main |
| 906 | /// iterator function keeps it smaller and more amenable to inlining |
| 907 | /// itself. |
| 908 | #[cold ] |
| 909 | #[inline (never)] |
| 910 | fn handle_overlapping_empty_match( |
| 911 | &mut self, |
| 912 | mut m: Match, |
| 913 | ) -> Option<Match> { |
| 914 | assert!(m.is_empty()); |
| 915 | if Some(m.end()) == self.last_match_end { |
| 916 | self.input.set_start(self.input.start().checked_add(1).unwrap()); |
| 917 | m = self.search()?; |
| 918 | } |
| 919 | Some(m) |
| 920 | } |
| 921 | } |
| 922 | |
| 923 | impl<'a, 'h, A: Automaton> Iterator for FindIter<'a, 'h, A> { |
| 924 | type Item = Match; |
| 925 | |
| 926 | #[inline (always)] |
| 927 | fn next(&mut self) -> Option<Match> { |
| 928 | let mut m: Match = self.search()?; |
| 929 | if m.is_empty() { |
| 930 | m = self.handle_overlapping_empty_match(m)?; |
| 931 | } |
| 932 | self.input.set_start(m.end()); |
| 933 | self.last_match_end = Some(m.end()); |
| 934 | Some(m) |
| 935 | } |
| 936 | } |
| 937 | |
| 938 | /// An iterator of overlapping matches in a particular haystack. |
| 939 | /// |
| 940 | /// This iterator will report all possible matches in a particular haystack, |
| 941 | /// even when the matches overlap. |
| 942 | /// |
| 943 | /// This iterator is constructed via the |
| 944 | /// [`Automaton::try_find_overlapping_iter`] method. |
| 945 | /// |
| 946 | /// The type variable `A` refers to the implementation of the [`Automaton`] |
| 947 | /// trait used to execute the search. |
| 948 | /// |
| 949 | /// The lifetime `'a` refers to the lifetime of the [`Automaton`] |
| 950 | /// implementation. |
| 951 | /// |
| 952 | /// The lifetime `'h` refers to the lifetime of the haystack being searched. |
| 953 | #[derive (Debug)] |
| 954 | pub struct FindOverlappingIter<'a, 'h, A> { |
| 955 | aut: &'a A, |
| 956 | input: Input<'h>, |
| 957 | state: OverlappingState, |
| 958 | } |
| 959 | |
| 960 | impl<'a, 'h, A: Automaton> Iterator for FindOverlappingIter<'a, 'h, A> { |
| 961 | type Item = Match; |
| 962 | |
| 963 | #[inline (always)] |
| 964 | fn next(&mut self) -> Option<Match> { |
| 965 | self.aut |
| 966 | .try_find_overlapping(&self.input, &mut self.state) |
| 967 | .expect(msg:"already checked that no match error can occur here" ); |
| 968 | self.state.get_match() |
| 969 | } |
| 970 | } |
| 971 | |
| 972 | /// An iterator that reports matches in a stream. |
| 973 | /// |
| 974 | /// This iterator yields elements of type `io::Result<Match>`, where an error |
| 975 | /// is reported if there was a problem reading from the underlying stream. |
| 976 | /// The iterator terminates only when the underlying stream reaches `EOF`. |
| 977 | /// |
| 978 | /// This iterator is constructed via the [`Automaton::try_stream_find_iter`] |
| 979 | /// method. |
| 980 | /// |
| 981 | /// The type variable `A` refers to the implementation of the [`Automaton`] |
| 982 | /// trait used to execute the search. |
| 983 | /// |
| 984 | /// The type variable `R` refers to the `io::Read` stream that is being read |
| 985 | /// from. |
| 986 | /// |
| 987 | /// The lifetime `'a` refers to the lifetime of the [`Automaton`] |
| 988 | /// implementation. |
| 989 | #[cfg (feature = "std" )] |
| 990 | #[derive (Debug)] |
| 991 | pub struct StreamFindIter<'a, A, R> { |
| 992 | it: StreamChunkIter<'a, A, R>, |
| 993 | } |
| 994 | |
| 995 | #[cfg (feature = "std" )] |
| 996 | impl<'a, A: Automaton, R: std::io::Read> Iterator |
| 997 | for StreamFindIter<'a, A, R> |
| 998 | { |
| 999 | type Item = std::io::Result<Match>; |
| 1000 | |
| 1001 | fn next(&mut self) -> Option<std::io::Result<Match>> { |
| 1002 | loop { |
| 1003 | match self.it.next() { |
| 1004 | None => return None, |
| 1005 | Some(Err(err: Error)) => return Some(Err(err)), |
| 1006 | Some(Ok(StreamChunk::NonMatch { .. })) => {} |
| 1007 | Some(Ok(StreamChunk::Match { mat: Match, .. })) => { |
| 1008 | return Some(Ok(mat)); |
| 1009 | } |
| 1010 | } |
| 1011 | } |
| 1012 | } |
| 1013 | } |
| 1014 | |
| 1015 | /// An iterator that reports matches in a stream. |
| 1016 | /// |
| 1017 | /// (This doesn't actually implement the `Iterator` trait because it returns |
| 1018 | /// something with a lifetime attached to a buffer it owns, but that's OK. It |
| 1019 | /// still has a `next` method and is iterator-like enough to be fine.) |
| 1020 | /// |
| 1021 | /// This iterator yields elements of type `io::Result<StreamChunk>`, where |
| 1022 | /// an error is reported if there was a problem reading from the underlying |
| 1023 | /// stream. The iterator terminates only when the underlying stream reaches |
| 1024 | /// `EOF`. |
| 1025 | /// |
| 1026 | /// The idea here is that each chunk represents either a match or a non-match, |
| 1027 | /// and if you concatenated all of the chunks together, you'd reproduce the |
| 1028 | /// entire contents of the stream, byte-for-byte. |
| 1029 | /// |
| 1030 | /// This chunk machinery is a bit complicated and it isn't strictly required |
| 1031 | /// for a stream searcher that just reports matches. But we do need something |
| 1032 | /// like this to deal with the "replacement" API, which needs to know which |
| 1033 | /// chunks it can copy and which it needs to replace. |
| 1034 | #[cfg (feature = "std" )] |
| 1035 | #[derive (Debug)] |
| 1036 | struct StreamChunkIter<'a, A, R> { |
| 1037 | /// The underlying automaton to do the search. |
| 1038 | aut: &'a A, |
| 1039 | /// The source of bytes we read from. |
| 1040 | rdr: R, |
| 1041 | /// A roll buffer for managing bytes from `rdr`. Basically, this is used |
| 1042 | /// to handle the case of a match that is split by two different |
| 1043 | /// calls to `rdr.read()`. This isn't strictly needed if all we needed to |
| 1044 | /// do was report matches, but here we are reporting chunks of non-matches |
| 1045 | /// and matches and in order to do that, we really just cannot treat our |
| 1046 | /// stream as non-overlapping blocks of bytes. We need to permit some |
| 1047 | /// overlap while we retain bytes from a previous `read` call in memory. |
| 1048 | buf: crate::util::buffer::Buffer, |
| 1049 | /// The unanchored starting state of this automaton. |
| 1050 | start: StateID, |
| 1051 | /// The state of the automaton. |
| 1052 | sid: StateID, |
| 1053 | /// The absolute position over the entire stream. |
| 1054 | absolute_pos: usize, |
| 1055 | /// The position we're currently at within `buf`. |
| 1056 | buffer_pos: usize, |
| 1057 | /// The buffer position of the end of the bytes that we last returned |
| 1058 | /// to the caller. Basically, whenever we find a match, we look to see if |
| 1059 | /// there is a difference between where the match started and the position |
| 1060 | /// of the last byte we returned to the caller. If there's a difference, |
| 1061 | /// then we need to return a 'NonMatch' chunk. |
| 1062 | buffer_reported_pos: usize, |
| 1063 | } |
| 1064 | |
| 1065 | #[cfg (feature = "std" )] |
| 1066 | impl<'a, A: Automaton, R: std::io::Read> StreamChunkIter<'a, A, R> { |
| 1067 | fn new( |
| 1068 | aut: &'a A, |
| 1069 | rdr: R, |
| 1070 | ) -> Result<StreamChunkIter<'a, A, R>, MatchError> { |
| 1071 | // This restriction is a carry-over from older versions of this crate. |
| 1072 | // I didn't have the bandwidth to think through how to handle, say, |
| 1073 | // leftmost-first or leftmost-longest matching, but... it should be |
| 1074 | // possible? The main problem is that once you see a match state in |
| 1075 | // leftmost-first semantics, you can't just stop at that point and |
| 1076 | // report a match. You have to keep going until you either hit a dead |
| 1077 | // state or EOF. So how do you know when you'll hit a dead state? Well, |
| 1078 | // you don't. With Aho-Corasick, I believe you can put a bound on it |
| 1079 | // and say, "once a match has been seen, you'll need to scan forward at |
| 1080 | // most N bytes" where N=aut.max_pattern_len(). |
| 1081 | // |
| 1082 | // Which is fine, but it does mean that state about whether we're still |
| 1083 | // looking for a dead state or not needs to persist across buffer |
| 1084 | // refills. Which this code doesn't really handle. It does preserve |
| 1085 | // *some* state across buffer refills, basically ensuring that a match |
| 1086 | // span is always in memory. |
| 1087 | if !aut.match_kind().is_standard() { |
| 1088 | return Err(MatchError::unsupported_stream(aut.match_kind())); |
| 1089 | } |
| 1090 | // This is kind of a cop-out, but empty matches are SUPER annoying. |
| 1091 | // If we know they can't happen (which is what we enforce here), then |
| 1092 | // it makes a lot of logic much simpler. With that said, I'm open to |
| 1093 | // supporting this case, but we need to define proper semantics for it |
| 1094 | // first. It wasn't totally clear to me what it should do at the time |
| 1095 | // of writing, so I decided to just be conservative. |
| 1096 | // |
| 1097 | // It also seems like a very weird case to support anyway. Why search a |
| 1098 | // stream if you're just going to get a match at every position? |
| 1099 | // |
| 1100 | // ¯\_(ツ)_/¯ |
| 1101 | if aut.min_pattern_len() == 0 { |
| 1102 | return Err(MatchError::unsupported_empty()); |
| 1103 | } |
| 1104 | let start = aut.start_state(Anchored::No)?; |
| 1105 | Ok(StreamChunkIter { |
| 1106 | aut, |
| 1107 | rdr, |
| 1108 | buf: crate::util::buffer::Buffer::new(aut.max_pattern_len()), |
| 1109 | start, |
| 1110 | sid: start, |
| 1111 | absolute_pos: 0, |
| 1112 | buffer_pos: 0, |
| 1113 | buffer_reported_pos: 0, |
| 1114 | }) |
| 1115 | } |
| 1116 | |
| 1117 | fn next(&mut self) -> Option<std::io::Result<StreamChunk>> { |
| 1118 | // This code is pretty gnarly. It IS simpler than the equivalent code |
| 1119 | // in the previous aho-corasick release, in part because we inline |
| 1120 | // automaton traversal here and also in part because we have abdicated |
| 1121 | // support for automatons that contain an empty pattern. |
| 1122 | // |
| 1123 | // I suspect this code could be made a bit simpler by designing a |
| 1124 | // better buffer abstraction. |
| 1125 | // |
| 1126 | // But in general, this code is basically write-only. So you'll need |
| 1127 | // to go through it step-by-step to grok it. One of the key bits of |
| 1128 | // complexity is tracking a few different offsets. 'buffer_pos' is |
| 1129 | // where we are in the buffer for search. 'buffer_reported_pos' is the |
| 1130 | // position immediately following the last byte in the buffer that |
| 1131 | // we've returned to the caller. And 'absolute_pos' is the overall |
| 1132 | // current absolute position of the search in the entire stream, and |
| 1133 | // this is what match spans are reported in terms of. |
| 1134 | loop { |
| 1135 | if self.aut.is_match(self.sid) { |
| 1136 | let mat = self.get_match(); |
| 1137 | if let Some(r) = self.get_non_match_chunk(mat) { |
| 1138 | self.buffer_reported_pos += r.len(); |
| 1139 | let bytes = &self.buf.buffer()[r]; |
| 1140 | return Some(Ok(StreamChunk::NonMatch { bytes })); |
| 1141 | } |
| 1142 | self.sid = self.start; |
| 1143 | let r = self.get_match_chunk(mat); |
| 1144 | self.buffer_reported_pos += r.len(); |
| 1145 | let bytes = &self.buf.buffer()[r]; |
| 1146 | return Some(Ok(StreamChunk::Match { bytes, mat })); |
| 1147 | } |
| 1148 | if self.buffer_pos >= self.buf.buffer().len() { |
| 1149 | if let Some(r) = self.get_pre_roll_non_match_chunk() { |
| 1150 | self.buffer_reported_pos += r.len(); |
| 1151 | let bytes = &self.buf.buffer()[r]; |
| 1152 | return Some(Ok(StreamChunk::NonMatch { bytes })); |
| 1153 | } |
| 1154 | if self.buf.buffer().len() >= self.buf.min_buffer_len() { |
| 1155 | self.buffer_pos = self.buf.min_buffer_len(); |
| 1156 | self.buffer_reported_pos -= |
| 1157 | self.buf.buffer().len() - self.buf.min_buffer_len(); |
| 1158 | self.buf.roll(); |
| 1159 | } |
| 1160 | match self.buf.fill(&mut self.rdr) { |
| 1161 | Err(err) => return Some(Err(err)), |
| 1162 | Ok(true) => {} |
| 1163 | Ok(false) => { |
| 1164 | // We've hit EOF, but if there are still some |
| 1165 | // unreported bytes remaining, return them now. |
| 1166 | if let Some(r) = self.get_eof_non_match_chunk() { |
| 1167 | self.buffer_reported_pos += r.len(); |
| 1168 | let bytes = &self.buf.buffer()[r]; |
| 1169 | return Some(Ok(StreamChunk::NonMatch { bytes })); |
| 1170 | } |
| 1171 | // We've reported everything! |
| 1172 | return None; |
| 1173 | } |
| 1174 | } |
| 1175 | } |
| 1176 | let start = self.absolute_pos; |
| 1177 | for &byte in self.buf.buffer()[self.buffer_pos..].iter() { |
| 1178 | self.sid = self.aut.next_state(Anchored::No, self.sid, byte); |
| 1179 | self.absolute_pos += 1; |
| 1180 | if self.aut.is_match(self.sid) { |
| 1181 | break; |
| 1182 | } |
| 1183 | } |
| 1184 | self.buffer_pos += self.absolute_pos - start; |
| 1185 | } |
| 1186 | } |
| 1187 | |
| 1188 | /// Return a match chunk for the given match. It is assumed that the match |
| 1189 | /// ends at the current `buffer_pos`. |
| 1190 | fn get_match_chunk(&self, mat: Match) -> core::ops::Range<usize> { |
| 1191 | let start = self.buffer_pos - mat.len(); |
| 1192 | let end = self.buffer_pos; |
| 1193 | start..end |
| 1194 | } |
| 1195 | |
| 1196 | /// Return a non-match chunk, if necessary, just before reporting a match. |
| 1197 | /// This returns `None` if there is nothing to report. Otherwise, this |
| 1198 | /// assumes that the given match ends at the current `buffer_pos`. |
| 1199 | fn get_non_match_chunk( |
| 1200 | &self, |
| 1201 | mat: Match, |
| 1202 | ) -> Option<core::ops::Range<usize>> { |
| 1203 | let buffer_mat_start = self.buffer_pos - mat.len(); |
| 1204 | if buffer_mat_start > self.buffer_reported_pos { |
| 1205 | let start = self.buffer_reported_pos; |
| 1206 | let end = buffer_mat_start; |
| 1207 | return Some(start..end); |
| 1208 | } |
| 1209 | None |
| 1210 | } |
| 1211 | |
| 1212 | /// Look for any bytes that should be reported as a non-match just before |
| 1213 | /// rolling the buffer. |
| 1214 | /// |
| 1215 | /// Note that this only reports bytes up to `buffer.len() - |
| 1216 | /// min_buffer_len`, as it's not possible to know whether the bytes |
| 1217 | /// following that will participate in a match or not. |
| 1218 | fn get_pre_roll_non_match_chunk(&self) -> Option<core::ops::Range<usize>> { |
| 1219 | let end = |
| 1220 | self.buf.buffer().len().saturating_sub(self.buf.min_buffer_len()); |
| 1221 | if self.buffer_reported_pos < end { |
| 1222 | return Some(self.buffer_reported_pos..end); |
| 1223 | } |
| 1224 | None |
| 1225 | } |
| 1226 | |
| 1227 | /// Return any unreported bytes as a non-match up to the end of the buffer. |
| 1228 | /// |
| 1229 | /// This should only be called when the entire contents of the buffer have |
| 1230 | /// been searched and EOF has been hit when trying to fill the buffer. |
| 1231 | fn get_eof_non_match_chunk(&self) -> Option<core::ops::Range<usize>> { |
| 1232 | if self.buffer_reported_pos < self.buf.buffer().len() { |
| 1233 | return Some(self.buffer_reported_pos..self.buf.buffer().len()); |
| 1234 | } |
| 1235 | None |
| 1236 | } |
| 1237 | |
| 1238 | /// Return the match at the current position for the current state. |
| 1239 | /// |
| 1240 | /// This panics if `self.aut.is_match(self.sid)` isn't true. |
| 1241 | fn get_match(&self) -> Match { |
| 1242 | get_match(self.aut, self.sid, 0, self.absolute_pos) |
| 1243 | } |
| 1244 | } |
| 1245 | |
| 1246 | /// A single chunk yielded by the stream chunk iterator. |
| 1247 | /// |
| 1248 | /// The `'r` lifetime refers to the lifetime of the stream chunk iterator. |
| 1249 | #[cfg (feature = "std" )] |
| 1250 | #[derive (Debug)] |
| 1251 | enum StreamChunk<'r> { |
| 1252 | /// A chunk that does not contain any matches. |
| 1253 | NonMatch { bytes: &'r [u8] }, |
| 1254 | /// A chunk that precisely contains a match. |
| 1255 | Match { bytes: &'r [u8], mat: Match }, |
| 1256 | } |
| 1257 | |
| 1258 | #[inline (never)] |
| 1259 | pub(crate) fn try_find_fwd<A: Automaton + ?Sized>( |
| 1260 | aut: &A, |
| 1261 | input: &Input<'_>, |
| 1262 | ) -> Result<Option<Match>, MatchError> { |
| 1263 | if input.is_done() { |
| 1264 | return Ok(None); |
| 1265 | } |
| 1266 | let earliest: bool = aut.match_kind().is_standard() || input.get_earliest(); |
| 1267 | if input.get_anchored().is_anchored() { |
| 1268 | try_find_fwd_imp(aut, input, pre:None, Anchored::Yes, earliest) |
| 1269 | } else if let Some(pre: &Prefilter) = aut.prefilter() { |
| 1270 | if earliest { |
| 1271 | try_find_fwd_imp(aut, input, pre:Some(pre), Anchored::No, earliest:true) |
| 1272 | } else { |
| 1273 | try_find_fwd_imp(aut, input, pre:Some(pre), Anchored::No, earliest:false) |
| 1274 | } |
| 1275 | } else { |
| 1276 | if earliest { |
| 1277 | try_find_fwd_imp(aut, input, pre:None, Anchored::No, earliest:true) |
| 1278 | } else { |
| 1279 | try_find_fwd_imp(aut, input, pre:None, Anchored::No, earliest:false) |
| 1280 | } |
| 1281 | } |
| 1282 | } |
| 1283 | |
| 1284 | #[inline (always)] |
| 1285 | fn try_find_fwd_imp<A: Automaton + ?Sized>( |
| 1286 | aut: &A, |
| 1287 | input: &Input<'_>, |
| 1288 | pre: Option<&Prefilter>, |
| 1289 | anchored: Anchored, |
| 1290 | earliest: bool, |
| 1291 | ) -> Result<Option<Match>, MatchError> { |
| 1292 | let mut sid = aut.start_state(input.get_anchored())?; |
| 1293 | let mut at = input.start(); |
| 1294 | let mut mat = None; |
| 1295 | if aut.is_match(sid) { |
| 1296 | mat = Some(get_match(aut, sid, 0, at)); |
| 1297 | if earliest { |
| 1298 | return Ok(mat); |
| 1299 | } |
| 1300 | } |
| 1301 | if let Some(pre) = pre { |
| 1302 | match pre.find_in(input.haystack(), input.get_span()) { |
| 1303 | Candidate::None => return Ok(None), |
| 1304 | Candidate::Match(m) => return Ok(Some(m)), |
| 1305 | Candidate::PossibleStartOfMatch(i) => { |
| 1306 | at = i; |
| 1307 | } |
| 1308 | } |
| 1309 | } |
| 1310 | while at < input.end() { |
| 1311 | // I've tried unrolling this loop and eliding bounds checks, but no |
| 1312 | // matter what I did, I could not observe a consistent improvement on |
| 1313 | // any benchmark I could devise. (If someone wants to re-litigate this, |
| 1314 | // the way to do it is to add an 'next_state_unchecked' method to the |
| 1315 | // 'Automaton' trait with a default impl that uses 'next_state'. Then |
| 1316 | // use 'aut.next_state_unchecked' here and implement it on DFA using |
| 1317 | // unchecked slice index acces.) |
| 1318 | sid = aut.next_state(anchored, sid, input.haystack()[at]); |
| 1319 | if aut.is_special(sid) { |
| 1320 | if aut.is_dead(sid) { |
| 1321 | return Ok(mat); |
| 1322 | } else if aut.is_match(sid) { |
| 1323 | // We use 'at + 1' here because the match state is entered |
| 1324 | // at the last byte of the pattern. Since we use half-open |
| 1325 | // intervals, the end of the range of the match is one past the |
| 1326 | // last byte. |
| 1327 | let m = get_match(aut, sid, 0, at + 1); |
| 1328 | // For the automata in this crate, we make a size trade off |
| 1329 | // where we reuse the same automaton for both anchored and |
| 1330 | // unanchored searches. We achieve this, principally, by simply |
| 1331 | // not following failure transitions while computing the next |
| 1332 | // state. Instead, if we fail to find the next state, we return |
| 1333 | // a dead state, which instructs the search to stop. (This |
| 1334 | // is why 'next_state' needs to know whether the search is |
| 1335 | // anchored or not.) In addition, we have different start |
| 1336 | // states for anchored and unanchored searches. The latter has |
| 1337 | // a self-loop where as the former does not. |
| 1338 | // |
| 1339 | // In this way, we can use the same trie to execute both |
| 1340 | // anchored and unanchored searches. There is a catch though. |
| 1341 | // When building an Aho-Corasick automaton for unanchored |
| 1342 | // searches, we copy matches from match states to other states |
| 1343 | // (which would otherwise not be match states) if they are |
| 1344 | // reachable via a failure transition. In the case of an |
| 1345 | // anchored search, we *specifically* do not want to report |
| 1346 | // these matches because they represent matches that start past |
| 1347 | // the beginning of the search. |
| 1348 | // |
| 1349 | // Now we could tweak the automaton somehow to differentiate |
| 1350 | // anchored from unanchored match states, but this would make |
| 1351 | // 'aut.is_match' and potentially 'aut.is_special' slower. And |
| 1352 | // also make the automaton itself more complex. |
| 1353 | // |
| 1354 | // Instead, we insert a special hack: if the search is |
| 1355 | // anchored, we simply ignore matches that don't begin at |
| 1356 | // the start of the search. This is not quite ideal, but we |
| 1357 | // do specialize this function in such a way that unanchored |
| 1358 | // searches don't pay for this additional branch. While this |
| 1359 | // might cause a search to continue on for more than it |
| 1360 | // otherwise optimally would, it will be no more than the |
| 1361 | // longest pattern in the automaton. The reason for this is |
| 1362 | // that we ensure we don't follow failure transitions during |
| 1363 | // an anchored search. Combined with using a different anchored |
| 1364 | // starting state with no self-loop, we guarantee that we'll |
| 1365 | // at worst move through a number of transitions equal to the |
| 1366 | // longest pattern. |
| 1367 | // |
| 1368 | // Now for DFAs, the whole point of them is to eliminate |
| 1369 | // failure transitions entirely. So there is no way to say "if |
| 1370 | // it's an anchored search don't follow failure transitions." |
| 1371 | // Instead, we actually have to build two entirely separate |
| 1372 | // automatons into the transition table. One with failure |
| 1373 | // transitions built into it and another that is effectively |
| 1374 | // just an encoding of the base trie into a transition table. |
| 1375 | // DFAs still need this check though, because the match states |
| 1376 | // still carry matches only reachable via a failure transition. |
| 1377 | // Why? Because removing them seems difficult, although I |
| 1378 | // haven't given it a lot of thought. |
| 1379 | if !(anchored.is_anchored() && m.start() > input.start()) { |
| 1380 | mat = Some(m); |
| 1381 | if earliest { |
| 1382 | return Ok(mat); |
| 1383 | } |
| 1384 | } |
| 1385 | } else if let Some(pre) = pre { |
| 1386 | // If we're here, we know it's a special state that is not a |
| 1387 | // dead or a match state AND that a prefilter is active. Thus, |
| 1388 | // it must be a start state. |
| 1389 | debug_assert!(aut.is_start(sid)); |
| 1390 | // We don't care about 'Candidate::Match' here because if such |
| 1391 | // a match were possible, it would have been returned above |
| 1392 | // when we run the prefilter before walking the automaton. |
| 1393 | let span = Span::from(at..input.end()); |
| 1394 | match pre.find_in(input.haystack(), span).into_option() { |
| 1395 | None => return Ok(None), |
| 1396 | Some(i) => { |
| 1397 | if i > at { |
| 1398 | at = i; |
| 1399 | continue; |
| 1400 | } |
| 1401 | } |
| 1402 | } |
| 1403 | } else { |
| 1404 | // When pre.is_none(), then starting states should not be |
| 1405 | // treated as special. That is, without a prefilter, is_special |
| 1406 | // should only return true when the state is a dead or a match |
| 1407 | // state. |
| 1408 | // |
| 1409 | // It is possible to execute a search without a prefilter even |
| 1410 | // when the underlying searcher has one: an anchored search. |
| 1411 | // But in this case, the automaton makes it impossible to move |
| 1412 | // back to the start state by construction, and thus, we should |
| 1413 | // never reach this branch. |
| 1414 | debug_assert!(false, "unreachable" ); |
| 1415 | } |
| 1416 | } |
| 1417 | at += 1; |
| 1418 | } |
| 1419 | Ok(mat) |
| 1420 | } |
| 1421 | |
| 1422 | #[inline (never)] |
| 1423 | fn try_find_overlapping_fwd<A: Automaton + ?Sized>( |
| 1424 | aut: &A, |
| 1425 | input: &Input<'_>, |
| 1426 | state: &mut OverlappingState, |
| 1427 | ) -> Result<(), MatchError> { |
| 1428 | state.mat = None; |
| 1429 | if input.is_done() { |
| 1430 | return Ok(()); |
| 1431 | } |
| 1432 | // Searching with a pattern ID is always anchored, so we should only ever |
| 1433 | // use a prefilter when no pattern ID is given. |
| 1434 | if aut.prefilter().is_some() && !input.get_anchored().is_anchored() { |
| 1435 | let pre: &Prefilter = aut.prefilter().unwrap(); |
| 1436 | try_find_overlapping_fwd_imp(aut, input, pre:Some(pre), state) |
| 1437 | } else { |
| 1438 | try_find_overlapping_fwd_imp(aut, input, pre:None, state) |
| 1439 | } |
| 1440 | } |
| 1441 | |
| 1442 | #[inline (always)] |
| 1443 | fn try_find_overlapping_fwd_imp<A: Automaton + ?Sized>( |
| 1444 | aut: &A, |
| 1445 | input: &Input<'_>, |
| 1446 | pre: Option<&Prefilter>, |
| 1447 | state: &mut OverlappingState, |
| 1448 | ) -> Result<(), MatchError> { |
| 1449 | let mut sid = match state.id { |
| 1450 | None => { |
| 1451 | let sid = aut.start_state(input.get_anchored())?; |
| 1452 | // Handle the case where the start state is a match state. That is, |
| 1453 | // the empty string is in our automaton. We report every match we |
| 1454 | // can here before moving on and updating 'state.at' and 'state.id' |
| 1455 | // to find more matches in other parts of the haystack. |
| 1456 | if aut.is_match(sid) { |
| 1457 | let i = state.next_match_index.unwrap_or(0); |
| 1458 | let len = aut.match_len(sid); |
| 1459 | if i < len { |
| 1460 | state.next_match_index = Some(i + 1); |
| 1461 | state.mat = Some(get_match(aut, sid, i, input.start())); |
| 1462 | return Ok(()); |
| 1463 | } |
| 1464 | } |
| 1465 | state.at = input.start(); |
| 1466 | state.id = Some(sid); |
| 1467 | state.next_match_index = None; |
| 1468 | state.mat = None; |
| 1469 | sid |
| 1470 | } |
| 1471 | Some(sid) => { |
| 1472 | // If we still have matches left to report in this state then |
| 1473 | // report them until we've exhausted them. Only after that do we |
| 1474 | // advance to the next offset in the haystack. |
| 1475 | if let Some(i) = state.next_match_index { |
| 1476 | let len = aut.match_len(sid); |
| 1477 | if i < len { |
| 1478 | state.next_match_index = Some(i + 1); |
| 1479 | state.mat = Some(get_match(aut, sid, i, state.at + 1)); |
| 1480 | return Ok(()); |
| 1481 | } |
| 1482 | // Once we've reported all matches at a given position, we need |
| 1483 | // to advance the search to the next position. |
| 1484 | state.at += 1; |
| 1485 | state.next_match_index = None; |
| 1486 | state.mat = None; |
| 1487 | } |
| 1488 | sid |
| 1489 | } |
| 1490 | }; |
| 1491 | while state.at < input.end() { |
| 1492 | sid = aut.next_state( |
| 1493 | input.get_anchored(), |
| 1494 | sid, |
| 1495 | input.haystack()[state.at], |
| 1496 | ); |
| 1497 | if aut.is_special(sid) { |
| 1498 | state.id = Some(sid); |
| 1499 | if aut.is_dead(sid) { |
| 1500 | return Ok(()); |
| 1501 | } else if aut.is_match(sid) { |
| 1502 | state.next_match_index = Some(1); |
| 1503 | state.mat = Some(get_match(aut, sid, 0, state.at + 1)); |
| 1504 | return Ok(()); |
| 1505 | } else if let Some(pre) = pre { |
| 1506 | // If we're here, we know it's a special state that is not a |
| 1507 | // dead or a match state AND that a prefilter is active. Thus, |
| 1508 | // it must be a start state. |
| 1509 | debug_assert!(aut.is_start(sid)); |
| 1510 | let span = Span::from(state.at..input.end()); |
| 1511 | match pre.find_in(input.haystack(), span).into_option() { |
| 1512 | None => return Ok(()), |
| 1513 | Some(i) => { |
| 1514 | if i > state.at { |
| 1515 | state.at = i; |
| 1516 | continue; |
| 1517 | } |
| 1518 | } |
| 1519 | } |
| 1520 | } else { |
| 1521 | // When pre.is_none(), then starting states should not be |
| 1522 | // treated as special. That is, without a prefilter, is_special |
| 1523 | // should only return true when the state is a dead or a match |
| 1524 | // state. |
| 1525 | // |
| 1526 | // ... except for one special case: in stream searching, we |
| 1527 | // currently call overlapping search with a 'None' prefilter, |
| 1528 | // regardless of whether one exists or not, because stream |
| 1529 | // searching can't currently deal with prefilters correctly in |
| 1530 | // all cases. |
| 1531 | } |
| 1532 | } |
| 1533 | state.at += 1; |
| 1534 | } |
| 1535 | state.id = Some(sid); |
| 1536 | Ok(()) |
| 1537 | } |
| 1538 | |
| 1539 | #[inline (always)] |
| 1540 | fn get_match<A: Automaton + ?Sized>( |
| 1541 | aut: &A, |
| 1542 | sid: StateID, |
| 1543 | index: usize, |
| 1544 | at: usize, |
| 1545 | ) -> Match { |
| 1546 | let pid: PatternID = aut.match_pattern(sid, index); |
| 1547 | let len: usize = aut.pattern_len(pid); |
| 1548 | Match::new(pattern:pid, (at - len)..at) |
| 1549 | } |
| 1550 | |
| 1551 | /// Write a prefix "state" indicator for fmt::Debug impls. It always writes |
| 1552 | /// exactly two printable bytes to the given formatter. |
| 1553 | /// |
| 1554 | /// Specifically, this tries to succinctly distinguish the different types of |
| 1555 | /// states: dead states, start states and match states. It even accounts for |
| 1556 | /// the possible overlappings of different state types. (The only possible |
| 1557 | /// overlapping is that of match and start states.) |
| 1558 | pub(crate) fn fmt_state_indicator<A: Automaton>( |
| 1559 | f: &mut core::fmt::Formatter<'_>, |
| 1560 | aut: A, |
| 1561 | id: StateID, |
| 1562 | ) -> core::fmt::Result { |
| 1563 | if aut.is_dead(sid:id) { |
| 1564 | write!(f, "D " )?; |
| 1565 | } else if aut.is_match(sid:id) { |
| 1566 | if aut.is_start(sid:id) { |
| 1567 | write!(f, "*>" )?; |
| 1568 | } else { |
| 1569 | write!(f, "* " )?; |
| 1570 | } |
| 1571 | } else if aut.is_start(sid:id) { |
| 1572 | write!(f, " >" )?; |
| 1573 | } else { |
| 1574 | write!(f, " " )?; |
| 1575 | } |
| 1576 | Ok(()) |
| 1577 | } |
| 1578 | |
| 1579 | /// Return an iterator of transitions in a sparse format given an iterator |
| 1580 | /// of all explicitly defined transitions. The iterator yields ranges of |
| 1581 | /// transitions, such that any adjacent transitions mapped to the same |
| 1582 | /// state are combined into a single range. |
| 1583 | pub(crate) fn sparse_transitions<'a>( |
| 1584 | mut it: impl Iterator<Item = (u8, StateID)> + 'a, |
| 1585 | ) -> impl Iterator<Item = (u8, u8, StateID)> + 'a { |
| 1586 | let mut cur: Option<(u8, u8, StateID)> = None; |
| 1587 | core::iter::from_fn(move || { |
| 1588 | while let Some((class: u8, next: StateID)) = it.next() { |
| 1589 | let (prev_start: u8, prev_end: u8, prev_next: StateID) = match cur { |
| 1590 | Some(x: (u8, u8, StateID)) => x, |
| 1591 | None => { |
| 1592 | cur = Some((class, class, next)); |
| 1593 | continue; |
| 1594 | } |
| 1595 | }; |
| 1596 | if prev_next == next { |
| 1597 | cur = Some((prev_start, class, prev_next)); |
| 1598 | } else { |
| 1599 | cur = Some((class, class, next)); |
| 1600 | return Some((prev_start, prev_end, prev_next)); |
| 1601 | } |
| 1602 | } |
| 1603 | if let Some((start: u8, end: u8, next: StateID)) = cur.take() { |
| 1604 | return Some((start, end, next)); |
| 1605 | } |
| 1606 | None |
| 1607 | }) |
| 1608 | } |
| 1609 | |