1/*!
2Provides [`Automaton`] trait for abstracting over Aho-Corasick automata.
3
4The `Automaton` trait provides a way to write generic code over any
5Aho-Corasick automaton. It also provides access to lower level APIs that
6permit walking the state transitions of an Aho-Corasick automaton manually.
7*/
8
9use alloc::{string::String, vec::Vec};
10
11use crate::util::{
12 error::MatchError,
13 primitives::PatternID,
14 search::{Anchored, Input, Match, MatchKind, Span},
15};
16
17pub use crate::util::{
18 prefilter::{Candidate, Prefilter},
19 primitives::{StateID, StateIDError},
20};
21
22/// We seal the `Automaton` trait for now. It's a big trait, and it's
23/// conceivable that I might want to add new required methods, and sealing the
24/// trait permits doing that in a backwards compatible fashion. On other the
25/// hand, if you have a solid use case for implementing the trait yourself,
26/// please file an issue and we can discuss it. This was *mostly* done as a
27/// conservative step.
28pub(crate) mod private {
29 pub trait Sealed {}
30}
31impl private::Sealed for crate::nfa::noncontiguous::NFA {}
32impl private::Sealed for crate::nfa::contiguous::NFA {}
33impl private::Sealed for crate::dfa::DFA {}
34
35impl<'a, T: private::Sealed + ?Sized> private::Sealed for &'a T {}
36
37/// A trait that abstracts over Aho-Corasick automata.
38///
39/// This trait primarily exists for niche use cases such as:
40///
41/// * Using an NFA or DFA directly, bypassing the top-level
42/// [`AhoCorasick`](crate::AhoCorasick) searcher. Currently, these include
43/// [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA),
44/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) and
45/// [`dfa::DFA`](crate::dfa::DFA).
46/// * Implementing your own custom search routine by walking the automaton
47/// yourself. This might be useful for implementing search on non-contiguous
48/// strings or streams.
49///
50/// For most use cases, it is not expected that users will need
51/// to use or even know about this trait. Indeed, the top level
52/// [`AhoCorasick`](crate::AhoCorasick) searcher does not expose any details
53/// about this trait, nor does it implement it itself.
54///
55/// Note that this trait defines a number of default methods, such as
56/// [`Automaton::try_find`] and [`Automaton::try_find_iter`], which implement
57/// higher level search routines in terms of the lower level automata API.
58///
59/// # Sealed
60///
61/// Currently, this trait is sealed. That means users of this crate can write
62/// generic routines over this trait but cannot implement it themselves. This
63/// restriction may be lifted in the future, but sealing the trait permits
64/// adding new required methods in a backwards compatible fashion.
65///
66/// # Special states
67///
68/// This trait encodes a notion of "special" states in an automaton. Namely,
69/// a state is treated as special if it is a dead, match or start state:
70///
71/// * A dead state is a state that cannot be left once entered. All transitions
72/// on a dead state lead back to itself. The dead state is meant to be treated
73/// as a sentinel indicating that the search should stop and return a match if
74/// one has been found, and nothing otherwise.
75/// * A match state is a state that indicates one or more patterns have
76/// matched. Depending on the [`MatchKind`] of the automaton, a search may
77/// stop once a match is seen, or it may continue looking for matches until
78/// it enters a dead state or sees the end of the haystack.
79/// * A start state is a state that a search begins in. It is useful to know
80/// when a search enters a start state because it may mean that a prefilter can
81/// be used to skip ahead and quickly look for candidate matches. Unlike dead
82/// and match states, it is never necessary to explicitly handle start states
83/// for correctness. Indeed, in this crate, implementations of `Automaton`
84/// will only treat start states as "special" when a prefilter is enabled and
85/// active. Otherwise, treating it as special has no purpose and winds up
86/// slowing down the overall search because it results in ping-ponging between
87/// the main state transition and the "special" state logic.
88///
89/// Since checking whether a state is special by doing three different
90/// checks would be too expensive inside a fast search loop, the
91/// [`Automaton::is_special`] method is provided for quickly checking whether
92/// the state is special. The `Automaton::is_dead`, `Automaton::is_match` and
93/// `Automaton::is_start` predicates can then be used to determine which kind
94/// of special state it is.
95///
96/// # Panics
97///
98/// Most of the APIs on this trait should panic or give incorrect results
99/// if invalid inputs are given to it. For example, `Automaton::next_state`
100/// has unspecified behavior if the state ID given to it is not a valid
101/// state ID for the underlying automaton. Valid state IDs can only be
102/// retrieved in one of two ways: calling `Automaton::start_state` or calling
103/// `Automaton::next_state` with a valid state ID.
104///
105/// # Safety
106///
107/// This trait is not safe to implement so that code may rely on the
108/// correctness of implementations of this trait to avoid undefined behavior.
109/// The primary correctness guarantees are:
110///
111/// * `Automaton::start_state` always returns a valid state ID or an error or
112/// panics.
113/// * `Automaton::next_state`, when given a valid state ID, always returns
114/// a valid state ID for all values of `anchored` and `byte`, or otherwise
115/// panics.
116///
117/// In general, the rest of the methods on `Automaton` need to uphold their
118/// contracts as well. For example, `Automaton::is_dead` should only returns
119/// true if the given state ID is actually a dead state.
120///
121/// Note that currently this crate does not rely on the safety property defined
122/// here to avoid undefined behavior. Instead, this was done to make it
123/// _possible_ to do in the future.
124///
125/// # Example
126///
127/// This example shows how one might implement a basic but correct search
128/// routine. We keep things simple by not using prefilters or worrying about
129/// anchored searches, but do make sure our search is correct for all possible
130/// [`MatchKind`] semantics. (The comments in the code below note the parts
131/// that are needed to support certain `MatchKind` semantics.)
132///
133/// ```
134/// use aho_corasick::{
135/// automaton::Automaton,
136/// nfa::noncontiguous::NFA,
137/// Anchored, Match, MatchError, MatchKind,
138/// };
139///
140/// // Run an unanchored search for 'aut' in 'haystack'. Return the first match
141/// // seen according to the automaton's match semantics. This returns an error
142/// // if the given automaton does not support unanchored searches.
143/// fn find<A: Automaton>(
144/// aut: A,
145/// haystack: &[u8],
146/// ) -> Result<Option<Match>, MatchError> {
147/// let mut sid = aut.start_state(Anchored::No)?;
148/// let mut at = 0;
149/// let mut mat = None;
150/// let get_match = |sid, at| {
151/// let pid = aut.match_pattern(sid, 0);
152/// let len = aut.pattern_len(pid);
153/// Match::new(pid, (at - len)..at)
154/// };
155/// // Start states can be match states!
156/// if aut.is_match(sid) {
157/// mat = Some(get_match(sid, at));
158/// // Standard semantics require matches to be reported as soon as
159/// // they're seen. Otherwise, we continue until we see a dead state
160/// // or the end of the haystack.
161/// if matches!(aut.match_kind(), MatchKind::Standard) {
162/// return Ok(mat);
163/// }
164/// }
165/// while at < haystack.len() {
166/// sid = aut.next_state(Anchored::No, sid, haystack[at]);
167/// if aut.is_special(sid) {
168/// if aut.is_dead(sid) {
169/// return Ok(mat);
170/// } else if aut.is_match(sid) {
171/// mat = Some(get_match(sid, at + 1));
172/// // As above, standard semantics require that we return
173/// // immediately once a match is found.
174/// if matches!(aut.match_kind(), MatchKind::Standard) {
175/// return Ok(mat);
176/// }
177/// }
178/// }
179/// at += 1;
180/// }
181/// Ok(mat)
182/// }
183///
184/// // Show that it works for standard searches.
185/// let nfa = NFA::new(&["samwise", "sam"]).unwrap();
186/// assert_eq!(Some(Match::must(1, 0..3)), find(&nfa, b"samwise")?);
187///
188/// // But also works when using leftmost-first. Notice how the match result
189/// // has changed!
190/// let nfa = NFA::builder()
191/// .match_kind(MatchKind::LeftmostFirst)
192/// .build(&["samwise", "sam"])
193/// .unwrap();
194/// assert_eq!(Some(Match::must(0, 0..7)), find(&nfa, b"samwise")?);
195///
196/// # Ok::<(), Box<dyn std::error::Error>>(())
197/// ```
198pub unsafe trait Automaton: private::Sealed {
199 /// Returns the starting state for the given anchor mode.
200 ///
201 /// Upon success, the state ID returned is guaranteed to be valid for
202 /// this automaton.
203 ///
204 /// # Errors
205 ///
206 /// This returns an error when the given search configuration is not
207 /// supported by the underlying automaton. For example, if the underlying
208 /// automaton only supports unanchored searches but the given configuration
209 /// was set to an anchored search, then this must return an error.
210 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError>;
211
212 /// Performs a state transition from `sid` for `byte` and returns the next
213 /// state.
214 ///
215 /// `anchored` should be [`Anchored::Yes`] when executing an anchored
216 /// search and [`Anchored::No`] otherwise. For some implementations of
217 /// `Automaton`, it is required to know whether the search is anchored
218 /// or not in order to avoid following failure transitions. Other
219 /// implementations may ignore `anchored` altogether and depend on
220 /// `Automaton::start_state` returning a state that walks a different path
221 /// through the automaton depending on whether the search is anchored or
222 /// not.
223 ///
224 /// # Panics
225 ///
226 /// This routine may panic or return incorrect results when the given state
227 /// ID is invalid. A state ID is valid if and only if:
228 ///
229 /// 1. It came from a call to `Automaton::start_state`, or
230 /// 2. It came from a previous call to `Automaton::next_state` with a
231 /// valid state ID.
232 ///
233 /// Implementations must treat all possible values of `byte` as valid.
234 ///
235 /// Implementations may panic on unsupported values of `anchored`, but are
236 /// not required to do so.
237 fn next_state(
238 &self,
239 anchored: Anchored,
240 sid: StateID,
241 byte: u8,
242 ) -> StateID;
243
244 /// Returns true if the given ID represents a "special" state. A special
245 /// state is a dead, match or start state.
246 ///
247 /// Note that implementations may choose to return false when the given ID
248 /// corresponds to a start state. Namely, it always correct to treat start
249 /// states as non-special. Implementations must return true for states that
250 /// are dead or contain matches.
251 ///
252 /// This has unspecified behavior when given an invalid state ID.
253 fn is_special(&self, sid: StateID) -> bool;
254
255 /// Returns true if the given ID represents a dead state.
256 ///
257 /// A dead state is a type of "sink" in a finite state machine. It
258 /// corresponds to a state whose transitions all loop back to itself. That
259 /// is, once entered, it can never be left. In practice, it serves as a
260 /// sentinel indicating that the search should terminate.
261 ///
262 /// This has unspecified behavior when given an invalid state ID.
263 fn is_dead(&self, sid: StateID) -> bool;
264
265 /// Returns true if the given ID represents a match state.
266 ///
267 /// A match state is always associated with one or more pattern IDs that
268 /// matched at the position in the haystack when the match state was
269 /// entered. When a match state is entered, the match semantics dictate
270 /// whether it should be returned immediately (for `MatchKind::Standard`)
271 /// or if the search should continue (for `MatchKind::LeftmostFirst` and
272 /// `MatchKind::LeftmostLongest`) until a dead state is seen or the end of
273 /// the haystack has been reached.
274 ///
275 /// This has unspecified behavior when given an invalid state ID.
276 fn is_match(&self, sid: StateID) -> bool;
277
278 /// Returns true if the given ID represents a start state.
279 ///
280 /// While it is never incorrect to ignore start states during a search
281 /// (except for the start of the search of course), knowing whether one has
282 /// entered a start state can be useful for certain classes of performance
283 /// optimizations. For example, if one is in a start state, it may be legal
284 /// to try to skip ahead and look for match candidates more quickly than
285 /// would otherwise be accomplished by walking the automaton.
286 ///
287 /// Implementations of `Automaton` in this crate "unspecialize" start
288 /// states when a prefilter is not active or enabled. In this case, it
289 /// is possible for `Automaton::is_special(sid)` to return false while
290 /// `Automaton::is_start(sid)` returns true.
291 ///
292 /// This has unspecified behavior when given an invalid state ID.
293 fn is_start(&self, sid: StateID) -> bool;
294
295 /// Returns the match semantics that this automaton was built with.
296 fn match_kind(&self) -> MatchKind;
297
298 /// Returns the total number of matches for the given state ID.
299 ///
300 /// This has unspecified behavior if the given ID does not refer to a match
301 /// state.
302 fn match_len(&self, sid: StateID) -> usize;
303
304 /// Returns the pattern ID for the match state given by `sid` at the
305 /// `index` given.
306 ///
307 /// Typically, `index` is only ever greater than `0` when implementing an
308 /// overlapping search. Otherwise, it's likely that your search only cares
309 /// about reporting the first pattern ID in a match state.
310 ///
311 /// This has unspecified behavior if the given ID does not refer to a match
312 /// state, or if the index is greater than or equal to the total number of
313 /// matches in this match state.
314 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID;
315
316 /// Returns the total number of patterns compiled into this automaton.
317 fn patterns_len(&self) -> usize;
318
319 /// Returns the length of the pattern for the given ID.
320 ///
321 /// This has unspecified behavior when given an invalid pattern
322 /// ID. A pattern ID is valid if and only if it is less than
323 /// `Automaton::patterns_len`.
324 fn pattern_len(&self, pid: PatternID) -> usize;
325
326 /// Returns the length, in bytes, of the shortest pattern in this
327 /// automaton.
328 fn min_pattern_len(&self) -> usize;
329
330 /// Returns the length, in bytes, of the longest pattern in this automaton.
331 fn max_pattern_len(&self) -> usize;
332
333 /// Returns the heap memory usage, in bytes, used by this automaton.
334 fn memory_usage(&self) -> usize;
335
336 /// Returns a prefilter, if available, that can be used to accelerate
337 /// searches for this automaton.
338 ///
339 /// The typical way this is used is when the start state is entered during
340 /// a search. When that happens, one can use a prefilter to skip ahead and
341 /// look for candidate matches without having to walk the automaton on the
342 /// bytes between candidates.
343 ///
344 /// Typically a prefilter is only available when there are a small (<100)
345 /// number of patterns built into the automaton.
346 fn prefilter(&self) -> Option<&Prefilter>;
347
348 /// Executes a non-overlapping search with this automaton using the given
349 /// configuration.
350 ///
351 /// See
352 /// [`AhoCorasick::try_find`](crate::AhoCorasick::try_find)
353 /// for more documentation and examples.
354 fn try_find(
355 &self,
356 input: &Input<'_>,
357 ) -> Result<Option<Match>, MatchError> {
358 try_find_fwd(&self, input)
359 }
360
361 /// Executes a overlapping search with this automaton using the given
362 /// configuration.
363 ///
364 /// See
365 /// [`AhoCorasick::try_find_overlapping`](crate::AhoCorasick::try_find_overlapping)
366 /// for more documentation and examples.
367 fn try_find_overlapping(
368 &self,
369 input: &Input<'_>,
370 state: &mut OverlappingState,
371 ) -> Result<(), MatchError> {
372 try_find_overlapping_fwd(&self, input, state)
373 }
374
375 /// Returns an iterator of non-overlapping matches with this automaton
376 /// using the given configuration.
377 ///
378 /// See
379 /// [`AhoCorasick::try_find_iter`](crate::AhoCorasick::try_find_iter)
380 /// for more documentation and examples.
381 fn try_find_iter<'a, 'h>(
382 &'a self,
383 input: Input<'h>,
384 ) -> Result<FindIter<'a, 'h, Self>, MatchError>
385 where
386 Self: Sized,
387 {
388 FindIter::new(self, input)
389 }
390
391 /// Returns an iterator of overlapping matches with this automaton
392 /// using the given configuration.
393 ///
394 /// See
395 /// [`AhoCorasick::try_find_overlapping_iter`](crate::AhoCorasick::try_find_overlapping_iter)
396 /// for more documentation and examples.
397 fn try_find_overlapping_iter<'a, 'h>(
398 &'a self,
399 input: Input<'h>,
400 ) -> Result<FindOverlappingIter<'a, 'h, Self>, MatchError>
401 where
402 Self: Sized,
403 {
404 if !self.match_kind().is_standard() {
405 return Err(MatchError::unsupported_overlapping(
406 self.match_kind(),
407 ));
408 }
409 // We might consider lifting this restriction. The reason why I added
410 // it was to ban the combination of "anchored search" and "overlapping
411 // iteration." The match semantics aren't totally clear in that case.
412 // Should we allow *any* matches that are adjacent to *any* previous
413 // match? Or only following the most recent one? Or only matches
414 // that start at the beginning of the search? We might also elect to
415 // just keep this restriction in place, as callers should be able to
416 // implement it themselves if they want to.
417 if input.get_anchored().is_anchored() {
418 return Err(MatchError::invalid_input_anchored());
419 }
420 let _ = self.start_state(input.get_anchored())?;
421 let state = OverlappingState::start();
422 Ok(FindOverlappingIter { aut: self, input, state })
423 }
424
425 /// Replaces all non-overlapping matches in `haystack` with
426 /// strings from `replace_with` depending on the pattern that
427 /// matched. The `replace_with` slice must have length equal to
428 /// `Automaton::patterns_len`.
429 ///
430 /// See
431 /// [`AhoCorasick::try_replace_all`](crate::AhoCorasick::try_replace_all)
432 /// for more documentation and examples.
433 fn try_replace_all<B>(
434 &self,
435 haystack: &str,
436 replace_with: &[B],
437 ) -> Result<String, MatchError>
438 where
439 Self: Sized,
440 B: AsRef<str>,
441 {
442 assert_eq!(
443 replace_with.len(),
444 self.patterns_len(),
445 "replace_all requires a replacement for every pattern \
446 in the automaton"
447 );
448 let mut dst = String::with_capacity(haystack.len());
449 self.try_replace_all_with(haystack, &mut dst, |mat, _, dst| {
450 dst.push_str(replace_with[mat.pattern()].as_ref());
451 true
452 })?;
453 Ok(dst)
454 }
455
456 /// Replaces all non-overlapping matches in `haystack` with
457 /// strings from `replace_with` depending on the pattern that
458 /// matched. The `replace_with` slice must have length equal to
459 /// `Automaton::patterns_len`.
460 ///
461 /// See
462 /// [`AhoCorasick::try_replace_all_bytes`](crate::AhoCorasick::try_replace_all_bytes)
463 /// for more documentation and examples.
464 fn try_replace_all_bytes<B>(
465 &self,
466 haystack: &[u8],
467 replace_with: &[B],
468 ) -> Result<Vec<u8>, MatchError>
469 where
470 Self: Sized,
471 B: AsRef<[u8]>,
472 {
473 assert_eq!(
474 replace_with.len(),
475 self.patterns_len(),
476 "replace_all requires a replacement for every pattern \
477 in the automaton"
478 );
479 let mut dst = Vec::with_capacity(haystack.len());
480 self.try_replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| {
481 dst.extend(replace_with[mat.pattern()].as_ref());
482 true
483 })?;
484 Ok(dst)
485 }
486
487 /// Replaces all non-overlapping matches in `haystack` by calling the
488 /// `replace_with` closure given.
489 ///
490 /// See
491 /// [`AhoCorasick::try_replace_all_with`](crate::AhoCorasick::try_replace_all_with)
492 /// for more documentation and examples.
493 fn try_replace_all_with<F>(
494 &self,
495 haystack: &str,
496 dst: &mut String,
497 mut replace_with: F,
498 ) -> Result<(), MatchError>
499 where
500 Self: Sized,
501 F: FnMut(&Match, &str, &mut String) -> bool,
502 {
503 let mut last_match = 0;
504 for m in self.try_find_iter(Input::new(haystack))? {
505 // Since there are no restrictions on what kinds of patterns are
506 // in an Aho-Corasick automaton, we might get matches that split
507 // a codepoint, or even matches of a partial codepoint. When that
508 // happens, we just skip the match.
509 if !haystack.is_char_boundary(m.start())
510 || !haystack.is_char_boundary(m.end())
511 {
512 continue;
513 }
514 dst.push_str(&haystack[last_match..m.start()]);
515 last_match = m.end();
516 if !replace_with(&m, &haystack[m.start()..m.end()], dst) {
517 break;
518 };
519 }
520 dst.push_str(&haystack[last_match..]);
521 Ok(())
522 }
523
524 /// Replaces all non-overlapping matches in `haystack` by calling the
525 /// `replace_with` closure given.
526 ///
527 /// See
528 /// [`AhoCorasick::try_replace_all_with_bytes`](crate::AhoCorasick::try_replace_all_with_bytes)
529 /// for more documentation and examples.
530 fn try_replace_all_with_bytes<F>(
531 &self,
532 haystack: &[u8],
533 dst: &mut Vec<u8>,
534 mut replace_with: F,
535 ) -> Result<(), MatchError>
536 where
537 Self: Sized,
538 F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
539 {
540 let mut last_match = 0;
541 for m in self.try_find_iter(Input::new(haystack))? {
542 dst.extend(&haystack[last_match..m.start()]);
543 last_match = m.end();
544 if !replace_with(&m, &haystack[m.start()..m.end()], dst) {
545 break;
546 };
547 }
548 dst.extend(&haystack[last_match..]);
549 Ok(())
550 }
551
552 /// Returns an iterator of non-overlapping matches with this automaton
553 /// from the stream given.
554 ///
555 /// See
556 /// [`AhoCorasick::try_stream_find_iter`](crate::AhoCorasick::try_stream_find_iter)
557 /// for more documentation and examples.
558 #[cfg(feature = "std")]
559 fn try_stream_find_iter<'a, R: std::io::Read>(
560 &'a self,
561 rdr: R,
562 ) -> Result<StreamFindIter<'a, Self, R>, MatchError>
563 where
564 Self: Sized,
565 {
566 Ok(StreamFindIter { it: StreamChunkIter::new(self, rdr)? })
567 }
568
569 /// Replaces all non-overlapping matches in `rdr` with strings from
570 /// `replace_with` depending on the pattern that matched, and writes the
571 /// result to `wtr`. The `replace_with` slice must have length equal to
572 /// `Automaton::patterns_len`.
573 ///
574 /// See
575 /// [`AhoCorasick::try_stream_replace_all`](crate::AhoCorasick::try_stream_replace_all)
576 /// for more documentation and examples.
577 #[cfg(feature = "std")]
578 fn try_stream_replace_all<R, W, B>(
579 &self,
580 rdr: R,
581 wtr: W,
582 replace_with: &[B],
583 ) -> std::io::Result<()>
584 where
585 Self: Sized,
586 R: std::io::Read,
587 W: std::io::Write,
588 B: AsRef<[u8]>,
589 {
590 assert_eq!(
591 replace_with.len(),
592 self.patterns_len(),
593 "streaming replace_all requires a replacement for every pattern \
594 in the automaton",
595 );
596 self.try_stream_replace_all_with(rdr, wtr, |mat, _, wtr| {
597 wtr.write_all(replace_with[mat.pattern()].as_ref())
598 })
599 }
600
601 /// Replaces all non-overlapping matches in `rdr` by calling the
602 /// `replace_with` closure given and writing the result to `wtr`.
603 ///
604 /// See
605 /// [`AhoCorasick::try_stream_replace_all_with`](crate::AhoCorasick::try_stream_replace_all_with)
606 /// for more documentation and examples.
607 #[cfg(feature = "std")]
608 fn try_stream_replace_all_with<R, W, F>(
609 &self,
610 rdr: R,
611 mut wtr: W,
612 mut replace_with: F,
613 ) -> std::io::Result<()>
614 where
615 Self: Sized,
616 R: std::io::Read,
617 W: std::io::Write,
618 F: FnMut(&Match, &[u8], &mut W) -> std::io::Result<()>,
619 {
620 let mut it = StreamChunkIter::new(self, rdr).map_err(|e| {
621 let kind = std::io::ErrorKind::Other;
622 std::io::Error::new(kind, e)
623 })?;
624 while let Some(result) = it.next() {
625 let chunk = result?;
626 match chunk {
627 StreamChunk::NonMatch { bytes, .. } => {
628 wtr.write_all(bytes)?;
629 }
630 StreamChunk::Match { bytes, mat } => {
631 replace_with(&mat, bytes, &mut wtr)?;
632 }
633 }
634 }
635 Ok(())
636 }
637}
638
639// SAFETY: This just defers to the underlying 'AcAutomaton' and thus inherits
640// its safety properties.
641unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
642 #[inline(always)]
643 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
644 (**self).start_state(anchored)
645 }
646
647 #[inline(always)]
648 fn next_state(
649 &self,
650 anchored: Anchored,
651 sid: StateID,
652 byte: u8,
653 ) -> StateID {
654 (**self).next_state(anchored, sid, byte)
655 }
656
657 #[inline(always)]
658 fn is_special(&self, sid: StateID) -> bool {
659 (**self).is_special(sid)
660 }
661
662 #[inline(always)]
663 fn is_dead(&self, sid: StateID) -> bool {
664 (**self).is_dead(sid)
665 }
666
667 #[inline(always)]
668 fn is_match(&self, sid: StateID) -> bool {
669 (**self).is_match(sid)
670 }
671
672 #[inline(always)]
673 fn is_start(&self, sid: StateID) -> bool {
674 (**self).is_start(sid)
675 }
676
677 #[inline(always)]
678 fn match_kind(&self) -> MatchKind {
679 (**self).match_kind()
680 }
681
682 #[inline(always)]
683 fn match_len(&self, sid: StateID) -> usize {
684 (**self).match_len(sid)
685 }
686
687 #[inline(always)]
688 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
689 (**self).match_pattern(sid, index)
690 }
691
692 #[inline(always)]
693 fn patterns_len(&self) -> usize {
694 (**self).patterns_len()
695 }
696
697 #[inline(always)]
698 fn pattern_len(&self, pid: PatternID) -> usize {
699 (**self).pattern_len(pid)
700 }
701
702 #[inline(always)]
703 fn min_pattern_len(&self) -> usize {
704 (**self).min_pattern_len()
705 }
706
707 #[inline(always)]
708 fn max_pattern_len(&self) -> usize {
709 (**self).max_pattern_len()
710 }
711
712 #[inline(always)]
713 fn memory_usage(&self) -> usize {
714 (**self).memory_usage()
715 }
716
717 #[inline(always)]
718 fn prefilter(&self) -> Option<&Prefilter> {
719 (**self).prefilter()
720 }
721}
722
723/// Represents the current state of an overlapping search.
724///
725/// This is used for overlapping searches since they need to know something
726/// about the previous search. For example, when multiple patterns match at the
727/// same position, this state tracks the last reported pattern so that the next
728/// search knows whether to report another matching pattern or continue with
729/// the search at the next position. Additionally, it also tracks which state
730/// the last search call terminated in and the current offset of the search
731/// in the haystack.
732///
733/// This type provides limited introspection capabilities. The only thing a
734/// caller can do is construct it and pass it around to permit search routines
735/// to use it to track state, and to ask whether a match has been found.
736///
737/// Callers should always provide a fresh state constructed via
738/// [`OverlappingState::start`] when starting a new search. That same state
739/// should be reused for subsequent searches on the same `Input`. The state
740/// given will advance through the haystack itself. Callers can detect the end
741/// of a search when neither an error nor a match is returned.
742///
743/// # Example
744///
745/// This example shows how to manually iterate over all overlapping matches. If
746/// you need this, you might consider using
747/// [`AhoCorasick::find_overlapping_iter`](crate::AhoCorasick::find_overlapping_iter)
748/// instead, but this shows how to correctly use an `OverlappingState`.
749///
750/// ```
751/// use aho_corasick::{
752/// automaton::OverlappingState,
753/// AhoCorasick, Input, Match,
754/// };
755///
756/// let patterns = &["append", "appendage", "app"];
757/// let haystack = "append the app to the appendage";
758///
759/// let ac = AhoCorasick::new(patterns).unwrap();
760/// let mut state = OverlappingState::start();
761/// let mut matches = vec![];
762///
763/// loop {
764/// ac.find_overlapping(haystack, &mut state);
765/// let mat = match state.get_match() {
766/// None => break,
767/// Some(mat) => mat,
768/// };
769/// matches.push(mat);
770/// }
771/// let expected = vec![
772/// Match::must(2, 0..3),
773/// Match::must(0, 0..6),
774/// Match::must(2, 11..14),
775/// Match::must(2, 22..25),
776/// Match::must(0, 22..28),
777/// Match::must(1, 22..31),
778/// ];
779/// assert_eq!(expected, matches);
780/// ```
781#[derive(Clone, Debug)]
782pub struct OverlappingState {
783 /// The match reported by the most recent overlapping search to use this
784 /// state.
785 ///
786 /// If a search does not find any matches, then it is expected to clear
787 /// this value.
788 mat: Option<Match>,
789 /// The state ID of the state at which the search was in when the call
790 /// terminated. When this is a match state, `last_match` must be set to a
791 /// non-None value.
792 ///
793 /// A `None` value indicates the start state of the corresponding
794 /// automaton. We cannot use the actual ID, since any one automaton may
795 /// have many start states, and which one is in use depends on search-time
796 /// factors (such as whether the search is anchored or not).
797 id: Option<StateID>,
798 /// The position of the search.
799 ///
800 /// When `id` is None (i.e., we are starting a search), this is set to
801 /// the beginning of the search as given by the caller regardless of its
802 /// current value. Subsequent calls to an overlapping search pick up at
803 /// this offset.
804 at: usize,
805 /// The index into the matching patterns of the next match to report if the
806 /// current state is a match state. Note that this may be 1 greater than
807 /// the total number of matches to report for the current match state. (In
808 /// which case, no more matches should be reported at the current position
809 /// and the search should advance to the next position.)
810 next_match_index: Option<usize>,
811}
812
813impl OverlappingState {
814 /// Create a new overlapping state that begins at the start state.
815 pub fn start() -> OverlappingState {
816 OverlappingState { mat: None, id: None, at: 0, next_match_index: None }
817 }
818
819 /// Return the match result of the most recent search to execute with this
820 /// state.
821 ///
822 /// Every search will clear this result automatically, such that if no
823 /// match is found, this will always correctly report `None`.
824 pub fn get_match(&self) -> Option<Match> {
825 self.mat
826 }
827}
828
829/// An iterator of non-overlapping matches in a particular haystack.
830///
831/// This iterator yields matches according to the [`MatchKind`] used by this
832/// automaton.
833///
834/// This iterator is constructed via the [`Automaton::try_find_iter`] method.
835///
836/// The type variable `A` refers to the implementation of the [`Automaton`]
837/// trait used to execute the search.
838///
839/// The lifetime `'a` refers to the lifetime of the [`Automaton`]
840/// implementation.
841///
842/// The lifetime `'h` refers to the lifetime of the haystack being searched.
843#[derive(Debug)]
844pub struct FindIter<'a, 'h, A> {
845 /// The automaton used to drive the search.
846 aut: &'a A,
847 /// The input parameters to give to each search call.
848 ///
849 /// The start position of the search is mutated during iteration.
850 input: Input<'h>,
851 /// Records the end offset of the most recent match. This is necessary to
852 /// handle a corner case for preventing empty matches from overlapping with
853 /// the ending bounds of a prior match.
854 last_match_end: Option<usize>,
855}
856
857impl<'a, 'h, A: Automaton> FindIter<'a, 'h, A> {
858 /// Creates a new non-overlapping iterator. If the given automaton would
859 /// return an error on a search with the given input configuration, then
860 /// that error is returned here.
861 fn new(
862 aut: &'a A,
863 input: Input<'h>,
864 ) -> Result<FindIter<'a, 'h, A>, MatchError> {
865 // The only way this search can fail is if we cannot retrieve the start
866 // state. e.g., Asking for an anchored search when only unanchored
867 // searches are supported.
868 let _ = aut.start_state(input.get_anchored())?;
869 Ok(FindIter { aut, input, last_match_end: None })
870 }
871
872 /// Executes a search and returns a match if one is found.
873 ///
874 /// This does not advance the input forward. It just executes a search
875 /// based on the current configuration/offsets.
876 fn search(&self) -> Option<Match> {
877 // The unwrap is OK here because we check at iterator construction time
878 // that no subsequent search call (using the same configuration) will
879 // ever return an error.
880 self.aut
881 .try_find(&self.input)
882 .expect("already checked that no match error can occur")
883 }
884
885 /// Handles the special case of an empty match by ensuring that 1) the
886 /// iterator always advances and 2) empty matches never overlap with other
887 /// matches.
888 ///
889 /// (1) is necessary because we principally make progress by setting the
890 /// starting location of the next search to the ending location of the last
891 /// match. But if a match is empty, then this results in a search that does
892 /// not advance and thus does not terminate.
893 ///
894 /// (2) is not strictly necessary, but makes intuitive sense and matches
895 /// the presiding behavior of most general purpose regex engines.
896 /// (Obviously this crate isn't a regex engine, but we choose to match
897 /// their semantics.) The "intuitive sense" here is that we want to report
898 /// NON-overlapping matches. So for example, given the patterns 'a' and
899 /// '' (an empty string) against the haystack 'a', without the special
900 /// handling, you'd get the matches [0, 1) and [1, 1), where the latter
901 /// overlaps with the end bounds of the former.
902 ///
903 /// Note that we mark this cold and forcefully prevent inlining because
904 /// handling empty matches like this is extremely rare and does require
905 /// quite a bit of code, comparatively. Keeping this code out of the main
906 /// iterator function keeps it smaller and more amenable to inlining
907 /// itself.
908 #[cold]
909 #[inline(never)]
910 fn handle_overlapping_empty_match(
911 &mut self,
912 mut m: Match,
913 ) -> Option<Match> {
914 assert!(m.is_empty());
915 if Some(m.end()) == self.last_match_end {
916 self.input.set_start(self.input.start().checked_add(1).unwrap());
917 m = self.search()?;
918 }
919 Some(m)
920 }
921}
922
923impl<'a, 'h, A: Automaton> Iterator for FindIter<'a, 'h, A> {
924 type Item = Match;
925
926 #[inline(always)]
927 fn next(&mut self) -> Option<Match> {
928 let mut m = self.search()?;
929 if m.is_empty() {
930 m = self.handle_overlapping_empty_match(m)?;
931 }
932 self.input.set_start(m.end());
933 self.last_match_end = Some(m.end());
934 Some(m)
935 }
936}
937
938/// An iterator of overlapping matches in a particular haystack.
939///
940/// This iterator will report all possible matches in a particular haystack,
941/// even when the matches overlap.
942///
943/// This iterator is constructed via the
944/// [`Automaton::try_find_overlapping_iter`] method.
945///
946/// The type variable `A` refers to the implementation of the [`Automaton`]
947/// trait used to execute the search.
948///
949/// The lifetime `'a` refers to the lifetime of the [`Automaton`]
950/// implementation.
951///
952/// The lifetime `'h` refers to the lifetime of the haystack being searched.
953#[derive(Debug)]
954pub struct FindOverlappingIter<'a, 'h, A> {
955 aut: &'a A,
956 input: Input<'h>,
957 state: OverlappingState,
958}
959
960impl<'a, 'h, A: Automaton> Iterator for FindOverlappingIter<'a, 'h, A> {
961 type Item = Match;
962
963 #[inline(always)]
964 fn next(&mut self) -> Option<Match> {
965 self.aut
966 .try_find_overlapping(&self.input, &mut self.state)
967 .expect("already checked that no match error can occur here");
968 self.state.get_match()
969 }
970}
971
972/// An iterator that reports matches in a stream.
973///
974/// This iterator yields elements of type `io::Result<Match>`, where an error
975/// is reported if there was a problem reading from the underlying stream.
976/// The iterator terminates only when the underlying stream reaches `EOF`.
977///
978/// This iterator is constructed via the [`Automaton::try_stream_find_iter`]
979/// method.
980///
981/// The type variable `A` refers to the implementation of the [`Automaton`]
982/// trait used to execute the search.
983///
984/// The type variable `R` refers to the `io::Read` stream that is being read
985/// from.
986///
987/// The lifetime `'a` refers to the lifetime of the [`Automaton`]
988/// implementation.
989#[cfg(feature = "std")]
990#[derive(Debug)]
991pub struct StreamFindIter<'a, A, R> {
992 it: StreamChunkIter<'a, A, R>,
993}
994
995#[cfg(feature = "std")]
996impl<'a, A: Automaton, R: std::io::Read> Iterator
997 for StreamFindIter<'a, A, R>
998{
999 type Item = std::io::Result<Match>;
1000
1001 fn next(&mut self) -> Option<std::io::Result<Match>> {
1002 loop {
1003 match self.it.next() {
1004 None => return None,
1005 Some(Err(err)) => return Some(Err(err)),
1006 Some(Ok(StreamChunk::NonMatch { .. })) => {}
1007 Some(Ok(StreamChunk::Match { mat, .. })) => {
1008 return Some(Ok(mat));
1009 }
1010 }
1011 }
1012 }
1013}
1014
1015/// An iterator that reports matches in a stream.
1016///
1017/// (This doesn't actually implement the `Iterator` trait because it returns
1018/// something with a lifetime attached to a buffer it owns, but that's OK. It
1019/// still has a `next` method and is iterator-like enough to be fine.)
1020///
1021/// This iterator yields elements of type `io::Result<StreamChunk>`, where
1022/// an error is reported if there was a problem reading from the underlying
1023/// stream. The iterator terminates only when the underlying stream reaches
1024/// `EOF`.
1025///
1026/// The idea here is that each chunk represents either a match or a non-match,
1027/// and if you concatenated all of the chunks together, you'd reproduce the
1028/// entire contents of the stream, byte-for-byte.
1029///
1030/// This chunk machinery is a bit complicated and it isn't strictly required
1031/// for a stream searcher that just reports matches. But we do need something
1032/// like this to deal with the "replacement" API, which needs to know which
1033/// chunks it can copy and which it needs to replace.
1034#[cfg(feature = "std")]
1035#[derive(Debug)]
1036struct StreamChunkIter<'a, A, R> {
1037 /// The underlying automaton to do the search.
1038 aut: &'a A,
1039 /// The source of bytes we read from.
1040 rdr: R,
1041 /// A roll buffer for managing bytes from `rdr`. Basically, this is used
1042 /// to handle the case of a match that is split by two different
1043 /// calls to `rdr.read()`. This isn't strictly needed if all we needed to
1044 /// do was report matches, but here we are reporting chunks of non-matches
1045 /// and matches and in order to do that, we really just cannot treat our
1046 /// stream as non-overlapping blocks of bytes. We need to permit some
1047 /// overlap while we retain bytes from a previous `read` call in memory.
1048 buf: crate::util::buffer::Buffer,
1049 /// The unanchored starting state of this automaton.
1050 start: StateID,
1051 /// The state of the automaton.
1052 sid: StateID,
1053 /// The absolute position over the entire stream.
1054 absolute_pos: usize,
1055 /// The position we're currently at within `buf`.
1056 buffer_pos: usize,
1057 /// The buffer position of the end of the bytes that we last returned
1058 /// to the caller. Basically, whenever we find a match, we look to see if
1059 /// there is a difference between where the match started and the position
1060 /// of the last byte we returned to the caller. If there's a difference,
1061 /// then we need to return a 'NonMatch' chunk.
1062 buffer_reported_pos: usize,
1063}
1064
1065#[cfg(feature = "std")]
1066impl<'a, A: Automaton, R: std::io::Read> StreamChunkIter<'a, A, R> {
1067 fn new(
1068 aut: &'a A,
1069 rdr: R,
1070 ) -> Result<StreamChunkIter<'a, A, R>, MatchError> {
1071 // This restriction is a carry-over from older versions of this crate.
1072 // I didn't have the bandwidth to think through how to handle, say,
1073 // leftmost-first or leftmost-longest matching, but... it should be
1074 // possible? The main problem is that once you see a match state in
1075 // leftmost-first semantics, you can't just stop at that point and
1076 // report a match. You have to keep going until you either hit a dead
1077 // state or EOF. So how do you know when you'll hit a dead state? Well,
1078 // you don't. With Aho-Corasick, I believe you can put a bound on it
1079 // and say, "once a match has been seen, you'll need to scan forward at
1080 // most N bytes" where N=aut.max_pattern_len().
1081 //
1082 // Which is fine, but it does mean that state about whether we're still
1083 // looking for a dead state or not needs to persist across buffer
1084 // refills. Which this code doesn't really handle. It does preserve
1085 // *some* state across buffer refills, basically ensuring that a match
1086 // span is always in memory.
1087 if !aut.match_kind().is_standard() {
1088 return Err(MatchError::unsupported_stream(aut.match_kind()));
1089 }
1090 // This is kind of a cop-out, but empty matches are SUPER annoying.
1091 // If we know they can't happen (which is what we enforce here), then
1092 // it makes a lot of logic much simpler. With that said, I'm open to
1093 // supporting this case, but we need to define proper semantics for it
1094 // first. It wasn't totally clear to me what it should do at the time
1095 // of writing, so I decided to just be conservative.
1096 //
1097 // It also seems like a very weird case to support anyway. Why search a
1098 // stream if you're just going to get a match at every position?
1099 //
1100 // ¯\_(ツ)_/¯
1101 if aut.min_pattern_len() == 0 {
1102 return Err(MatchError::unsupported_empty());
1103 }
1104 let start = aut.start_state(Anchored::No)?;
1105 Ok(StreamChunkIter {
1106 aut,
1107 rdr,
1108 buf: crate::util::buffer::Buffer::new(aut.max_pattern_len()),
1109 start,
1110 sid: start,
1111 absolute_pos: 0,
1112 buffer_pos: 0,
1113 buffer_reported_pos: 0,
1114 })
1115 }
1116
1117 fn next(&mut self) -> Option<std::io::Result<StreamChunk>> {
1118 // This code is pretty gnarly. It IS simpler than the equivalent code
1119 // in the previous aho-corasick release, in part because we inline
1120 // automaton traversal here and also in part because we have abdicated
1121 // support for automatons that contain an empty pattern.
1122 //
1123 // I suspect this code could be made a bit simpler by designing a
1124 // better buffer abstraction.
1125 //
1126 // But in general, this code is basically write-only. So you'll need
1127 // to go through it step-by-step to grok it. One of the key bits of
1128 // complexity is tracking a few different offsets. 'buffer_pos' is
1129 // where we are in the buffer for search. 'buffer_reported_pos' is the
1130 // position immediately following the last byte in the buffer that
1131 // we've returned to the caller. And 'absolute_pos' is the overall
1132 // current absolute position of the search in the entire stream, and
1133 // this is what match spans are reported in terms of.
1134 loop {
1135 if self.aut.is_match(self.sid) {
1136 let mat = self.get_match();
1137 if let Some(r) = self.get_non_match_chunk(mat) {
1138 self.buffer_reported_pos += r.len();
1139 let bytes = &self.buf.buffer()[r];
1140 return Some(Ok(StreamChunk::NonMatch { bytes }));
1141 }
1142 self.sid = self.start;
1143 let r = self.get_match_chunk(mat);
1144 self.buffer_reported_pos += r.len();
1145 let bytes = &self.buf.buffer()[r];
1146 return Some(Ok(StreamChunk::Match { bytes, mat }));
1147 }
1148 if self.buffer_pos >= self.buf.buffer().len() {
1149 if let Some(r) = self.get_pre_roll_non_match_chunk() {
1150 self.buffer_reported_pos += r.len();
1151 let bytes = &self.buf.buffer()[r];
1152 return Some(Ok(StreamChunk::NonMatch { bytes }));
1153 }
1154 if self.buf.buffer().len() >= self.buf.min_buffer_len() {
1155 self.buffer_pos = self.buf.min_buffer_len();
1156 self.buffer_reported_pos -=
1157 self.buf.buffer().len() - self.buf.min_buffer_len();
1158 self.buf.roll();
1159 }
1160 match self.buf.fill(&mut self.rdr) {
1161 Err(err) => return Some(Err(err)),
1162 Ok(true) => {}
1163 Ok(false) => {
1164 // We've hit EOF, but if there are still some
1165 // unreported bytes remaining, return them now.
1166 if let Some(r) = self.get_eof_non_match_chunk() {
1167 self.buffer_reported_pos += r.len();
1168 let bytes = &self.buf.buffer()[r];
1169 return Some(Ok(StreamChunk::NonMatch { bytes }));
1170 }
1171 // We've reported everything!
1172 return None;
1173 }
1174 }
1175 }
1176 let start = self.absolute_pos;
1177 for &byte in self.buf.buffer()[self.buffer_pos..].iter() {
1178 self.sid = self.aut.next_state(Anchored::No, self.sid, byte);
1179 self.absolute_pos += 1;
1180 if self.aut.is_match(self.sid) {
1181 break;
1182 }
1183 }
1184 self.buffer_pos += self.absolute_pos - start;
1185 }
1186 }
1187
1188 /// Return a match chunk for the given match. It is assumed that the match
1189 /// ends at the current `buffer_pos`.
1190 fn get_match_chunk(&self, mat: Match) -> core::ops::Range<usize> {
1191 let start = self.buffer_pos - mat.len();
1192 let end = self.buffer_pos;
1193 start..end
1194 }
1195
1196 /// Return a non-match chunk, if necessary, just before reporting a match.
1197 /// This returns `None` if there is nothing to report. Otherwise, this
1198 /// assumes that the given match ends at the current `buffer_pos`.
1199 fn get_non_match_chunk(
1200 &self,
1201 mat: Match,
1202 ) -> Option<core::ops::Range<usize>> {
1203 let buffer_mat_start = self.buffer_pos - mat.len();
1204 if buffer_mat_start > self.buffer_reported_pos {
1205 let start = self.buffer_reported_pos;
1206 let end = buffer_mat_start;
1207 return Some(start..end);
1208 }
1209 None
1210 }
1211
1212 /// Look for any bytes that should be reported as a non-match just before
1213 /// rolling the buffer.
1214 ///
1215 /// Note that this only reports bytes up to `buffer.len() -
1216 /// min_buffer_len`, as it's not possible to know whether the bytes
1217 /// following that will participate in a match or not.
1218 fn get_pre_roll_non_match_chunk(&self) -> Option<core::ops::Range<usize>> {
1219 let end =
1220 self.buf.buffer().len().saturating_sub(self.buf.min_buffer_len());
1221 if self.buffer_reported_pos < end {
1222 return Some(self.buffer_reported_pos..end);
1223 }
1224 None
1225 }
1226
1227 /// Return any unreported bytes as a non-match up to the end of the buffer.
1228 ///
1229 /// This should only be called when the entire contents of the buffer have
1230 /// been searched and EOF has been hit when trying to fill the buffer.
1231 fn get_eof_non_match_chunk(&self) -> Option<core::ops::Range<usize>> {
1232 if self.buffer_reported_pos < self.buf.buffer().len() {
1233 return Some(self.buffer_reported_pos..self.buf.buffer().len());
1234 }
1235 None
1236 }
1237
1238 /// Return the match at the current position for the current state.
1239 ///
1240 /// This panics if `self.aut.is_match(self.sid)` isn't true.
1241 fn get_match(&self) -> Match {
1242 get_match(self.aut, self.sid, 0, self.absolute_pos)
1243 }
1244}
1245
1246/// A single chunk yielded by the stream chunk iterator.
1247///
1248/// The `'r` lifetime refers to the lifetime of the stream chunk iterator.
1249#[cfg(feature = "std")]
1250#[derive(Debug)]
1251enum StreamChunk<'r> {
1252 /// A chunk that does not contain any matches.
1253 NonMatch { bytes: &'r [u8] },
1254 /// A chunk that precisely contains a match.
1255 Match { bytes: &'r [u8], mat: Match },
1256}
1257
1258#[inline(never)]
1259pub(crate) fn try_find_fwd<A: Automaton + ?Sized>(
1260 aut: &A,
1261 input: &Input<'_>,
1262) -> Result<Option<Match>, MatchError> {
1263 if input.is_done() {
1264 return Ok(None);
1265 }
1266 let earliest = aut.match_kind().is_standard() || input.get_earliest();
1267 if input.get_anchored().is_anchored() {
1268 try_find_fwd_imp(aut, input, None, Anchored::Yes, earliest)
1269 } else if let Some(pre) = aut.prefilter() {
1270 if earliest {
1271 try_find_fwd_imp(aut, input, Some(pre), Anchored::No, true)
1272 } else {
1273 try_find_fwd_imp(aut, input, Some(pre), Anchored::No, false)
1274 }
1275 } else {
1276 if earliest {
1277 try_find_fwd_imp(aut, input, None, Anchored::No, true)
1278 } else {
1279 try_find_fwd_imp(aut, input, None, Anchored::No, false)
1280 }
1281 }
1282}
1283
1284#[inline(always)]
1285fn try_find_fwd_imp<A: Automaton + ?Sized>(
1286 aut: &A,
1287 input: &Input<'_>,
1288 pre: Option<&Prefilter>,
1289 anchored: Anchored,
1290 earliest: bool,
1291) -> Result<Option<Match>, MatchError> {
1292 let mut sid = aut.start_state(input.get_anchored())?;
1293 let mut at = input.start();
1294 let mut mat = None;
1295 if aut.is_match(sid) {
1296 mat = Some(get_match(aut, sid, 0, at));
1297 if earliest {
1298 return Ok(mat);
1299 }
1300 }
1301 if let Some(pre) = pre {
1302 match pre.find_in(input.haystack(), input.get_span()) {
1303 Candidate::None => return Ok(None),
1304 Candidate::Match(m) => return Ok(Some(m)),
1305 Candidate::PossibleStartOfMatch(i) => {
1306 at = i;
1307 }
1308 }
1309 }
1310 while at < input.end() {
1311 // I've tried unrolling this loop and eliding bounds checks, but no
1312 // matter what I did, I could not observe a consistent improvement on
1313 // any benchmark I could devise. (If someone wants to re-litigate this,
1314 // the way to do it is to add an 'next_state_unchecked' method to the
1315 // 'Automaton' trait with a default impl that uses 'next_state'. Then
1316 // use 'aut.next_state_unchecked' here and implement it on DFA using
1317 // unchecked slice index acces.)
1318 sid = aut.next_state(anchored, sid, input.haystack()[at]);
1319 if aut.is_special(sid) {
1320 if aut.is_dead(sid) {
1321 return Ok(mat);
1322 } else if aut.is_match(sid) {
1323 // We use 'at + 1' here because the match state is entered
1324 // at the last byte of the pattern. Since we use half-open
1325 // intervals, the end of the range of the match is one past the
1326 // last byte.
1327 let m = get_match(aut, sid, 0, at + 1);
1328 // For the automata in this crate, we make a size trade off
1329 // where we reuse the same automaton for both anchored and
1330 // unanchored searches. We achieve this, principally, by simply
1331 // not following failure transitions while computing the next
1332 // state. Instead, if we fail to find the next state, we return
1333 // a dead state, which instructs the search to stop. (This
1334 // is why 'next_state' needs to know whether the search is
1335 // anchored or not.) In addition, we have different start
1336 // states for anchored and unanchored searches. The latter has
1337 // a self-loop where as the former does not.
1338 //
1339 // In this way, we can use the same trie to execute both
1340 // anchored and unanchored searches. There is a catch though.
1341 // When building an Aho-Corasick automaton for unanchored
1342 // searches, we copy matches from match states to other states
1343 // (which would otherwise not be match states) if they are
1344 // reachable via a failure transition. In the case of an
1345 // anchored search, we *specifically* do not want to report
1346 // these matches because they represent matches that start past
1347 // the beginning of the search.
1348 //
1349 // Now we could tweak the automaton somehow to differentiate
1350 // anchored from unanchored match states, but this would make
1351 // 'aut.is_match' and potentially 'aut.is_special' slower. And
1352 // also make the automaton itself more complex.
1353 //
1354 // Instead, we insert a special hack: if the search is
1355 // anchored, we simply ignore matches that don't begin at
1356 // the start of the search. This is not quite ideal, but we
1357 // do specialize this function in such a way that unanchored
1358 // searches don't pay for this additional branch. While this
1359 // might cause a search to continue on for more than it
1360 // otherwise optimally would, it will be no more than the
1361 // longest pattern in the automaton. The reason for this is
1362 // that we ensure we don't follow failure transitions during
1363 // an anchored search. Combined with using a different anchored
1364 // starting state with no self-loop, we guarantee that we'll
1365 // at worst move through a number of transitions equal to the
1366 // longest pattern.
1367 //
1368 // Now for DFAs, the whole point of them is to eliminate
1369 // failure transitions entirely. So there is no way to say "if
1370 // it's an anchored search don't follow failure transitions."
1371 // Instead, we actually have to build two entirely separate
1372 // automatons into the transition table. One with failure
1373 // transitions built into it and another that is effectively
1374 // just an encoding of the base trie into a transition table.
1375 // DFAs still need this check though, because the match states
1376 // still carry matches only reachable via a failure transition.
1377 // Why? Because removing them seems difficult, although I
1378 // haven't given it a lot of thought.
1379 if !(anchored.is_anchored() && m.start() > input.start()) {
1380 mat = Some(m);
1381 if earliest {
1382 return Ok(mat);
1383 }
1384 }
1385 } else if let Some(pre) = pre {
1386 // If we're here, we know it's a special state that is not a
1387 // dead or a match state AND that a prefilter is active. Thus,
1388 // it must be a start state.
1389 debug_assert!(aut.is_start(sid));
1390 // We don't care about 'Candidate::Match' here because if such
1391 // a match were possible, it would have been returned above
1392 // when we run the prefilter before walking the automaton.
1393 let span = Span::from(at..input.end());
1394 match pre.find_in(input.haystack(), span).into_option() {
1395 None => return Ok(None),
1396 Some(i) => {
1397 if i > at {
1398 at = i;
1399 continue;
1400 }
1401 }
1402 }
1403 } else {
1404 // When pre.is_none(), then starting states should not be
1405 // treated as special. That is, without a prefilter, is_special
1406 // should only return true when the state is a dead or a match
1407 // state.
1408 //
1409 // It is possible to execute a search without a prefilter even
1410 // when the underlying searcher has one: an anchored search.
1411 // But in this case, the automaton makes it impossible to move
1412 // back to the start state by construction, and thus, we should
1413 // never reach this branch.
1414 debug_assert!(false, "unreachable");
1415 }
1416 }
1417 at += 1;
1418 }
1419 Ok(mat)
1420}
1421
1422#[inline(never)]
1423fn try_find_overlapping_fwd<A: Automaton + ?Sized>(
1424 aut: &A,
1425 input: &Input<'_>,
1426 state: &mut OverlappingState,
1427) -> Result<(), MatchError> {
1428 state.mat = None;
1429 if input.is_done() {
1430 return Ok(());
1431 }
1432 // Searching with a pattern ID is always anchored, so we should only ever
1433 // use a prefilter when no pattern ID is given.
1434 if aut.prefilter().is_some() && !input.get_anchored().is_anchored() {
1435 let pre = aut.prefilter().unwrap();
1436 try_find_overlapping_fwd_imp(aut, input, Some(pre), state)
1437 } else {
1438 try_find_overlapping_fwd_imp(aut, input, None, state)
1439 }
1440}
1441
1442#[inline(always)]
1443fn try_find_overlapping_fwd_imp<A: Automaton + ?Sized>(
1444 aut: &A,
1445 input: &Input<'_>,
1446 pre: Option<&Prefilter>,
1447 state: &mut OverlappingState,
1448) -> Result<(), MatchError> {
1449 let mut sid = match state.id {
1450 None => {
1451 let sid = aut.start_state(input.get_anchored())?;
1452 // Handle the case where the start state is a match state. That is,
1453 // the empty string is in our automaton. We report every match we
1454 // can here before moving on and updating 'state.at' and 'state.id'
1455 // to find more matches in other parts of the haystack.
1456 if aut.is_match(sid) {
1457 let i = state.next_match_index.unwrap_or(0);
1458 let len = aut.match_len(sid);
1459 if i < len {
1460 state.next_match_index = Some(i + 1);
1461 state.mat = Some(get_match(aut, sid, i, input.start()));
1462 return Ok(());
1463 }
1464 }
1465 state.at = input.start();
1466 state.id = Some(sid);
1467 state.next_match_index = None;
1468 state.mat = None;
1469 sid
1470 }
1471 Some(sid) => {
1472 // If we still have matches left to report in this state then
1473 // report them until we've exhausted them. Only after that do we
1474 // advance to the next offset in the haystack.
1475 if let Some(i) = state.next_match_index {
1476 let len = aut.match_len(sid);
1477 if i < len {
1478 state.next_match_index = Some(i + 1);
1479 state.mat = Some(get_match(aut, sid, i, state.at + 1));
1480 return Ok(());
1481 }
1482 // Once we've reported all matches at a given position, we need
1483 // to advance the search to the next position.
1484 state.at += 1;
1485 state.next_match_index = None;
1486 state.mat = None;
1487 }
1488 sid
1489 }
1490 };
1491 while state.at < input.end() {
1492 sid = aut.next_state(
1493 input.get_anchored(),
1494 sid,
1495 input.haystack()[state.at],
1496 );
1497 if aut.is_special(sid) {
1498 state.id = Some(sid);
1499 if aut.is_dead(sid) {
1500 return Ok(());
1501 } else if aut.is_match(sid) {
1502 state.next_match_index = Some(1);
1503 state.mat = Some(get_match(aut, sid, 0, state.at + 1));
1504 return Ok(());
1505 } else if let Some(pre) = pre {
1506 // If we're here, we know it's a special state that is not a
1507 // dead or a match state AND that a prefilter is active. Thus,
1508 // it must be a start state.
1509 debug_assert!(aut.is_start(sid));
1510 let span = Span::from(state.at..input.end());
1511 match pre.find_in(input.haystack(), span).into_option() {
1512 None => return Ok(()),
1513 Some(i) => {
1514 if i > state.at {
1515 state.at = i;
1516 continue;
1517 }
1518 }
1519 }
1520 } else {
1521 // When pre.is_none(), then starting states should not be
1522 // treated as special. That is, without a prefilter, is_special
1523 // should only return true when the state is a dead or a match
1524 // state.
1525 //
1526 // ... except for one special case: in stream searching, we
1527 // currently call overlapping search with a 'None' prefilter,
1528 // regardless of whether one exists or not, because stream
1529 // searching can't currently deal with prefilters correctly in
1530 // all cases.
1531 }
1532 }
1533 state.at += 1;
1534 }
1535 state.id = Some(sid);
1536 Ok(())
1537}
1538
1539#[inline(always)]
1540fn get_match<A: Automaton + ?Sized>(
1541 aut: &A,
1542 sid: StateID,
1543 index: usize,
1544 at: usize,
1545) -> Match {
1546 let pid = aut.match_pattern(sid, index);
1547 let len = aut.pattern_len(pid);
1548 Match::new(pid, (at - len)..at)
1549}
1550
1551/// Write a prefix "state" indicator for fmt::Debug impls. It always writes
1552/// exactly two printable bytes to the given formatter.
1553///
1554/// Specifically, this tries to succinctly distinguish the different types of
1555/// states: dead states, start states and match states. It even accounts for
1556/// the possible overlappings of different state types. (The only possible
1557/// overlapping is that of match and start states.)
1558pub(crate) fn fmt_state_indicator<A: Automaton>(
1559 f: &mut core::fmt::Formatter<'_>,
1560 aut: A,
1561 id: StateID,
1562) -> core::fmt::Result {
1563 if aut.is_dead(id) {
1564 write!(f, "D ")?;
1565 } else if aut.is_match(id) {
1566 if aut.is_start(id) {
1567 write!(f, "*>")?;
1568 } else {
1569 write!(f, "* ")?;
1570 }
1571 } else if aut.is_start(id) {
1572 write!(f, " >")?;
1573 } else {
1574 write!(f, " ")?;
1575 }
1576 Ok(())
1577}
1578
1579/// Return an iterator of transitions in a sparse format given an iterator
1580/// of all explicitly defined transitions. The iterator yields ranges of
1581/// transitions, such that any adjacent transitions mapped to the same
1582/// state are combined into a single range.
1583pub(crate) fn sparse_transitions<'a>(
1584 mut it: impl Iterator<Item = (u8, StateID)> + 'a,
1585) -> impl Iterator<Item = (u8, u8, StateID)> + 'a {
1586 let mut cur: Option<(u8, u8, StateID)> = None;
1587 core::iter::from_fn(move || {
1588 while let Some((class, next)) = it.next() {
1589 let (prev_start, prev_end, prev_next) = match cur {
1590 Some(x) => x,
1591 None => {
1592 cur = Some((class, class, next));
1593 continue;
1594 }
1595 };
1596 if prev_next == next {
1597 cur = Some((prev_start, class, prev_next));
1598 } else {
1599 cur = Some((class, class, next));
1600 return Some((prev_start, prev_end, prev_next));
1601 }
1602 }
1603 if let Some((start, end, next)) = cur.take() {
1604 return Some((start, end, next));
1605 }
1606 None
1607 })
1608}
1609