1use core::{
2 fmt::Debug,
3 panic::{RefUnwindSafe, UnwindSafe},
4};
5
6use alloc::{string::String, sync::Arc, vec::Vec};
7
8use crate::{
9 automaton::{self, Automaton, OverlappingState},
10 dfa,
11 nfa::{contiguous, noncontiguous},
12 util::{
13 error::{BuildError, MatchError},
14 prefilter::Prefilter,
15 primitives::{PatternID, StateID},
16 search::{Anchored, Input, Match, MatchKind, StartKind},
17 },
18};
19
20/// An automaton for searching multiple strings in linear time.
21///
22/// The `AhoCorasick` type supports a few basic ways of constructing an
23/// automaton, with the default being [`AhoCorasick::new`]. However, there
24/// are a fair number of configurable options that can be set by using
25/// [`AhoCorasickBuilder`] instead. Such options include, but are not limited
26/// to, how matches are determined, simple case insensitivity, whether to use a
27/// DFA or not and various knobs for controlling the space-vs-time trade offs
28/// taken when building the automaton.
29///
30/// # Resource usage
31///
32/// Aho-Corasick automatons are always constructed in `O(p)` time, where
33/// `p` is the combined length of all patterns being searched. With that
34/// said, building an automaton can be fairly costly because of high constant
35/// factors, particularly when enabling the [DFA](AhoCorasickKind::DFA) option
36/// with [`AhoCorasickBuilder::kind`]. For this reason, it's generally a good
37/// idea to build an automaton once and reuse it as much as possible.
38///
39/// Aho-Corasick automatons can also use a fair bit of memory. To get
40/// a concrete idea of how much memory is being used, try using the
41/// [`AhoCorasick::memory_usage`] method.
42///
43/// To give a quick idea of the differences between Aho-Corasick
44/// implementations and their resource usage, here's a sample of construction
45/// times and heap memory used after building an automaton from 100,000
46/// randomly selected titles from Wikipedia:
47///
48/// * 99MB for a [`noncontiguous::NFA`] in 240ms.
49/// * 21MB for a [`contiguous::NFA`] in 275ms.
50/// * 1.6GB for a [`dfa::DFA`] in 1.88s.
51///
52/// (Note that the memory usage above reflects the size of each automaton and
53/// not peak memory usage. For example, building a contiguous NFA requires
54/// first building a noncontiguous NFA. Once the contiguous NFA is built, the
55/// noncontiguous NFA is freed.)
56///
57/// This experiment very strongly argues that a contiguous NFA is often the
58/// best balance in terms of resource usage. It takes a little longer to build,
59/// but its memory usage is quite small. Its search speed (not listed) is
60/// also often faster than a noncontiguous NFA, but a little slower than a
61/// DFA. Indeed, when no specific [`AhoCorasickKind`] is used (which is the
62/// default), a contiguous NFA is used in most cases.
63///
64/// The only "catch" to using a contiguous NFA is that, because of its variety
65/// of compression tricks, it may not be able to support automatons as large as
66/// what the noncontiguous NFA supports. In which case, building a contiguous
67/// NFA will fail and (by default) `AhoCorasick` will automatically fall
68/// back to a noncontiguous NFA. (This typically only happens when building
69/// automatons from millions of patterns.) Otherwise, the small additional time
70/// for building a contiguous NFA is almost certainly worth it.
71///
72/// # Cloning
73///
74/// The `AhoCorasick` type uses thread safe reference counting internally. It
75/// is guaranteed that it is cheap to clone.
76///
77/// # Search configuration
78///
79/// Most of the search routines accept anything that can be cheaply converted
80/// to an [`Input`]. This includes `&[u8]`, `&str` and `Input` itself.
81///
82/// # Construction failure
83///
84/// It is generally possible for building an Aho-Corasick automaton to fail.
85/// Construction can fail in generally one way: when the inputs provided are
86/// too big. Whether that's a pattern that is too long, too many patterns
87/// or some combination of both. A first approximation for the scale at which
88/// construction can fail is somewhere around "millions of patterns."
89///
90/// For that reason, if you're building an Aho-Corasick automaton from
91/// untrusted input (or input that doesn't have any reasonable bounds on its
92/// size), then it is strongly recommended to handle the possibility of an
93/// error.
94///
95/// If you're constructing an Aho-Corasick automaton from static or trusted
96/// data, then it is likely acceptable to panic (by calling `unwrap()` or
97/// `expect()`) if construction fails.
98///
99/// # Fallibility
100///
101/// The `AhoCorasick` type provides a number of methods for searching, as one
102/// might expect. Depending on how the Aho-Corasick automaton was built and
103/// depending on the search configuration, it is possible for a search to
104/// return an error. Since an error is _never_ dependent on the actual contents
105/// of the haystack, this type provides both infallible and fallible methods
106/// for searching. The infallible methods panic if an error occurs, and can be
107/// used for convenience and when you know the search will never return an
108/// error.
109///
110/// For example, the [`AhoCorasick::find_iter`] method is the infallible
111/// version of the [`AhoCorasick::try_find_iter`] method.
112///
113/// Examples of errors that can occur:
114///
115/// * Running a search that requires [`MatchKind::Standard`] semantics (such
116/// as a stream or overlapping search) with an automaton that was built with
117/// [`MatchKind::LeftmostFirst`] or [`MatchKind::LeftmostLongest`] semantics.
118/// * Running an anchored search with an automaton that only supports
119/// unanchored searches. (By default, `AhoCorasick` only supports unanchored
120/// searches. But this can be toggled with [`AhoCorasickBuilder::start_kind`].)
121/// * Running an unanchored search with an automaton that only supports
122/// anchored searches.
123///
124/// The common thread between the different types of errors is that they are
125/// all rooted in the automaton construction and search configurations. If
126/// those configurations are a static property of your program, then it is
127/// reasonable to call infallible routines since you know an error will never
128/// occur. And if one _does_ occur, then it's a bug in your program.
129///
130/// To re-iterate, if the patterns, build or search configuration come from
131/// user or untrusted data, then you should handle errors at build or search
132/// time. If only the haystack comes from user or untrusted data, then there
133/// should be no need to handle errors anywhere and it is generally encouraged
134/// to `unwrap()` (or `expect()`) both build and search time calls.
135///
136/// # Examples
137///
138/// This example shows how to search for occurrences of multiple patterns
139/// simultaneously in a case insensitive fashion. Each match includes the
140/// pattern that matched along with the byte offsets of the match.
141///
142/// ```
143/// use aho_corasick::{AhoCorasick, PatternID};
144///
145/// let patterns = &["apple", "maple", "snapple"];
146/// let haystack = "Nobody likes maple in their apple flavored Snapple.";
147///
148/// let ac = AhoCorasick::builder()
149/// .ascii_case_insensitive(true)
150/// .build(patterns)
151/// .unwrap();
152/// let mut matches = vec![];
153/// for mat in ac.find_iter(haystack) {
154/// matches.push((mat.pattern(), mat.start(), mat.end()));
155/// }
156/// assert_eq!(matches, vec![
157/// (PatternID::must(1), 13, 18),
158/// (PatternID::must(0), 28, 33),
159/// (PatternID::must(2), 43, 50),
160/// ]);
161/// ```
162///
163/// This example shows how to replace matches with some other string:
164///
165/// ```
166/// use aho_corasick::AhoCorasick;
167///
168/// let patterns = &["fox", "brown", "quick"];
169/// let haystack = "The quick brown fox.";
170/// let replace_with = &["sloth", "grey", "slow"];
171///
172/// let ac = AhoCorasick::new(patterns).unwrap();
173/// let result = ac.replace_all(haystack, replace_with);
174/// assert_eq!(result, "The slow grey sloth.");
175/// ```
176#[derive(Clone)]
177pub struct AhoCorasick {
178 /// The underlying Aho-Corasick automaton. It's one of
179 /// nfa::noncontiguous::NFA, nfa::contiguous::NFA or dfa::DFA.
180 aut: Arc<dyn AcAutomaton>,
181 /// The specific Aho-Corasick kind chosen. This makes it possible to
182 /// inspect any `AhoCorasick` and know what kind of search strategy it
183 /// uses.
184 kind: AhoCorasickKind,
185 /// The start kind of this automaton as configured by the caller.
186 ///
187 /// We don't really *need* to put this here, since the underlying automaton
188 /// will correctly return errors if the caller requests an unsupported
189 /// search type. But we do keep this here for API behavior consistency.
190 /// Namely, the NFAs in this crate support both unanchored and anchored
191 /// searches unconditionally. There's no way to disable one or the other.
192 /// They always both work. But the DFA in this crate specifically only
193 /// supports both unanchored and anchored searches if it's configured to
194 /// do so. Why? Because for the DFA, supporting both essentially requires
195 /// two copies of the transition table: one generated by following failure
196 /// transitions from the original NFA and one generated by not following
197 /// those failure transitions.
198 ///
199 /// So why record the start kind here? Well, consider what happens
200 /// when no specific 'AhoCorasickKind' is selected by the caller and
201 /// 'StartKind::Unanchored' is used (both are the default). It *might*
202 /// result in using a DFA or it might pick an NFA. If it picks an NFA, the
203 /// caller would then be able to run anchored searches, even though the
204 /// caller only asked for support for unanchored searches. Maybe that's
205 /// fine, but what if the DFA was chosen instead? Oops, the caller would
206 /// get an error.
207 ///
208 /// Basically, it seems bad to return an error or not based on some
209 /// internal implementation choice. So we smooth things out and ensure
210 /// anchored searches *always* report an error when only unanchored support
211 /// was asked for (and vice versa), even if the underlying automaton
212 /// supports it.
213 start_kind: StartKind,
214}
215
216/// Convenience constructors for an Aho-Corasick searcher. To configure the
217/// searcher, use an [`AhoCorasickBuilder`] instead.
218impl AhoCorasick {
219 /// Create a new Aho-Corasick automaton using the default configuration.
220 ///
221 /// The default configuration optimizes for less space usage, but at the
222 /// expense of longer search times. To change the configuration, use
223 /// [`AhoCorasickBuilder`].
224 ///
225 /// This uses the default [`MatchKind::Standard`] match semantics, which
226 /// reports a match as soon as it is found. This corresponds to the
227 /// standard match semantics supported by textbook descriptions of the
228 /// Aho-Corasick algorithm.
229 ///
230 /// # Examples
231 ///
232 /// Basic usage:
233 ///
234 /// ```
235 /// use aho_corasick::{AhoCorasick, PatternID};
236 ///
237 /// let ac = AhoCorasick::new(&["foo", "bar", "baz"]).unwrap();
238 /// assert_eq!(
239 /// Some(PatternID::must(1)),
240 /// ac.find("xxx bar xxx").map(|m| m.pattern()),
241 /// );
242 /// ```
243 pub fn new<I, P>(patterns: I) -> Result<AhoCorasick, BuildError>
244 where
245 I: IntoIterator<Item = P>,
246 P: AsRef<[u8]>,
247 {
248 AhoCorasickBuilder::new().build(patterns)
249 }
250
251 /// A convenience method for returning a new Aho-Corasick builder.
252 ///
253 /// This usually permits one to just import the `AhoCorasick` type.
254 ///
255 /// # Examples
256 ///
257 /// Basic usage:
258 ///
259 /// ```
260 /// use aho_corasick::{AhoCorasick, Match, MatchKind};
261 ///
262 /// let ac = AhoCorasick::builder()
263 /// .match_kind(MatchKind::LeftmostFirst)
264 /// .build(&["samwise", "sam"])
265 /// .unwrap();
266 /// assert_eq!(Some(Match::must(0, 0..7)), ac.find("samwise"));
267 /// ```
268 pub fn builder() -> AhoCorasickBuilder {
269 AhoCorasickBuilder::new()
270 }
271}
272
273/// Infallible search routines. These APIs panic when the underlying search
274/// would otherwise fail. Infallible routines are useful because the errors are
275/// a result of both search-time configuration and what configuration is used
276/// to build the Aho-Corasick searcher. Both of these things are not usually
277/// the result of user input, and thus, an error is typically indicative of a
278/// programmer error. In cases where callers want errors instead of panics, use
279/// the corresponding `try` method in the section below.
280impl AhoCorasick {
281 /// Returns true if and only if this automaton matches the haystack at any
282 /// position.
283 ///
284 /// `input` may be any type that is cheaply convertible to an `Input`. This
285 /// includes, but is not limited to, `&str` and `&[u8]`.
286 ///
287 /// Aside from convenience, when `AhoCorasick` was built with
288 /// leftmost-first or leftmost-longest semantics, this might result in a
289 /// search that visits less of the haystack than [`AhoCorasick::find`]
290 /// would otherwise. (For standard semantics, matches are always
291 /// immediately returned once they are seen, so there is no way for this to
292 /// do less work in that case.)
293 ///
294 /// Note that there is no corresponding fallible routine for this method.
295 /// If you need a fallible version of this, then [`AhoCorasick::try_find`]
296 /// can be used with [`Input::earliest`] enabled.
297 ///
298 /// # Examples
299 ///
300 /// Basic usage:
301 ///
302 /// ```
303 /// use aho_corasick::AhoCorasick;
304 ///
305 /// let ac = AhoCorasick::new(&[
306 /// "foo", "bar", "quux", "baz",
307 /// ]).unwrap();
308 /// assert!(ac.is_match("xxx bar xxx"));
309 /// assert!(!ac.is_match("xxx qux xxx"));
310 /// ```
311 pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
312 self.aut
313 .try_find(&input.into().earliest(true))
314 .expect("AhoCorasick::try_find is not expected to fail")
315 .is_some()
316 }
317
318 /// Returns the location of the first match according to the match
319 /// semantics that this automaton was constructed with.
320 ///
321 /// `input` may be any type that is cheaply convertible to an `Input`. This
322 /// includes, but is not limited to, `&str` and `&[u8]`.
323 ///
324 /// This is the infallible version of [`AhoCorasick::try_find`].
325 ///
326 /// # Panics
327 ///
328 /// This panics when [`AhoCorasick::try_find`] would return an error.
329 ///
330 /// # Examples
331 ///
332 /// Basic usage, with standard semantics:
333 ///
334 /// ```
335 /// use aho_corasick::{AhoCorasick, MatchKind};
336 ///
337 /// let patterns = &["b", "abc", "abcd"];
338 /// let haystack = "abcd";
339 ///
340 /// let ac = AhoCorasick::builder()
341 /// .match_kind(MatchKind::Standard) // default, not necessary
342 /// .build(patterns)
343 /// .unwrap();
344 /// let mat = ac.find(haystack).expect("should have a match");
345 /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
346 /// ```
347 ///
348 /// Now with leftmost-first semantics:
349 ///
350 /// ```
351 /// use aho_corasick::{AhoCorasick, MatchKind};
352 ///
353 /// let patterns = &["b", "abc", "abcd"];
354 /// let haystack = "abcd";
355 ///
356 /// let ac = AhoCorasick::builder()
357 /// .match_kind(MatchKind::LeftmostFirst)
358 /// .build(patterns)
359 /// .unwrap();
360 /// let mat = ac.find(haystack).expect("should have a match");
361 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
362 /// ```
363 ///
364 /// And finally, leftmost-longest semantics:
365 ///
366 /// ```
367 /// use aho_corasick::{AhoCorasick, MatchKind};
368 ///
369 /// let patterns = &["b", "abc", "abcd"];
370 /// let haystack = "abcd";
371 ///
372 /// let ac = AhoCorasick::builder()
373 /// .match_kind(MatchKind::LeftmostLongest)
374 /// .build(patterns)
375 /// .unwrap();
376 /// let mat = ac.find(haystack).expect("should have a match");
377 /// ```
378 ///
379 /// # Example: configuring a search
380 ///
381 /// Because this method accepts anything that can be turned into an
382 /// [`Input`], it's possible to provide an `Input` directly in order to
383 /// configure the search. In this example, we show how to use the
384 /// `earliest` option to force the search to return as soon as it knows
385 /// a match has occurred.
386 ///
387 /// ```
388 /// use aho_corasick::{AhoCorasick, Input, MatchKind};
389 ///
390 /// let patterns = &["b", "abc", "abcd"];
391 /// let haystack = "abcd";
392 ///
393 /// let ac = AhoCorasick::builder()
394 /// .match_kind(MatchKind::LeftmostLongest)
395 /// .build(patterns)
396 /// .unwrap();
397 /// let mat = ac.find(Input::new(haystack).earliest(true))
398 /// .expect("should have a match");
399 /// // The correct leftmost-longest match here is 'abcd', but since we
400 /// // told the search to quit as soon as it knows a match has occurred,
401 /// // we get a different match back.
402 /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
403 /// ```
404 pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
405 self.try_find(input)
406 .expect("AhoCorasick::try_find is not expected to fail")
407 }
408
409 /// Returns the location of the first overlapping match in the given
410 /// input with respect to the current state of the underlying searcher.
411 ///
412 /// `input` may be any type that is cheaply convertible to an `Input`. This
413 /// includes, but is not limited to, `&str` and `&[u8]`.
414 ///
415 /// Overlapping searches do not report matches in their return value.
416 /// Instead, matches can be accessed via [`OverlappingState::get_match`]
417 /// after a search call.
418 ///
419 /// This is the infallible version of
420 /// [`AhoCorasick::try_find_overlapping`].
421 ///
422 /// # Panics
423 ///
424 /// This panics when [`AhoCorasick::try_find_overlapping`] would
425 /// return an error. For example, when the Aho-Corasick searcher
426 /// doesn't support overlapping searches. (Only searchers built with
427 /// [`MatchKind::Standard`] semantics support overlapping searches.)
428 ///
429 /// # Example
430 ///
431 /// This shows how we can repeatedly call an overlapping search without
432 /// ever needing to explicitly re-slice the haystack. Overlapping search
433 /// works this way because searches depend on state saved during the
434 /// previous search.
435 ///
436 /// ```
437 /// use aho_corasick::{
438 /// automaton::OverlappingState,
439 /// AhoCorasick, Input, Match,
440 /// };
441 ///
442 /// let patterns = &["append", "appendage", "app"];
443 /// let haystack = "append the app to the appendage";
444 ///
445 /// let ac = AhoCorasick::new(patterns).unwrap();
446 /// let mut state = OverlappingState::start();
447 ///
448 /// ac.find_overlapping(haystack, &mut state);
449 /// assert_eq!(Some(Match::must(2, 0..3)), state.get_match());
450 ///
451 /// ac.find_overlapping(haystack, &mut state);
452 /// assert_eq!(Some(Match::must(0, 0..6)), state.get_match());
453 ///
454 /// ac.find_overlapping(haystack, &mut state);
455 /// assert_eq!(Some(Match::must(2, 11..14)), state.get_match());
456 ///
457 /// ac.find_overlapping(haystack, &mut state);
458 /// assert_eq!(Some(Match::must(2, 22..25)), state.get_match());
459 ///
460 /// ac.find_overlapping(haystack, &mut state);
461 /// assert_eq!(Some(Match::must(0, 22..28)), state.get_match());
462 ///
463 /// ac.find_overlapping(haystack, &mut state);
464 /// assert_eq!(Some(Match::must(1, 22..31)), state.get_match());
465 ///
466 /// // No more match matches to be found.
467 /// ac.find_overlapping(haystack, &mut state);
468 /// assert_eq!(None, state.get_match());
469 /// ```
470 pub fn find_overlapping<'h, I: Into<Input<'h>>>(
471 &self,
472 input: I,
473 state: &mut OverlappingState,
474 ) {
475 self.try_find_overlapping(input, state).expect(
476 "AhoCorasick::try_find_overlapping is not expected to fail",
477 )
478 }
479
480 /// Returns an iterator of non-overlapping matches, using the match
481 /// semantics that this automaton was constructed with.
482 ///
483 /// `input` may be any type that is cheaply convertible to an `Input`. This
484 /// includes, but is not limited to, `&str` and `&[u8]`.
485 ///
486 /// This is the infallible version of [`AhoCorasick::try_find_iter`].
487 ///
488 /// # Panics
489 ///
490 /// This panics when [`AhoCorasick::try_find_iter`] would return an error.
491 ///
492 /// # Examples
493 ///
494 /// Basic usage, with standard semantics:
495 ///
496 /// ```
497 /// use aho_corasick::{AhoCorasick, MatchKind, PatternID};
498 ///
499 /// let patterns = &["append", "appendage", "app"];
500 /// let haystack = "append the app to the appendage";
501 ///
502 /// let ac = AhoCorasick::builder()
503 /// .match_kind(MatchKind::Standard) // default, not necessary
504 /// .build(patterns)
505 /// .unwrap();
506 /// let matches: Vec<PatternID> = ac
507 /// .find_iter(haystack)
508 /// .map(|mat| mat.pattern())
509 /// .collect();
510 /// assert_eq!(vec![
511 /// PatternID::must(2),
512 /// PatternID::must(2),
513 /// PatternID::must(2),
514 /// ], matches);
515 /// ```
516 ///
517 /// Now with leftmost-first semantics:
518 ///
519 /// ```
520 /// use aho_corasick::{AhoCorasick, MatchKind, PatternID};
521 ///
522 /// let patterns = &["append", "appendage", "app"];
523 /// let haystack = "append the app to the appendage";
524 ///
525 /// let ac = AhoCorasick::builder()
526 /// .match_kind(MatchKind::LeftmostFirst)
527 /// .build(patterns)
528 /// .unwrap();
529 /// let matches: Vec<PatternID> = ac
530 /// .find_iter(haystack)
531 /// .map(|mat| mat.pattern())
532 /// .collect();
533 /// assert_eq!(vec![
534 /// PatternID::must(0),
535 /// PatternID::must(2),
536 /// PatternID::must(0),
537 /// ], matches);
538 /// ```
539 ///
540 /// And finally, leftmost-longest semantics:
541 ///
542 /// ```
543 /// use aho_corasick::{AhoCorasick, MatchKind, PatternID};
544 ///
545 /// let patterns = &["append", "appendage", "app"];
546 /// let haystack = "append the app to the appendage";
547 ///
548 /// let ac = AhoCorasick::builder()
549 /// .match_kind(MatchKind::LeftmostLongest)
550 /// .build(patterns)
551 /// .unwrap();
552 /// let matches: Vec<PatternID> = ac
553 /// .find_iter(haystack)
554 /// .map(|mat| mat.pattern())
555 /// .collect();
556 /// assert_eq!(vec![
557 /// PatternID::must(0),
558 /// PatternID::must(2),
559 /// PatternID::must(1),
560 /// ], matches);
561 /// ```
562 pub fn find_iter<'a, 'h, I: Into<Input<'h>>>(
563 &'a self,
564 input: I,
565 ) -> FindIter<'a, 'h> {
566 self.try_find_iter(input)
567 .expect("AhoCorasick::try_find_iter is not expected to fail")
568 }
569
570 /// Returns an iterator of overlapping matches. Stated differently, this
571 /// returns an iterator of all possible matches at every position.
572 ///
573 /// `input` may be any type that is cheaply convertible to an `Input`. This
574 /// includes, but is not limited to, `&str` and `&[u8]`.
575 ///
576 /// This is the infallible version of
577 /// [`AhoCorasick::try_find_overlapping_iter`].
578 ///
579 /// # Panics
580 ///
581 /// This panics when `AhoCorasick::try_find_overlapping_iter` would return
582 /// an error. For example, when the Aho-Corasick searcher is built with
583 /// either leftmost-first or leftmost-longest match semantics. Stated
584 /// differently, overlapping searches require one to build the searcher
585 /// with [`MatchKind::Standard`] (it is the default).
586 ///
587 /// # Example: basic usage
588 ///
589 /// ```
590 /// use aho_corasick::{AhoCorasick, PatternID};
591 ///
592 /// let patterns = &["append", "appendage", "app"];
593 /// let haystack = "append the app to the appendage";
594 ///
595 /// let ac = AhoCorasick::new(patterns).unwrap();
596 /// let matches: Vec<PatternID> = ac
597 /// .find_overlapping_iter(haystack)
598 /// .map(|mat| mat.pattern())
599 /// .collect();
600 /// assert_eq!(vec![
601 /// PatternID::must(2),
602 /// PatternID::must(0),
603 /// PatternID::must(2),
604 /// PatternID::must(2),
605 /// PatternID::must(0),
606 /// PatternID::must(1),
607 /// ], matches);
608 /// ```
609 pub fn find_overlapping_iter<'a, 'h, I: Into<Input<'h>>>(
610 &'a self,
611 input: I,
612 ) -> FindOverlappingIter<'a, 'h> {
613 self.try_find_overlapping_iter(input).expect(
614 "AhoCorasick::try_find_overlapping_iter is not expected to fail",
615 )
616 }
617
618 /// Replace all matches with a corresponding value in the `replace_with`
619 /// slice given. Matches correspond to the same matches as reported by
620 /// [`AhoCorasick::find_iter`].
621 ///
622 /// Replacements are determined by the index of the matching pattern.
623 /// For example, if the pattern with index `2` is found, then it is
624 /// replaced by `replace_with[2]`.
625 ///
626 /// This is the infallible version of [`AhoCorasick::try_replace_all`].
627 ///
628 /// # Panics
629 ///
630 /// This panics when [`AhoCorasick::try_replace_all`] would return an
631 /// error.
632 ///
633 /// This also panics when `replace_with.len()` does not equal
634 /// [`AhoCorasick::patterns_len`].
635 ///
636 /// # Example: basic usage
637 ///
638 /// ```
639 /// use aho_corasick::{AhoCorasick, MatchKind};
640 ///
641 /// let patterns = &["append", "appendage", "app"];
642 /// let haystack = "append the app to the appendage";
643 ///
644 /// let ac = AhoCorasick::builder()
645 /// .match_kind(MatchKind::LeftmostFirst)
646 /// .build(patterns)
647 /// .unwrap();
648 /// let result = ac.replace_all(haystack, &["x", "y", "z"]);
649 /// assert_eq!("x the z to the xage", result);
650 /// ```
651 pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String
652 where
653 B: AsRef<str>,
654 {
655 self.try_replace_all(haystack, replace_with)
656 .expect("AhoCorasick::try_replace_all is not expected to fail")
657 }
658
659 /// Replace all matches using raw bytes with a corresponding value in the
660 /// `replace_with` slice given. Matches correspond to the same matches as
661 /// reported by [`AhoCorasick::find_iter`].
662 ///
663 /// Replacements are determined by the index of the matching pattern.
664 /// For example, if the pattern with index `2` is found, then it is
665 /// replaced by `replace_with[2]`.
666 ///
667 /// This is the infallible version of
668 /// [`AhoCorasick::try_replace_all_bytes`].
669 ///
670 /// # Panics
671 ///
672 /// This panics when [`AhoCorasick::try_replace_all_bytes`] would return an
673 /// error.
674 ///
675 /// This also panics when `replace_with.len()` does not equal
676 /// [`AhoCorasick::patterns_len`].
677 ///
678 /// # Example: basic usage
679 ///
680 /// ```
681 /// use aho_corasick::{AhoCorasick, MatchKind};
682 ///
683 /// let patterns = &["append", "appendage", "app"];
684 /// let haystack = b"append the app to the appendage";
685 ///
686 /// let ac = AhoCorasick::builder()
687 /// .match_kind(MatchKind::LeftmostFirst)
688 /// .build(patterns)
689 /// .unwrap();
690 /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]);
691 /// assert_eq!(b"x the z to the xage".to_vec(), result);
692 /// ```
693 pub fn replace_all_bytes<B>(
694 &self,
695 haystack: &[u8],
696 replace_with: &[B],
697 ) -> Vec<u8>
698 where
699 B: AsRef<[u8]>,
700 {
701 self.try_replace_all_bytes(haystack, replace_with)
702 .expect("AhoCorasick::try_replace_all_bytes should not fail")
703 }
704
705 /// Replace all matches using a closure called on each match.
706 /// Matches correspond to the same matches as reported by
707 /// [`AhoCorasick::find_iter`].
708 ///
709 /// The closure accepts three parameters: the match found, the text of
710 /// the match and a string buffer with which to write the replaced text
711 /// (if any). If the closure returns `true`, then it continues to the next
712 /// match. If the closure returns `false`, then searching is stopped.
713 ///
714 /// Note that any matches with boundaries that don't fall on a valid UTF-8
715 /// boundary are silently skipped.
716 ///
717 /// This is the infallible version of
718 /// [`AhoCorasick::try_replace_all_with`].
719 ///
720 /// # Panics
721 ///
722 /// This panics when [`AhoCorasick::try_replace_all_with`] would return an
723 /// error.
724 ///
725 /// # Examples
726 ///
727 /// Basic usage:
728 ///
729 /// ```
730 /// use aho_corasick::{AhoCorasick, MatchKind};
731 ///
732 /// let patterns = &["append", "appendage", "app"];
733 /// let haystack = "append the app to the appendage";
734 ///
735 /// let ac = AhoCorasick::builder()
736 /// .match_kind(MatchKind::LeftmostFirst)
737 /// .build(patterns)
738 /// .unwrap();
739 /// let mut result = String::new();
740 /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
741 /// dst.push_str(&mat.pattern().as_usize().to_string());
742 /// true
743 /// });
744 /// assert_eq!("0 the 2 to the 0age", result);
745 /// ```
746 ///
747 /// Stopping the replacement by returning `false` (continued from the
748 /// example above):
749 ///
750 /// ```
751 /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID};
752 /// # let patterns = &["append", "appendage", "app"];
753 /// # let haystack = "append the app to the appendage";
754 /// # let ac = AhoCorasick::builder()
755 /// # .match_kind(MatchKind::LeftmostFirst)
756 /// # .build(patterns)
757 /// # .unwrap();
758 /// let mut result = String::new();
759 /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
760 /// dst.push_str(&mat.pattern().as_usize().to_string());
761 /// mat.pattern() != PatternID::must(2)
762 /// });
763 /// assert_eq!("0 the 2 to the appendage", result);
764 /// ```
765 pub fn replace_all_with<F>(
766 &self,
767 haystack: &str,
768 dst: &mut String,
769 replace_with: F,
770 ) where
771 F: FnMut(&Match, &str, &mut String) -> bool,
772 {
773 self.try_replace_all_with(haystack, dst, replace_with)
774 .expect("AhoCorasick::try_replace_all_with should not fail")
775 }
776
777 /// Replace all matches using raw bytes with a closure called on each
778 /// match. Matches correspond to the same matches as reported by
779 /// [`AhoCorasick::find_iter`].
780 ///
781 /// The closure accepts three parameters: the match found, the text of
782 /// the match and a byte buffer with which to write the replaced text
783 /// (if any). If the closure returns `true`, then it continues to the next
784 /// match. If the closure returns `false`, then searching is stopped.
785 ///
786 /// This is the infallible version of
787 /// [`AhoCorasick::try_replace_all_with_bytes`].
788 ///
789 /// # Panics
790 ///
791 /// This panics when [`AhoCorasick::try_replace_all_with_bytes`] would
792 /// return an error.
793 ///
794 /// # Examples
795 ///
796 /// Basic usage:
797 ///
798 /// ```
799 /// use aho_corasick::{AhoCorasick, MatchKind};
800 ///
801 /// let patterns = &["append", "appendage", "app"];
802 /// let haystack = b"append the app to the appendage";
803 ///
804 /// let ac = AhoCorasick::builder()
805 /// .match_kind(MatchKind::LeftmostFirst)
806 /// .build(patterns)
807 /// .unwrap();
808 /// let mut result = vec![];
809 /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
810 /// dst.extend(mat.pattern().as_usize().to_string().bytes());
811 /// true
812 /// });
813 /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
814 /// ```
815 ///
816 /// Stopping the replacement by returning `false` (continued from the
817 /// example above):
818 ///
819 /// ```
820 /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID};
821 /// # let patterns = &["append", "appendage", "app"];
822 /// # let haystack = b"append the app to the appendage";
823 /// # let ac = AhoCorasick::builder()
824 /// # .match_kind(MatchKind::LeftmostFirst)
825 /// # .build(patterns)
826 /// # .unwrap();
827 /// let mut result = vec![];
828 /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
829 /// dst.extend(mat.pattern().as_usize().to_string().bytes());
830 /// mat.pattern() != PatternID::must(2)
831 /// });
832 /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
833 /// ```
834 pub fn replace_all_with_bytes<F>(
835 &self,
836 haystack: &[u8],
837 dst: &mut Vec<u8>,
838 replace_with: F,
839 ) where
840 F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
841 {
842 self.try_replace_all_with_bytes(haystack, dst, replace_with)
843 .expect("AhoCorasick::try_replace_all_with_bytes should not fail")
844 }
845
846 /// Returns an iterator of non-overlapping matches in the given
847 /// stream. Matches correspond to the same matches as reported by
848 /// [`AhoCorasick::find_iter`].
849 ///
850 /// The matches yielded by this iterator use absolute position offsets in
851 /// the stream given, where the first byte has index `0`. Matches are
852 /// yieled until the stream is exhausted.
853 ///
854 /// Each item yielded by the iterator is an `Result<Match,
855 /// std::io::Error>`, where an error is yielded if there was a problem
856 /// reading from the reader given.
857 ///
858 /// When searching a stream, an internal buffer is used. Therefore, callers
859 /// should avoiding providing a buffered reader, if possible.
860 ///
861 /// This is the infallible version of
862 /// [`AhoCorasick::try_stream_find_iter`]. Note that both methods return
863 /// iterators that produce `Result` values. The difference is that this
864 /// routine panics if _construction_ of the iterator failed. The `Result`
865 /// values yield by the iterator come from whether the given reader returns
866 /// an error or not during the search.
867 ///
868 /// # Memory usage
869 ///
870 /// In general, searching streams will use a constant amount of memory for
871 /// its internal buffer. The one requirement is that the internal buffer
872 /// must be at least the size of the longest possible match. In most use
873 /// cases, the default buffer size will be much larger than any individual
874 /// match.
875 ///
876 /// # Panics
877 ///
878 /// This panics when [`AhoCorasick::try_stream_find_iter`] would return
879 /// an error. For example, when the Aho-Corasick searcher doesn't support
880 /// stream searches. (Only searchers built with [`MatchKind::Standard`]
881 /// semantics support stream searches.)
882 ///
883 /// # Example: basic usage
884 ///
885 /// ```
886 /// use aho_corasick::{AhoCorasick, PatternID};
887 ///
888 /// let patterns = &["append", "appendage", "app"];
889 /// let haystack = "append the app to the appendage";
890 ///
891 /// let ac = AhoCorasick::new(patterns).unwrap();
892 /// let mut matches = vec![];
893 /// for result in ac.stream_find_iter(haystack.as_bytes()) {
894 /// let mat = result?;
895 /// matches.push(mat.pattern());
896 /// }
897 /// assert_eq!(vec![
898 /// PatternID::must(2),
899 /// PatternID::must(2),
900 /// PatternID::must(2),
901 /// ], matches);
902 ///
903 /// # Ok::<(), Box<dyn std::error::Error>>(())
904 /// ```
905 #[cfg(feature = "std")]
906 pub fn stream_find_iter<'a, R: std::io::Read>(
907 &'a self,
908 rdr: R,
909 ) -> StreamFindIter<'a, R> {
910 self.try_stream_find_iter(rdr)
911 .expect("AhoCorasick::try_stream_find_iter should not fail")
912 }
913}
914
915/// Fallible search routines. These APIs return an error in cases where the
916/// infallible routines would panic.
917impl AhoCorasick {
918 /// Returns the location of the first match according to the match
919 /// semantics that this automaton was constructed with, and according
920 /// to the given `Input` configuration.
921 ///
922 /// This is the fallible version of [`AhoCorasick::find`].
923 ///
924 /// # Errors
925 ///
926 /// This returns an error when this Aho-Corasick searcher does not support
927 /// the given `Input` configuration.
928 ///
929 /// For example, if the Aho-Corasick searcher only supports anchored
930 /// searches or only supports unanchored searches, then providing an
931 /// `Input` that requests an anchored (or unanchored) search when it isn't
932 /// supported would result in an error.
933 ///
934 /// # Example: leftmost-first searching
935 ///
936 /// Basic usage with leftmost-first semantics:
937 ///
938 /// ```
939 /// use aho_corasick::{AhoCorasick, MatchKind, Input};
940 ///
941 /// let patterns = &["b", "abc", "abcd"];
942 /// let haystack = "foo abcd";
943 ///
944 /// let ac = AhoCorasick::builder()
945 /// .match_kind(MatchKind::LeftmostFirst)
946 /// .build(patterns)
947 /// .unwrap();
948 /// let mat = ac.try_find(haystack)?.expect("should have a match");
949 /// assert_eq!("abc", &haystack[mat.span()]);
950 ///
951 /// # Ok::<(), Box<dyn std::error::Error>>(())
952 /// ```
953 ///
954 /// # Example: anchored leftmost-first searching
955 ///
956 /// This shows how to anchor the search, so that even if the haystack
957 /// contains a match somewhere, a match won't be reported unless one can
958 /// be found that starts at the beginning of the search:
959 ///
960 /// ```
961 /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind};
962 ///
963 /// let patterns = &["b", "abc", "abcd"];
964 /// let haystack = "foo abcd";
965 ///
966 /// let ac = AhoCorasick::builder()
967 /// .match_kind(MatchKind::LeftmostFirst)
968 /// .start_kind(StartKind::Anchored)
969 /// .build(patterns)
970 /// .unwrap();
971 /// let input = Input::new(haystack).anchored(Anchored::Yes);
972 /// assert_eq!(None, ac.try_find(input)?);
973 ///
974 /// # Ok::<(), Box<dyn std::error::Error>>(())
975 /// ```
976 ///
977 /// If the beginning of the search is changed to where a match begins, then
978 /// it will be found:
979 ///
980 /// ```
981 /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind};
982 ///
983 /// let patterns = &["b", "abc", "abcd"];
984 /// let haystack = "foo abcd";
985 ///
986 /// let ac = AhoCorasick::builder()
987 /// .match_kind(MatchKind::LeftmostFirst)
988 /// .start_kind(StartKind::Anchored)
989 /// .build(patterns)
990 /// .unwrap();
991 /// let input = Input::new(haystack).range(4..).anchored(Anchored::Yes);
992 /// let mat = ac.try_find(input)?.expect("should have a match");
993 /// assert_eq!("abc", &haystack[mat.span()]);
994 ///
995 /// # Ok::<(), Box<dyn std::error::Error>>(())
996 /// ```
997 ///
998 /// # Example: earliest leftmost-first searching
999 ///
1000 /// This shows how to run an "earliest" search even when the Aho-Corasick
1001 /// searcher was compiled with leftmost-first match semantics. In this
1002 /// case, the search is stopped as soon as it is known that a match has
1003 /// occurred, even if it doesn't correspond to the leftmost-first match.
1004 ///
1005 /// ```
1006 /// use aho_corasick::{AhoCorasick, Input, MatchKind};
1007 ///
1008 /// let patterns = &["b", "abc", "abcd"];
1009 /// let haystack = "foo abcd";
1010 ///
1011 /// let ac = AhoCorasick::builder()
1012 /// .match_kind(MatchKind::LeftmostFirst)
1013 /// .build(patterns)
1014 /// .unwrap();
1015 /// let input = Input::new(haystack).earliest(true);
1016 /// let mat = ac.try_find(input)?.expect("should have a match");
1017 /// assert_eq!("b", &haystack[mat.span()]);
1018 ///
1019 /// # Ok::<(), Box<dyn std::error::Error>>(())
1020 /// ```
1021 pub fn try_find<'h, I: Into<Input<'h>>>(
1022 &self,
1023 input: I,
1024 ) -> Result<Option<Match>, MatchError> {
1025 let input = input.into();
1026 enforce_anchored_consistency(self.start_kind, input.get_anchored())?;
1027 self.aut.try_find(&input)
1028 }
1029
1030 /// Returns the location of the first overlapping match in the given
1031 /// input with respect to the current state of the underlying searcher.
1032 ///
1033 /// Overlapping searches do not report matches in their return value.
1034 /// Instead, matches can be accessed via [`OverlappingState::get_match`]
1035 /// after a search call.
1036 ///
1037 /// This is the fallible version of [`AhoCorasick::find_overlapping`].
1038 ///
1039 /// # Errors
1040 ///
1041 /// This returns an error when this Aho-Corasick searcher does not support
1042 /// the given `Input` configuration or if overlapping search is not
1043 /// supported.
1044 ///
1045 /// One example is that only Aho-Corasicker searchers built with
1046 /// [`MatchKind::Standard`] semantics support overlapping searches. Using
1047 /// any other match semantics will result in this returning an error.
1048 ///
1049 /// # Example: basic usage
1050 ///
1051 /// This shows how we can repeatedly call an overlapping search without
1052 /// ever needing to explicitly re-slice the haystack. Overlapping search
1053 /// works this way because searches depend on state saved during the
1054 /// previous search.
1055 ///
1056 /// ```
1057 /// use aho_corasick::{
1058 /// automaton::OverlappingState,
1059 /// AhoCorasick, Input, Match,
1060 /// };
1061 ///
1062 /// let patterns = &["append", "appendage", "app"];
1063 /// let haystack = "append the app to the appendage";
1064 ///
1065 /// let ac = AhoCorasick::new(patterns).unwrap();
1066 /// let mut state = OverlappingState::start();
1067 ///
1068 /// ac.try_find_overlapping(haystack, &mut state)?;
1069 /// assert_eq!(Some(Match::must(2, 0..3)), state.get_match());
1070 ///
1071 /// ac.try_find_overlapping(haystack, &mut state)?;
1072 /// assert_eq!(Some(Match::must(0, 0..6)), state.get_match());
1073 ///
1074 /// ac.try_find_overlapping(haystack, &mut state)?;
1075 /// assert_eq!(Some(Match::must(2, 11..14)), state.get_match());
1076 ///
1077 /// ac.try_find_overlapping(haystack, &mut state)?;
1078 /// assert_eq!(Some(Match::must(2, 22..25)), state.get_match());
1079 ///
1080 /// ac.try_find_overlapping(haystack, &mut state)?;
1081 /// assert_eq!(Some(Match::must(0, 22..28)), state.get_match());
1082 ///
1083 /// ac.try_find_overlapping(haystack, &mut state)?;
1084 /// assert_eq!(Some(Match::must(1, 22..31)), state.get_match());
1085 ///
1086 /// // No more match matches to be found.
1087 /// ac.try_find_overlapping(haystack, &mut state)?;
1088 /// assert_eq!(None, state.get_match());
1089 ///
1090 /// # Ok::<(), Box<dyn std::error::Error>>(())
1091 /// ```
1092 ///
1093 /// # Example: implementing your own overlapping iteration
1094 ///
1095 /// The previous example can be easily adapted to implement your own
1096 /// iteration by repeatedly calling `try_find_overlapping` until either
1097 /// an error occurs or no more matches are reported.
1098 ///
1099 /// This is effectively equivalent to the iterator returned by
1100 /// [`AhoCorasick::try_find_overlapping_iter`], with the only difference
1101 /// being that the iterator checks for errors before construction and
1102 /// absolves the caller of needing to check for errors on every search
1103 /// call. (Indeed, if the first `try_find_overlapping` call succeeds and
1104 /// the same `Input` is given to subsequent calls, then all subsequent
1105 /// calls are guaranteed to succeed.)
1106 ///
1107 /// ```
1108 /// use aho_corasick::{
1109 /// automaton::OverlappingState,
1110 /// AhoCorasick, Input, Match,
1111 /// };
1112 ///
1113 /// let patterns = &["append", "appendage", "app"];
1114 /// let haystack = "append the app to the appendage";
1115 ///
1116 /// let ac = AhoCorasick::new(patterns).unwrap();
1117 /// let mut state = OverlappingState::start();
1118 /// let mut matches = vec![];
1119 ///
1120 /// loop {
1121 /// ac.try_find_overlapping(haystack, &mut state)?;
1122 /// let mat = match state.get_match() {
1123 /// None => break,
1124 /// Some(mat) => mat,
1125 /// };
1126 /// matches.push(mat);
1127 /// }
1128 /// let expected = vec![
1129 /// Match::must(2, 0..3),
1130 /// Match::must(0, 0..6),
1131 /// Match::must(2, 11..14),
1132 /// Match::must(2, 22..25),
1133 /// Match::must(0, 22..28),
1134 /// Match::must(1, 22..31),
1135 /// ];
1136 /// assert_eq!(expected, matches);
1137 ///
1138 /// # Ok::<(), Box<dyn std::error::Error>>(())
1139 /// ```
1140 ///
1141 /// # Example: anchored iteration
1142 ///
1143 /// The previous example can also be adapted to implement
1144 /// iteration over all anchored matches. In particular,
1145 /// [`AhoCorasick::try_find_overlapping_iter`] does not support this
1146 /// because it isn't totally clear what the match semantics ought to be.
1147 ///
1148 /// In this example, we will find all overlapping matches that start at
1149 /// the beginning of our search.
1150 ///
1151 /// ```
1152 /// use aho_corasick::{
1153 /// automaton::OverlappingState,
1154 /// AhoCorasick, Anchored, Input, Match, StartKind,
1155 /// };
1156 ///
1157 /// let patterns = &["append", "appendage", "app"];
1158 /// let haystack = "append the app to the appendage";
1159 ///
1160 /// let ac = AhoCorasick::builder()
1161 /// .start_kind(StartKind::Anchored)
1162 /// .build(patterns)
1163 /// .unwrap();
1164 /// let input = Input::new(haystack).anchored(Anchored::Yes);
1165 /// let mut state = OverlappingState::start();
1166 /// let mut matches = vec![];
1167 ///
1168 /// loop {
1169 /// ac.try_find_overlapping(input.clone(), &mut state)?;
1170 /// let mat = match state.get_match() {
1171 /// None => break,
1172 /// Some(mat) => mat,
1173 /// };
1174 /// matches.push(mat);
1175 /// }
1176 /// let expected = vec![
1177 /// Match::must(2, 0..3),
1178 /// Match::must(0, 0..6),
1179 /// ];
1180 /// assert_eq!(expected, matches);
1181 ///
1182 /// # Ok::<(), Box<dyn std::error::Error>>(())
1183 /// ```
1184 pub fn try_find_overlapping<'h, I: Into<Input<'h>>>(
1185 &self,
1186 input: I,
1187 state: &mut OverlappingState,
1188 ) -> Result<(), MatchError> {
1189 let input = input.into();
1190 enforce_anchored_consistency(self.start_kind, input.get_anchored())?;
1191 self.aut.try_find_overlapping(&input, state)
1192 }
1193
1194 /// Returns an iterator of non-overlapping matches, using the match
1195 /// semantics that this automaton was constructed with.
1196 ///
1197 /// This is the fallible version of [`AhoCorasick::find_iter`].
1198 ///
1199 /// Note that the error returned by this method occurs during construction
1200 /// of the iterator. The iterator itself yields `Match` values. That is,
1201 /// once the iterator is constructed, the iteration itself will never
1202 /// report an error.
1203 ///
1204 /// # Errors
1205 ///
1206 /// This returns an error when this Aho-Corasick searcher does not support
1207 /// the given `Input` configuration.
1208 ///
1209 /// For example, if the Aho-Corasick searcher only supports anchored
1210 /// searches or only supports unanchored searches, then providing an
1211 /// `Input` that requests an anchored (or unanchored) search when it isn't
1212 /// supported would result in an error.
1213 ///
1214 /// # Example: leftmost-first searching
1215 ///
1216 /// Basic usage with leftmost-first semantics:
1217 ///
1218 /// ```
1219 /// use aho_corasick::{AhoCorasick, Input, MatchKind, PatternID};
1220 ///
1221 /// let patterns = &["append", "appendage", "app"];
1222 /// let haystack = "append the app to the appendage";
1223 ///
1224 /// let ac = AhoCorasick::builder()
1225 /// .match_kind(MatchKind::LeftmostFirst)
1226 /// .build(patterns)
1227 /// .unwrap();
1228 /// let matches: Vec<PatternID> = ac
1229 /// .try_find_iter(Input::new(haystack))?
1230 /// .map(|mat| mat.pattern())
1231 /// .collect();
1232 /// assert_eq!(vec![
1233 /// PatternID::must(0),
1234 /// PatternID::must(2),
1235 /// PatternID::must(0),
1236 /// ], matches);
1237 ///
1238 /// # Ok::<(), Box<dyn std::error::Error>>(())
1239 /// ```
1240 ///
1241 /// # Example: anchored leftmost-first searching
1242 ///
1243 /// This shows how to anchor the search, such that all matches must begin
1244 /// at the starting location of the search. For an iterator, an anchored
1245 /// search implies that all matches are adjacent.
1246 ///
1247 /// ```
1248 /// use aho_corasick::{
1249 /// AhoCorasick, Anchored, Input, MatchKind, PatternID, StartKind,
1250 /// };
1251 ///
1252 /// let patterns = &["foo", "bar", "quux"];
1253 /// let haystack = "fooquuxbar foo";
1254 ///
1255 /// let ac = AhoCorasick::builder()
1256 /// .match_kind(MatchKind::LeftmostFirst)
1257 /// .start_kind(StartKind::Anchored)
1258 /// .build(patterns)
1259 /// .unwrap();
1260 /// let matches: Vec<PatternID> = ac
1261 /// .try_find_iter(Input::new(haystack).anchored(Anchored::Yes))?
1262 /// .map(|mat| mat.pattern())
1263 /// .collect();
1264 /// assert_eq!(vec![
1265 /// PatternID::must(0),
1266 /// PatternID::must(2),
1267 /// PatternID::must(1),
1268 /// // The final 'foo' is not found because it is not adjacent to the
1269 /// // 'bar' match. It needs to be adjacent because our search is
1270 /// // anchored.
1271 /// ], matches);
1272 ///
1273 /// # Ok::<(), Box<dyn std::error::Error>>(())
1274 /// ```
1275 pub fn try_find_iter<'a, 'h, I: Into<Input<'h>>>(
1276 &'a self,
1277 input: I,
1278 ) -> Result<FindIter<'a, 'h>, MatchError> {
1279 let input = input.into();
1280 enforce_anchored_consistency(self.start_kind, input.get_anchored())?;
1281 Ok(FindIter(self.aut.try_find_iter(input)?))
1282 }
1283
1284 /// Returns an iterator of overlapping matches.
1285 ///
1286 /// This is the fallible version of [`AhoCorasick::find_overlapping_iter`].
1287 ///
1288 /// Note that the error returned by this method occurs during construction
1289 /// of the iterator. The iterator itself yields `Match` values. That is,
1290 /// once the iterator is constructed, the iteration itself will never
1291 /// report an error.
1292 ///
1293 /// # Errors
1294 ///
1295 /// This returns an error when this Aho-Corasick searcher does not support
1296 /// the given `Input` configuration or does not support overlapping
1297 /// searches.
1298 ///
1299 /// One example is that only Aho-Corasicker searchers built with
1300 /// [`MatchKind::Standard`] semantics support overlapping searches. Using
1301 /// any other match semantics will result in this returning an error.
1302 ///
1303 /// # Example: basic usage
1304 ///
1305 /// ```
1306 /// use aho_corasick::{AhoCorasick, Input, PatternID};
1307 ///
1308 /// let patterns = &["append", "appendage", "app"];
1309 /// let haystack = "append the app to the appendage";
1310 ///
1311 /// let ac = AhoCorasick::new(patterns).unwrap();
1312 /// let matches: Vec<PatternID> = ac
1313 /// .try_find_overlapping_iter(Input::new(haystack))?
1314 /// .map(|mat| mat.pattern())
1315 /// .collect();
1316 /// assert_eq!(vec![
1317 /// PatternID::must(2),
1318 /// PatternID::must(0),
1319 /// PatternID::must(2),
1320 /// PatternID::must(2),
1321 /// PatternID::must(0),
1322 /// PatternID::must(1),
1323 /// ], matches);
1324 ///
1325 /// # Ok::<(), Box<dyn std::error::Error>>(())
1326 /// ```
1327 ///
1328 /// # Example: anchored overlapping search returns an error
1329 ///
1330 /// It isn't clear what the match semantics for anchored overlapping
1331 /// iterators *ought* to be, so currently an error is returned. Callers
1332 /// may use [`AhoCorasick::try_find_overlapping`] to implement their own
1333 /// semantics if desired.
1334 ///
1335 /// ```
1336 /// use aho_corasick::{AhoCorasick, Anchored, Input, StartKind};
1337 ///
1338 /// let patterns = &["append", "appendage", "app"];
1339 /// let haystack = "appendappendage app";
1340 ///
1341 /// let ac = AhoCorasick::builder()
1342 /// .start_kind(StartKind::Anchored)
1343 /// .build(patterns)
1344 /// .unwrap();
1345 /// let input = Input::new(haystack).anchored(Anchored::Yes);
1346 /// assert!(ac.try_find_overlapping_iter(input).is_err());
1347 ///
1348 /// # Ok::<(), Box<dyn std::error::Error>>(())
1349 /// ```
1350 pub fn try_find_overlapping_iter<'a, 'h, I: Into<Input<'h>>>(
1351 &'a self,
1352 input: I,
1353 ) -> Result<FindOverlappingIter<'a, 'h>, MatchError> {
1354 let input = input.into();
1355 enforce_anchored_consistency(self.start_kind, input.get_anchored())?;
1356 Ok(FindOverlappingIter(self.aut.try_find_overlapping_iter(input)?))
1357 }
1358
1359 /// Replace all matches with a corresponding value in the `replace_with`
1360 /// slice given. Matches correspond to the same matches as reported by
1361 /// [`AhoCorasick::try_find_iter`].
1362 ///
1363 /// Replacements are determined by the index of the matching pattern.
1364 /// For example, if the pattern with index `2` is found, then it is
1365 /// replaced by `replace_with[2]`.
1366 ///
1367 /// # Panics
1368 ///
1369 /// This panics when `replace_with.len()` does not equal
1370 /// [`AhoCorasick::patterns_len`].
1371 ///
1372 /// # Errors
1373 ///
1374 /// This returns an error when this Aho-Corasick searcher does not support
1375 /// the default `Input` configuration. More specifically, this occurs only
1376 /// when the Aho-Corasick searcher does not support unanchored searches
1377 /// since this replacement routine always does an unanchored search.
1378 ///
1379 /// # Example: basic usage
1380 ///
1381 /// ```
1382 /// use aho_corasick::{AhoCorasick, MatchKind};
1383 ///
1384 /// let patterns = &["append", "appendage", "app"];
1385 /// let haystack = "append the app to the appendage";
1386 ///
1387 /// let ac = AhoCorasick::builder()
1388 /// .match_kind(MatchKind::LeftmostFirst)
1389 /// .build(patterns)
1390 /// .unwrap();
1391 /// let result = ac.try_replace_all(haystack, &["x", "y", "z"])?;
1392 /// assert_eq!("x the z to the xage", result);
1393 ///
1394 /// # Ok::<(), Box<dyn std::error::Error>>(())
1395 /// ```
1396 pub fn try_replace_all<B>(
1397 &self,
1398 haystack: &str,
1399 replace_with: &[B],
1400 ) -> Result<String, MatchError>
1401 where
1402 B: AsRef<str>,
1403 {
1404 enforce_anchored_consistency(self.start_kind, Anchored::No)?;
1405 self.aut.try_replace_all(haystack, replace_with)
1406 }
1407
1408 /// Replace all matches using raw bytes with a corresponding value in the
1409 /// `replace_with` slice given. Matches correspond to the same matches as
1410 /// reported by [`AhoCorasick::try_find_iter`].
1411 ///
1412 /// Replacements are determined by the index of the matching pattern.
1413 /// For example, if the pattern with index `2` is found, then it is
1414 /// replaced by `replace_with[2]`.
1415 ///
1416 /// This is the fallible version of [`AhoCorasick::replace_all_bytes`].
1417 ///
1418 /// # Panics
1419 ///
1420 /// This panics when `replace_with.len()` does not equal
1421 /// [`AhoCorasick::patterns_len`].
1422 ///
1423 /// # Errors
1424 ///
1425 /// This returns an error when this Aho-Corasick searcher does not support
1426 /// the default `Input` configuration. More specifically, this occurs only
1427 /// when the Aho-Corasick searcher does not support unanchored searches
1428 /// since this replacement routine always does an unanchored search.
1429 ///
1430 /// # Example: basic usage
1431 ///
1432 /// ```
1433 /// use aho_corasick::{AhoCorasick, MatchKind};
1434 ///
1435 /// let patterns = &["append", "appendage", "app"];
1436 /// let haystack = b"append the app to the appendage";
1437 ///
1438 /// let ac = AhoCorasick::builder()
1439 /// .match_kind(MatchKind::LeftmostFirst)
1440 /// .build(patterns)
1441 /// .unwrap();
1442 /// let result = ac.try_replace_all_bytes(haystack, &["x", "y", "z"])?;
1443 /// assert_eq!(b"x the z to the xage".to_vec(), result);
1444 ///
1445 /// # Ok::<(), Box<dyn std::error::Error>>(())
1446 /// ```
1447 pub fn try_replace_all_bytes<B>(
1448 &self,
1449 haystack: &[u8],
1450 replace_with: &[B],
1451 ) -> Result<Vec<u8>, MatchError>
1452 where
1453 B: AsRef<[u8]>,
1454 {
1455 enforce_anchored_consistency(self.start_kind, Anchored::No)?;
1456 self.aut.try_replace_all_bytes(haystack, replace_with)
1457 }
1458
1459 /// Replace all matches using a closure called on each match.
1460 /// Matches correspond to the same matches as reported by
1461 /// [`AhoCorasick::try_find_iter`].
1462 ///
1463 /// The closure accepts three parameters: the match found, the text of
1464 /// the match and a string buffer with which to write the replaced text
1465 /// (if any). If the closure returns `true`, then it continues to the next
1466 /// match. If the closure returns `false`, then searching is stopped.
1467 ///
1468 /// Note that any matches with boundaries that don't fall on a valid UTF-8
1469 /// boundary are silently skipped.
1470 ///
1471 /// This is the fallible version of [`AhoCorasick::replace_all_with`].
1472 ///
1473 /// # Errors
1474 ///
1475 /// This returns an error when this Aho-Corasick searcher does not support
1476 /// the default `Input` configuration. More specifically, this occurs only
1477 /// when the Aho-Corasick searcher does not support unanchored searches
1478 /// since this replacement routine always does an unanchored search.
1479 ///
1480 /// # Examples
1481 ///
1482 /// Basic usage:
1483 ///
1484 /// ```
1485 /// use aho_corasick::{AhoCorasick, MatchKind};
1486 ///
1487 /// let patterns = &["append", "appendage", "app"];
1488 /// let haystack = "append the app to the appendage";
1489 ///
1490 /// let ac = AhoCorasick::builder()
1491 /// .match_kind(MatchKind::LeftmostFirst)
1492 /// .build(patterns)
1493 /// .unwrap();
1494 /// let mut result = String::new();
1495 /// ac.try_replace_all_with(haystack, &mut result, |mat, _, dst| {
1496 /// dst.push_str(&mat.pattern().as_usize().to_string());
1497 /// true
1498 /// })?;
1499 /// assert_eq!("0 the 2 to the 0age", result);
1500 ///
1501 /// # Ok::<(), Box<dyn std::error::Error>>(())
1502 /// ```
1503 ///
1504 /// Stopping the replacement by returning `false` (continued from the
1505 /// example above):
1506 ///
1507 /// ```
1508 /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID};
1509 /// # let patterns = &["append", "appendage", "app"];
1510 /// # let haystack = "append the app to the appendage";
1511 /// # let ac = AhoCorasick::builder()
1512 /// # .match_kind(MatchKind::LeftmostFirst)
1513 /// # .build(patterns)
1514 /// # .unwrap();
1515 /// let mut result = String::new();
1516 /// ac.try_replace_all_with(haystack, &mut result, |mat, _, dst| {
1517 /// dst.push_str(&mat.pattern().as_usize().to_string());
1518 /// mat.pattern() != PatternID::must(2)
1519 /// })?;
1520 /// assert_eq!("0 the 2 to the appendage", result);
1521 ///
1522 /// # Ok::<(), Box<dyn std::error::Error>>(())
1523 /// ```
1524 pub fn try_replace_all_with<F>(
1525 &self,
1526 haystack: &str,
1527 dst: &mut String,
1528 replace_with: F,
1529 ) -> Result<(), MatchError>
1530 where
1531 F: FnMut(&Match, &str, &mut String) -> bool,
1532 {
1533 enforce_anchored_consistency(self.start_kind, Anchored::No)?;
1534 self.aut.try_replace_all_with(haystack, dst, replace_with)
1535 }
1536
1537 /// Replace all matches using raw bytes with a closure called on each
1538 /// match. Matches correspond to the same matches as reported by
1539 /// [`AhoCorasick::try_find_iter`].
1540 ///
1541 /// The closure accepts three parameters: the match found, the text of
1542 /// the match and a byte buffer with which to write the replaced text
1543 /// (if any). If the closure returns `true`, then it continues to the next
1544 /// match. If the closure returns `false`, then searching is stopped.
1545 ///
1546 /// This is the fallible version of
1547 /// [`AhoCorasick::replace_all_with_bytes`].
1548 ///
1549 /// # Errors
1550 ///
1551 /// This returns an error when this Aho-Corasick searcher does not support
1552 /// the default `Input` configuration. More specifically, this occurs only
1553 /// when the Aho-Corasick searcher does not support unanchored searches
1554 /// since this replacement routine always does an unanchored search.
1555 ///
1556 /// # Examples
1557 ///
1558 /// Basic usage:
1559 ///
1560 /// ```
1561 /// use aho_corasick::{AhoCorasick, MatchKind};
1562 ///
1563 /// let patterns = &["append", "appendage", "app"];
1564 /// let haystack = b"append the app to the appendage";
1565 ///
1566 /// let ac = AhoCorasick::builder()
1567 /// .match_kind(MatchKind::LeftmostFirst)
1568 /// .build(patterns)
1569 /// .unwrap();
1570 /// let mut result = vec![];
1571 /// ac.try_replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
1572 /// dst.extend(mat.pattern().as_usize().to_string().bytes());
1573 /// true
1574 /// })?;
1575 /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
1576 ///
1577 /// # Ok::<(), Box<dyn std::error::Error>>(())
1578 /// ```
1579 ///
1580 /// Stopping the replacement by returning `false` (continued from the
1581 /// example above):
1582 ///
1583 /// ```
1584 /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID};
1585 /// # let patterns = &["append", "appendage", "app"];
1586 /// # let haystack = b"append the app to the appendage";
1587 /// # let ac = AhoCorasick::builder()
1588 /// # .match_kind(MatchKind::LeftmostFirst)
1589 /// # .build(patterns)
1590 /// # .unwrap();
1591 /// let mut result = vec![];
1592 /// ac.try_replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
1593 /// dst.extend(mat.pattern().as_usize().to_string().bytes());
1594 /// mat.pattern() != PatternID::must(2)
1595 /// })?;
1596 /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
1597 ///
1598 /// # Ok::<(), Box<dyn std::error::Error>>(())
1599 /// ```
1600 pub fn try_replace_all_with_bytes<F>(
1601 &self,
1602 haystack: &[u8],
1603 dst: &mut Vec<u8>,
1604 replace_with: F,
1605 ) -> Result<(), MatchError>
1606 where
1607 F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
1608 {
1609 enforce_anchored_consistency(self.start_kind, Anchored::No)?;
1610 self.aut.try_replace_all_with_bytes(haystack, dst, replace_with)
1611 }
1612
1613 /// Returns an iterator of non-overlapping matches in the given
1614 /// stream. Matches correspond to the same matches as reported by
1615 /// [`AhoCorasick::try_find_iter`].
1616 ///
1617 /// The matches yielded by this iterator use absolute position offsets in
1618 /// the stream given, where the first byte has index `0`. Matches are
1619 /// yieled until the stream is exhausted.
1620 ///
1621 /// Each item yielded by the iterator is an `Result<Match,
1622 /// std::io::Error>`, where an error is yielded if there was a problem
1623 /// reading from the reader given.
1624 ///
1625 /// When searching a stream, an internal buffer is used. Therefore, callers
1626 /// should avoiding providing a buffered reader, if possible.
1627 ///
1628 /// This is the fallible version of [`AhoCorasick::stream_find_iter`].
1629 /// Note that both methods return iterators that produce `Result` values.
1630 /// The difference is that this routine returns an error if _construction_
1631 /// of the iterator failed. The `Result` values yield by the iterator
1632 /// come from whether the given reader returns an error or not during the
1633 /// search.
1634 ///
1635 /// # Memory usage
1636 ///
1637 /// In general, searching streams will use a constant amount of memory for
1638 /// its internal buffer. The one requirement is that the internal buffer
1639 /// must be at least the size of the longest possible match. In most use
1640 /// cases, the default buffer size will be much larger than any individual
1641 /// match.
1642 ///
1643 /// # Errors
1644 ///
1645 /// This returns an error when this Aho-Corasick searcher does not support
1646 /// the default `Input` configuration. More specifically, this occurs only
1647 /// when the Aho-Corasick searcher does not support unanchored searches
1648 /// since this stream searching routine always does an unanchored search.
1649 ///
1650 /// This also returns an error if the searcher does not support stream
1651 /// searches. Only searchers built with [`MatchKind::Standard`] semantics
1652 /// support stream searches.
1653 ///
1654 /// # Example: basic usage
1655 ///
1656 /// ```
1657 /// use aho_corasick::{AhoCorasick, PatternID};
1658 ///
1659 /// let patterns = &["append", "appendage", "app"];
1660 /// let haystack = "append the app to the appendage";
1661 ///
1662 /// let ac = AhoCorasick::new(patterns).unwrap();
1663 /// let mut matches = vec![];
1664 /// for result in ac.try_stream_find_iter(haystack.as_bytes())? {
1665 /// let mat = result?;
1666 /// matches.push(mat.pattern());
1667 /// }
1668 /// assert_eq!(vec![
1669 /// PatternID::must(2),
1670 /// PatternID::must(2),
1671 /// PatternID::must(2),
1672 /// ], matches);
1673 ///
1674 /// # Ok::<(), Box<dyn std::error::Error>>(())
1675 /// ```
1676 #[cfg(feature = "std")]
1677 pub fn try_stream_find_iter<'a, R: std::io::Read>(
1678 &'a self,
1679 rdr: R,
1680 ) -> Result<StreamFindIter<'a, R>, MatchError> {
1681 enforce_anchored_consistency(self.start_kind, Anchored::No)?;
1682 self.aut.try_stream_find_iter(rdr).map(StreamFindIter)
1683 }
1684
1685 /// Search for and replace all matches of this automaton in
1686 /// the given reader, and write the replacements to the given
1687 /// writer. Matches correspond to the same matches as reported by
1688 /// [`AhoCorasick::try_find_iter`].
1689 ///
1690 /// Replacements are determined by the index of the matching pattern. For
1691 /// example, if the pattern with index `2` is found, then it is replaced by
1692 /// `replace_with[2]`.
1693 ///
1694 /// After all matches are replaced, the writer is _not_ flushed.
1695 ///
1696 /// If there was a problem reading from the given reader or writing to the
1697 /// given writer, then the corresponding `io::Error` is returned and all
1698 /// replacement is stopped.
1699 ///
1700 /// When searching a stream, an internal buffer is used. Therefore, callers
1701 /// should avoiding providing a buffered reader, if possible. However,
1702 /// callers may want to provide a buffered writer.
1703 ///
1704 /// Note that there is currently no infallible version of this routine.
1705 ///
1706 /// # Memory usage
1707 ///
1708 /// In general, searching streams will use a constant amount of memory for
1709 /// its internal buffer. The one requirement is that the internal buffer
1710 /// must be at least the size of the longest possible match. In most use
1711 /// cases, the default buffer size will be much larger than any individual
1712 /// match.
1713 ///
1714 /// # Panics
1715 ///
1716 /// This panics when `replace_with.len()` does not equal
1717 /// [`AhoCorasick::patterns_len`].
1718 ///
1719 /// # Errors
1720 ///
1721 /// This returns an error when this Aho-Corasick searcher does not support
1722 /// the default `Input` configuration. More specifically, this occurs only
1723 /// when the Aho-Corasick searcher does not support unanchored searches
1724 /// since this stream searching routine always does an unanchored search.
1725 ///
1726 /// This also returns an error if the searcher does not support stream
1727 /// searches. Only searchers built with [`MatchKind::Standard`] semantics
1728 /// support stream searches.
1729 ///
1730 /// # Example: basic usage
1731 ///
1732 /// ```
1733 /// use aho_corasick::AhoCorasick;
1734 ///
1735 /// let patterns = &["fox", "brown", "quick"];
1736 /// let haystack = "The quick brown fox.";
1737 /// let replace_with = &["sloth", "grey", "slow"];
1738 ///
1739 /// let ac = AhoCorasick::new(patterns).unwrap();
1740 /// let mut result = vec![];
1741 /// ac.try_stream_replace_all(
1742 /// haystack.as_bytes(),
1743 /// &mut result,
1744 /// replace_with,
1745 /// )?;
1746 /// assert_eq!(b"The slow grey sloth.".to_vec(), result);
1747 ///
1748 /// # Ok::<(), Box<dyn std::error::Error>>(())
1749 /// ```
1750 #[cfg(feature = "std")]
1751 pub fn try_stream_replace_all<R, W, B>(
1752 &self,
1753 rdr: R,
1754 wtr: W,
1755 replace_with: &[B],
1756 ) -> Result<(), std::io::Error>
1757 where
1758 R: std::io::Read,
1759 W: std::io::Write,
1760 B: AsRef<[u8]>,
1761 {
1762 enforce_anchored_consistency(self.start_kind, Anchored::No)
1763 .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
1764 self.aut.try_stream_replace_all(rdr, wtr, replace_with)
1765 }
1766
1767 /// Search the given reader and replace all matches of this automaton
1768 /// using the given closure. The result is written to the given
1769 /// writer. Matches correspond to the same matches as reported by
1770 /// [`AhoCorasick::try_find_iter`].
1771 ///
1772 /// The closure accepts three parameters: the match found, the text of
1773 /// the match and the writer with which to write the replaced text (if any).
1774 ///
1775 /// After all matches are replaced, the writer is _not_ flushed.
1776 ///
1777 /// If there was a problem reading from the given reader or writing to the
1778 /// given writer, then the corresponding `io::Error` is returned and all
1779 /// replacement is stopped.
1780 ///
1781 /// When searching a stream, an internal buffer is used. Therefore, callers
1782 /// should avoiding providing a buffered reader, if possible. However,
1783 /// callers may want to provide a buffered writer.
1784 ///
1785 /// Note that there is currently no infallible version of this routine.
1786 ///
1787 /// # Memory usage
1788 ///
1789 /// In general, searching streams will use a constant amount of memory for
1790 /// its internal buffer. The one requirement is that the internal buffer
1791 /// must be at least the size of the longest possible match. In most use
1792 /// cases, the default buffer size will be much larger than any individual
1793 /// match.
1794 ///
1795 /// # Errors
1796 ///
1797 /// This returns an error when this Aho-Corasick searcher does not support
1798 /// the default `Input` configuration. More specifically, this occurs only
1799 /// when the Aho-Corasick searcher does not support unanchored searches
1800 /// since this stream searching routine always does an unanchored search.
1801 ///
1802 /// This also returns an error if the searcher does not support stream
1803 /// searches. Only searchers built with [`MatchKind::Standard`] semantics
1804 /// support stream searches.
1805 ///
1806 /// # Example: basic usage
1807 ///
1808 /// ```
1809 /// use std::io::Write;
1810 /// use aho_corasick::AhoCorasick;
1811 ///
1812 /// let patterns = &["fox", "brown", "quick"];
1813 /// let haystack = "The quick brown fox.";
1814 ///
1815 /// let ac = AhoCorasick::new(patterns).unwrap();
1816 /// let mut result = vec![];
1817 /// ac.try_stream_replace_all_with(
1818 /// haystack.as_bytes(),
1819 /// &mut result,
1820 /// |mat, _, wtr| {
1821 /// wtr.write_all(mat.pattern().as_usize().to_string().as_bytes())
1822 /// },
1823 /// )?;
1824 /// assert_eq!(b"The 2 1 0.".to_vec(), result);
1825 ///
1826 /// # Ok::<(), Box<dyn std::error::Error>>(())
1827 /// ```
1828 #[cfg(feature = "std")]
1829 pub fn try_stream_replace_all_with<R, W, F>(
1830 &self,
1831 rdr: R,
1832 wtr: W,
1833 replace_with: F,
1834 ) -> Result<(), std::io::Error>
1835 where
1836 R: std::io::Read,
1837 W: std::io::Write,
1838 F: FnMut(&Match, &[u8], &mut W) -> Result<(), std::io::Error>,
1839 {
1840 enforce_anchored_consistency(self.start_kind, Anchored::No)
1841 .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
1842 self.aut.try_stream_replace_all_with(rdr, wtr, replace_with)
1843 }
1844}
1845
1846/// Routines for querying information about the Aho-Corasick automaton.
1847impl AhoCorasick {
1848 /// Returns the kind of the Aho-Corasick automaton used by this searcher.
1849 ///
1850 /// Knowing the Aho-Corasick kind is principally useful for diagnostic
1851 /// purposes. In particular, if no specific kind was given to
1852 /// [`AhoCorasickBuilder::kind`], then one is automatically chosen and
1853 /// this routine will report which one.
1854 ///
1855 /// Note that the heuristics used for choosing which `AhoCorasickKind`
1856 /// may be changed in a semver compatible release.
1857 ///
1858 /// # Examples
1859 ///
1860 /// ```
1861 /// use aho_corasick::{AhoCorasick, AhoCorasickKind};
1862 ///
1863 /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
1864 /// // The specific Aho-Corasick kind chosen is not guaranteed!
1865 /// assert_eq!(AhoCorasickKind::DFA, ac.kind());
1866 /// ```
1867 pub fn kind(&self) -> AhoCorasickKind {
1868 self.kind
1869 }
1870
1871 /// Returns the type of starting search configuration supported by this
1872 /// Aho-Corasick automaton.
1873 ///
1874 /// # Examples
1875 ///
1876 /// ```
1877 /// use aho_corasick::{AhoCorasick, StartKind};
1878 ///
1879 /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
1880 /// assert_eq!(StartKind::Unanchored, ac.start_kind());
1881 /// ```
1882 pub fn start_kind(&self) -> StartKind {
1883 self.start_kind
1884 }
1885
1886 /// Returns the match kind used by this automaton.
1887 ///
1888 /// The match kind is important because it determines what kinds of
1889 /// matches are returned. Also, some operations (such as overlapping
1890 /// search and stream searching) are only supported when using the
1891 /// [`MatchKind::Standard`] match kind.
1892 ///
1893 /// # Examples
1894 ///
1895 /// ```
1896 /// use aho_corasick::{AhoCorasick, MatchKind};
1897 ///
1898 /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
1899 /// assert_eq!(MatchKind::Standard, ac.match_kind());
1900 /// ```
1901 pub fn match_kind(&self) -> MatchKind {
1902 self.aut.match_kind()
1903 }
1904
1905 /// Returns the length of the shortest pattern matched by this automaton.
1906 ///
1907 /// # Examples
1908 ///
1909 /// Basic usage:
1910 ///
1911 /// ```
1912 /// use aho_corasick::AhoCorasick;
1913 ///
1914 /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
1915 /// assert_eq!(3, ac.min_pattern_len());
1916 /// ```
1917 ///
1918 /// Note that an `AhoCorasick` automaton has a minimum length of `0` if
1919 /// and only if it can match the empty string:
1920 ///
1921 /// ```
1922 /// use aho_corasick::AhoCorasick;
1923 ///
1924 /// let ac = AhoCorasick::new(&["foo", "", "quux", "baz"]).unwrap();
1925 /// assert_eq!(0, ac.min_pattern_len());
1926 /// ```
1927 pub fn min_pattern_len(&self) -> usize {
1928 self.aut.min_pattern_len()
1929 }
1930
1931 /// Returns the length of the longest pattern matched by this automaton.
1932 ///
1933 /// # Examples
1934 ///
1935 /// Basic usage:
1936 ///
1937 /// ```
1938 /// use aho_corasick::AhoCorasick;
1939 ///
1940 /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
1941 /// assert_eq!(4, ac.max_pattern_len());
1942 /// ```
1943 pub fn max_pattern_len(&self) -> usize {
1944 self.aut.max_pattern_len()
1945 }
1946
1947 /// Return the total number of patterns matched by this automaton.
1948 ///
1949 /// This includes patterns that may never participate in a match. For
1950 /// example, if [`MatchKind::LeftmostFirst`] match semantics are used, and
1951 /// the patterns `Sam` and `Samwise` were used to build the automaton (in
1952 /// that order), then `Samwise` can never participate in a match because
1953 /// `Sam` will always take priority.
1954 ///
1955 /// # Examples
1956 ///
1957 /// Basic usage:
1958 ///
1959 /// ```
1960 /// use aho_corasick::AhoCorasick;
1961 ///
1962 /// let ac = AhoCorasick::new(&["foo", "bar", "baz"]).unwrap();
1963 /// assert_eq!(3, ac.patterns_len());
1964 /// ```
1965 pub fn patterns_len(&self) -> usize {
1966 self.aut.patterns_len()
1967 }
1968
1969 /// Returns the approximate total amount of heap used by this automaton, in
1970 /// units of bytes.
1971 ///
1972 /// # Examples
1973 ///
1974 /// This example shows the difference in heap usage between a few
1975 /// configurations:
1976 ///
1977 /// ```
1978 /// # if !cfg!(target_pointer_width = "64") { return; }
1979 /// use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind};
1980 ///
1981 /// let ac = AhoCorasick::builder()
1982 /// .kind(None) // default
1983 /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
1984 /// .unwrap();
1985 /// assert_eq!(5_632, ac.memory_usage());
1986 ///
1987 /// let ac = AhoCorasick::builder()
1988 /// .kind(None) // default
1989 /// .ascii_case_insensitive(true)
1990 /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
1991 /// .unwrap();
1992 /// assert_eq!(11_136, ac.memory_usage());
1993 ///
1994 /// let ac = AhoCorasick::builder()
1995 /// .kind(Some(AhoCorasickKind::NoncontiguousNFA))
1996 /// .ascii_case_insensitive(true)
1997 /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
1998 /// .unwrap();
1999 /// assert_eq!(10_879, ac.memory_usage());
2000 ///
2001 /// let ac = AhoCorasick::builder()
2002 /// .kind(Some(AhoCorasickKind::ContiguousNFA))
2003 /// .ascii_case_insensitive(true)
2004 /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
2005 /// .unwrap();
2006 /// assert_eq!(2_584, ac.memory_usage());
2007 ///
2008 /// let ac = AhoCorasick::builder()
2009 /// .kind(Some(AhoCorasickKind::DFA))
2010 /// .ascii_case_insensitive(true)
2011 /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
2012 /// .unwrap();
2013 /// // While this shows the DFA being the biggest here by a small margin,
2014 /// // don't let the difference fool you. With such a small number of
2015 /// // patterns, the difference is small, but a bigger number of patterns
2016 /// // will reveal that the rate of growth of the DFA is far bigger than
2017 /// // the NFAs above. For a large number of patterns, it is easy for the
2018 /// // DFA to take an order of magnitude more heap space (or more!).
2019 /// assert_eq!(11_136, ac.memory_usage());
2020 /// ```
2021 pub fn memory_usage(&self) -> usize {
2022 self.aut.memory_usage()
2023 }
2024}
2025
2026// We provide a manual debug impl so that we don't include the 'start_kind',
2027// principally because it's kind of weird to do so and because it screws with
2028// the carefully curated debug output for the underlying automaton.
2029impl core::fmt::Debug for AhoCorasick {
2030 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2031 f.debug_tuple("AhoCorasick").field(&self.aut).finish()
2032 }
2033}
2034
2035/// An iterator of non-overlapping matches in a particular haystack.
2036///
2037/// This iterator yields matches according to the [`MatchKind`] used by this
2038/// automaton.
2039///
2040/// This iterator is constructed via the [`AhoCorasick::find_iter`] and
2041/// [`AhoCorasick::try_find_iter`] methods.
2042///
2043/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
2044///
2045/// The lifetime `'h` refers to the lifetime of the haystack being searched.
2046#[derive(Debug)]
2047pub struct FindIter<'a, 'h>(automaton::FindIter<'a, 'h, Arc<dyn AcAutomaton>>);
2048
2049impl<'a, 'h> Iterator for FindIter<'a, 'h> {
2050 type Item = Match;
2051
2052 #[inline]
2053 fn next(&mut self) -> Option<Match> {
2054 self.0.next()
2055 }
2056}
2057
2058/// An iterator of overlapping matches in a particular haystack.
2059///
2060/// This iterator will report all possible matches in a particular haystack,
2061/// even when the matches overlap.
2062///
2063/// This iterator is constructed via the [`AhoCorasick::find_overlapping_iter`]
2064/// and [`AhoCorasick::try_find_overlapping_iter`] methods.
2065///
2066/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
2067///
2068/// The lifetime `'h` refers to the lifetime of the haystack being searched.
2069#[derive(Debug)]
2070pub struct FindOverlappingIter<'a, 'h>(
2071 automaton::FindOverlappingIter<'a, 'h, Arc<dyn AcAutomaton>>,
2072);
2073
2074impl<'a, 'h> Iterator for FindOverlappingIter<'a, 'h> {
2075 type Item = Match;
2076
2077 #[inline]
2078 fn next(&mut self) -> Option<Match> {
2079 self.0.next()
2080 }
2081}
2082
2083/// An iterator that reports Aho-Corasick matches in a stream.
2084///
2085/// This iterator yields elements of type `Result<Match, std::io::Error>`,
2086/// where an error is reported if there was a problem reading from the
2087/// underlying stream. The iterator terminates only when the underlying stream
2088/// reaches `EOF`.
2089///
2090/// This iterator is constructed via the [`AhoCorasick::stream_find_iter`] and
2091/// [`AhoCorasick::try_stream_find_iter`] methods.
2092///
2093/// The type variable `R` refers to the `io::Read` stream that is being read
2094/// from.
2095///
2096/// The lifetime `'a` refers to the lifetime of the corresponding
2097/// [`AhoCorasick`] searcher.
2098#[cfg(feature = "std")]
2099#[derive(Debug)]
2100pub struct StreamFindIter<'a, R>(
2101 automaton::StreamFindIter<'a, Arc<dyn AcAutomaton>, R>,
2102);
2103
2104#[cfg(feature = "std")]
2105impl<'a, R: std::io::Read> Iterator for StreamFindIter<'a, R> {
2106 type Item = Result<Match, std::io::Error>;
2107
2108 fn next(&mut self) -> Option<Result<Match, std::io::Error>> {
2109 self.0.next()
2110 }
2111}
2112
2113/// A builder for configuring an Aho-Corasick automaton.
2114///
2115/// # Quick advice
2116///
2117/// * Use [`AhoCorasickBuilder::match_kind`] to configure your searcher
2118/// with [`MatchKind::LeftmostFirst`] if you want to match how backtracking
2119/// regex engines execute searches for `pat1|pat2|..|patN`. Use
2120/// [`MatchKind::LeftmostLongest`] if you want to match how POSIX regex engines
2121/// do it.
2122/// * If you need an anchored search, use [`AhoCorasickBuilder::start_kind`] to
2123/// set the [`StartKind::Anchored`] mode since [`StartKind::Unanchored`] is the
2124/// default. Or just use [`StartKind::Both`] to support both types of searches.
2125/// * You might want to use [`AhoCorasickBuilder::kind`] to set your searcher
2126/// to always use a [`AhoCorasickKind::DFA`] if search speed is critical and
2127/// memory usage isn't a concern. Otherwise, not setting a kind will probably
2128/// make the right choice for you. Beware that if you use [`StartKind::Both`]
2129/// to build a searcher that supports both unanchored and anchored searches
2130/// _and_ you set [`AhoCorasickKind::DFA`], then the DFA will essentially be
2131/// duplicated to support both simultaneously. This results in very high memory
2132/// usage.
2133/// * For all other options, their defaults are almost certainly what you want.
2134#[derive(Clone, Debug, Default)]
2135pub struct AhoCorasickBuilder {
2136 nfa_noncontiguous: noncontiguous::Builder,
2137 nfa_contiguous: contiguous::Builder,
2138 dfa: dfa::Builder,
2139 kind: Option<AhoCorasickKind>,
2140 start_kind: StartKind,
2141}
2142
2143impl AhoCorasickBuilder {
2144 /// Create a new builder for configuring an Aho-Corasick automaton.
2145 ///
2146 /// The builder provides a way to configure a number of things, including
2147 /// ASCII case insensitivity and what kind of match semantics are used.
2148 pub fn new() -> AhoCorasickBuilder {
2149 AhoCorasickBuilder::default()
2150 }
2151
2152 /// Build an Aho-Corasick automaton using the configuration set on this
2153 /// builder.
2154 ///
2155 /// A builder may be reused to create more automatons.
2156 ///
2157 /// # Examples
2158 ///
2159 /// Basic usage:
2160 ///
2161 /// ```
2162 /// use aho_corasick::{AhoCorasickBuilder, PatternID};
2163 ///
2164 /// let patterns = &["foo", "bar", "baz"];
2165 /// let ac = AhoCorasickBuilder::new().build(patterns).unwrap();
2166 /// assert_eq!(
2167 /// Some(PatternID::must(1)),
2168 /// ac.find("xxx bar xxx").map(|m| m.pattern()),
2169 /// );
2170 /// ```
2171 pub fn build<I, P>(&self, patterns: I) -> Result<AhoCorasick, BuildError>
2172 where
2173 I: IntoIterator<Item = P>,
2174 P: AsRef<[u8]>,
2175 {
2176 let nfa = self.nfa_noncontiguous.build(patterns)?;
2177 let (aut, kind): (Arc<dyn AcAutomaton>, AhoCorasickKind) =
2178 match self.kind {
2179 None => {
2180 debug!(
2181 "asked for automatic Aho-Corasick implementation, \
2182 criteria: <patterns: {:?}, max pattern len: {:?}, \
2183 start kind: {:?}>",
2184 nfa.patterns_len(),
2185 nfa.max_pattern_len(),
2186 self.start_kind,
2187 );
2188 self.build_auto(nfa)
2189 }
2190 Some(AhoCorasickKind::NoncontiguousNFA) => {
2191 debug!("forcefully chose noncontiguous NFA");
2192 (Arc::new(nfa), AhoCorasickKind::NoncontiguousNFA)
2193 }
2194 Some(AhoCorasickKind::ContiguousNFA) => {
2195 debug!("forcefully chose contiguous NFA");
2196 let cnfa =
2197 self.nfa_contiguous.build_from_noncontiguous(&nfa)?;
2198 (Arc::new(cnfa), AhoCorasickKind::ContiguousNFA)
2199 }
2200 Some(AhoCorasickKind::DFA) => {
2201 debug!("forcefully chose DFA");
2202 let dfa = self.dfa.build_from_noncontiguous(&nfa)?;
2203 (Arc::new(dfa), AhoCorasickKind::DFA)
2204 }
2205 };
2206 Ok(AhoCorasick { aut, kind, start_kind: self.start_kind })
2207 }
2208
2209 /// Implements the automatic selection logic for the Aho-Corasick
2210 /// implementation to use. Since all Aho-Corasick automatons are built
2211 /// from a non-contiguous NFA, the caller is responsible for building
2212 /// that first.
2213 fn build_auto(
2214 &self,
2215 nfa: noncontiguous::NFA,
2216 ) -> (Arc<dyn AcAutomaton>, AhoCorasickKind) {
2217 // We try to build a DFA if we have a very small number of patterns,
2218 // otherwise the memory usage just gets too crazy. We also only do it
2219 // when the start kind is unanchored or anchored, but not both, because
2220 // both implies two full copies of the transition table.
2221 let try_dfa = !matches!(self.start_kind, StartKind::Both)
2222 && nfa.patterns_len() <= 100;
2223 if try_dfa {
2224 match self.dfa.build_from_noncontiguous(&nfa) {
2225 Ok(dfa) => {
2226 debug!("chose a DFA");
2227 return (Arc::new(dfa), AhoCorasickKind::DFA);
2228 }
2229 Err(_err) => {
2230 debug!(
2231 "failed to build DFA, trying something else: {}",
2232 _err
2233 );
2234 }
2235 }
2236 }
2237 // We basically always want a contiguous NFA if the limited
2238 // circumstances in which we use a DFA are not true. It is quite fast
2239 // and has excellent memory usage. The only way we don't use it is if
2240 // there are so many states that it can't fit in a contiguous NFA.
2241 // And the only way to know that is to try to build it. Building a
2242 // contiguous NFA is mostly just reshuffling data from a noncontiguous
2243 // NFA, so it isn't too expensive, especially relative to building a
2244 // noncontiguous NFA in the first place.
2245 match self.nfa_contiguous.build_from_noncontiguous(&nfa) {
2246 Ok(nfa) => {
2247 debug!("chose contiguous NFA");
2248 return (Arc::new(nfa), AhoCorasickKind::ContiguousNFA);
2249 }
2250 #[allow(unused_variables)] // unused when 'logging' is disabled
2251 Err(_err) => {
2252 debug!(
2253 "failed to build contiguous NFA, \
2254 trying something else: {}",
2255 _err
2256 );
2257 }
2258 }
2259 debug!("chose non-contiguous NFA");
2260 (Arc::new(nfa), AhoCorasickKind::NoncontiguousNFA)
2261 }
2262
2263 /// Set the desired match semantics.
2264 ///
2265 /// The default is [`MatchKind::Standard`], which corresponds to the match
2266 /// semantics supported by the standard textbook description of the
2267 /// Aho-Corasick algorithm. Namely, matches are reported as soon as they
2268 /// are found. Moreover, this is the only way to get overlapping matches
2269 /// or do stream searching.
2270 ///
2271 /// The other kinds of match semantics that are supported are
2272 /// [`MatchKind::LeftmostFirst`] and [`MatchKind::LeftmostLongest`]. The
2273 /// former corresponds to the match you would get if you were to try to
2274 /// match each pattern at each position in the haystack in the same order
2275 /// that you give to the automaton. That is, it returns the leftmost match
2276 /// corresponding to the earliest pattern given to the automaton. The
2277 /// latter corresponds to finding the longest possible match among all
2278 /// leftmost matches.
2279 ///
2280 /// For more details on match semantics, see the [documentation for
2281 /// `MatchKind`](MatchKind).
2282 ///
2283 /// Note that setting this to [`MatchKind::LeftmostFirst`] or
2284 /// [`MatchKind::LeftmostLongest`] will cause some search routines on
2285 /// [`AhoCorasick`] to return an error (or panic if you're using the
2286 /// infallible API). Notably, this includes stream and overlapping
2287 /// searches.
2288 ///
2289 /// # Examples
2290 ///
2291 /// In these examples, we demonstrate the differences between match
2292 /// semantics for a particular set of patterns in a specific order:
2293 /// `b`, `abc`, `abcd`.
2294 ///
2295 /// Standard semantics:
2296 ///
2297 /// ```
2298 /// use aho_corasick::{AhoCorasick, MatchKind};
2299 ///
2300 /// let patterns = &["b", "abc", "abcd"];
2301 /// let haystack = "abcd";
2302 ///
2303 /// let ac = AhoCorasick::builder()
2304 /// .match_kind(MatchKind::Standard) // default, not necessary
2305 /// .build(patterns)
2306 /// .unwrap();
2307 /// let mat = ac.find(haystack).expect("should have a match");
2308 /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
2309 /// ```
2310 ///
2311 /// Leftmost-first semantics:
2312 ///
2313 /// ```
2314 /// use aho_corasick::{AhoCorasick, MatchKind};
2315 ///
2316 /// let patterns = &["b", "abc", "abcd"];
2317 /// let haystack = "abcd";
2318 ///
2319 /// let ac = AhoCorasick::builder()
2320 /// .match_kind(MatchKind::LeftmostFirst)
2321 /// .build(patterns)
2322 /// .unwrap();
2323 /// let mat = ac.find(haystack).expect("should have a match");
2324 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
2325 /// ```
2326 ///
2327 /// Leftmost-longest semantics:
2328 ///
2329 /// ```
2330 /// use aho_corasick::{AhoCorasick, MatchKind};
2331 ///
2332 /// let patterns = &["b", "abc", "abcd"];
2333 /// let haystack = "abcd";
2334 ///
2335 /// let ac = AhoCorasick::builder()
2336 /// .match_kind(MatchKind::LeftmostLongest)
2337 /// .build(patterns)
2338 /// .unwrap();
2339 /// let mat = ac.find(haystack).expect("should have a match");
2340 /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
2341 /// ```
2342 pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder {
2343 self.nfa_noncontiguous.match_kind(kind);
2344 self.nfa_contiguous.match_kind(kind);
2345 self.dfa.match_kind(kind);
2346 self
2347 }
2348
2349 /// Sets the starting state configuration for the automaton.
2350 ///
2351 /// Every Aho-Corasick automaton is capable of having two start states: one
2352 /// that is used for unanchored searches and one that is used for anchored
2353 /// searches. Some automatons, like the NFAs, support this with almost zero
2354 /// additional cost. Other automatons, like the DFA, require two copies of
2355 /// the underlying transition table to support both simultaneously.
2356 ///
2357 /// Because there may be an added non-trivial cost to supporting both, it
2358 /// is possible to configure which starting state configuration is needed.
2359 ///
2360 /// Indeed, since anchored searches tend to be somewhat more rare,
2361 /// _only_ unanchored searches are supported by default. Thus,
2362 /// [`StartKind::Unanchored`] is the default.
2363 ///
2364 /// Note that when this is set to [`StartKind::Unanchored`], then
2365 /// running an anchored search will result in an error (or a panic
2366 /// if using the infallible APIs). Similarly, when this is set to
2367 /// [`StartKind::Anchored`], then running an unanchored search will
2368 /// result in an error (or a panic if using the infallible APIs). When
2369 /// [`StartKind::Both`] is used, then both unanchored and anchored searches
2370 /// are always supported.
2371 ///
2372 /// Also note that even if an `AhoCorasick` searcher is using an NFA
2373 /// internally (which always supports both unanchored and anchored
2374 /// searches), an error will still be reported for a search that isn't
2375 /// supported by the configuration set via this method. This means,
2376 /// for example, that an error is never dependent on which internal
2377 /// implementation of Aho-Corasick is used.
2378 ///
2379 /// # Example: anchored search
2380 ///
2381 /// This shows how to build a searcher that only supports anchored
2382 /// searches:
2383 ///
2384 /// ```
2385 /// use aho_corasick::{
2386 /// AhoCorasick, Anchored, Input, Match, MatchKind, StartKind,
2387 /// };
2388 ///
2389 /// let ac = AhoCorasick::builder()
2390 /// .match_kind(MatchKind::LeftmostFirst)
2391 /// .start_kind(StartKind::Anchored)
2392 /// .build(&["b", "abc", "abcd"])
2393 /// .unwrap();
2394 ///
2395 /// // An unanchored search is not supported! An error here is guaranteed
2396 /// // given the configuration above regardless of which kind of
2397 /// // Aho-Corasick implementation ends up being used internally.
2398 /// let input = Input::new("foo abcd").anchored(Anchored::No);
2399 /// assert!(ac.try_find(input).is_err());
2400 ///
2401 /// let input = Input::new("foo abcd").anchored(Anchored::Yes);
2402 /// assert_eq!(None, ac.try_find(input)?);
2403 ///
2404 /// let input = Input::new("abcd").anchored(Anchored::Yes);
2405 /// assert_eq!(Some(Match::must(1, 0..3)), ac.try_find(input)?);
2406 ///
2407 /// # Ok::<(), Box<dyn std::error::Error>>(())
2408 /// ```
2409 ///
2410 /// # Example: unanchored and anchored searches
2411 ///
2412 /// This shows how to build a searcher that supports both unanchored and
2413 /// anchored searches:
2414 ///
2415 /// ```
2416 /// use aho_corasick::{
2417 /// AhoCorasick, Anchored, Input, Match, MatchKind, StartKind,
2418 /// };
2419 ///
2420 /// let ac = AhoCorasick::builder()
2421 /// .match_kind(MatchKind::LeftmostFirst)
2422 /// .start_kind(StartKind::Both)
2423 /// .build(&["b", "abc", "abcd"])
2424 /// .unwrap();
2425 ///
2426 /// let input = Input::new("foo abcd").anchored(Anchored::No);
2427 /// assert_eq!(Some(Match::must(1, 4..7)), ac.try_find(input)?);
2428 ///
2429 /// let input = Input::new("foo abcd").anchored(Anchored::Yes);
2430 /// assert_eq!(None, ac.try_find(input)?);
2431 ///
2432 /// let input = Input::new("abcd").anchored(Anchored::Yes);
2433 /// assert_eq!(Some(Match::must(1, 0..3)), ac.try_find(input)?);
2434 ///
2435 /// # Ok::<(), Box<dyn std::error::Error>>(())
2436 /// ```
2437 pub fn start_kind(&mut self, kind: StartKind) -> &mut AhoCorasickBuilder {
2438 self.dfa.start_kind(kind);
2439 self.start_kind = kind;
2440 self
2441 }
2442
2443 /// Enable ASCII-aware case insensitive matching.
2444 ///
2445 /// When this option is enabled, searching will be performed without
2446 /// respect to case for ASCII letters (`a-z` and `A-Z`) only.
2447 ///
2448 /// Enabling this option does not change the search algorithm, but it may
2449 /// increase the size of the automaton.
2450 ///
2451 /// **NOTE:** It is unlikely that support for Unicode case folding will
2452 /// be added in the future. The ASCII case works via a simple hack to the
2453 /// underlying automaton, but full Unicode handling requires a fair bit of
2454 /// sophistication. If you do need Unicode handling, you might consider
2455 /// using the [`regex` crate](https://docs.rs/regex) or the lower level
2456 /// [`regex-automata` crate](https://docs.rs/regex-automata).
2457 ///
2458 /// # Examples
2459 ///
2460 /// Basic usage:
2461 ///
2462 /// ```
2463 /// use aho_corasick::AhoCorasick;
2464 ///
2465 /// let patterns = &["FOO", "bAr", "BaZ"];
2466 /// let haystack = "foo bar baz";
2467 ///
2468 /// let ac = AhoCorasick::builder()
2469 /// .ascii_case_insensitive(true)
2470 /// .build(patterns)
2471 /// .unwrap();
2472 /// assert_eq!(3, ac.find_iter(haystack).count());
2473 /// ```
2474 pub fn ascii_case_insensitive(
2475 &mut self,
2476 yes: bool,
2477 ) -> &mut AhoCorasickBuilder {
2478 self.nfa_noncontiguous.ascii_case_insensitive(yes);
2479 self.nfa_contiguous.ascii_case_insensitive(yes);
2480 self.dfa.ascii_case_insensitive(yes);
2481 self
2482 }
2483
2484 /// Choose the type of underlying automaton to use.
2485 ///
2486 /// Currently, there are four choices:
2487 ///
2488 /// * [`AhoCorasickKind::NoncontiguousNFA`] instructs the searcher to
2489 /// use a [`noncontiguous::NFA`]. A noncontiguous NFA is the fastest to
2490 /// be built, has moderate memory usage and is typically the slowest to
2491 /// execute a search.
2492 /// * [`AhoCorasickKind::ContiguousNFA`] instructs the searcher to use a
2493 /// [`contiguous::NFA`]. A contiguous NFA is a little slower to build than
2494 /// a noncontiguous NFA, has excellent memory usage and is typically a
2495 /// little slower than a DFA for a search.
2496 /// * [`AhoCorasickKind::DFA`] instructs the searcher to use a
2497 /// [`dfa::DFA`]. A DFA is very slow to build, uses exorbitant amounts of
2498 /// memory, but will typically execute searches the fastest.
2499 /// * `None` (the default) instructs the searcher to choose the "best"
2500 /// Aho-Corasick implementation. This choice is typically based primarily
2501 /// on the number of patterns.
2502 ///
2503 /// Setting this configuration does not change the time complexity for
2504 /// constructing the Aho-Corasick automaton (which is `O(p)` where `p`
2505 /// is the total number of patterns being compiled). Setting this to
2506 /// [`AhoCorasickKind::DFA`] does however reduce the time complexity of
2507 /// non-overlapping searches from `O(n + p)` to `O(n)`, where `n` is the
2508 /// length of the haystack.
2509 ///
2510 /// In general, you should probably stick to the default unless you have
2511 /// some kind of reason to use a specific Aho-Corasick implementation. For
2512 /// example, you might choose `AhoCorasickKind::DFA` if you don't care
2513 /// about memory usage and want the fastest possible search times.
2514 ///
2515 /// Setting this guarantees that the searcher returned uses the chosen
2516 /// implementation. If that implementation could not be constructed, then
2517 /// an error will be returned. In contrast, when `None` is used, it is
2518 /// possible for it to attempt to construct, for example, a contiguous
2519 /// NFA and have it fail. In which case, it will fall back to using a
2520 /// noncontiguous NFA.
2521 ///
2522 /// If `None` is given, then one may use [`AhoCorasick::kind`] to determine
2523 /// which Aho-Corasick implementation was chosen.
2524 ///
2525 /// Note that the heuristics used for choosing which `AhoCorasickKind`
2526 /// may be changed in a semver compatible release.
2527 pub fn kind(
2528 &mut self,
2529 kind: Option<AhoCorasickKind>,
2530 ) -> &mut AhoCorasickBuilder {
2531 self.kind = kind;
2532 self
2533 }
2534
2535 /// Enable heuristic prefilter optimizations.
2536 ///
2537 /// When enabled, searching will attempt to quickly skip to match
2538 /// candidates using specialized literal search routines. A prefilter
2539 /// cannot always be used, and is generally treated as a heuristic. It
2540 /// can be useful to disable this if the prefilter is observed to be
2541 /// sub-optimal for a particular workload.
2542 ///
2543 /// Currently, prefilters are typically only active when building searchers
2544 /// with a small (less than 100) number of patterns.
2545 ///
2546 /// This is enabled by default.
2547 pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
2548 self.nfa_noncontiguous.prefilter(yes);
2549 self.nfa_contiguous.prefilter(yes);
2550 self.dfa.prefilter(yes);
2551 self
2552 }
2553
2554 /// Set the limit on how many states use a dense representation for their
2555 /// transitions. Other states will generally use a sparse representation.
2556 ///
2557 /// A dense representation uses more memory but is generally faster, since
2558 /// the next transition in a dense representation can be computed in a
2559 /// constant number of instructions. A sparse representation uses less
2560 /// memory but is generally slower, since the next transition in a sparse
2561 /// representation requires executing a variable number of instructions.
2562 ///
2563 /// This setting is only used when an Aho-Corasick implementation is used
2564 /// that supports the dense versus sparse representation trade off. Not all
2565 /// do.
2566 ///
2567 /// This limit is expressed in terms of the depth of a state, i.e., the
2568 /// number of transitions from the starting state of the automaton. The
2569 /// idea is that most of the time searching will be spent near the starting
2570 /// state of the automaton, so states near the start state should use a
2571 /// dense representation. States further away from the start state would
2572 /// then use a sparse representation.
2573 ///
2574 /// By default, this is set to a low but non-zero number. Setting this to
2575 /// `0` is almost never what you want, since it is likely to make searches
2576 /// very slow due to the start state itself being forced to use a sparse
2577 /// representation. However, it is unlikely that increasing this number
2578 /// will help things much, since the most active states have a small depth.
2579 /// More to the point, the memory usage increases superlinearly as this
2580 /// number increases.
2581 pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder {
2582 self.nfa_noncontiguous.dense_depth(depth);
2583 self.nfa_contiguous.dense_depth(depth);
2584 self
2585 }
2586
2587 /// A debug settting for whether to attempt to shrink the size of the
2588 /// automaton's alphabet or not.
2589 ///
2590 /// This option is enabled by default and should never be disabled unless
2591 /// one is debugging the underlying automaton.
2592 ///
2593 /// When enabled, some (but not all) Aho-Corasick automatons will use a map
2594 /// from all possible bytes to their corresponding equivalence class. Each
2595 /// equivalence class represents a set of bytes that does not discriminate
2596 /// between a match and a non-match in the automaton.
2597 ///
2598 /// The advantage of this map is that the size of the transition table can
2599 /// be reduced drastically from `#states * 256 * sizeof(u32)` to
2600 /// `#states * k * sizeof(u32)` where `k` is the number of equivalence
2601 /// classes (rounded up to the nearest power of 2). As a result, total
2602 /// space usage can decrease substantially. Moreover, since a smaller
2603 /// alphabet is used, automaton compilation becomes faster as well.
2604 ///
2605 /// **WARNING:** This is only useful for debugging automatons. Disabling
2606 /// this does not yield any speed advantages. Namely, even when this is
2607 /// disabled, a byte class map is still used while searching. The only
2608 /// difference is that every byte will be forced into its own distinct
2609 /// equivalence class. This is useful for debugging the actual generated
2610 /// transitions because it lets one see the transitions defined on actual
2611 /// bytes instead of the equivalence classes.
2612 pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
2613 self.nfa_contiguous.byte_classes(yes);
2614 self.dfa.byte_classes(yes);
2615 self
2616 }
2617}
2618
2619/// The type of Aho-Corasick implementation to use in an [`AhoCorasick`]
2620/// searcher.
2621///
2622/// This is principally used as an input to the
2623/// [`AhoCorasickBuilder::start_kind`] method. Its documentation goes into more
2624/// detail about each choice.
2625#[non_exhaustive]
2626#[derive(Clone, Copy, Debug, Eq, PartialEq)]
2627pub enum AhoCorasickKind {
2628 /// Use a noncontiguous NFA.
2629 NoncontiguousNFA,
2630 /// Use a contiguous NFA.
2631 ContiguousNFA,
2632 /// Use a DFA. Warning: DFAs typically use a large amount of memory.
2633 DFA,
2634}
2635
2636/// A trait that effectively gives us practical dynamic dispatch over anything
2637/// that impls `Automaton`, but without needing to add a bunch of bounds to
2638/// the core `Automaton` trait. Basically, we provide all of the marker traits
2639/// that our automatons have, in addition to `Debug` impls and requiring that
2640/// there is no borrowed data. Without these, the main `AhoCorasick` type would
2641/// not be able to meaningfully impl `Debug` or the marker traits without also
2642/// requiring that all impls of `Automaton` do so, which would be not great.
2643trait AcAutomaton:
2644 Automaton + Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static
2645{
2646}
2647
2648impl<A> AcAutomaton for A where
2649 A: Automaton + Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static
2650{
2651}
2652
2653impl crate::automaton::private::Sealed for Arc<dyn AcAutomaton> {}
2654
2655// I'm not sure why this trait impl shows up in the docs, as the AcAutomaton
2656// trait is not exported. So we forcefully hide it.
2657//
2658// SAFETY: This just defers to the underlying 'AcAutomaton' and thus inherits
2659// its safety properties.
2660#[doc(hidden)]
2661unsafe impl Automaton for Arc<dyn AcAutomaton> {
2662 #[inline(always)]
2663 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
2664 (**self).start_state(anchored)
2665 }
2666
2667 #[inline(always)]
2668 fn next_state(
2669 &self,
2670 anchored: Anchored,
2671 sid: StateID,
2672 byte: u8,
2673 ) -> StateID {
2674 (**self).next_state(anchored, sid, byte)
2675 }
2676
2677 #[inline(always)]
2678 fn is_special(&self, sid: StateID) -> bool {
2679 (**self).is_special(sid)
2680 }
2681
2682 #[inline(always)]
2683 fn is_dead(&self, sid: StateID) -> bool {
2684 (**self).is_dead(sid)
2685 }
2686
2687 #[inline(always)]
2688 fn is_match(&self, sid: StateID) -> bool {
2689 (**self).is_match(sid)
2690 }
2691
2692 #[inline(always)]
2693 fn is_start(&self, sid: StateID) -> bool {
2694 (**self).is_start(sid)
2695 }
2696
2697 #[inline(always)]
2698 fn match_kind(&self) -> MatchKind {
2699 (**self).match_kind()
2700 }
2701
2702 #[inline(always)]
2703 fn match_len(&self, sid: StateID) -> usize {
2704 (**self).match_len(sid)
2705 }
2706
2707 #[inline(always)]
2708 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
2709 (**self).match_pattern(sid, index)
2710 }
2711
2712 #[inline(always)]
2713 fn patterns_len(&self) -> usize {
2714 (**self).patterns_len()
2715 }
2716
2717 #[inline(always)]
2718 fn pattern_len(&self, pid: PatternID) -> usize {
2719 (**self).pattern_len(pid)
2720 }
2721
2722 #[inline(always)]
2723 fn min_pattern_len(&self) -> usize {
2724 (**self).min_pattern_len()
2725 }
2726
2727 #[inline(always)]
2728 fn max_pattern_len(&self) -> usize {
2729 (**self).max_pattern_len()
2730 }
2731
2732 #[inline(always)]
2733 fn memory_usage(&self) -> usize {
2734 (**self).memory_usage()
2735 }
2736
2737 #[inline(always)]
2738 fn prefilter(&self) -> Option<&Prefilter> {
2739 (**self).prefilter()
2740 }
2741
2742 // Even though 'try_find' and 'try_find_overlapping' each have their
2743 // own default impls, we explicitly define them here to fix a perf bug.
2744 // Without these explicit definitions, the default impl will wind up using
2745 // dynamic dispatch for all 'Automaton' method calls, including things like
2746 // 'next_state' that absolutely must get inlined or else perf is trashed.
2747 // Defining them explicitly here like this still requires dynamic dispatch
2748 // to call 'try_find' itself, but all uses of 'Automaton' within 'try_find'
2749 // are monomorphized.
2750 //
2751 // We don't need to explicitly impl any other methods, I think, because
2752 // they are all implemented themselves in terms of 'try_find' and
2753 // 'try_find_overlapping'. We still might wind up with an extra virtual
2754 // call here or there, but that's okay since it's outside of any perf
2755 // critical areas.
2756
2757 #[inline(always)]
2758 fn try_find(
2759 &self,
2760 input: &Input<'_>,
2761 ) -> Result<Option<Match>, MatchError> {
2762 (**self).try_find(input)
2763 }
2764
2765 #[inline(always)]
2766 fn try_find_overlapping(
2767 &self,
2768 input: &Input<'_>,
2769 state: &mut OverlappingState,
2770 ) -> Result<(), MatchError> {
2771 (**self).try_find_overlapping(input, state)
2772 }
2773}
2774
2775/// Returns an error if the start state configuration does not support the
2776/// desired search configuration. See the internal 'AhoCorasick::start_kind'
2777/// field docs for more details.
2778fn enforce_anchored_consistency(
2779 have: StartKind,
2780 want: Anchored,
2781) -> Result<(), MatchError> {
2782 match have {
2783 StartKind::Both => Ok(()),
2784 StartKind::Unanchored if !want.is_anchored() => Ok(()),
2785 StartKind::Unanchored => Err(MatchError::invalid_input_anchored()),
2786 StartKind::Anchored if want.is_anchored() => Ok(()),
2787 StartKind::Anchored => Err(MatchError::invalid_input_unanchored()),
2788 }
2789}
2790