1use crate::{
2 packed::{
3 pattern::Patterns,
4 rabinkarp::RabinKarp,
5 teddy::{self, Teddy},
6 },
7 util::search::{Match, Span},
8};
9
10/// This is a limit placed on the total number of patterns we're willing to try
11/// and match at once. As more sophisticated algorithms are added, this number
12/// may be increased.
13const PATTERN_LIMIT: usize = 128;
14
15/// A knob for controlling the match semantics of a packed multiple string
16/// searcher.
17///
18/// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level
19/// crate module in that it doesn't support "standard" match semantics,
20/// and instead only supports leftmost-first or leftmost-longest. Namely,
21/// "standard" semantics cannot be easily supported by packed searchers.
22///
23/// For more information on the distinction between leftmost-first and
24/// leftmost-longest, see the docs on the top-level `MatchKind` type.
25///
26/// Unlike the top-level `MatchKind` type, the default match semantics for this
27/// type are leftmost-first.
28#[derive(Clone, Copy, Debug, Eq, PartialEq)]
29#[non_exhaustive]
30pub enum MatchKind {
31 /// Use leftmost-first match semantics, which reports leftmost matches.
32 /// When there are multiple possible leftmost matches, the match
33 /// corresponding to the pattern that appeared earlier when constructing
34 /// the automaton is reported.
35 ///
36 /// This is the default.
37 LeftmostFirst,
38 /// Use leftmost-longest match semantics, which reports leftmost matches.
39 /// When there are multiple possible leftmost matches, the longest match
40 /// is chosen.
41 LeftmostLongest,
42}
43
44impl Default for MatchKind {
45 fn default() -> MatchKind {
46 MatchKind::LeftmostFirst
47 }
48}
49
50/// The configuration for a packed multiple pattern searcher.
51///
52/// The configuration is currently limited only to being able to select the
53/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
54/// future, more knobs may be made available.
55///
56/// A configuration produces a [`packed::Builder`](Builder), which in turn can
57/// be used to construct a [`packed::Searcher`](Searcher) for searching.
58///
59/// # Example
60///
61/// This example shows how to use leftmost-longest semantics instead of the
62/// default (leftmost-first).
63///
64/// ```
65/// use aho_corasick::{packed::{Config, MatchKind}, PatternID};
66///
67/// # fn example() -> Option<()> {
68/// let searcher = Config::new()
69/// .match_kind(MatchKind::LeftmostLongest)
70/// .builder()
71/// .add("foo")
72/// .add("foobar")
73/// .build()?;
74/// let matches: Vec<PatternID> = searcher
75/// .find_iter("foobar")
76/// .map(|mat| mat.pattern())
77/// .collect();
78/// assert_eq!(vec![PatternID::must(1)], matches);
79/// # Some(()) }
80/// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
81/// # example().unwrap()
82/// # } else {
83/// # assert!(example().is_none());
84/// # }
85/// ```
86#[derive(Clone, Debug)]
87pub struct Config {
88 kind: MatchKind,
89 force: Option<ForceAlgorithm>,
90 force_teddy_fat: Option<bool>,
91 force_avx: Option<bool>,
92}
93
94/// An internal option for forcing the use of a particular packed algorithm.
95///
96/// When an algorithm is forced, if a searcher could not be constructed for it,
97/// then no searcher will be returned even if an alternative algorithm would
98/// work.
99#[derive(Clone, Debug)]
100enum ForceAlgorithm {
101 Teddy,
102 RabinKarp,
103}
104
105impl Default for Config {
106 fn default() -> Config {
107 Config::new()
108 }
109}
110
111impl Config {
112 /// Create a new default configuration. A default configuration uses
113 /// leftmost-first match semantics.
114 pub fn new() -> Config {
115 Config {
116 kind: MatchKind::LeftmostFirst,
117 force: None,
118 force_teddy_fat: None,
119 force_avx: None,
120 }
121 }
122
123 /// Create a packed builder from this configuration. The builder can be
124 /// used to accumulate patterns and create a [`Searcher`] from them.
125 pub fn builder(&self) -> Builder {
126 Builder::from_config(self.clone())
127 }
128
129 /// Set the match semantics for this configuration.
130 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
131 self.kind = kind;
132 self
133 }
134
135 /// An undocumented method for forcing the use of the Teddy algorithm.
136 ///
137 /// This is only exposed for more precise testing and benchmarks. Callers
138 /// should not use it as it is not part of the API stability guarantees of
139 /// this crate.
140 #[doc(hidden)]
141 pub fn force_teddy(&mut self, yes: bool) -> &mut Config {
142 if yes {
143 self.force = Some(ForceAlgorithm::Teddy);
144 } else {
145 self.force = None;
146 }
147 self
148 }
149
150 /// An undocumented method for forcing the use of the Fat Teddy algorithm.
151 ///
152 /// This is only exposed for more precise testing and benchmarks. Callers
153 /// should not use it as it is not part of the API stability guarantees of
154 /// this crate.
155 #[doc(hidden)]
156 pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
157 self.force_teddy_fat = yes;
158 self
159 }
160
161 /// An undocumented method for forcing the use of SSE (`Some(false)`) or
162 /// AVX (`Some(true)`) algorithms.
163 ///
164 /// This is only exposed for more precise testing and benchmarks. Callers
165 /// should not use it as it is not part of the API stability guarantees of
166 /// this crate.
167 #[doc(hidden)]
168 pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config {
169 self.force_avx = yes;
170 self
171 }
172
173 /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
174 ///
175 /// This is only exposed for more precise testing and benchmarks. Callers
176 /// should not use it as it is not part of the API stability guarantees of
177 /// this crate.
178 #[doc(hidden)]
179 pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config {
180 if yes {
181 self.force = Some(ForceAlgorithm::RabinKarp);
182 } else {
183 self.force = None;
184 }
185 self
186 }
187}
188
189/// A builder for constructing a packed searcher from a collection of patterns.
190///
191/// # Example
192///
193/// This example shows how to use a builder to construct a searcher. By
194/// default, leftmost-first match semantics are used.
195///
196/// ```
197/// use aho_corasick::{packed::{Builder, MatchKind}, PatternID};
198///
199/// # fn example() -> Option<()> {
200/// let searcher = Builder::new()
201/// .add("foobar")
202/// .add("foo")
203/// .build()?;
204/// let matches: Vec<PatternID> = searcher
205/// .find_iter("foobar")
206/// .map(|mat| mat.pattern())
207/// .collect();
208/// assert_eq!(vec![PatternID::ZERO], matches);
209/// # Some(()) }
210/// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
211/// # example().unwrap()
212/// # } else {
213/// # assert!(example().is_none());
214/// # }
215/// ```
216#[derive(Clone, Debug)]
217pub struct Builder {
218 /// The configuration of this builder and subsequent matcher.
219 config: Config,
220 /// Set to true if the builder detects that a matcher cannot be built.
221 inert: bool,
222 /// The patterns provided by the caller.
223 patterns: Patterns,
224}
225
226impl Builder {
227 /// Create a new builder for constructing a multi-pattern searcher. This
228 /// constructor uses the default configuration.
229 pub fn new() -> Builder {
230 Builder::from_config(Config::new())
231 }
232
233 fn from_config(config: Config) -> Builder {
234 Builder { config, inert: false, patterns: Patterns::new() }
235 }
236
237 /// Build a searcher from the patterns added to this builder so far.
238 pub fn build(&self) -> Option<Searcher> {
239 if self.inert || self.patterns.is_empty() {
240 return None;
241 }
242 let mut patterns = self.patterns.clone();
243 patterns.set_match_kind(self.config.kind);
244 let rabinkarp = RabinKarp::new(&patterns);
245 // Effectively, we only want to return a searcher if we can use Teddy,
246 // since Teddy is our only fast packed searcher at the moment.
247 // Rabin-Karp is only used when searching haystacks smaller than what
248 // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
249 // is to force it using undocumented APIs (for tests/benchmarks).
250 let (search_kind, minimum_len) = match self.config.force {
251 None | Some(ForceAlgorithm::Teddy) => {
252 debug!("trying to build Teddy packed matcher");
253 let teddy = match self.build_teddy(&patterns) {
254 None => return None,
255 Some(teddy) => teddy,
256 };
257 let minimum_len = teddy.minimum_len();
258 (SearchKind::Teddy(teddy), minimum_len)
259 }
260 Some(ForceAlgorithm::RabinKarp) => {
261 debug!("using Rabin-Karp packed matcher");
262 (SearchKind::RabinKarp, 0)
263 }
264 };
265 Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
266 }
267
268 fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
269 teddy::Builder::new()
270 .avx(self.config.force_avx)
271 .fat(self.config.force_teddy_fat)
272 .build(&patterns)
273 }
274
275 /// Add the given pattern to this set to match.
276 ///
277 /// The order in which patterns are added is significant. Namely, when
278 /// using leftmost-first match semantics, then when multiple patterns can
279 /// match at a particular location, the pattern that was added first is
280 /// used as the match.
281 ///
282 /// If the number of patterns added exceeds the amount supported by packed
283 /// searchers, then the builder will stop accumulating patterns and render
284 /// itself inert. At this point, constructing a searcher will always return
285 /// `None`.
286 pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
287 if self.inert {
288 return self;
289 } else if self.patterns.len() >= PATTERN_LIMIT {
290 self.inert = true;
291 self.patterns.reset();
292 return self;
293 }
294 // Just in case PATTERN_LIMIT increases beyond u16::MAX.
295 assert!(self.patterns.len() <= core::u16::MAX as usize);
296
297 let pattern = pattern.as_ref();
298 if pattern.is_empty() {
299 self.inert = true;
300 self.patterns.reset();
301 return self;
302 }
303 self.patterns.add(pattern);
304 self
305 }
306
307 /// Add the given iterator of patterns to this set to match.
308 ///
309 /// The iterator must yield elements that can be converted into a `&[u8]`.
310 ///
311 /// The order in which patterns are added is significant. Namely, when
312 /// using leftmost-first match semantics, then when multiple patterns can
313 /// match at a particular location, the pattern that was added first is
314 /// used as the match.
315 ///
316 /// If the number of patterns added exceeds the amount supported by packed
317 /// searchers, then the builder will stop accumulating patterns and render
318 /// itself inert. At this point, constructing a searcher will always return
319 /// `None`.
320 pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
321 where
322 I: IntoIterator<Item = P>,
323 P: AsRef<[u8]>,
324 {
325 for p in patterns {
326 self.add(p);
327 }
328 self
329 }
330}
331
332impl Default for Builder {
333 fn default() -> Builder {
334 Builder::new()
335 }
336}
337
338/// A packed searcher for quickly finding occurrences of multiple patterns.
339///
340/// If callers need more flexible construction, or if one wants to change the
341/// match semantics (either leftmost-first or leftmost-longest), then one can
342/// use the [`Config`] and/or [`Builder`] types for more fine grained control.
343///
344/// # Example
345///
346/// This example shows how to create a searcher from an iterator of patterns.
347/// By default, leftmost-first match semantics are used.
348///
349/// ```
350/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
351///
352/// # fn example() -> Option<()> {
353/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
354/// let matches: Vec<PatternID> = searcher
355/// .find_iter("foobar")
356/// .map(|mat| mat.pattern())
357/// .collect();
358/// assert_eq!(vec![PatternID::ZERO], matches);
359/// # Some(()) }
360/// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
361/// # example().unwrap()
362/// # } else {
363/// # assert!(example().is_none());
364/// # }
365/// ```
366#[derive(Clone, Debug)]
367pub struct Searcher {
368 patterns: Patterns,
369 rabinkarp: RabinKarp,
370 search_kind: SearchKind,
371 minimum_len: usize,
372}
373
374#[derive(Clone, Debug)]
375enum SearchKind {
376 Teddy(Teddy),
377 RabinKarp,
378}
379
380impl Searcher {
381 /// A convenience function for constructing a searcher from an iterator
382 /// of things that can be converted to a `&[u8]`.
383 ///
384 /// If a searcher could not be constructed (either because of an
385 /// unsupported CPU or because there are too many patterns), then `None`
386 /// is returned.
387 ///
388 /// # Example
389 ///
390 /// Basic usage:
391 ///
392 /// ```
393 /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
394 ///
395 /// # fn example() -> Option<()> {
396 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
397 /// let matches: Vec<PatternID> = searcher
398 /// .find_iter("foobar")
399 /// .map(|mat| mat.pattern())
400 /// .collect();
401 /// assert_eq!(vec![PatternID::ZERO], matches);
402 /// # Some(()) }
403 /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
404 /// # example().unwrap()
405 /// # } else {
406 /// # assert!(example().is_none());
407 /// # }
408 /// ```
409 pub fn new<I, P>(patterns: I) -> Option<Searcher>
410 where
411 I: IntoIterator<Item = P>,
412 P: AsRef<[u8]>,
413 {
414 Builder::new().extend(patterns).build()
415 }
416
417 /// A convenience function for calling `Config::new()`.
418 ///
419 /// This is useful for avoiding an additional import.
420 pub fn config() -> Config {
421 Config::new()
422 }
423
424 /// A convenience function for calling `Builder::new()`.
425 ///
426 /// This is useful for avoiding an additional import.
427 pub fn builder() -> Builder {
428 Builder::new()
429 }
430
431 /// Return the first occurrence of any of the patterns in this searcher,
432 /// according to its match semantics, in the given haystack. The `Match`
433 /// returned will include the identifier of the pattern that matched, which
434 /// corresponds to the index of the pattern (starting from `0`) in which it
435 /// was added.
436 ///
437 /// # Example
438 ///
439 /// Basic usage:
440 ///
441 /// ```
442 /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
443 ///
444 /// # fn example() -> Option<()> {
445 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
446 /// let mat = searcher.find("foobar")?;
447 /// assert_eq!(PatternID::ZERO, mat.pattern());
448 /// assert_eq!(0, mat.start());
449 /// assert_eq!(6, mat.end());
450 /// # Some(()) }
451 /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
452 /// # example().unwrap()
453 /// # } else {
454 /// # assert!(example().is_none());
455 /// # }
456 /// ```
457 #[inline]
458 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
459 let haystack = haystack.as_ref();
460 self.find_in(haystack, Span::from(0..haystack.len()))
461 }
462
463 /// Return the first occurrence of any of the patterns in this searcher,
464 /// according to its match semantics, in the given haystack starting from
465 /// the given position.
466 ///
467 /// The `Match` returned will include the identifier of the pattern that
468 /// matched, which corresponds to the index of the pattern (starting from
469 /// `0`) in which it was added. The offsets in the `Match` will be relative
470 /// to the start of `haystack` (and not `at`).
471 ///
472 /// # Example
473 ///
474 /// Basic usage:
475 ///
476 /// ```
477 /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span};
478 ///
479 /// # fn example() -> Option<()> {
480 /// let haystack = "foofoobar";
481 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
482 /// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?;
483 /// assert_eq!(PatternID::ZERO, mat.pattern());
484 /// assert_eq!(3, mat.start());
485 /// assert_eq!(9, mat.end());
486 /// # Some(()) }
487 /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
488 /// # example().unwrap()
489 /// # } else {
490 /// # assert!(example().is_none());
491 /// # }
492 /// ```
493 #[inline]
494 pub fn find_in<B: AsRef<[u8]>>(
495 &self,
496 haystack: B,
497 span: Span,
498 ) -> Option<Match> {
499 let haystack = haystack.as_ref();
500 match self.search_kind {
501 SearchKind::Teddy(ref teddy) => {
502 if haystack[span].len() < teddy.minimum_len() {
503 return self.find_in_slow(haystack, span);
504 }
505 teddy.find_at(
506 &self.patterns,
507 &haystack[..span.end],
508 span.start,
509 )
510 }
511 SearchKind::RabinKarp => self.rabinkarp.find_at(
512 &self.patterns,
513 &haystack[..span.end],
514 span.start,
515 ),
516 }
517 }
518
519 /// Return an iterator of non-overlapping occurrences of the patterns in
520 /// this searcher, according to its match semantics, in the given haystack.
521 ///
522 /// # Example
523 ///
524 /// Basic usage:
525 ///
526 /// ```
527 /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
528 ///
529 /// # fn example() -> Option<()> {
530 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
531 /// let matches: Vec<PatternID> = searcher
532 /// .find_iter("foobar fooba foofoo")
533 /// .map(|mat| mat.pattern())
534 /// .collect();
535 /// assert_eq!(vec![
536 /// PatternID::must(0),
537 /// PatternID::must(1),
538 /// PatternID::must(1),
539 /// PatternID::must(1),
540 /// ], matches);
541 /// # Some(()) }
542 /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
543 /// # example().unwrap()
544 /// # } else {
545 /// # assert!(example().is_none());
546 /// # }
547 /// ```
548 pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
549 &'a self,
550 haystack: &'b B,
551 ) -> FindIter<'a, 'b> {
552 let haystack = haystack.as_ref();
553 let span = Span::from(0..haystack.len());
554 FindIter { searcher: self, haystack, span }
555 }
556
557 /// Returns the match kind used by this packed searcher.
558 ///
559 /// # Examples
560 ///
561 /// Basic usage:
562 ///
563 /// ```
564 /// use aho_corasick::packed::{MatchKind, Searcher};
565 ///
566 /// # fn example() -> Option<()> {
567 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
568 /// // leftmost-first is the default.
569 /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
570 /// # Some(()) }
571 /// # if cfg!(all(feature = "std", target_arch = "x86_64")) {
572 /// # example().unwrap()
573 /// # } else {
574 /// # assert!(example().is_none());
575 /// # }
576 /// ```
577 pub fn match_kind(&self) -> &MatchKind {
578 self.patterns.match_kind()
579 }
580
581 /// Returns the minimum length of a haystack that is required in order for
582 /// packed searching to be effective.
583 ///
584 /// In some cases, the underlying packed searcher may not be able to search
585 /// very short haystacks. When that occurs, the implementation will defer
586 /// to a slower non-packed searcher (which is still generally faster than
587 /// Aho-Corasick for a small number of patterns). However, callers may
588 /// want to avoid ever using the slower variant, which one can do by
589 /// never passing a haystack shorter than the minimum length returned by
590 /// this method.
591 pub fn minimum_len(&self) -> usize {
592 self.minimum_len
593 }
594
595 /// Returns the approximate total amount of heap used by this searcher, in
596 /// units of bytes.
597 pub fn memory_usage(&self) -> usize {
598 self.patterns.memory_usage()
599 + self.rabinkarp.memory_usage()
600 + self.search_kind.memory_usage()
601 }
602
603 /// Use a slow (non-packed) searcher.
604 ///
605 /// This is useful when a packed searcher could be constructed, but could
606 /// not be used to search a specific haystack. For example, if Teddy was
607 /// built but the haystack is smaller than ~34 bytes, then Teddy might not
608 /// be able to run.
609 fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option<Match> {
610 self.rabinkarp.find_at(
611 &self.patterns,
612 &haystack[..span.end],
613 span.start,
614 )
615 }
616}
617
618impl SearchKind {
619 fn memory_usage(&self) -> usize {
620 match *self {
621 SearchKind::Teddy(ref ted: &Teddy) => ted.memory_usage(),
622 SearchKind::RabinKarp => 0,
623 }
624 }
625}
626
627/// An iterator over non-overlapping matches from a packed searcher.
628///
629/// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`],
630/// while the lifetime `'h` refers to the lifetime of the haystack being
631/// searched.
632#[derive(Debug)]
633pub struct FindIter<'s, 'h> {
634 searcher: &'s Searcher,
635 haystack: &'h [u8],
636 span: Span,
637}
638
639impl<'s, 'h> Iterator for FindIter<'s, 'h> {
640 type Item = Match;
641
642 fn next(&mut self) -> Option<Match> {
643 if self.span.start > self.span.end {
644 return None;
645 }
646 match self.searcher.find_in(&self.haystack, self.span) {
647 None => None,
648 Some(m: Match) => {
649 self.span.start = m.end();
650 Some(m)
651 }
652 }
653 }
654}
655