1use std::u16;
2
3use crate::packed::pattern::Patterns;
4use crate::packed::rabinkarp::RabinKarp;
5use crate::packed::teddy::{self, Teddy};
6use crate::Match;
7
8/// This is a limit placed on the total number of patterns we're willing to try
9/// and match at once. As more sophisticated algorithms are added, this number
10/// may be increased.
11const PATTERN_LIMIT: usize = 128;
12
13/// A knob for controlling the match semantics of a packed multiple string
14/// searcher.
15///
16/// This differs from the
17/// [`MatchKind`](../enum.MatchKind.html)
18/// type in the top-level crate module in that it doesn't support
19/// "standard" match semantics, and instead only supports leftmost-first or
20/// leftmost-longest. Namely, "standard" semantics cannot be easily supported
21/// by packed searchers.
22///
23/// For more information on the distinction between leftmost-first and
24/// leftmost-longest, see the docs on the top-level `MatchKind` type.
25///
26/// Unlike the top-level `MatchKind` type, the default match semantics for this
27/// type are leftmost-first.
28#[derive(Clone, Copy, Debug, Eq, PartialEq)]
29pub enum MatchKind {
30 /// Use leftmost-first match semantics, which reports leftmost matches.
31 /// When there are multiple possible leftmost matches, the match
32 /// corresponding to the pattern that appeared earlier when constructing
33 /// the automaton is reported.
34 ///
35 /// This is the default.
36 LeftmostFirst,
37 /// Use leftmost-longest match semantics, which reports leftmost matches.
38 /// When there are multiple possible leftmost matches, the longest match
39 /// is chosen.
40 LeftmostLongest,
41 /// Hints that destructuring should not be exhaustive.
42 ///
43 /// This enum may grow additional variants, so this makes sure clients
44 /// don't count on exhaustive matching. (Otherwise, adding a new variant
45 /// could break existing code.)
46 #[doc(hidden)]
47 __Nonexhaustive,
48}
49
50impl Default for MatchKind {
51 fn default() -> MatchKind {
52 MatchKind::LeftmostFirst
53 }
54}
55
56/// The configuration for a packed multiple pattern searcher.
57///
58/// The configuration is currently limited only to being able to select the
59/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
60/// future, more knobs may be made available.
61///
62/// A configuration produces a [`packed::Builder`](struct.Builder.html), which
63/// in turn can be used to construct a
64/// [`packed::Searcher`](struct.Searcher.html) for searching.
65///
66/// # Example
67///
68/// This example shows how to use leftmost-longest semantics instead of the
69/// default (leftmost-first).
70///
71/// ```
72/// use aho_corasick::packed::{Config, MatchKind};
73///
74/// # fn example() -> Option<()> {
75/// let searcher = Config::new()
76/// .match_kind(MatchKind::LeftmostLongest)
77/// .builder()
78/// .add("foo")
79/// .add("foobar")
80/// .build()?;
81/// let matches: Vec<usize> = searcher
82/// .find_iter("foobar")
83/// .map(|mat| mat.pattern())
84/// .collect();
85/// assert_eq!(vec![1], matches);
86/// # Some(()) }
87/// # if cfg!(target_arch = "x86_64") {
88/// # example().unwrap()
89/// # } else {
90/// # assert!(example().is_none());
91/// # }
92/// ```
93#[derive(Clone, Debug)]
94pub struct Config {
95 kind: MatchKind,
96 force: Option<ForceAlgorithm>,
97 force_teddy_fat: Option<bool>,
98 force_avx: Option<bool>,
99}
100
101/// An internal option for forcing the use of a particular packed algorithm.
102///
103/// When an algorithm is forced, if a searcher could not be constructed for it,
104/// then no searcher will be returned even if an alternative algorithm would
105/// work.
106#[derive(Clone, Debug)]
107enum ForceAlgorithm {
108 Teddy,
109 RabinKarp,
110}
111
112impl Default for Config {
113 fn default() -> Config {
114 Config::new()
115 }
116}
117
118impl Config {
119 /// Create a new default configuration. A default configuration uses
120 /// leftmost-first match semantics.
121 pub fn new() -> Config {
122 Config {
123 kind: MatchKind::LeftmostFirst,
124 force: None,
125 force_teddy_fat: None,
126 force_avx: None,
127 }
128 }
129
130 /// Create a packed builder from this configuration. The builder can be
131 /// used to accumulate patterns and create a
132 /// [`Searcher`](struct.Searcher.html)
133 /// from them.
134 pub fn builder(&self) -> Builder {
135 Builder::from_config(self.clone())
136 }
137
138 /// Set the match semantics for this configuration.
139 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
140 self.kind = kind;
141 self
142 }
143
144 /// An undocumented method for forcing the use of the Teddy algorithm.
145 ///
146 /// This is only exposed for more precise testing and benchmarks. Callers
147 /// should not use it as it is not part of the API stability guarantees of
148 /// this crate.
149 #[doc(hidden)]
150 pub fn force_teddy(&mut self, yes: bool) -> &mut Config {
151 if yes {
152 self.force = Some(ForceAlgorithm::Teddy);
153 } else {
154 self.force = None;
155 }
156 self
157 }
158
159 /// An undocumented method for forcing the use of the Fat Teddy algorithm.
160 ///
161 /// This is only exposed for more precise testing and benchmarks. Callers
162 /// should not use it as it is not part of the API stability guarantees of
163 /// this crate.
164 #[doc(hidden)]
165 pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
166 self.force_teddy_fat = yes;
167 self
168 }
169
170 /// An undocumented method for forcing the use of SSE (`Some(false)`) or
171 /// AVX (`Some(true)`) algorithms.
172 ///
173 /// This is only exposed for more precise testing and benchmarks. Callers
174 /// should not use it as it is not part of the API stability guarantees of
175 /// this crate.
176 #[doc(hidden)]
177 pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config {
178 self.force_avx = yes;
179 self
180 }
181
182 /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
183 ///
184 /// This is only exposed for more precise testing and benchmarks. Callers
185 /// should not use it as it is not part of the API stability guarantees of
186 /// this crate.
187 #[doc(hidden)]
188 pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config {
189 if yes {
190 self.force = Some(ForceAlgorithm::RabinKarp);
191 } else {
192 self.force = None;
193 }
194 self
195 }
196}
197
198/// A builder for constructing a packed searcher from a collection of patterns.
199///
200/// # Example
201///
202/// This example shows how to use a builder to construct a searcher. By
203/// default, leftmost-first match semantics are used.
204///
205/// ```
206/// use aho_corasick::packed::{Builder, MatchKind};
207///
208/// # fn example() -> Option<()> {
209/// let searcher = Builder::new()
210/// .add("foobar")
211/// .add("foo")
212/// .build()?;
213/// let matches: Vec<usize> = searcher
214/// .find_iter("foobar")
215/// .map(|mat| mat.pattern())
216/// .collect();
217/// assert_eq!(vec![0], matches);
218/// # Some(()) }
219/// # if cfg!(target_arch = "x86_64") {
220/// # example().unwrap()
221/// # } else {
222/// # assert!(example().is_none());
223/// # }
224/// ```
225#[derive(Clone, Debug)]
226pub struct Builder {
227 /// The configuration of this builder and subsequent matcher.
228 config: Config,
229 /// Set to true if the builder detects that a matcher cannot be built.
230 inert: bool,
231 /// The patterns provided by the caller.
232 patterns: Patterns,
233}
234
235impl Builder {
236 /// Create a new builder for constructing a multi-pattern searcher. This
237 /// constructor uses the default configuration.
238 pub fn new() -> Builder {
239 Builder::from_config(Config::new())
240 }
241
242 fn from_config(config: Config) -> Builder {
243 Builder { config, inert: false, patterns: Patterns::new() }
244 }
245
246 /// Build a searcher from the patterns added to this builder so far.
247 pub fn build(&self) -> Option<Searcher> {
248 if self.inert || self.patterns.is_empty() {
249 return None;
250 }
251 let mut patterns = self.patterns.clone();
252 patterns.set_match_kind(self.config.kind);
253 let rabinkarp = RabinKarp::new(&patterns);
254 // Effectively, we only want to return a searcher if we can use Teddy,
255 // since Teddy is our only fast packed searcher at the moment.
256 // Rabin-Karp is only used when searching haystacks smaller than what
257 // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
258 // is to force it using undocumented APIs (for tests/benchmarks).
259 let (search_kind, minimum_len) = match self.config.force {
260 None | Some(ForceAlgorithm::Teddy) => {
261 let teddy = match self.build_teddy(&patterns) {
262 None => return None,
263 Some(teddy) => teddy,
264 };
265 let minimum_len = teddy.minimum_len();
266 (SearchKind::Teddy(teddy), minimum_len)
267 }
268 Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0),
269 };
270 Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
271 }
272
273 fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
274 teddy::Builder::new()
275 .avx(self.config.force_avx)
276 .fat(self.config.force_teddy_fat)
277 .build(&patterns)
278 }
279
280 /// Add the given pattern to this set to match.
281 ///
282 /// The order in which patterns are added is significant. Namely, when
283 /// using leftmost-first match semantics, then when multiple patterns can
284 /// match at a particular location, the pattern that was added first is
285 /// used as the match.
286 ///
287 /// If the number of patterns added exceeds the amount supported by packed
288 /// searchers, then the builder will stop accumulating patterns and render
289 /// itself inert. At this point, constructing a searcher will always return
290 /// `None`.
291 pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
292 if self.inert {
293 return self;
294 } else if self.patterns.len() >= PATTERN_LIMIT {
295 self.inert = true;
296 self.patterns.reset();
297 return self;
298 }
299 // Just in case PATTERN_LIMIT increases beyond u16::MAX.
300 assert!(self.patterns.len() <= u16::MAX as usize);
301
302 let pattern = pattern.as_ref();
303 if pattern.is_empty() {
304 self.inert = true;
305 self.patterns.reset();
306 return self;
307 }
308 self.patterns.add(pattern);
309 self
310 }
311
312 /// Add the given iterator of patterns to this set to match.
313 ///
314 /// The iterator must yield elements that can be converted into a `&[u8]`.
315 ///
316 /// The order in which patterns are added is significant. Namely, when
317 /// using leftmost-first match semantics, then when multiple patterns can
318 /// match at a particular location, the pattern that was added first is
319 /// used as the match.
320 ///
321 /// If the number of patterns added exceeds the amount supported by packed
322 /// searchers, then the builder will stop accumulating patterns and render
323 /// itself inert. At this point, constructing a searcher will always return
324 /// `None`.
325 pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
326 where
327 I: IntoIterator<Item = P>,
328 P: AsRef<[u8]>,
329 {
330 for p in patterns {
331 self.add(p);
332 }
333 self
334 }
335}
336
337impl Default for Builder {
338 fn default() -> Builder {
339 Builder::new()
340 }
341}
342
343/// A packed searcher for quickly finding occurrences of multiple patterns.
344///
345/// If callers need more flexible construction, or if one wants to change the
346/// match semantics (either leftmost-first or leftmost-longest), then one can
347/// use the [`Config`](struct.Config.html) and/or
348/// [`Builder`](struct.Builder.html) types for more fine grained control.
349///
350/// # Example
351///
352/// This example shows how to create a searcher from an iterator of patterns.
353/// By default, leftmost-first match semantics are used.
354///
355/// ```
356/// use aho_corasick::packed::{MatchKind, Searcher};
357///
358/// # fn example() -> Option<()> {
359/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
360/// let matches: Vec<usize> = searcher
361/// .find_iter("foobar")
362/// .map(|mat| mat.pattern())
363/// .collect();
364/// assert_eq!(vec![0], matches);
365/// # Some(()) }
366/// # if cfg!(target_arch = "x86_64") {
367/// # example().unwrap()
368/// # } else {
369/// # assert!(example().is_none());
370/// # }
371/// ```
372#[derive(Clone, Debug)]
373pub struct Searcher {
374 patterns: Patterns,
375 rabinkarp: RabinKarp,
376 search_kind: SearchKind,
377 minimum_len: usize,
378}
379
380#[derive(Clone, Debug)]
381enum SearchKind {
382 Teddy(Teddy),
383 RabinKarp,
384}
385
386impl Searcher {
387 /// A convenience function for constructing a searcher from an iterator
388 /// of things that can be converted to a `&[u8]`.
389 ///
390 /// If a searcher could not be constructed (either because of an
391 /// unsupported CPU or because there are too many patterns), then `None`
392 /// is returned.
393 ///
394 /// # Example
395 ///
396 /// Basic usage:
397 ///
398 /// ```
399 /// use aho_corasick::packed::{MatchKind, Searcher};
400 ///
401 /// # fn example() -> Option<()> {
402 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
403 /// let matches: Vec<usize> = searcher
404 /// .find_iter("foobar")
405 /// .map(|mat| mat.pattern())
406 /// .collect();
407 /// assert_eq!(vec![0], matches);
408 /// # Some(()) }
409 /// # if cfg!(target_arch = "x86_64") {
410 /// # example().unwrap()
411 /// # } else {
412 /// # assert!(example().is_none());
413 /// # }
414 /// ```
415 pub fn new<I, P>(patterns: I) -> Option<Searcher>
416 where
417 I: IntoIterator<Item = P>,
418 P: AsRef<[u8]>,
419 {
420 Builder::new().extend(patterns).build()
421 }
422
423 /// Return the first occurrence of any of the patterns in this searcher,
424 /// according to its match semantics, in the given haystack. The `Match`
425 /// returned will include the identifier of the pattern that matched, which
426 /// corresponds to the index of the pattern (starting from `0`) in which it
427 /// was added.
428 ///
429 /// # Example
430 ///
431 /// Basic usage:
432 ///
433 /// ```
434 /// use aho_corasick::packed::{MatchKind, Searcher};
435 ///
436 /// # fn example() -> Option<()> {
437 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
438 /// let mat = searcher.find("foobar")?;
439 /// assert_eq!(0, mat.pattern());
440 /// assert_eq!(0, mat.start());
441 /// assert_eq!(6, mat.end());
442 /// # Some(()) }
443 /// # if cfg!(target_arch = "x86_64") {
444 /// # example().unwrap()
445 /// # } else {
446 /// # assert!(example().is_none());
447 /// # }
448 /// ```
449 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
450 self.find_at(haystack, 0)
451 }
452
453 /// Return the first occurrence of any of the patterns in this searcher,
454 /// according to its match semantics, in the given haystack starting from
455 /// the given position.
456 ///
457 /// The `Match` returned will include the identifier of the pattern that
458 /// matched, which corresponds to the index of the pattern (starting from
459 /// `0`) in which it was added. The offsets in the `Match` will be relative
460 /// to the start of `haystack` (and not `at`).
461 ///
462 /// # Example
463 ///
464 /// Basic usage:
465 ///
466 /// ```
467 /// use aho_corasick::packed::{MatchKind, Searcher};
468 ///
469 /// # fn example() -> Option<()> {
470 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
471 /// let mat = searcher.find_at("foofoobar", 3)?;
472 /// assert_eq!(0, mat.pattern());
473 /// assert_eq!(3, mat.start());
474 /// assert_eq!(9, mat.end());
475 /// # Some(()) }
476 /// # if cfg!(target_arch = "x86_64") {
477 /// # example().unwrap()
478 /// # } else {
479 /// # assert!(example().is_none());
480 /// # }
481 /// ```
482 pub fn find_at<B: AsRef<[u8]>>(
483 &self,
484 haystack: B,
485 at: usize,
486 ) -> Option<Match> {
487 let haystack = haystack.as_ref();
488 match self.search_kind {
489 SearchKind::Teddy(ref teddy) => {
490 if haystack[at..].len() < teddy.minimum_len() {
491 return self.slow_at(haystack, at);
492 }
493 teddy.find_at(&self.patterns, haystack, at)
494 }
495 SearchKind::RabinKarp => {
496 self.rabinkarp.find_at(&self.patterns, haystack, at)
497 }
498 }
499 }
500
501 /// Return an iterator of non-overlapping occurrences of the patterns in
502 /// this searcher, according to its match semantics, in the given haystack.
503 ///
504 /// # Example
505 ///
506 /// Basic usage:
507 ///
508 /// ```
509 /// use aho_corasick::packed::{MatchKind, Searcher};
510 ///
511 /// # fn example() -> Option<()> {
512 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
513 /// let matches: Vec<usize> = searcher
514 /// .find_iter("foobar fooba foofoo")
515 /// .map(|mat| mat.pattern())
516 /// .collect();
517 /// assert_eq!(vec![0, 1, 1, 1], matches);
518 /// # Some(()) }
519 /// # if cfg!(target_arch = "x86_64") {
520 /// # example().unwrap()
521 /// # } else {
522 /// # assert!(example().is_none());
523 /// # }
524 /// ```
525 pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
526 &'a self,
527 haystack: &'b B,
528 ) -> FindIter<'a, 'b> {
529 FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 }
530 }
531
532 /// Returns the match kind used by this packed searcher.
533 ///
534 /// # Examples
535 ///
536 /// Basic usage:
537 ///
538 /// ```
539 /// use aho_corasick::packed::{MatchKind, Searcher};
540 ///
541 /// # fn example() -> Option<()> {
542 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
543 /// // leftmost-first is the default.
544 /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
545 /// # Some(()) }
546 /// # if cfg!(target_arch = "x86_64") {
547 /// # example().unwrap()
548 /// # } else {
549 /// # assert!(example().is_none());
550 /// # }
551 /// ```
552 pub fn match_kind(&self) -> &MatchKind {
553 self.patterns.match_kind()
554 }
555
556 /// Returns the minimum length of a haystack that is required in order for
557 /// packed searching to be effective.
558 ///
559 /// In some cases, the underlying packed searcher may not be able to search
560 /// very short haystacks. When that occurs, the implementation will defer
561 /// to a slower non-packed searcher (which is still generally faster than
562 /// Aho-Corasick for a small number of patterns). However, callers may
563 /// want to avoid ever using the slower variant, which one can do by
564 /// never passing a haystack shorter than the minimum length returned by
565 /// this method.
566 pub fn minimum_len(&self) -> usize {
567 self.minimum_len
568 }
569
570 /// Returns the approximate total amount of heap used by this searcher, in
571 /// units of bytes.
572 pub fn heap_bytes(&self) -> usize {
573 self.patterns.heap_bytes()
574 + self.rabinkarp.heap_bytes()
575 + self.search_kind.heap_bytes()
576 }
577
578 /// Use a slow (non-packed) searcher.
579 ///
580 /// This is useful when a packed searcher could be constructed, but could
581 /// not be used to search a specific haystack. For example, if Teddy was
582 /// built but the haystack is smaller than ~34 bytes, then Teddy might not
583 /// be able to run.
584 fn slow_at(&self, haystack: &[u8], at: usize) -> Option<Match> {
585 self.rabinkarp.find_at(&self.patterns, haystack, at)
586 }
587}
588
589impl SearchKind {
590 fn heap_bytes(&self) -> usize {
591 match *self {
592 SearchKind::Teddy(ref ted: &Teddy) => ted.heap_bytes(),
593 SearchKind::RabinKarp => 0,
594 }
595 }
596}
597
598/// An iterator over non-overlapping matches from a packed searcher.
599///
600/// The lifetime `'s` refers to the lifetime of the underlying
601/// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the
602/// lifetime of the haystack being searched.
603#[derive(Debug)]
604pub struct FindIter<'s, 'h> {
605 searcher: &'s Searcher,
606 haystack: &'h [u8],
607 at: usize,
608}
609
610impl<'s, 'h> Iterator for FindIter<'s, 'h> {
611 type Item = Match;
612
613 fn next(&mut self) -> Option<Match> {
614 if self.at > self.haystack.len() {
615 return None;
616 }
617 match self.searcher.find_at(&self.haystack, self.at) {
618 None => None,
619 Some(c: Match) => {
620 self.at = c.end;
621 Some(c)
622 }
623 }
624 }
625}
626