1/*!
2This module provides forward and reverse substring search routines.
3
4Unlike the standard library's substring search routines, these work on
5arbitrary bytes. For all non-empty needles, these routines will report exactly
6the same values as the corresponding routines in the standard library. For
7the empty needle, the standard library reports matches only at valid UTF-8
8boundaries, where as these routines will report matches at every position.
9
10Other than being able to work on arbitrary bytes, the primary reason to prefer
11these routines over the standard library routines is that these will generally
12be faster. In some cases, significantly so.
13
14# Example: iterating over substring matches
15
16This example shows how to use [`find_iter`] to find occurrences of a substring
17in a haystack.
18
19```
20use memchr::memmem;
21
22let haystack = b"foo bar foo baz foo";
23
24let mut it = memmem::find_iter(haystack, "foo");
25assert_eq!(Some(0), it.next());
26assert_eq!(Some(8), it.next());
27assert_eq!(Some(16), it.next());
28assert_eq!(None, it.next());
29```
30
31# Example: iterating over substring matches in reverse
32
33This example shows how to use [`rfind_iter`] to find occurrences of a substring
34in a haystack starting from the end of the haystack.
35
36**NOTE:** This module does not implement double ended iterators, so reverse
37searches aren't done by calling `rev` on a forward iterator.
38
39```
40use memchr::memmem;
41
42let haystack = b"foo bar foo baz foo";
43
44let mut it = memmem::rfind_iter(haystack, "foo");
45assert_eq!(Some(16), it.next());
46assert_eq!(Some(8), it.next());
47assert_eq!(Some(0), it.next());
48assert_eq!(None, it.next());
49```
50
51# Example: repeating a search for the same needle
52
53It may be possible for the overhead of constructing a substring searcher to be
54measurable in some workloads. In cases where the same needle is used to search
55many haystacks, it is possible to do construction once and thus to avoid it for
56subsequent searches. This can be done with a [`Finder`] (or a [`FinderRev`] for
57reverse searches).
58
59```
60use memchr::memmem;
61
62let finder = memmem::Finder::new("foo");
63
64assert_eq!(Some(4), finder.find(b"baz foo quux"));
65assert_eq!(None, finder.find(b"quux baz bar"));
66```
67*/
68
69pub use self::prefilter::Prefilter;
70
71use crate::{
72 cow::CowBytes,
73 memmem::{
74 prefilter::{Pre, PrefilterFn, PrefilterState},
75 rabinkarp::NeedleHash,
76 rarebytes::RareNeedleBytes,
77 },
78};
79
80/// Defines a suite of quickcheck properties for forward and reverse
81/// substring searching.
82///
83/// This is defined in this specific spot so that it can be used freely among
84/// the different substring search implementations. I couldn't be bothered to
85/// fight with the macro-visibility rules enough to figure out how to stuff it
86/// somewhere more convenient.
87#[cfg(all(test, feature = "std"))]
88macro_rules! define_memmem_quickcheck_tests {
89 ($fwd:expr, $rev:expr) => {
90 use crate::memmem::proptests;
91
92 quickcheck::quickcheck! {
93 fn qc_fwd_prefix_is_substring(bs: Vec<u8>) -> bool {
94 proptests::prefix_is_substring(false, &bs, $fwd)
95 }
96
97 fn qc_fwd_suffix_is_substring(bs: Vec<u8>) -> bool {
98 proptests::suffix_is_substring(false, &bs, $fwd)
99 }
100
101 fn qc_fwd_matches_naive(
102 haystack: Vec<u8>,
103 needle: Vec<u8>
104 ) -> bool {
105 proptests::matches_naive(false, &haystack, &needle, $fwd)
106 }
107
108 fn qc_rev_prefix_is_substring(bs: Vec<u8>) -> bool {
109 proptests::prefix_is_substring(true, &bs, $rev)
110 }
111
112 fn qc_rev_suffix_is_substring(bs: Vec<u8>) -> bool {
113 proptests::suffix_is_substring(true, &bs, $rev)
114 }
115
116 fn qc_rev_matches_naive(
117 haystack: Vec<u8>,
118 needle: Vec<u8>
119 ) -> bool {
120 proptests::matches_naive(true, &haystack, &needle, $rev)
121 }
122 }
123 };
124}
125
126/// Defines a suite of "simple" hand-written tests for a substring
127/// implementation.
128///
129/// This is defined here for the same reason that
130/// define_memmem_quickcheck_tests is defined here.
131#[cfg(test)]
132macro_rules! define_memmem_simple_tests {
133 ($fwd:expr, $rev:expr) => {
134 use crate::memmem::testsimples;
135
136 #[test]
137 fn simple_forward() {
138 testsimples::run_search_tests_fwd($fwd);
139 }
140
141 #[test]
142 fn simple_reverse() {
143 testsimples::run_search_tests_rev($rev);
144 }
145 };
146}
147
148mod byte_frequencies;
149#[cfg(memchr_runtime_simd)]
150mod genericsimd;
151mod prefilter;
152mod rabinkarp;
153mod rarebytes;
154mod twoway;
155mod util;
156#[cfg(memchr_runtime_simd)]
157mod vector;
158#[cfg(all(memchr_runtime_wasm128))]
159mod wasm;
160#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
161mod x86;
162
163/// Returns an iterator over all non-overlapping occurrences of a substring in
164/// a haystack.
165///
166/// # Complexity
167///
168/// This routine is guaranteed to have worst case linear time complexity
169/// with respect to both the needle and the haystack. That is, this runs
170/// in `O(needle.len() + haystack.len())` time.
171///
172/// This routine is also guaranteed to have worst case constant space
173/// complexity.
174///
175/// # Examples
176///
177/// Basic usage:
178///
179/// ```
180/// use memchr::memmem;
181///
182/// let haystack = b"foo bar foo baz foo";
183/// let mut it = memmem::find_iter(haystack, b"foo");
184/// assert_eq!(Some(0), it.next());
185/// assert_eq!(Some(8), it.next());
186/// assert_eq!(Some(16), it.next());
187/// assert_eq!(None, it.next());
188/// ```
189#[inline]
190pub fn find_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>(
191 haystack: &'h [u8],
192 needle: &'n N,
193) -> FindIter<'h, 'n> {
194 FindIter::new(haystack, Finder::new(needle))
195}
196
197/// Returns a reverse iterator over all non-overlapping occurrences of a
198/// substring in a haystack.
199///
200/// # Complexity
201///
202/// This routine is guaranteed to have worst case linear time complexity
203/// with respect to both the needle and the haystack. That is, this runs
204/// in `O(needle.len() + haystack.len())` time.
205///
206/// This routine is also guaranteed to have worst case constant space
207/// complexity.
208///
209/// # Examples
210///
211/// Basic usage:
212///
213/// ```
214/// use memchr::memmem;
215///
216/// let haystack = b"foo bar foo baz foo";
217/// let mut it = memmem::rfind_iter(haystack, b"foo");
218/// assert_eq!(Some(16), it.next());
219/// assert_eq!(Some(8), it.next());
220/// assert_eq!(Some(0), it.next());
221/// assert_eq!(None, it.next());
222/// ```
223#[inline]
224pub fn rfind_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>(
225 haystack: &'h [u8],
226 needle: &'n N,
227) -> FindRevIter<'h, 'n> {
228 FindRevIter::new(haystack, finder:FinderRev::new(needle))
229}
230
231/// Returns the index of the first occurrence of the given needle.
232///
233/// Note that if you're are searching for the same needle in many different
234/// small haystacks, it may be faster to initialize a [`Finder`] once,
235/// and reuse it for each search.
236///
237/// # Complexity
238///
239/// This routine is guaranteed to have worst case linear time complexity
240/// with respect to both the needle and the haystack. That is, this runs
241/// in `O(needle.len() + haystack.len())` time.
242///
243/// This routine is also guaranteed to have worst case constant space
244/// complexity.
245///
246/// # Examples
247///
248/// Basic usage:
249///
250/// ```
251/// use memchr::memmem;
252///
253/// let haystack = b"foo bar baz";
254/// assert_eq!(Some(0), memmem::find(haystack, b"foo"));
255/// assert_eq!(Some(4), memmem::find(haystack, b"bar"));
256/// assert_eq!(None, memmem::find(haystack, b"quux"));
257/// ```
258#[inline]
259pub fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
260 if haystack.len() < 64 {
261 rabinkarp::find(haystack, needle)
262 } else {
263 Finder::new(needle).find(haystack)
264 }
265}
266
267/// Returns the index of the last occurrence of the given needle.
268///
269/// Note that if you're are searching for the same needle in many different
270/// small haystacks, it may be faster to initialize a [`FinderRev`] once,
271/// and reuse it for each search.
272///
273/// # Complexity
274///
275/// This routine is guaranteed to have worst case linear time complexity
276/// with respect to both the needle and the haystack. That is, this runs
277/// in `O(needle.len() + haystack.len())` time.
278///
279/// This routine is also guaranteed to have worst case constant space
280/// complexity.
281///
282/// # Examples
283///
284/// Basic usage:
285///
286/// ```
287/// use memchr::memmem;
288///
289/// let haystack = b"foo bar baz";
290/// assert_eq!(Some(0), memmem::rfind(haystack, b"foo"));
291/// assert_eq!(Some(4), memmem::rfind(haystack, b"bar"));
292/// assert_eq!(Some(8), memmem::rfind(haystack, b"ba"));
293/// assert_eq!(None, memmem::rfind(haystack, b"quux"));
294/// ```
295#[inline]
296pub fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> {
297 if haystack.len() < 64 {
298 rabinkarp::rfind(haystack, needle)
299 } else {
300 FinderRev::new(needle).rfind(haystack)
301 }
302}
303
304/// An iterator over non-overlapping substring matches.
305///
306/// Matches are reported by the byte offset at which they begin.
307///
308/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
309/// needle.
310#[derive(Debug)]
311pub struct FindIter<'h, 'n> {
312 haystack: &'h [u8],
313 prestate: PrefilterState,
314 finder: Finder<'n>,
315 pos: usize,
316}
317
318impl<'h, 'n> FindIter<'h, 'n> {
319 #[inline(always)]
320 pub(crate) fn new(
321 haystack: &'h [u8],
322 finder: Finder<'n>,
323 ) -> FindIter<'h, 'n> {
324 let prestate = finder.searcher.prefilter_state();
325 FindIter { haystack, prestate, finder, pos: 0 }
326 }
327
328 /// Convert this iterator into its owned variant, such that it no longer
329 /// borrows the finder and needle.
330 ///
331 /// If this is already an owned iterator, then this is a no-op. Otherwise,
332 /// this copies the needle.
333 ///
334 /// This is only available when the `std` feature is enabled.
335 #[cfg(feature = "std")]
336 #[inline]
337 pub fn into_owned(self) -> FindIter<'h, 'static> {
338 FindIter {
339 haystack: self.haystack,
340 prestate: self.prestate,
341 finder: self.finder.into_owned(),
342 pos: self.pos,
343 }
344 }
345}
346
347impl<'h, 'n> Iterator for FindIter<'h, 'n> {
348 type Item = usize;
349
350 fn next(&mut self) -> Option<usize> {
351 if self.pos > self.haystack.len() {
352 return None;
353 }
354 let result: Option = self
355 .finder
356 .searcher
357 .find(&mut self.prestate, &self.haystack[self.pos..]);
358 match result {
359 None => None,
360 Some(i: usize) => {
361 let pos: usize = self.pos + i;
362 self.pos = pos + core::cmp::max(v1:1, self.finder.needle().len());
363 Some(pos)
364 }
365 }
366 }
367}
368
369/// An iterator over non-overlapping substring matches in reverse.
370///
371/// Matches are reported by the byte offset at which they begin.
372///
373/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
374/// needle.
375#[derive(Debug)]
376pub struct FindRevIter<'h, 'n> {
377 haystack: &'h [u8],
378 finder: FinderRev<'n>,
379 /// When searching with an empty needle, this gets set to `None` after
380 /// we've yielded the last element at `0`.
381 pos: Option<usize>,
382}
383
384impl<'h, 'n> FindRevIter<'h, 'n> {
385 #[inline(always)]
386 pub(crate) fn new(
387 haystack: &'h [u8],
388 finder: FinderRev<'n>,
389 ) -> FindRevIter<'h, 'n> {
390 let pos = Some(haystack.len());
391 FindRevIter { haystack, finder, pos }
392 }
393
394 /// Convert this iterator into its owned variant, such that it no longer
395 /// borrows the finder and needle.
396 ///
397 /// If this is already an owned iterator, then this is a no-op. Otherwise,
398 /// this copies the needle.
399 ///
400 /// This is only available when the `std` feature is enabled.
401 #[cfg(feature = "std")]
402 #[inline]
403 pub fn into_owned(self) -> FindRevIter<'h, 'static> {
404 FindRevIter {
405 haystack: self.haystack,
406 finder: self.finder.into_owned(),
407 pos: self.pos,
408 }
409 }
410}
411
412impl<'h, 'n> Iterator for FindRevIter<'h, 'n> {
413 type Item = usize;
414
415 fn next(&mut self) -> Option<usize> {
416 let pos: usize = match self.pos {
417 None => return None,
418 Some(pos: usize) => pos,
419 };
420 let result: Option = self.finder.rfind(&self.haystack[..pos]);
421 match result {
422 None => None,
423 Some(i: usize) => {
424 if pos == i {
425 self.pos = pos.checked_sub(1);
426 } else {
427 self.pos = Some(i);
428 }
429 Some(i)
430 }
431 }
432 }
433}
434
435/// A single substring searcher fixed to a particular needle.
436///
437/// The purpose of this type is to permit callers to construct a substring
438/// searcher that can be used to search haystacks without the overhead of
439/// constructing the searcher in the first place. This is a somewhat niche
440/// concern when it's necessary to re-use the same needle to search multiple
441/// different haystacks with as little overhead as possible. In general, using
442/// [`find`] is good enough, but `Finder` is useful when you can meaningfully
443/// observe searcher construction time in a profile.
444///
445/// When the `std` feature is enabled, then this type has an `into_owned`
446/// version which permits building a `Finder` that is not connected to
447/// the lifetime of its needle.
448#[derive(Clone, Debug)]
449pub struct Finder<'n> {
450 searcher: Searcher<'n>,
451}
452
453impl<'n> Finder<'n> {
454 /// Create a new finder for the given needle.
455 #[inline]
456 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> Finder<'n> {
457 FinderBuilder::new().build_forward(needle)
458 }
459
460 /// Returns the index of the first occurrence of this needle in the given
461 /// haystack.
462 ///
463 /// # Complexity
464 ///
465 /// This routine is guaranteed to have worst case linear time complexity
466 /// with respect to both the needle and the haystack. That is, this runs
467 /// in `O(needle.len() + haystack.len())` time.
468 ///
469 /// This routine is also guaranteed to have worst case constant space
470 /// complexity.
471 ///
472 /// # Examples
473 ///
474 /// Basic usage:
475 ///
476 /// ```
477 /// use memchr::memmem::Finder;
478 ///
479 /// let haystack = b"foo bar baz";
480 /// assert_eq!(Some(0), Finder::new("foo").find(haystack));
481 /// assert_eq!(Some(4), Finder::new("bar").find(haystack));
482 /// assert_eq!(None, Finder::new("quux").find(haystack));
483 /// ```
484 pub fn find(&self, haystack: &[u8]) -> Option<usize> {
485 self.searcher.find(&mut self.searcher.prefilter_state(), haystack)
486 }
487
488 /// Returns an iterator over all occurrences of a substring in a haystack.
489 ///
490 /// # Complexity
491 ///
492 /// This routine is guaranteed to have worst case linear time complexity
493 /// with respect to both the needle and the haystack. That is, this runs
494 /// in `O(needle.len() + haystack.len())` time.
495 ///
496 /// This routine is also guaranteed to have worst case constant space
497 /// complexity.
498 ///
499 /// # Examples
500 ///
501 /// Basic usage:
502 ///
503 /// ```
504 /// use memchr::memmem::Finder;
505 ///
506 /// let haystack = b"foo bar foo baz foo";
507 /// let finder = Finder::new(b"foo");
508 /// let mut it = finder.find_iter(haystack);
509 /// assert_eq!(Some(0), it.next());
510 /// assert_eq!(Some(8), it.next());
511 /// assert_eq!(Some(16), it.next());
512 /// assert_eq!(None, it.next());
513 /// ```
514 #[inline]
515 pub fn find_iter<'a, 'h>(
516 &'a self,
517 haystack: &'h [u8],
518 ) -> FindIter<'h, 'a> {
519 FindIter::new(haystack, self.as_ref())
520 }
521
522 /// Convert this finder into its owned variant, such that it no longer
523 /// borrows the needle.
524 ///
525 /// If this is already an owned finder, then this is a no-op. Otherwise,
526 /// this copies the needle.
527 ///
528 /// This is only available when the `std` feature is enabled.
529 #[cfg(feature = "std")]
530 #[inline]
531 pub fn into_owned(self) -> Finder<'static> {
532 Finder { searcher: self.searcher.into_owned() }
533 }
534
535 /// Convert this finder into its borrowed variant.
536 ///
537 /// This is primarily useful if your finder is owned and you'd like to
538 /// store its borrowed variant in some intermediate data structure.
539 ///
540 /// Note that the lifetime parameter of the returned finder is tied to the
541 /// lifetime of `self`, and may be shorter than the `'n` lifetime of the
542 /// needle itself. Namely, a finder's needle can be either borrowed or
543 /// owned, so the lifetime of the needle returned must necessarily be the
544 /// shorter of the two.
545 #[inline]
546 pub fn as_ref(&self) -> Finder<'_> {
547 Finder { searcher: self.searcher.as_ref() }
548 }
549
550 /// Returns the needle that this finder searches for.
551 ///
552 /// Note that the lifetime of the needle returned is tied to the lifetime
553 /// of the finder, and may be shorter than the `'n` lifetime. Namely, a
554 /// finder's needle can be either borrowed or owned, so the lifetime of the
555 /// needle returned must necessarily be the shorter of the two.
556 #[inline]
557 pub fn needle(&self) -> &[u8] {
558 self.searcher.needle()
559 }
560}
561
562/// A single substring reverse searcher fixed to a particular needle.
563///
564/// The purpose of this type is to permit callers to construct a substring
565/// searcher that can be used to search haystacks without the overhead of
566/// constructing the searcher in the first place. This is a somewhat niche
567/// concern when it's necessary to re-use the same needle to search multiple
568/// different haystacks with as little overhead as possible. In general,
569/// using [`rfind`] is good enough, but `FinderRev` is useful when you can
570/// meaningfully observe searcher construction time in a profile.
571///
572/// When the `std` feature is enabled, then this type has an `into_owned`
573/// version which permits building a `FinderRev` that is not connected to
574/// the lifetime of its needle.
575#[derive(Clone, Debug)]
576pub struct FinderRev<'n> {
577 searcher: SearcherRev<'n>,
578}
579
580impl<'n> FinderRev<'n> {
581 /// Create a new reverse finder for the given needle.
582 #[inline]
583 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> FinderRev<'n> {
584 FinderBuilder::new().build_reverse(needle)
585 }
586
587 /// Returns the index of the last occurrence of this needle in the given
588 /// haystack.
589 ///
590 /// The haystack may be any type that can be cheaply converted into a
591 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
592 ///
593 /// # Complexity
594 ///
595 /// This routine is guaranteed to have worst case linear time complexity
596 /// with respect to both the needle and the haystack. That is, this runs
597 /// in `O(needle.len() + haystack.len())` time.
598 ///
599 /// This routine is also guaranteed to have worst case constant space
600 /// complexity.
601 ///
602 /// # Examples
603 ///
604 /// Basic usage:
605 ///
606 /// ```
607 /// use memchr::memmem::FinderRev;
608 ///
609 /// let haystack = b"foo bar baz";
610 /// assert_eq!(Some(0), FinderRev::new("foo").rfind(haystack));
611 /// assert_eq!(Some(4), FinderRev::new("bar").rfind(haystack));
612 /// assert_eq!(None, FinderRev::new("quux").rfind(haystack));
613 /// ```
614 pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
615 self.searcher.rfind(haystack.as_ref())
616 }
617
618 /// Returns a reverse iterator over all occurrences of a substring in a
619 /// haystack.
620 ///
621 /// # Complexity
622 ///
623 /// This routine is guaranteed to have worst case linear time complexity
624 /// with respect to both the needle and the haystack. That is, this runs
625 /// in `O(needle.len() + haystack.len())` time.
626 ///
627 /// This routine is also guaranteed to have worst case constant space
628 /// complexity.
629 ///
630 /// # Examples
631 ///
632 /// Basic usage:
633 ///
634 /// ```
635 /// use memchr::memmem::FinderRev;
636 ///
637 /// let haystack = b"foo bar foo baz foo";
638 /// let finder = FinderRev::new(b"foo");
639 /// let mut it = finder.rfind_iter(haystack);
640 /// assert_eq!(Some(16), it.next());
641 /// assert_eq!(Some(8), it.next());
642 /// assert_eq!(Some(0), it.next());
643 /// assert_eq!(None, it.next());
644 /// ```
645 #[inline]
646 pub fn rfind_iter<'a, 'h>(
647 &'a self,
648 haystack: &'h [u8],
649 ) -> FindRevIter<'h, 'a> {
650 FindRevIter::new(haystack, self.as_ref())
651 }
652
653 /// Convert this finder into its owned variant, such that it no longer
654 /// borrows the needle.
655 ///
656 /// If this is already an owned finder, then this is a no-op. Otherwise,
657 /// this copies the needle.
658 ///
659 /// This is only available when the `std` feature is enabled.
660 #[cfg(feature = "std")]
661 #[inline]
662 pub fn into_owned(self) -> FinderRev<'static> {
663 FinderRev { searcher: self.searcher.into_owned() }
664 }
665
666 /// Convert this finder into its borrowed variant.
667 ///
668 /// This is primarily useful if your finder is owned and you'd like to
669 /// store its borrowed variant in some intermediate data structure.
670 ///
671 /// Note that the lifetime parameter of the returned finder is tied to the
672 /// lifetime of `self`, and may be shorter than the `'n` lifetime of the
673 /// needle itself. Namely, a finder's needle can be either borrowed or
674 /// owned, so the lifetime of the needle returned must necessarily be the
675 /// shorter of the two.
676 #[inline]
677 pub fn as_ref(&self) -> FinderRev<'_> {
678 FinderRev { searcher: self.searcher.as_ref() }
679 }
680
681 /// Returns the needle that this finder searches for.
682 ///
683 /// Note that the lifetime of the needle returned is tied to the lifetime
684 /// of the finder, and may be shorter than the `'n` lifetime. Namely, a
685 /// finder's needle can be either borrowed or owned, so the lifetime of the
686 /// needle returned must necessarily be the shorter of the two.
687 #[inline]
688 pub fn needle(&self) -> &[u8] {
689 self.searcher.needle()
690 }
691}
692
693/// A builder for constructing non-default forward or reverse memmem finders.
694///
695/// A builder is primarily useful for configuring a substring searcher.
696/// Currently, the only configuration exposed is the ability to disable
697/// heuristic prefilters used to speed up certain searches.
698#[derive(Clone, Debug, Default)]
699pub struct FinderBuilder {
700 config: SearcherConfig,
701}
702
703impl FinderBuilder {
704 /// Create a new finder builder with default settings.
705 pub fn new() -> FinderBuilder {
706 FinderBuilder::default()
707 }
708
709 /// Build a forward finder using the given needle from the current
710 /// settings.
711 pub fn build_forward<'n, B: ?Sized + AsRef<[u8]>>(
712 &self,
713 needle: &'n B,
714 ) -> Finder<'n> {
715 Finder { searcher: Searcher::new(self.config, needle.as_ref()) }
716 }
717
718 /// Build a reverse finder using the given needle from the current
719 /// settings.
720 pub fn build_reverse<'n, B: ?Sized + AsRef<[u8]>>(
721 &self,
722 needle: &'n B,
723 ) -> FinderRev<'n> {
724 FinderRev { searcher: SearcherRev::new(needle.as_ref()) }
725 }
726
727 /// Configure the prefilter setting for the finder.
728 ///
729 /// See the documentation for [`Prefilter`] for more discussion on why
730 /// you might want to configure this.
731 pub fn prefilter(&mut self, prefilter: Prefilter) -> &mut FinderBuilder {
732 self.config.prefilter = prefilter;
733 self
734 }
735}
736
737/// The internal implementation of a forward substring searcher.
738///
739/// The reality is that this is a "meta" searcher. Namely, depending on a
740/// variety of parameters (CPU support, target, needle size, haystack size and
741/// even dynamic properties such as prefilter effectiveness), the actual
742/// algorithm employed to do substring search may change.
743#[derive(Clone, Debug)]
744struct Searcher<'n> {
745 /// The actual needle we're searching for.
746 ///
747 /// A CowBytes is like a Cow<[u8]>, except in no_std environments, it is
748 /// specialized to a single variant (the borrowed form).
749 needle: CowBytes<'n>,
750 /// A collection of facts computed on the needle that are useful for more
751 /// than one substring search algorithm.
752 ninfo: NeedleInfo,
753 /// A prefilter function, if it was deemed appropriate.
754 ///
755 /// Some substring search implementations (like Two-Way) benefit greatly
756 /// if we can quickly find candidate starting positions for a match.
757 prefn: Option<PrefilterFn>,
758 /// The actual substring implementation in use.
759 kind: SearcherKind,
760}
761
762/// A collection of facts computed about a search needle.
763///
764/// We group these things together because it's useful to be able to hand them
765/// to prefilters or substring algorithms that want them.
766#[derive(Clone, Copy, Debug)]
767pub(crate) struct NeedleInfo {
768 /// The offsets of "rare" bytes detected in the needle.
769 ///
770 /// This is meant to be a heuristic in order to maximize the effectiveness
771 /// of vectorized code. Namely, vectorized code tends to focus on only
772 /// one or two bytes. If we pick bytes from the needle that occur
773 /// infrequently, then more time will be spent in the vectorized code and
774 /// will likely make the overall search (much) faster.
775 ///
776 /// Of course, this is only a heuristic based on a background frequency
777 /// distribution of bytes. But it tends to work very well in practice.
778 pub(crate) rarebytes: RareNeedleBytes,
779 /// A Rabin-Karp hash of the needle.
780 ///
781 /// This is store here instead of in a more specific Rabin-Karp search
782 /// since Rabin-Karp may be used even if another SearchKind corresponds
783 /// to some other search implementation. e.g., If measurements suggest RK
784 /// is faster in some cases or if a search implementation can't handle
785 /// particularly small haystack. (Moreover, we cannot use RK *generally*,
786 /// since its worst case time is multiplicative. Instead, we only use it
787 /// some small haystacks, where "small" is a constant.)
788 pub(crate) nhash: NeedleHash,
789}
790
791/// Configuration for substring search.
792#[derive(Clone, Copy, Debug, Default)]
793struct SearcherConfig {
794 /// This permits changing the behavior of the prefilter, since it can have
795 /// a variable impact on performance.
796 prefilter: Prefilter,
797}
798
799#[derive(Clone, Debug)]
800enum SearcherKind {
801 /// A special case for empty needles. An empty needle always matches, even
802 /// in an empty haystack.
803 Empty,
804 /// This is used whenever the needle is a single byte. In this case, we
805 /// always use memchr.
806 OneByte(u8),
807 /// Two-Way is the generic work horse and is what provides our additive
808 /// linear time guarantee. In general, it's used when the needle is bigger
809 /// than 8 bytes or so.
810 TwoWay(twoway::Forward),
811 #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
812 GenericSIMD128(x86::sse::Forward),
813 #[cfg(memchr_runtime_wasm128)]
814 GenericSIMD128(wasm::Forward),
815 #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
816 GenericSIMD256(x86::avx::Forward),
817}
818
819impl<'n> Searcher<'n> {
820 fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> {
821 use self::SearcherKind::*;
822
823 let ninfo = NeedleInfo::new(needle);
824 let mk = |kind: SearcherKind| {
825 let prefn = prefilter::forward(
826 &config.prefilter,
827 &ninfo.rarebytes,
828 needle,
829 );
830 Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind }
831 };
832 if needle.len() == 0 {
833 return mk(Empty);
834 }
835 if needle.len() == 1 {
836 return mk(OneByte(needle[0]));
837 }
838 #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
839 {
840 if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) {
841 return mk(GenericSIMD256(fwd));
842 } else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) {
843 return mk(GenericSIMD128(fwd));
844 }
845 }
846 #[cfg(all(target_arch = "wasm32", memchr_runtime_simd))]
847 {
848 if let Some(fwd) = wasm::Forward::new(&ninfo, needle) {
849 return mk(GenericSIMD128(fwd));
850 }
851 }
852
853 mk(TwoWay(twoway::Forward::new(needle)))
854 }
855
856 /// Return a fresh prefilter state that can be used with this searcher.
857 /// A prefilter state is used to track the effectiveness of a searcher's
858 /// prefilter for speeding up searches. Therefore, the prefilter state
859 /// should generally be reused on subsequent searches (such as in an
860 /// iterator). For searches on a different haystack, then a new prefilter
861 /// state should be used.
862 ///
863 /// This always initializes a valid (but possibly inert) prefilter state
864 /// even if this searcher does not have a prefilter enabled.
865 fn prefilter_state(&self) -> PrefilterState {
866 if self.prefn.is_none() {
867 PrefilterState::inert()
868 } else {
869 PrefilterState::new()
870 }
871 }
872
873 fn needle(&self) -> &[u8] {
874 self.needle.as_slice()
875 }
876
877 fn as_ref(&self) -> Searcher<'_> {
878 use self::SearcherKind::*;
879
880 let kind = match self.kind {
881 Empty => Empty,
882 OneByte(b) => OneByte(b),
883 TwoWay(tw) => TwoWay(tw),
884 #[cfg(all(not(miri), memchr_runtime_simd))]
885 GenericSIMD128(gs) => GenericSIMD128(gs),
886 #[cfg(all(
887 not(miri),
888 target_arch = "x86_64",
889 memchr_runtime_simd
890 ))]
891 GenericSIMD256(gs) => GenericSIMD256(gs),
892 };
893 Searcher {
894 needle: CowBytes::new(self.needle()),
895 ninfo: self.ninfo,
896 prefn: self.prefn,
897 kind,
898 }
899 }
900
901 #[cfg(feature = "std")]
902 fn into_owned(self) -> Searcher<'static> {
903 use self::SearcherKind::*;
904
905 let kind = match self.kind {
906 Empty => Empty,
907 OneByte(b) => OneByte(b),
908 TwoWay(tw) => TwoWay(tw),
909 #[cfg(all(not(miri), memchr_runtime_simd))]
910 GenericSIMD128(gs) => GenericSIMD128(gs),
911 #[cfg(all(
912 not(miri),
913 target_arch = "x86_64",
914 memchr_runtime_simd
915 ))]
916 GenericSIMD256(gs) => GenericSIMD256(gs),
917 };
918 Searcher {
919 needle: self.needle.into_owned(),
920 ninfo: self.ninfo,
921 prefn: self.prefn,
922 kind,
923 }
924 }
925
926 /// Implements forward substring search by selecting the implementation
927 /// chosen at construction and executing it on the given haystack with the
928 /// prefilter's current state of effectiveness.
929 #[inline(always)]
930 fn find(
931 &self,
932 state: &mut PrefilterState,
933 haystack: &[u8],
934 ) -> Option<usize> {
935 use self::SearcherKind::*;
936
937 let needle = self.needle();
938 if haystack.len() < needle.len() {
939 return None;
940 }
941 match self.kind {
942 Empty => Some(0),
943 OneByte(b) => crate::memchr(b, haystack),
944 TwoWay(ref tw) => {
945 // For very short haystacks (e.g., where the prefilter probably
946 // can't run), it's faster to just run RK.
947 if rabinkarp::is_fast(haystack, needle) {
948 rabinkarp::find_with(&self.ninfo.nhash, haystack, needle)
949 } else {
950 self.find_tw(tw, state, haystack, needle)
951 }
952 }
953 #[cfg(all(not(miri), memchr_runtime_simd))]
954 GenericSIMD128(ref gs) => {
955 // The SIMD matcher can't handle particularly short haystacks,
956 // so we fall back to RK in these cases.
957 if haystack.len() < gs.min_haystack_len() {
958 rabinkarp::find_with(&self.ninfo.nhash, haystack, needle)
959 } else {
960 gs.find(haystack, needle)
961 }
962 }
963 #[cfg(all(
964 not(miri),
965 target_arch = "x86_64",
966 memchr_runtime_simd
967 ))]
968 GenericSIMD256(ref gs) => {
969 // The SIMD matcher can't handle particularly short haystacks,
970 // so we fall back to RK in these cases.
971 if haystack.len() < gs.min_haystack_len() {
972 rabinkarp::find_with(&self.ninfo.nhash, haystack, needle)
973 } else {
974 gs.find(haystack, needle)
975 }
976 }
977 }
978 }
979
980 /// Calls Two-Way on the given haystack/needle.
981 ///
982 /// This is marked as unlineable since it seems to have a better overall
983 /// effect on benchmarks. However, this is one of those cases where
984 /// inlining it results an improvement in other benchmarks too, so I
985 /// suspect we just don't have enough data yet to make the right call here.
986 ///
987 /// I suspect the main problem is that this function contains two different
988 /// inlined copies of Two-Way: one with and one without prefilters enabled.
989 #[inline(never)]
990 fn find_tw(
991 &self,
992 tw: &twoway::Forward,
993 state: &mut PrefilterState,
994 haystack: &[u8],
995 needle: &[u8],
996 ) -> Option<usize> {
997 if let Some(prefn) = self.prefn {
998 // We used to look at the length of a haystack here. That is, if
999 // it was too small, then don't bother with the prefilter. But two
1000 // things changed: the prefilter falls back to memchr for small
1001 // haystacks, and, above, Rabin-Karp is employed for tiny haystacks
1002 // anyway.
1003 if state.is_effective() {
1004 let mut pre = Pre { state, prefn, ninfo: &self.ninfo };
1005 return tw.find(Some(&mut pre), haystack, needle);
1006 }
1007 }
1008 tw.find(None, haystack, needle)
1009 }
1010}
1011
1012impl NeedleInfo {
1013 pub(crate) fn new(needle: &[u8]) -> NeedleInfo {
1014 NeedleInfo {
1015 rarebytes: RareNeedleBytes::forward(needle),
1016 nhash: NeedleHash::forward(needle),
1017 }
1018 }
1019}
1020
1021/// The internal implementation of a reverse substring searcher.
1022///
1023/// See the forward searcher docs for more details. Currently, the reverse
1024/// searcher is considerably simpler since it lacks prefilter support. This
1025/// was done because it adds a lot of code, and more surface area to test. And
1026/// in particular, it's not clear whether a prefilter on reverse searching is
1027/// worth it. (If you have a compelling use case, please file an issue!)
1028#[derive(Clone, Debug)]
1029struct SearcherRev<'n> {
1030 /// The actual needle we're searching for.
1031 needle: CowBytes<'n>,
1032 /// A Rabin-Karp hash of the needle.
1033 nhash: NeedleHash,
1034 /// The actual substring implementation in use.
1035 kind: SearcherRevKind,
1036}
1037
1038#[derive(Clone, Debug)]
1039enum SearcherRevKind {
1040 /// A special case for empty needles. An empty needle always matches, even
1041 /// in an empty haystack.
1042 Empty,
1043 /// This is used whenever the needle is a single byte. In this case, we
1044 /// always use memchr.
1045 OneByte(u8),
1046 /// Two-Way is the generic work horse and is what provides our additive
1047 /// linear time guarantee. In general, it's used when the needle is bigger
1048 /// than 8 bytes or so.
1049 TwoWay(twoway::Reverse),
1050}
1051
1052impl<'n> SearcherRev<'n> {
1053 fn new(needle: &'n [u8]) -> SearcherRev<'n> {
1054 use self::SearcherRevKind::*;
1055
1056 let kind = if needle.len() == 0 {
1057 Empty
1058 } else if needle.len() == 1 {
1059 OneByte(needle[0])
1060 } else {
1061 TwoWay(twoway::Reverse::new(needle))
1062 };
1063 SearcherRev {
1064 needle: CowBytes::new(needle),
1065 nhash: NeedleHash::reverse(needle),
1066 kind,
1067 }
1068 }
1069
1070 fn needle(&self) -> &[u8] {
1071 self.needle.as_slice()
1072 }
1073
1074 fn as_ref(&self) -> SearcherRev<'_> {
1075 use self::SearcherRevKind::*;
1076
1077 let kind = match self.kind {
1078 Empty => Empty,
1079 OneByte(b) => OneByte(b),
1080 TwoWay(tw) => TwoWay(tw),
1081 };
1082 SearcherRev {
1083 needle: CowBytes::new(self.needle()),
1084 nhash: self.nhash,
1085 kind,
1086 }
1087 }
1088
1089 #[cfg(feature = "std")]
1090 fn into_owned(self) -> SearcherRev<'static> {
1091 use self::SearcherRevKind::*;
1092
1093 let kind = match self.kind {
1094 Empty => Empty,
1095 OneByte(b) => OneByte(b),
1096 TwoWay(tw) => TwoWay(tw),
1097 };
1098 SearcherRev {
1099 needle: self.needle.into_owned(),
1100 nhash: self.nhash,
1101 kind,
1102 }
1103 }
1104
1105 /// Implements reverse substring search by selecting the implementation
1106 /// chosen at construction and executing it on the given haystack with the
1107 /// prefilter's current state of effectiveness.
1108 #[inline(always)]
1109 fn rfind(&self, haystack: &[u8]) -> Option<usize> {
1110 use self::SearcherRevKind::*;
1111
1112 let needle = self.needle();
1113 if haystack.len() < needle.len() {
1114 return None;
1115 }
1116 match self.kind {
1117 Empty => Some(haystack.len()),
1118 OneByte(b) => crate::memrchr(b, haystack),
1119 TwoWay(ref tw) => {
1120 // For very short haystacks (e.g., where the prefilter probably
1121 // can't run), it's faster to just run RK.
1122 if rabinkarp::is_fast(haystack, needle) {
1123 rabinkarp::rfind_with(&self.nhash, haystack, needle)
1124 } else {
1125 tw.rfind(haystack, needle)
1126 }
1127 }
1128 }
1129 }
1130}
1131
1132/// This module defines some generic quickcheck properties useful for testing
1133/// any substring search algorithm. It also runs those properties for the
1134/// top-level public API memmem routines. (The properties are also used to
1135/// test various substring search implementations more granularly elsewhere as
1136/// well.)
1137#[cfg(all(test, feature = "std", not(miri)))]
1138mod proptests {
1139 // N.B. This defines the quickcheck tests using the properties defined
1140 // below. Because of macro-visibility weirdness, the actual macro is
1141 // defined at the top of this file.
1142 define_memmem_quickcheck_tests!(super::find, super::rfind);
1143
1144 /// Check that every prefix of the given byte string is a substring.
1145 pub(crate) fn prefix_is_substring(
1146 reverse: bool,
1147 bs: &[u8],
1148 mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1149 ) -> bool {
1150 if bs.is_empty() {
1151 return true;
1152 }
1153 for i in 0..(bs.len() - 1) {
1154 let prefix = &bs[..i];
1155 if reverse {
1156 assert_eq!(naive_rfind(bs, prefix), search(bs, prefix));
1157 } else {
1158 assert_eq!(naive_find(bs, prefix), search(bs, prefix));
1159 }
1160 }
1161 true
1162 }
1163
1164 /// Check that every suffix of the given byte string is a substring.
1165 pub(crate) fn suffix_is_substring(
1166 reverse: bool,
1167 bs: &[u8],
1168 mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1169 ) -> bool {
1170 if bs.is_empty() {
1171 return true;
1172 }
1173 for i in 0..(bs.len() - 1) {
1174 let suffix = &bs[i..];
1175 if reverse {
1176 assert_eq!(naive_rfind(bs, suffix), search(bs, suffix));
1177 } else {
1178 assert_eq!(naive_find(bs, suffix), search(bs, suffix));
1179 }
1180 }
1181 true
1182 }
1183
1184 /// Check that naive substring search matches the result of the given search
1185 /// algorithm.
1186 pub(crate) fn matches_naive(
1187 reverse: bool,
1188 haystack: &[u8],
1189 needle: &[u8],
1190 mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1191 ) -> bool {
1192 if reverse {
1193 naive_rfind(haystack, needle) == search(haystack, needle)
1194 } else {
1195 naive_find(haystack, needle) == search(haystack, needle)
1196 }
1197 }
1198
1199 /// Naively search forwards for the given needle in the given haystack.
1200 fn naive_find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
1201 if needle.is_empty() {
1202 return Some(0);
1203 } else if haystack.len() < needle.len() {
1204 return None;
1205 }
1206 for i in 0..(haystack.len() - needle.len() + 1) {
1207 if needle == &haystack[i..i + needle.len()] {
1208 return Some(i);
1209 }
1210 }
1211 None
1212 }
1213
1214 /// Naively search in reverse for the given needle in the given haystack.
1215 fn naive_rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> {
1216 if needle.is_empty() {
1217 return Some(haystack.len());
1218 } else if haystack.len() < needle.len() {
1219 return None;
1220 }
1221 for i in (0..(haystack.len() - needle.len() + 1)).rev() {
1222 if needle == &haystack[i..i + needle.len()] {
1223 return Some(i);
1224 }
1225 }
1226 None
1227 }
1228}
1229
1230/// This module defines some hand-written "simple" substring tests. It
1231/// also provides routines for easily running them on any substring search
1232/// implementation.
1233#[cfg(test)]
1234mod testsimples {
1235 define_memmem_simple_tests!(super::find, super::rfind);
1236
1237 /// Each test is a (needle, haystack, expected_fwd, expected_rev) tuple.
1238 type SearchTest =
1239 (&'static str, &'static str, Option<usize>, Option<usize>);
1240
1241 const SEARCH_TESTS: &'static [SearchTest] = &[
1242 ("", "", Some(0), Some(0)),
1243 ("", "a", Some(0), Some(1)),
1244 ("", "ab", Some(0), Some(2)),
1245 ("", "abc", Some(0), Some(3)),
1246 ("a", "", None, None),
1247 ("a", "a", Some(0), Some(0)),
1248 ("a", "aa", Some(0), Some(1)),
1249 ("a", "ba", Some(1), Some(1)),
1250 ("a", "bba", Some(2), Some(2)),
1251 ("a", "bbba", Some(3), Some(3)),
1252 ("a", "bbbab", Some(3), Some(3)),
1253 ("a", "bbbabb", Some(3), Some(3)),
1254 ("a", "bbbabbb", Some(3), Some(3)),
1255 ("a", "bbbbbb", None, None),
1256 ("ab", "", None, None),
1257 ("ab", "a", None, None),
1258 ("ab", "b", None, None),
1259 ("ab", "ab", Some(0), Some(0)),
1260 ("ab", "aab", Some(1), Some(1)),
1261 ("ab", "aaab", Some(2), Some(2)),
1262 ("ab", "abaab", Some(0), Some(3)),
1263 ("ab", "baaab", Some(3), Some(3)),
1264 ("ab", "acb", None, None),
1265 ("ab", "abba", Some(0), Some(0)),
1266 ("abc", "ab", None, None),
1267 ("abc", "abc", Some(0), Some(0)),
1268 ("abc", "abcz", Some(0), Some(0)),
1269 ("abc", "abczz", Some(0), Some(0)),
1270 ("abc", "zabc", Some(1), Some(1)),
1271 ("abc", "zzabc", Some(2), Some(2)),
1272 ("abc", "azbc", None, None),
1273 ("abc", "abzc", None, None),
1274 ("abczdef", "abczdefzzzzzzzzzzzzzzzzzzzz", Some(0), Some(0)),
1275 ("abczdef", "zzzzzzzzzzzzzzzzzzzzabczdef", Some(20), Some(20)),
1276 ("xyz", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaxyz", Some(32), Some(32)),
1277 // Failures caught by quickcheck.
1278 ("\u{0}\u{15}", "\u{0}\u{15}\u{15}\u{0}", Some(0), Some(0)),
1279 ("\u{0}\u{1e}", "\u{1e}\u{0}", None, None),
1280 ];
1281
1282 /// Run the substring search tests. `search` should be a closure that
1283 /// accepts a haystack and a needle and returns the starting position
1284 /// of the first occurrence of needle in the haystack, or `None` if one
1285 /// doesn't exist.
1286 pub(crate) fn run_search_tests_fwd(
1287 mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1288 ) {
1289 for &(needle, haystack, expected_fwd, _) in SEARCH_TESTS {
1290 let (n, h) = (needle.as_bytes(), haystack.as_bytes());
1291 assert_eq!(
1292 expected_fwd,
1293 search(h, n),
1294 "needle: {:?}, haystack: {:?}, expected: {:?}",
1295 n,
1296 h,
1297 expected_fwd
1298 );
1299 }
1300 }
1301
1302 /// Run the substring search tests. `search` should be a closure that
1303 /// accepts a haystack and a needle and returns the starting position of
1304 /// the last occurrence of needle in the haystack, or `None` if one doesn't
1305 /// exist.
1306 pub(crate) fn run_search_tests_rev(
1307 mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1308 ) {
1309 for &(needle, haystack, _, expected_rev) in SEARCH_TESTS {
1310 let (n, h) = (needle.as_bytes(), haystack.as_bytes());
1311 assert_eq!(
1312 expected_rev,
1313 search(h, n),
1314 "needle: {:?}, haystack: {:?}, expected: {:?}",
1315 n,
1316 h,
1317 expected_rev
1318 );
1319 }
1320 }
1321}
1322