mod.rs source code [crates/memchr-2.5.0/src/memmem/mod.rs]

1	/!*
2	This module provides forward and reverse substring search routines.
3
4	Unlike the standard library's substring search routines, these work on
5	arbitrary bytes. For all non-empty needles, these routines will report exactly
6	the same values as the corresponding routines in the standard library. For
7	the empty needle, the standard library reports matches only at valid UTF-8
8	boundaries, where as these routines will report matches at every position.
9
10	Other than being able to work on arbitrary bytes, the primary reason to prefer
11	these routines over the standard library routines is that these will generally
12	be faster. In some cases, significantly so.
13
14	# Example: iterating over substring matches
15
16	This example shows how to use [`find_iter`] to find occurrences of a substring
17	in a haystack.
18
19	```
20	use memchr::memmem;
21
22	let haystack = b"foo bar foo baz foo";
23
24	let mut it = memmem::find_iter(haystack, "foo");
25	assert_eq!(Some(`0`), it.next());
26	assert_eq!(Some(`8`), it.next());
27	assert_eq!(Some(`16`), it.next());
28	assert_eq!(None, it.next());
29	```
30
31	# Example: iterating over substring matches in reverse
32
33	This example shows how to use [`rfind_iter`] to find occurrences of a substring
34	in a haystack starting from the end of the haystack.
35
36	NOTE: This module does not implement double ended iterators, so reverse
37	searches aren't done by calling `rev` on a forward iterator.
38
39	```
40	use memchr::memmem;
41
42	let haystack = b"foo bar foo baz foo";
43
44	let mut it = memmem::rfind_iter(haystack, "foo");
45	assert_eq!(Some(`16`), it.next());
46	assert_eq!(Some(`8`), it.next());
47	assert_eq!(Some(`0`), it.next());
48	assert_eq!(None, it.next());
49	```
50
51	# Example: repeating a search for the same needle
52
53	It may be possible for the overhead of constructing a substring searcher to be
54	measurable in some workloads. In cases where the same needle is used to search
55	many haystacks, it is possible to do construction once and thus to avoid it for
56	subsequent searches. This can be done with a [`Finder`] (or a [`FinderRev`] for
57	reverse searches).
58
59	```
60	use memchr::memmem;
61
62	let finder = memmem::Finder::new("foo");
63
64	assert_eq!(Some(`4`), finder.find(b"baz foo quux"));
65	assert_eq!(None, finder.find(b"quux baz bar"));
66	```
67	*/
68
69	pub use self::prefilter::Prefilter;
70
71	use crate::{
72	cow::CowBytes,
73	memmem::{
74	prefilter::{Pre, PrefilterFn, PrefilterState},
75	rabinkarp::NeedleHash,
76	rarebytes::RareNeedleBytes,
77	},
78	};
79
80	/// Defines a suite of quickcheck properties for forward and reverse
81	/// substring searching.
82	///
83	/// This is defined in this specific spot so that it can be used freely among
84	/// the different substring search implementations. I couldn't be bothered to
85	/// fight with the macro-visibility rules enough to figure out how to stuff it
86	/// somewhere more convenient.
87	#[cfg(all(test, feature = "std"))]
88	macro_rules! define_memmem_quickcheck_tests {
89	($fwd:expr, $rev:expr) => {
90	use crate::memmem::proptests;
91
92	quickcheck::quickcheck! {
93	fn qc_fwd_prefix_is_substring(bs: Vec<u8>) -> bool {
94	proptests::prefix_is_substring(`false`, &bs, $fwd)
95	}
96
97	fn qc_fwd_suffix_is_substring(bs: Vec<u8>) -> bool {
98	proptests::suffix_is_substring(`false`, &bs, $fwd)
99	}
100
101	fn qc_fwd_matches_naive(
102	haystack: Vec<u8>,
103	needle: Vec<u8>
104	) -> bool {
105	proptests::matches_naive(`false`, &haystack, &needle, $fwd)
106	}
107
108	fn qc_rev_prefix_is_substring(bs: Vec<u8>) -> bool {
109	proptests::prefix_is_substring(`true`, &bs, $rev)
110	}
111
112	fn qc_rev_suffix_is_substring(bs: Vec<u8>) -> bool {
113	proptests::suffix_is_substring(`true`, &bs, $rev)
114	}
115
116	fn qc_rev_matches_naive(
117	haystack: Vec<u8>,
118	needle: Vec<u8>
119	) -> bool {
120	proptests::matches_naive(`true`, &haystack, &needle, $rev)
121	}
122	}
123	};
124	}
125
126	/// Defines a suite of "simple" hand-written tests for a substring
127	/// implementation.
128	///
129	/// This is defined here for the same reason that
130	/// define_memmem_quickcheck_tests is defined here.
131	#[cfg(test)]
132	macro_rules! define_memmem_simple_tests {
133	($fwd:expr, $rev:expr) => {
134	use crate::memmem::testsimples;
135
136	#[test]
137	fn simple_forward() {
138	testsimples::run_search_tests_fwd($fwd);
139	}
140
141	#[test]
142	fn simple_reverse() {
143	testsimples::run_search_tests_rev($rev);
144	}
145	};
146	}
147
148	mod byte_frequencies;
149	#[cfg(memchr_runtime_simd)]
150	mod genericsimd;
151	mod prefilter;
152	mod rabinkarp;
153	mod rarebytes;
154	mod twoway;
155	mod util;
156	#[cfg(memchr_runtime_simd)]
157	mod vector;
158	#[cfg(all(memchr_runtime_wasm128))]
159	mod wasm;
160	#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
161	mod x86;
162
163	/// Returns an iterator over all non-overlapping occurrences of a substring in
164	/// a haystack.
165	///
166	/// # Complexity
167	///
168	/// This routine is guaranteed to have worst case linear time complexity
169	/// with respect to both the needle and the haystack. That is, this runs
170	/// in `O(needle.len() + haystack.len())` time.
171	///
172	/// This routine is also guaranteed to have worst case constant space
173	/// complexity.
174	///
175	/// # Examples
176	///
177	/// Basic usage:
178	///
179	/// ```
180	/// use memchr::memmem;
181	///
182	/// let haystack = b"foo bar foo baz foo";
183	/// let mut it = memmem::find_iter(haystack, b"foo");
184	/// assert_eq!(Some(`0`), it.next());
185	/// assert_eq!(Some(`8`), it.next());
186	/// assert_eq!(Some(`16`), it.next());
187	/// assert_eq!(None, it.next());
188	/// ```
189	#[inline]
190	pub fn find_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>(
191	haystack: &'h [u8],
192	needle: &'n N,
193	) -> FindIter<'h, 'n> {
194	FindIter::new(haystack, Finder::new(needle))
195	}
196
197	/// Returns a reverse iterator over all non-overlapping occurrences of a
198	/// substring in a haystack.
199	///
200	/// # Complexity
201	///
202	/// This routine is guaranteed to have worst case linear time complexity
203	/// with respect to both the needle and the haystack. That is, this runs
204	/// in `O(needle.len() + haystack.len())` time.
205	///
206	/// This routine is also guaranteed to have worst case constant space
207	/// complexity.
208	///
209	/// # Examples
210	///
211	/// Basic usage:
212	///
213	/// ```
214	/// use memchr::memmem;
215	///
216	/// let haystack = b"foo bar foo baz foo";
217	/// let mut it = memmem::rfind_iter(haystack, b"foo");
218	/// assert_eq!(Some(`16`), it.next());
219	/// assert_eq!(Some(`8`), it.next());
220	/// assert_eq!(Some(`0`), it.next());
221	/// assert_eq!(None, it.next());
222	/// ```
223	#[inline]
224	pub fn rfind_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>(
225	haystack: &'h [u8],
226	needle: &'n N,
227	) -> FindRevIter<'h, 'n> {
228	FindRevIter::new(haystack, finder:FinderRev::new(needle))
229	}
230
231	/// Returns the index of the first occurrence of the given needle.
232	///
233	/// Note that if you're are searching for the same needle in many different
234	/// small haystacks, it may be faster to initialize a [`Finder`] once,
235	/// and reuse it for each search.
236	///
237	/// # Complexity
238	///
239	/// This routine is guaranteed to have worst case linear time complexity
240	/// with respect to both the needle and the haystack. That is, this runs
241	/// in `O(needle.len() + haystack.len())` time.
242	///
243	/// This routine is also guaranteed to have worst case constant space
244	/// complexity.
245	///
246	/// # Examples
247	///
248	/// Basic usage:
249	///
250	/// ```
251	/// use memchr::memmem;
252	///
253	/// let haystack = b"foo bar baz";
254	/// assert_eq!(Some(`0`), memmem::find(haystack, b"foo"));
255	/// assert_eq!(Some(`4`), memmem::find(haystack, b"bar"));
256	/// assert_eq!(None, memmem::find(haystack, b"quux"));
257	/// ```
258	#[inline]
259	pub fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
260	if haystack.len() < `64` {
261	rabinkarp::find(haystack, needle)
262	} else {
263	Finder::new(needle).find(haystack)
264	}
265	}
266
267	/// Returns the index of the last occurrence of the given needle.
268	///
269	/// Note that if you're are searching for the same needle in many different
270	/// small haystacks, it may be faster to initialize a [`FinderRev`] once,
271	/// and reuse it for each search.
272	///
273	/// # Complexity
274	///
275	/// This routine is guaranteed to have worst case linear time complexity
276	/// with respect to both the needle and the haystack. That is, this runs
277	/// in `O(needle.len() + haystack.len())` time.
278	///
279	/// This routine is also guaranteed to have worst case constant space
280	/// complexity.
281	///
282	/// # Examples
283	///
284	/// Basic usage:
285	///
286	/// ```
287	/// use memchr::memmem;
288	///
289	/// let haystack = b"foo bar baz";
290	/// assert_eq!(Some(`0`), memmem::rfind(haystack, b"foo"));
291	/// assert_eq!(Some(`4`), memmem::rfind(haystack, b"bar"));
292	/// assert_eq!(Some(`8`), memmem::rfind(haystack, b"ba"));
293	/// assert_eq!(None, memmem::rfind(haystack, b"quux"));
294	/// ```
295	#[inline]
296	pub fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> {
297	if haystack.len() < `64` {
298	rabinkarp::rfind(haystack, needle)
299	} else {
300	FinderRev::new(needle).rfind(haystack)
301	}
302	}
303
304	/// An iterator over non-overlapping substring matches.
305	///
306	/// Matches are reported by the byte offset at which they begin.
307	///
308	/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
309	/// needle.
310	#[derive(Debug)]
311	pub struct FindIter<'h, 'n> {
312	haystack: &'h [u8],
313	prestate: PrefilterState,
314	finder: Finder<'n>,
315	pos: usize,
316	}
317
318	impl<'h, 'n> FindIter<'h, 'n> {
319	#[inline(always)]
320	pub(crate) fn new(
321	haystack: &'h [u8],
322	finder: Finder<'n>,
323	) -> FindIter<'h, 'n> {
324	let prestate = finder.searcher.prefilter_state();
325	FindIter { haystack, prestate, finder, pos: `0` }
326	}
327
328	/// Convert this iterator into its owned variant, such that it no longer
329	/// borrows the finder and needle.
330	///
331	/// If this is already an owned iterator, then this is a no-op. Otherwise,
332	/// this copies the needle.
333	///
334	/// This is only available when the `std` feature is enabled.
335	#[cfg(feature = "std")]
336	#[inline]
337	pub fn into_owned(self) -> FindIter<'h, 'static> {
338	FindIter {
339	haystack: self.haystack,
340	prestate: self.prestate,
341	finder: self.finder.into_owned(),
342	pos: self.pos,
343	}
344	}
345	}
346
347	impl<'h, 'n> Iterator for FindIter<'h, 'n> {
348	type Item = usize;
349
350	fn next(&mut self) -> Option<usize> {
351	if self.pos > self.haystack.len() {
352	return None;
353	}
354	let result: Option = self
355	.finder
356	.searcher
357	.find(&mut self.prestate, &self.haystack[self.pos..]);
358	match result {
359	None => None,
360	Some(i: usize) => {
361	let pos: usize = self.pos + i;
362	self.pos = pos + core::cmp::max(v1:`1`, self.finder.needle().len());
363	Some(pos)
364	}
365	}
366	}
367	}
368
369	/// An iterator over non-overlapping substring matches in reverse.
370	///
371	/// Matches are reported by the byte offset at which they begin.
372	///
373	/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
374	/// needle.
375	#[derive(Debug)]
376	pub struct FindRevIter<'h, 'n> {
377	haystack: &'h [u8],
378	finder: FinderRev<'n>,
379	/// When searching with an empty needle, this gets set to `None` after
380	/// we've yielded the last element at `0`.
381	pos: Option<usize>,
382	}
383
384	impl<'h, 'n> FindRevIter<'h, 'n> {
385	#[inline(always)]
386	pub(crate) fn new(
387	haystack: &'h [u8],
388	finder: FinderRev<'n>,
389	) -> FindRevIter<'h, 'n> {
390	let pos = Some(haystack.len());
391	FindRevIter { haystack, finder, pos }
392	}
393
394	/// Convert this iterator into its owned variant, such that it no longer
395	/// borrows the finder and needle.
396	///
397	/// If this is already an owned iterator, then this is a no-op. Otherwise,
398	/// this copies the needle.
399	///
400	/// This is only available when the `std` feature is enabled.
401	#[cfg(feature = "std")]
402	#[inline]
403	pub fn into_owned(self) -> FindRevIter<'h, 'static> {
404	FindRevIter {
405	haystack: self.haystack,
406	finder: self.finder.into_owned(),
407	pos: self.pos,
408	}
409	}
410	}
411
412	impl<'h, 'n> Iterator for FindRevIter<'h, 'n> {
413	type Item = usize;
414
415	fn next(&mut self) -> Option<usize> {
416	let pos: usize = match self.pos {
417	None => return None,
418	Some(pos: usize) => pos,
419	};
420	let result: Option = self.finder.rfind(&self.haystack[..pos]);
421	match result {
422	None => None,
423	Some(i: usize) => {
424	if pos == i {
425	self.pos = pos.checked_sub(`1`);
426	} else {
427	self.pos = Some(i);
428	}
429	Some(i)
430	}
431	}
432	}
433	}
434
435	/// A single substring searcher fixed to a particular needle.
436	///
437	/// The purpose of this type is to permit callers to construct a substring
438	/// searcher that can be used to search haystacks without the overhead of
439	/// constructing the searcher in the first place. This is a somewhat niche
440	/// concern when it's necessary to re-use the same needle to search multiple
441	/// different haystacks with as little overhead as possible. In general, using
442	/// [`find`] is good enough, but `Finder` is useful when you can meaningfully
443	/// observe searcher construction time in a profile.
444	///
445	/// When the `std` feature is enabled, then this type has an `into_owned`
446	/// version which permits building a `Finder` that is not connected to
447	/// the lifetime of its needle.
448	#[derive(Clone, Debug)]
449	pub struct Finder<'n> {
450	searcher: Searcher<'n>,
451	}
452
453	impl<'n> Finder<'n> {
454	/// Create a new finder for the given needle.
455	#[inline]
456	pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> Finder<'n> {
457	FinderBuilder::new().build_forward(needle)
458	}
459
460	/// Returns the index of the first occurrence of this needle in the given
461	/// haystack.
462	///
463	/// # Complexity
464	///
465	/// This routine is guaranteed to have worst case linear time complexity
466	/// with respect to both the needle and the haystack. That is, this runs
467	/// in `O(needle.len() + haystack.len())` time.
468	///
469	/// This routine is also guaranteed to have worst case constant space
470	/// complexity.
471	///
472	/// # Examples
473	///
474	/// Basic usage:
475	///
476	/// ```
477	/// use memchr::memmem::Finder;
478	///
479	/// let haystack = b"foo bar baz";
480	/// assert_eq!(Some(`0`), Finder::new("foo").find(haystack));
481	/// assert_eq!(Some(`4`), Finder::new("bar").find(haystack));
482	/// assert_eq!(None, Finder::new("quux").find(haystack));
483	/// ```
484	pub fn find(&self, haystack: &[u8]) -> Option<usize> {
485	self.searcher.find(&mut self.searcher.prefilter_state(), haystack)
486	}
487
488	/// Returns an iterator over all occurrences of a substring in a haystack.
489	///
490	/// # Complexity
491	///
492	/// This routine is guaranteed to have worst case linear time complexity
493	/// with respect to both the needle and the haystack. That is, this runs
494	/// in `O(needle.len() + haystack.len())` time.
495	///
496	/// This routine is also guaranteed to have worst case constant space
497	/// complexity.
498	///
499	/// # Examples
500	///
501	/// Basic usage:
502	///
503	/// ```
504	/// use memchr::memmem::Finder;
505	///
506	/// let haystack = b"foo bar foo baz foo";
507	/// let finder = Finder::new(b"foo");
508	/// let mut it = finder.find_iter(haystack);
509	/// assert_eq!(Some(`0`), it.next());
510	/// assert_eq!(Some(`8`), it.next());
511	/// assert_eq!(Some(`16`), it.next());
512	/// assert_eq!(None, it.next());
513	/// ```
514	#[inline]
515	pub fn find_iter<'a, 'h>(
516	&'a self,
517	haystack: &'h [u8],
518	) -> FindIter<'h, 'a> {
519	FindIter::new(haystack, self.as_ref())
520	}
521
522	/// Convert this finder into its owned variant, such that it no longer
523	/// borrows the needle.
524	///
525	/// If this is already an owned finder, then this is a no-op. Otherwise,
526	/// this copies the needle.
527	///
528	/// This is only available when the `std` feature is enabled.
529	#[cfg(feature = "std")]
530	#[inline]
531	pub fn into_owned(self) -> Finder<'static> {
532	Finder { searcher: self.searcher.into_owned() }
533	}
534
535	/// Convert this finder into its borrowed variant.
536	///
537	/// This is primarily useful if your finder is owned and you'd like to
538	/// store its borrowed variant in some intermediate data structure.
539	///
540	/// Note that the lifetime parameter of the returned finder is tied to the
541	/// lifetime of `self`, and may be shorter than the `'n` lifetime of the
542	/// needle itself. Namely, a finder's needle can be either borrowed or
543	/// owned, so the lifetime of the needle returned must necessarily be the
544	/// shorter of the two.
545	#[inline]
546	pub fn as_ref(&self) -> Finder<'_> {
547	Finder { searcher: self.searcher.as_ref() }
548	}
549
550	/// Returns the needle that this finder searches for.
551	///
552	/// Note that the lifetime of the needle returned is tied to the lifetime
553	/// of the finder, and may be shorter than the `'n` lifetime. Namely, a
554	/// finder's needle can be either borrowed or owned, so the lifetime of the
555	/// needle returned must necessarily be the shorter of the two.
556	#[inline]
557	pub fn needle(&self) -> &[u8] {
558	self.searcher.needle()
559	}
560	}
561
562	/// A single substring reverse searcher fixed to a particular needle.
563	///
564	/// The purpose of this type is to permit callers to construct a substring
565	/// searcher that can be used to search haystacks without the overhead of
566	/// constructing the searcher in the first place. This is a somewhat niche
567	/// concern when it's necessary to re-use the same needle to search multiple
568	/// different haystacks with as little overhead as possible. In general,
569	/// using [`rfind`] is good enough, but `FinderRev` is useful when you can
570	/// meaningfully observe searcher construction time in a profile.
571	///
572	/// When the `std` feature is enabled, then this type has an `into_owned`
573	/// version which permits building a `FinderRev` that is not connected to
574	/// the lifetime of its needle.
575	#[derive(Clone, Debug)]
576	pub struct FinderRev<'n> {
577	searcher: SearcherRev<'n>,
578	}
579
580	impl<'n> FinderRev<'n> {
581	/// Create a new reverse finder for the given needle.
582	#[inline]
583	pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> FinderRev<'n> {
584	FinderBuilder::new().build_reverse(needle)
585	}
586
587	/// Returns the index of the last occurrence of this needle in the given
588	/// haystack.
589	///
590	/// The haystack may be any type that can be cheaply converted into a
591	/// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
592	///
593	/// # Complexity
594	///
595	/// This routine is guaranteed to have worst case linear time complexity
596	/// with respect to both the needle and the haystack. That is, this runs
597	/// in `O(needle.len() + haystack.len())` time.
598	///
599	/// This routine is also guaranteed to have worst case constant space
600	/// complexity.
601	///
602	/// # Examples
603	///
604	/// Basic usage:
605	///
606	/// ```
607	/// use memchr::memmem::FinderRev;
608	///
609	/// let haystack = b"foo bar baz";
610	/// assert_eq!(Some(`0`), FinderRev::new("foo").rfind(haystack));
611	/// assert_eq!(Some(`4`), FinderRev::new("bar").rfind(haystack));
612	/// assert_eq!(None, FinderRev::new("quux").rfind(haystack));
613	/// ```
614	pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
615	self.searcher.rfind(haystack.as_ref())
616	}
617
618	/// Returns a reverse iterator over all occurrences of a substring in a
619	/// haystack.
620	///
621	/// # Complexity
622	///
623	/// This routine is guaranteed to have worst case linear time complexity
624	/// with respect to both the needle and the haystack. That is, this runs
625	/// in `O(needle.len() + haystack.len())` time.
626	///
627	/// This routine is also guaranteed to have worst case constant space
628	/// complexity.
629	///
630	/// # Examples
631	///
632	/// Basic usage:
633	///
634	/// ```
635	/// use memchr::memmem::FinderRev;
636	///
637	/// let haystack = b"foo bar foo baz foo";
638	/// let finder = FinderRev::new(b"foo");
639	/// let mut it = finder.rfind_iter(haystack);
640	/// assert_eq!(Some(`16`), it.next());
641	/// assert_eq!(Some(`8`), it.next());
642	/// assert_eq!(Some(`0`), it.next());
643	/// assert_eq!(None, it.next());
644	/// ```
645	#[inline]
646	pub fn rfind_iter<'a, 'h>(
647	&'a self,
648	haystack: &'h [u8],
649	) -> FindRevIter<'h, 'a> {
650	FindRevIter::new(haystack, self.as_ref())
651	}
652
653	/// Convert this finder into its owned variant, such that it no longer
654	/// borrows the needle.
655	///
656	/// If this is already an owned finder, then this is a no-op. Otherwise,
657	/// this copies the needle.
658	///
659	/// This is only available when the `std` feature is enabled.
660	#[cfg(feature = "std")]
661	#[inline]
662	pub fn into_owned(self) -> FinderRev<'static> {
663	FinderRev { searcher: self.searcher.into_owned() }
664	}
665
666	/// Convert this finder into its borrowed variant.
667	///
668	/// This is primarily useful if your finder is owned and you'd like to
669	/// store its borrowed variant in some intermediate data structure.
670	///
671	/// Note that the lifetime parameter of the returned finder is tied to the
672	/// lifetime of `self`, and may be shorter than the `'n` lifetime of the
673	/// needle itself. Namely, a finder's needle can be either borrowed or
674	/// owned, so the lifetime of the needle returned must necessarily be the
675	/// shorter of the two.
676	#[inline]
677	pub fn as_ref(&self) -> FinderRev<'_> {
678	FinderRev { searcher: self.searcher.as_ref() }
679	}
680
681	/// Returns the needle that this finder searches for.
682	///
683	/// Note that the lifetime of the needle returned is tied to the lifetime
684	/// of the finder, and may be shorter than the `'n` lifetime. Namely, a
685	/// finder's needle can be either borrowed or owned, so the lifetime of the
686	/// needle returned must necessarily be the shorter of the two.
687	#[inline]
688	pub fn needle(&self) -> &[u8] {
689	self.searcher.needle()
690	}
691	}
692
693	/// A builder for constructing non-default forward or reverse memmem finders.
694	///
695	/// A builder is primarily useful for configuring a substring searcher.
696	/// Currently, the only configuration exposed is the ability to disable
697	/// heuristic prefilters used to speed up certain searches.
698	#[derive(Clone, Debug, Default)]
699	pub struct FinderBuilder {
700	config: SearcherConfig,
701	}
702
703	impl FinderBuilder {
704	/// Create a new finder builder with default settings.
705	pub fn new() -> FinderBuilder {
706	FinderBuilder::default()
707	}
708
709	/// Build a forward finder using the given needle from the current
710	/// settings.
711	pub fn build_forward<'n, B: ?Sized + AsRef<[u8]>>(
712	&self,
713	needle: &'n B,
714	) -> Finder<'n> {
715	Finder { searcher: Searcher::new(self.config, needle.as_ref()) }
716	}
717
718	/// Build a reverse finder using the given needle from the current
719	/// settings.
720	pub fn build_reverse<'n, B: ?Sized + AsRef<[u8]>>(
721	&self,
722	needle: &'n B,
723	) -> FinderRev<'n> {
724	FinderRev { searcher: SearcherRev::new(needle.as_ref()) }
725	}
726
727	/// Configure the prefilter setting for the finder.
728	///
729	/// See the documentation for [`Prefilter`] for more discussion on why
730	/// you might want to configure this.
731	pub fn prefilter(&mut self, prefilter: Prefilter) -> &mut FinderBuilder {
732	self.config.prefilter = prefilter;
733	self
734	}
735	}
736
737	/// The internal implementation of a forward substring searcher.
738	///
739	/// The reality is that this is a "meta" searcher. Namely, depending on a
740	/// variety of parameters (CPU support, target, needle size, haystack size and
741	/// even dynamic properties such as prefilter effectiveness), the actual
742	/// algorithm employed to do substring search may change.
743	#[derive(Clone, Debug)]
744	struct Searcher<'n> {
745	/// The actual needle we're searching for.
746	///
747	/// A CowBytes is like a Cow<[u8]>, except in no_std environments, it is
748	/// specialized to a single variant (the borrowed form).
749	needle: CowBytes<'n>,
750	/// A collection of facts computed on the needle that are useful for more
751	/// than one substring search algorithm.
752	ninfo: NeedleInfo,
753	/// A prefilter function, if it was deemed appropriate.
754	///
755	/// Some substring search implementations (like Two-Way) benefit greatly
756	/// if we can quickly find candidate starting positions for a match.
757	prefn: Option<PrefilterFn>,
758	/// The actual substring implementation in use.
759	kind: SearcherKind,
760	}
761
762	/// A collection of facts computed about a search needle.
763	///
764	/// We group these things together because it's useful to be able to hand them
765	/// to prefilters or substring algorithms that want them.
766	#[derive(Clone, Copy, Debug)]
767	pub(crate) struct NeedleInfo {
768	/// The offsets of "rare" bytes detected in the needle.
769	///
770	/// This is meant to be a heuristic in order to maximize the effectiveness
771	/// of vectorized code. Namely, vectorized code tends to focus on only
772	/// one or two bytes. If we pick bytes from the needle that occur
773	/// infrequently, then more time will be spent in the vectorized code and
774	/// will likely make the overall search (much) faster.
775	///
776	/// Of course, this is only a heuristic based on a background frequency
777	/// distribution of bytes. But it tends to work very well in practice.
778	pub(crate) rarebytes: RareNeedleBytes,
779	/// A Rabin-Karp hash of the needle.
780	///
781	/// This is store here instead of in a more specific Rabin-Karp search
782	/// since Rabin-Karp may be used even if another SearchKind corresponds
783	/// to some other search implementation. e.g., If measurements suggest RK
784	/// is faster in some cases or if a search implementation can't handle
785	/// particularly small haystack. (Moreover, we cannot use RK generally,
786	/// since its worst case time is multiplicative. Instead, we only use it
787	/// some small haystacks, where "small" is a constant.)
788	pub(crate) nhash: NeedleHash,
789	}
790
791	/// Configuration for substring search.
792	#[derive(Clone, Copy, Debug, Default)]
793	struct SearcherConfig {
794	/// This permits changing the behavior of the prefilter, since it can have
795	/// a variable impact on performance.
796	prefilter: Prefilter,
797	}
798
799	#[derive(Clone, Debug)]
800	enum SearcherKind {
801	/// A special case for empty needles. An empty needle always matches, even
802	/// in an empty haystack.
803	Empty,
804	/// This is used whenever the needle is a single byte. In this case, we
805	/// always use memchr.
806	OneByte(u8),
807	/// Two-Way is the generic work horse and is what provides our additive
808	/// linear time guarantee. In general, it's used when the needle is bigger
809	/// than 8 bytes or so.
810	TwoWay(twoway::Forward),
811	#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
812	GenericSIMD128(x86::sse::Forward),
813	#[cfg(memchr_runtime_wasm128)]
814	GenericSIMD128(wasm::Forward),
815	#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
816	GenericSIMD256(x86::avx::Forward),
817	}
818
819	impl<'n> Searcher<'n> {
820	fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> {
821	use self::SearcherKind::*;
822
823	let ninfo = NeedleInfo::new(needle);
824	let mk = \|kind: SearcherKind\| {
825	let prefn = prefilter::forward(
826	&config.prefilter,
827	&ninfo.rarebytes,
828	needle,
829	);
830	Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind }
831	};
832	if needle.len() == `0` {
833	return mk(Empty);
834	}
835	if needle.len() == `1` {
836	return mk(OneByte(needle[`0`]));
837	}
838	#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
839	{
840	if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) {
841	return mk(GenericSIMD256(fwd));
842	} else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) {
843	return mk(GenericSIMD128(fwd));
844	}
845	}
846	#[cfg(all(target_arch = "wasm32", memchr_runtime_simd))]
847	{
848	if let Some(fwd) = wasm::Forward::new(&ninfo, needle) {
849	return mk(GenericSIMD128(fwd));
850	}
851	}
852
853	mk(TwoWay(twoway::Forward::new(needle)))
854	}
855
856	/// Return a fresh prefilter state that can be used with this searcher.
857	/// A prefilter state is used to track the effectiveness of a searcher's
858	/// prefilter for speeding up searches. Therefore, the prefilter state
859	/// should generally be reused on subsequent searches (such as in an
860	/// iterator). For searches on a different haystack, then a new prefilter
861	/// state should be used.
862	///
863	/// This always initializes a valid (but possibly inert) prefilter state
864	/// even if this searcher does not have a prefilter enabled.
865	fn prefilter_state(&self) -> PrefilterState {
866	if self.prefn.is_none() {
867	PrefilterState::inert()
868	} else {
869	PrefilterState::new()
870	}
871	}
872
873	fn needle(&self) -> &[u8] {
874	self.needle.as_slice()
875	}
876
877	fn as_ref(&self) -> Searcher<'_> {
878	use self::SearcherKind::*;
879
880	let kind = match self.kind {
881	Empty => Empty,
882	OneByte(b) => OneByte(b),
883	TwoWay(tw) => TwoWay(tw),
884	#[cfg(all(not(miri), memchr_runtime_simd))]
885	GenericSIMD128(gs) => GenericSIMD128(gs),
886	#[cfg(all(
887	not(miri),
888	target_arch = "x86_64",
889	memchr_runtime_simd
890	))]
891	GenericSIMD256(gs) => GenericSIMD256(gs),
892	};
893	Searcher {
894	needle: CowBytes::new(self.needle()),
895	ninfo: self.ninfo,
896	prefn: self.prefn,
897	kind,
898	}
899	}
900
901	#[cfg(feature = "std")]
902	fn into_owned(self) -> Searcher<'static> {
903	use self::SearcherKind::*;
904
905	let kind = match self.kind {
906	Empty => Empty,
907	OneByte(b) => OneByte(b),
908	TwoWay(tw) => TwoWay(tw),
909	#[cfg(all(not(miri), memchr_runtime_simd))]
910	GenericSIMD128(gs) => GenericSIMD128(gs),
911	#[cfg(all(
912	not(miri),
913	target_arch = "x86_64",
914	memchr_runtime_simd
915	))]
916	GenericSIMD256(gs) => GenericSIMD256(gs),
917	};
918	Searcher {
919	needle: self.needle.into_owned(),
920	ninfo: self.ninfo,
921	prefn: self.prefn,
922	kind,
923	}
924	}
925
926	/// Implements forward substring search by selecting the implementation
927	/// chosen at construction and executing it on the given haystack with the
928	/// prefilter's current state of effectiveness.
929	#[inline(always)]
930	fn find(
931	&self,
932	state: &mut PrefilterState,
933	haystack: &[u8],
934	) -> Option<usize> {
935	use self::SearcherKind::*;
936
937	let needle = self.needle();
938	if haystack.len() < needle.len() {
939	return None;
940	}
941	match self.kind {
942	Empty => Some(`0`),
943	OneByte(b) => crate::memchr(b, haystack),
944	TwoWay(ref tw) => {
945	// For very short haystacks (e.g., where the prefilter probably
946	// can't run), it's faster to just run RK.
947	if rabinkarp::is_fast(haystack, needle) {
948	rabinkarp::find_with(&self.ninfo.nhash, haystack, needle)
949	} else {
950	self.find_tw(tw, state, haystack, needle)
951	}
952	}
953	#[cfg(all(not(miri), memchr_runtime_simd))]
954	GenericSIMD128(ref gs) => {
955	// The SIMD matcher can't handle particularly short haystacks,
956	// so we fall back to RK in these cases.
957	if haystack.len() < gs.min_haystack_len() {
958	rabinkarp::find_with(&self.ninfo.nhash, haystack, needle)
959	} else {
960	gs.find(haystack, needle)
961	}
962	}
963	#[cfg(all(
964	not(miri),
965	target_arch = "x86_64",
966	memchr_runtime_simd
967	))]
968	GenericSIMD256(ref gs) => {
969	// The SIMD matcher can't handle particularly short haystacks,
970	// so we fall back to RK in these cases.
971	if haystack.len() < gs.min_haystack_len() {
972	rabinkarp::find_with(&self.ninfo.nhash, haystack, needle)
973	} else {
974	gs.find(haystack, needle)
975	}
976	}
977	}
978	}
979
980	/// Calls Two-Way on the given haystack/needle.
981	///
982	/// This is marked as unlineable since it seems to have a better overall
983	/// effect on benchmarks. However, this is one of those cases where
984	/// inlining it results an improvement in other benchmarks too, so I
985	/// suspect we just don't have enough data yet to make the right call here.
986	///
987	/// I suspect the main problem is that this function contains two different
988	/// inlined copies of Two-Way: one with and one without prefilters enabled.
989	#[inline(never)]
990	fn find_tw(
991	&self,
992	tw: &twoway::Forward,
993	state: &mut PrefilterState,
994	haystack: &[u8],
995	needle: &[u8],
996	) -> Option<usize> {
997	if let Some(prefn) = self.prefn {
998	// We used to look at the length of a haystack here. That is, if
999	// it was too small, then don't bother with the prefilter. But two
1000	// things changed: the prefilter falls back to memchr for small
1001	// haystacks, and, above, Rabin-Karp is employed for tiny haystacks
1002	// anyway.
1003	if state.is_effective() {
1004	let mut pre = Pre { state, prefn, ninfo: &self.ninfo };
1005	return tw.find(Some(&mut pre), haystack, needle);
1006	}
1007	}
1008	tw.find(None, haystack, needle)
1009	}
1010	}
1011
1012	impl NeedleInfo {
1013	pub(crate) fn new(needle: &[u8]) -> NeedleInfo {
1014	NeedleInfo {
1015	rarebytes: RareNeedleBytes::forward(needle),
1016	nhash: NeedleHash::forward(needle),
1017	}
1018	}
1019	}
1020
1021	/// The internal implementation of a reverse substring searcher.
1022	///
1023	/// See the forward searcher docs for more details. Currently, the reverse
1024	/// searcher is considerably simpler since it lacks prefilter support. This
1025	/// was done because it adds a lot of code, and more surface area to test. And
1026	/// in particular, it's not clear whether a prefilter on reverse searching is
1027	/// worth it. (If you have a compelling use case, please file an issue!)
1028	#[derive(Clone, Debug)]
1029	struct SearcherRev<'n> {
1030	/// The actual needle we're searching for.
1031	needle: CowBytes<'n>,
1032	/// A Rabin-Karp hash of the needle.
1033	nhash: NeedleHash,
1034	/// The actual substring implementation in use.
1035	kind: SearcherRevKind,
1036	}
1037
1038	#[derive(Clone, Debug)]
1039	enum SearcherRevKind {
1040	/// A special case for empty needles. An empty needle always matches, even
1041	/// in an empty haystack.
1042	Empty,
1043	/// This is used whenever the needle is a single byte. In this case, we
1044	/// always use memchr.
1045	OneByte(u8),
1046	/// Two-Way is the generic work horse and is what provides our additive
1047	/// linear time guarantee. In general, it's used when the needle is bigger
1048	/// than 8 bytes or so.
1049	TwoWay(twoway::Reverse),
1050	}
1051
1052	impl<'n> SearcherRev<'n> {
1053	fn new(needle: &'n [u8]) -> SearcherRev<'n> {
1054	use self::SearcherRevKind::*;
1055
1056	let kind = if needle.len() == `0` {
1057	Empty
1058	} else if needle.len() == `1` {
1059	OneByte(needle[`0`])
1060	} else {
1061	TwoWay(twoway::Reverse::new(needle))
1062	};
1063	SearcherRev {
1064	needle: CowBytes::new(needle),
1065	nhash: NeedleHash::reverse(needle),
1066	kind,
1067	}
1068	}
1069
1070	fn needle(&self) -> &[u8] {
1071	self.needle.as_slice()
1072	}
1073
1074	fn as_ref(&self) -> SearcherRev<'_> {
1075	use self::SearcherRevKind::*;
1076
1077	let kind = match self.kind {
1078	Empty => Empty,
1079	OneByte(b) => OneByte(b),
1080	TwoWay(tw) => TwoWay(tw),
1081	};
1082	SearcherRev {
1083	needle: CowBytes::new(self.needle()),
1084	nhash: self.nhash,
1085	kind,
1086	}
1087	}
1088
1089	#[cfg(feature = "std")]
1090	fn into_owned(self) -> SearcherRev<'static> {
1091	use self::SearcherRevKind::*;
1092
1093	let kind = match self.kind {
1094	Empty => Empty,
1095	OneByte(b) => OneByte(b),
1096	TwoWay(tw) => TwoWay(tw),
1097	};
1098	SearcherRev {
1099	needle: self.needle.into_owned(),
1100	nhash: self.nhash,
1101	kind,
1102	}
1103	}
1104
1105	/// Implements reverse substring search by selecting the implementation
1106	/// chosen at construction and executing it on the given haystack with the
1107	/// prefilter's current state of effectiveness.
1108	#[inline(always)]
1109	fn rfind(&self, haystack: &[u8]) -> Option<usize> {
1110	use self::SearcherRevKind::*;
1111
1112	let needle = self.needle();
1113	if haystack.len() < needle.len() {
1114	return None;
1115	}
1116	match self.kind {
1117	Empty => Some(haystack.len()),
1118	OneByte(b) => crate::memrchr(b, haystack),
1119	TwoWay(ref tw) => {
1120	// For very short haystacks (e.g., where the prefilter probably
1121	// can't run), it's faster to just run RK.
1122	if rabinkarp::is_fast(haystack, needle) {
1123	rabinkarp::rfind_with(&self.nhash, haystack, needle)
1124	} else {
1125	tw.rfind(haystack, needle)
1126	}
1127	}
1128	}
1129	}
1130	}
1131
1132	/// This module defines some generic quickcheck properties useful for testing
1133	/// any substring search algorithm. It also runs those properties for the
1134	/// top-level public API memmem routines. (The properties are also used to
1135	/// test various substring search implementations more granularly elsewhere as
1136	/// well.)
1137	#[cfg(all(test, feature = "std", not(miri)))]
1138	mod proptests {
1139	// N.B. This defines the quickcheck tests using the properties defined
1140	// below. Because of macro-visibility weirdness, the actual macro is
1141	// defined at the top of this file.
1142	define_memmem_quickcheck_tests!(super::find, super::rfind);
1143
1144	/// Check that every prefix of the given byte string is a substring.
1145	pub(crate) fn prefix_is_substring(
1146	reverse: bool,
1147	bs: &[u8],
1148	mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1149	) -> bool {
1150	if bs.is_empty() {
1151	return `true`;
1152	}
1153	for i in `0`..(bs.len() - `1`) {
1154	let prefix = &bs[..i];
1155	if reverse {
1156	assert_eq!(naive_rfind(bs, prefix), search(bs, prefix));
1157	} else {
1158	assert_eq!(naive_find(bs, prefix), search(bs, prefix));
1159	}
1160	}
1161	`true`
1162	}
1163
1164	/// Check that every suffix of the given byte string is a substring.
1165	pub(crate) fn suffix_is_substring(
1166	reverse: bool,
1167	bs: &[u8],
1168	mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1169	) -> bool {
1170	if bs.is_empty() {
1171	return `true`;
1172	}
1173	for i in `0`..(bs.len() - `1`) {
1174	let suffix = &bs[i..];
1175	if reverse {
1176	assert_eq!(naive_rfind(bs, suffix), search(bs, suffix));
1177	} else {
1178	assert_eq!(naive_find(bs, suffix), search(bs, suffix));
1179	}
1180	}
1181	`true`
1182	}
1183
1184	/// Check that naive substring search matches the result of the given search
1185	/// algorithm.
1186	pub(crate) fn matches_naive(
1187	reverse: bool,
1188	haystack: &[u8],
1189	needle: &[u8],
1190	mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1191	) -> bool {
1192	if reverse {
1193	naive_rfind(haystack, needle) == search(haystack, needle)
1194	} else {
1195	naive_find(haystack, needle) == search(haystack, needle)
1196	}
1197	}
1198
1199	/// Naively search forwards for the given needle in the given haystack.
1200	fn naive_find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
1201	if needle.is_empty() {
1202	return Some(`0`);
1203	} else if haystack.len() < needle.len() {
1204	return None;
1205	}
1206	for i in `0`..(haystack.len() - needle.len() + `1`) {
1207	if needle == &haystack[i..i + needle.len()] {
1208	return Some(i);
1209	}
1210	}
1211	None
1212	}
1213
1214	/// Naively search in reverse for the given needle in the given haystack.
1215	fn naive_rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> {
1216	if needle.is_empty() {
1217	return Some(haystack.len());
1218	} else if haystack.len() < needle.len() {
1219	return None;
1220	}
1221	for i in (`0`..(haystack.len() - needle.len() + `1`)).rev() {
1222	if needle == &haystack[i..i + needle.len()] {
1223	return Some(i);
1224	}
1225	}
1226	None
1227	}
1228	}
1229
1230	/// This module defines some hand-written "simple" substring tests. It
1231	/// also provides routines for easily running them on any substring search
1232	/// implementation.
1233	#[cfg(test)]
1234	mod testsimples {
1235	define_memmem_simple_tests!(super::find, super::rfind);
1236
1237	/// Each test is a (needle, haystack, expected_fwd, expected_rev) tuple.
1238	type SearchTest =
1239	(&'static str, &'static str, Option<usize>, Option<usize>);
1240
1241	const SEARCH_TESTS: &'static [SearchTest] = &[
1242	("", "", Some(`0`), Some(`0`)),
1243	("", "a", Some(`0`), Some(`1`)),
1244	("", "ab", Some(`0`), Some(`2`)),
1245	("", "abc", Some(`0`), Some(`3`)),
1246	("a", "", None, None),
1247	("a", "a", Some(`0`), Some(`0`)),
1248	("a", "aa", Some(`0`), Some(`1`)),
1249	("a", "ba", Some(`1`), Some(`1`)),
1250	("a", "bba", Some(`2`), Some(`2`)),
1251	("a", "bbba", Some(`3`), Some(`3`)),
1252	("a", "bbbab", Some(`3`), Some(`3`)),
1253	("a", "bbbabb", Some(`3`), Some(`3`)),
1254	("a", "bbbabbb", Some(`3`), Some(`3`)),
1255	("a", "bbbbbb", None, None),
1256	("ab", "", None, None),
1257	("ab", "a", None, None),
1258	("ab", "b", None, None),
1259	("ab", "ab", Some(`0`), Some(`0`)),
1260	("ab", "aab", Some(`1`), Some(`1`)),
1261	("ab", "aaab", Some(`2`), Some(`2`)),
1262	("ab", "abaab", Some(`0`), Some(`3`)),
1263	("ab", "baaab", Some(`3`), Some(`3`)),
1264	("ab", "acb", None, None),
1265	("ab", "abba", Some(`0`), Some(`0`)),
1266	("abc", "ab", None, None),
1267	("abc", "abc", Some(`0`), Some(`0`)),
1268	("abc", "abcz", Some(`0`), Some(`0`)),
1269	("abc", "abczz", Some(`0`), Some(`0`)),
1270	("abc", "zabc", Some(`1`), Some(`1`)),
1271	("abc", "zzabc", Some(`2`), Some(`2`)),
1272	("abc", "azbc", None, None),
1273	("abc", "abzc", None, None),
1274	("abczdef", "abczdefzzzzzzzzzzzzzzzzzzzz", Some(`0`), Some(`0`)),
1275	("abczdef", "zzzzzzzzzzzzzzzzzzzzabczdef", Some(`20`), Some(`20`)),
1276	("xyz", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaxyz", Some(`32`), Some(`32`)),
1277	// Failures caught by quickcheck.
1278	("`\u{0}\u{15}`", "`\u{0}\u{15}\u{15}\u{0}`", Some(`0`), Some(`0`)),
1279	("`\u{0}\u{1e}`", "`\u{1e}\u{0}`", None, None),
1280	];
1281
1282	/// Run the substring search tests. `search` should be a closure that
1283	/// accepts a haystack and a needle and returns the starting position
1284	/// of the first occurrence of needle in the haystack, or `None` if one
1285	/// doesn't exist.
1286	pub(crate) fn run_search_tests_fwd(
1287	mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1288	) {
1289	for &(needle, haystack, expected_fwd, _) in SEARCH_TESTS {
1290	let (n, h) = (needle.as_bytes(), haystack.as_bytes());
1291	assert_eq!(
1292	expected_fwd,
1293	search(h, n),
1294	"needle: {:?}, haystack: {:?}, expected: {:?}",
1295	n,
1296	h,
1297	expected_fwd
1298	);
1299	}
1300	}
1301
1302	/// Run the substring search tests. `search` should be a closure that
1303	/// accepts a haystack and a needle and returns the starting position of
1304	/// the last occurrence of needle in the haystack, or `None` if one doesn't
1305	/// exist.
1306	pub(crate) fn run_search_tests_rev(
1307	mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>,
1308	) {
1309	for &(needle, haystack, _, expected_rev) in SEARCH_TESTS {
1310	let (n, h) = (needle.as_bytes(), haystack.as_bytes());
1311	assert_eq!(
1312	expected_rev,
1313	search(h, n),
1314	"needle: {:?}, haystack: {:?}, expected: {:?}",
1315	n,
1316	h,
1317	expected_rev
1318	);
1319	}
1320	}
1321	}
1322

Provided by KDAB

Definitions