memchr.rs - Codebrowser

1	/!*
2	Generic crate-internal routines for the `memchr` family of functions.
3	*/
4
5	// What follows is a vector algorithm generic over the specific vector
6	// type to detect the position of one, two or three needles in a haystack.
7	// From what I know, this is a "classic" algorithm, although I don't
8	// believe it has been published in any peer reviewed journal. I believe
9	// it can be found in places like glibc and Go's standard library. It
10	// appears to be well known and is elaborated on in more detail here:
11	// https://gms.tf/stdfind-and-memchr-optimizations.html
12	//
13	// While the routine below is fairly long and perhaps intimidating, the basic
14	// idea is actually very simple and can be expressed straight-forwardly in
15	// pseudo code. The psuedo code below is written for 128 bit vectors, but the
16	// actual code below works for anything that implements the Vector trait.
17	//
18	// needle = (n1 << 15) \| (n1 << 14) \| ... \| (n1 << 1) \| n1
19	// // Note: shift amount is in bytes
20	//
21	// while i <= haystack.len() - 16:
22	// // A 16 byte vector. Each byte in chunk corresponds to a byte in
23	// // the haystack.
24	// chunk = haystack[i:i+16]
25	// // Compare bytes in needle with bytes in chunk. The result is a 16
26	// // byte chunk where each byte is 0xFF if the corresponding bytes
27	// // in needle and chunk were equal, or 0x00 otherwise.
28	// eqs = cmpeq(needle, chunk)
29	// // Return a 32 bit integer where the most significant 16 bits
30	// // are always 0 and the lower 16 bits correspond to whether the
31	// // most significant bit in the correspond byte in `eqs` is set.
32	// // In other words, `mask as u16` has bit i set if and only if
33	// // needle[i] == chunk[i].
34	// mask = movemask(eqs)
35	//
36	// // Mask is 0 if there is no match, and non-zero otherwise.
37	// if mask != 0:
38	// // trailing_zeros tells us the position of the least significant
39	// // bit that is set.
40	// return i + trailing_zeros(mask)
41	//
42	// // haystack length may not be a multiple of 16, so search the rest.
43	// while i < haystack.len():
44	// if haystack[i] == n1:
45	// return i
46	//
47	// // No match found.
48	// return NULL
49	//
50	// In fact, we could loosely translate the above code to Rust line-for-line
51	// and it would be a pretty fast algorithm. But, we pull out all the stops
52	// to go as fast as possible:
53	//
54	// 1. We use aligned loads. That is, we do some finagling to make sure our
55	// primary loop not only proceeds in increments of 16 bytes, but that
56	// the address of haystack's pointer that we dereference is aligned to
57	// 16 bytes. 16 is a magic number here because it is the size of SSE2
58	// 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
59	// Therefore, to get aligned loads, our pointer's address must be evenly
60	// divisible by 16.
61	// 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
62	// kind of like loop unrolling, but we combine the equality comparisons
63	// using a vector OR such that we only need to extract a single mask to
64	// determine whether a match exists or not. If so, then we do some
65	// book-keeping to determine the precise location but otherwise mush on.
66	// 3. We use our "chunk" comparison routine in as many places as possible,
67	// even if it means using unaligned loads. In particular, if haystack
68	// starts with an unaligned address, then we do an unaligned load to
69	// search the first 16 bytes. We then start our primary loop at the
70	// smallest subsequent aligned address, which will actually overlap with
71	// previously searched bytes. But we're OK with that. We do a similar
72	// dance at the end of our primary loop. Finally, to avoid a
73	// byte-at-a-time loop at the end, we do a final 16 byte unaligned load
74	// that may overlap with a previous load. This is OK because it converts
75	// a loop into a small number of very fast vector instructions. The overlap
76	// is OK because we know the place where the overlap occurs does not
77	// contain a match.
78	//
79	// And that's pretty all there is to it. Note that since the below is
80	// generic and since it's meant to be inlined into routines with a
81	// `#[target_feature(enable = "...")]` annotation, we must mark all routines as
82	// both unsafe and `#[inline(always)]`.
83	//
84	// The fact that the code below is generic does somewhat inhibit us. For
85	// example, I've noticed that introducing an unlineable `#[cold]` function to
86	// handle the match case in the loop generates tighter assembly, but there is
87	// no way to do this in the generic code below because the generic code doesn't
88	// know what `target_feature` annotation to apply to the unlineable function.
89	// We could make such functions part of the `Vector` trait, but we instead live
90	// with the slightly sub-optimal codegen for now since it doesn't seem to have
91	// a noticeable perf difference.
92
93	use crate::{
94	ext::Pointer,
95	vector::{MoveMask, Vector},
96	};
97
98	/// Finds all occurrences of a single byte in a haystack.
99	#[derive(Clone, Copy, Debug)]
100	pub(crate) struct One<V> {
101	s1: u8,
102	v1: V,
103	}
104
105	impl<V: Vector> One<V> {
106	/// The number of bytes we examine per each iteration of our search loop.
107	const LOOP_SIZE: usize = `4` * V::BYTES;
108
109	/// Create a new searcher that finds occurrences of the byte given.
110	#[inline(always)]
111	pub(crate) unsafe fn new(needle: u8) -> One<V> {
112	One { s1: needle, v1: V::splat(needle) }
113	}
114
115	/// Returns the needle given to `One::new`.
116	#[inline(always)]
117	pub(crate) fn needle1(&self) -> u8 {
118	self.s1
119	}
120
121	/// Return a pointer to the first occurrence of the needle in the given
122	/// haystack. If no such occurrence exists, then `None` is returned.
123	///
124	/// When a match is found, the pointer returned is guaranteed to be
125	/// `>= start` and `< end`.
126	///
127	/// # Safety
128	///
129	/// It must be the case that `start < end` and that the distance between*
130	/// them is at least equal to `V::BYTES`. That is, it must always be valid
131	/// to do at least an unaligned load of `V` at `start`.
132	/// Both `start` and `end` must be valid for reads.*
133	/// Both `start` and `end` must point to an initialized value.*
134	/// Both `start` and `end` must point to the same allocated object and*
135	/// must either be in bounds or at most one byte past the end of the
136	/// allocated object.
137	/// Both `start` and `end` must be _derived from_ a pointer to the same*
138	/// object.
139	/// The distance between `start` and `end` must not overflow `isize`.*
140	/// The distance being in bounds must not rely on "wrapping around" the*
141	/// address space.
142	#[inline(always)]
143	pub(crate) unsafe fn find_raw(
144	&self,
145	start: *const u8,
146	end: *const u8,
147	) -> Option<*const u8> {
148	// If we want to support vectors bigger than 256 bits, we probably
149	// need to move up to using a u64 for the masks used below. Currently
150	// they are 32 bits, which means we're SOL for vectors that need masks
151	// bigger than 32 bits. Overall unclear until there's a use case.
152	debug_assert!(V::BYTES <= `32`, "vector cannot be bigger than 32 bytes");
153
154	let topos = V::Mask::first_offset;
155	let len = end.distance(start);
156	debug_assert!(
157	len >= V::BYTES,
158	"haystack has length {}, but must be at least {}",
159	len,
160	V::BYTES
161	);
162
163	// Search a possibly unaligned chunk at `start`. This covers any part
164	// of the haystack prior to where aligned loads can start.
165	if let Some(cur) = self.search_chunk(start, topos) {
166	return Some(cur);
167	}
168	// Set `cur` to the first V-aligned pointer greater than `start`.
169	let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
170	debug_assert!(cur > start && end.sub(V::BYTES) >= start);
171	if len >= Self::LOOP_SIZE {
172	while cur <= end.sub(Self::LOOP_SIZE) {
173	debug_assert_eq!(`0`, cur.as_usize() % V::BYTES);
174
175	let a = V::load_aligned(cur);
176	let b = V::load_aligned(cur.add(`1` * V::BYTES));
177	let c = V::load_aligned(cur.add(`2` * V::BYTES));
178	let d = V::load_aligned(cur.add(`3` * V::BYTES));
179	let eqa = self.v1.cmpeq(a);
180	let eqb = self.v1.cmpeq(b);
181	let eqc = self.v1.cmpeq(c);
182	let eqd = self.v1.cmpeq(d);
183	let or1 = eqa.or(eqb);
184	let or2 = eqc.or(eqd);
185	let or3 = or1.or(or2);
186	if or3.movemask_will_have_non_zero() {
187	let mask = eqa.movemask();
188	if mask.has_non_zero() {
189	return Some(cur.add(topos(mask)));
190	}
191
192	let mask = eqb.movemask();
193	if mask.has_non_zero() {
194	return Some(cur.add(`1` * V::BYTES).add(topos(mask)));
195	}
196
197	let mask = eqc.movemask();
198	if mask.has_non_zero() {
199	return Some(cur.add(`2` * V::BYTES).add(topos(mask)));
200	}
201
202	let mask = eqd.movemask();
203	debug_assert!(mask.has_non_zero());
204	return Some(cur.add(`3` * V::BYTES).add(topos(mask)));
205	}
206	cur = cur.add(Self::LOOP_SIZE);
207	}
208	}
209	// Handle any leftovers after the aligned loop above. We use unaligned
210	// loads here, but I believe we are guaranteed that they are aligned
211	// since `cur` is aligned.
212	while cur <= end.sub(V::BYTES) {
213	debug_assert!(end.distance(cur) >= V::BYTES);
214	if let Some(cur) = self.search_chunk(cur, topos) {
215	return Some(cur);
216	}
217	cur = cur.add(V::BYTES);
218	}
219	// Finally handle any remaining bytes less than the size of V. In this
220	// case, our pointer may indeed be unaligned and the load may overlap
221	// with the previous one. But that's okay since we know the previous
222	// load didn't lead to a match (otherwise we wouldn't be here).
223	if cur < end {
224	debug_assert!(end.distance(cur) < V::BYTES);
225	cur = cur.sub(V::BYTES - end.distance(cur));
226	debug_assert_eq!(end.distance(cur), V::BYTES);
227	return self.search_chunk(cur, topos);
228	}
229	None
230	}
231
232	/// Return a pointer to the last occurrence of the needle in the given
233	/// haystack. If no such occurrence exists, then `None` is returned.
234	///
235	/// When a match is found, the pointer returned is guaranteed to be
236	/// `>= start` and `< end`.
237	///
238	/// # Safety
239	///
240	/// It must be the case that `start < end` and that the distance between*
241	/// them is at least equal to `V::BYTES`. That is, it must always be valid
242	/// to do at least an unaligned load of `V` at `start`.
243	/// Both `start` and `end` must be valid for reads.*
244	/// Both `start` and `end` must point to an initialized value.*
245	/// Both `start` and `end` must point to the same allocated object and*
246	/// must either be in bounds or at most one byte past the end of the
247	/// allocated object.
248	/// Both `start` and `end` must be _derived from_ a pointer to the same*
249	/// object.
250	/// The distance between `start` and `end` must not overflow `isize`.*
251	/// The distance being in bounds must not rely on "wrapping around" the*
252	/// address space.
253	#[inline(always)]
254	pub(crate) unsafe fn rfind_raw(
255	&self,
256	start: *const u8,
257	end: *const u8,
258	) -> Option<*const u8> {
259	// If we want to support vectors bigger than 256 bits, we probably
260	// need to move up to using a u64 for the masks used below. Currently
261	// they are 32 bits, which means we're SOL for vectors that need masks
262	// bigger than 32 bits. Overall unclear until there's a use case.
263	debug_assert!(V::BYTES <= `32`, "vector cannot be bigger than 32 bytes");
264
265	let topos = V::Mask::last_offset;
266	let len = end.distance(start);
267	debug_assert!(
268	len >= V::BYTES,
269	"haystack has length {}, but must be at least {}",
270	len,
271	V::BYTES
272	);
273
274	if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
275	return Some(cur);
276	}
277	let mut cur = end.sub(end.as_usize() & V::ALIGN);
278	debug_assert!(start <= cur && cur <= end);
279	if len >= Self::LOOP_SIZE {
280	while cur >= start.add(Self::LOOP_SIZE) {
281	debug_assert_eq!(`0`, cur.as_usize() % V::BYTES);
282
283	cur = cur.sub(Self::LOOP_SIZE);
284	let a = V::load_aligned(cur);
285	let b = V::load_aligned(cur.add(`1` * V::BYTES));
286	let c = V::load_aligned(cur.add(`2` * V::BYTES));
287	let d = V::load_aligned(cur.add(`3` * V::BYTES));
288	let eqa = self.v1.cmpeq(a);
289	let eqb = self.v1.cmpeq(b);
290	let eqc = self.v1.cmpeq(c);
291	let eqd = self.v1.cmpeq(d);
292	let or1 = eqa.or(eqb);
293	let or2 = eqc.or(eqd);
294	let or3 = or1.or(or2);
295	if or3.movemask_will_have_non_zero() {
296	let mask = eqd.movemask();
297	if mask.has_non_zero() {
298	return Some(cur.add(`3` * V::BYTES).add(topos(mask)));
299	}
300
301	let mask = eqc.movemask();
302	if mask.has_non_zero() {
303	return Some(cur.add(`2` * V::BYTES).add(topos(mask)));
304	}
305
306	let mask = eqb.movemask();
307	if mask.has_non_zero() {
308	return Some(cur.add(`1` * V::BYTES).add(topos(mask)));
309	}
310
311	let mask = eqa.movemask();
312	debug_assert!(mask.has_non_zero());
313	return Some(cur.add(topos(mask)));
314	}
315	}
316	}
317	while cur >= start.add(V::BYTES) {
318	debug_assert!(cur.distance(start) >= V::BYTES);
319	cur = cur.sub(V::BYTES);
320	if let Some(cur) = self.search_chunk(cur, topos) {
321	return Some(cur);
322	}
323	}
324	if cur > start {
325	debug_assert!(cur.distance(start) < V::BYTES);
326	return self.search_chunk(start, topos);
327	}
328	None
329	}
330
331	/// Return a count of all matching bytes in the given haystack.
332	///
333	/// # Safety
334	///
335	/// It must be the case that `start < end` and that the distance between*
336	/// them is at least equal to `V::BYTES`. That is, it must always be valid
337	/// to do at least an unaligned load of `V` at `start`.
338	/// Both `start` and `end` must be valid for reads.*
339	/// Both `start` and `end` must point to an initialized value.*
340	/// Both `start` and `end` must point to the same allocated object and*
341	/// must either be in bounds or at most one byte past the end of the
342	/// allocated object.
343	/// Both `start` and `end` must be _derived from_ a pointer to the same*
344	/// object.
345	/// The distance between `start` and `end` must not overflow `isize`.*
346	/// The distance being in bounds must not rely on "wrapping around" the*
347	/// address space.
348	#[inline(always)]
349	pub(crate) unsafe fn count_raw(
350	&self,
351	start: *const u8,
352	end: *const u8,
353	) -> usize {
354	debug_assert!(V::BYTES <= `32`, "vector cannot be bigger than 32 bytes");
355
356	let confirm = \|b\| b == self.needle1();
357	let len = end.distance(start);
358	debug_assert!(
359	len >= V::BYTES,
360	"haystack has length {}, but must be at least {}",
361	len,
362	V::BYTES
363	);
364
365	// Set `cur` to the first V-aligned pointer greater than `start`.
366	let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
367	// Count any matching bytes before we start our aligned loop.
368	let mut count = count_byte_by_byte(start, cur, confirm);
369	debug_assert!(cur > start && end.sub(V::BYTES) >= start);
370	if len >= Self::LOOP_SIZE {
371	while cur <= end.sub(Self::LOOP_SIZE) {
372	debug_assert_eq!(`0`, cur.as_usize() % V::BYTES);
373
374	let a = V::load_aligned(cur);
375	let b = V::load_aligned(cur.add(`1` * V::BYTES));
376	let c = V::load_aligned(cur.add(`2` * V::BYTES));
377	let d = V::load_aligned(cur.add(`3` * V::BYTES));
378	let eqa = self.v1.cmpeq(a);
379	let eqb = self.v1.cmpeq(b);
380	let eqc = self.v1.cmpeq(c);
381	let eqd = self.v1.cmpeq(d);
382	count += eqa.movemask().count_ones();
383	count += eqb.movemask().count_ones();
384	count += eqc.movemask().count_ones();
385	count += eqd.movemask().count_ones();
386	cur = cur.add(Self::LOOP_SIZE);
387	}
388	}
389	// Handle any leftovers after the aligned loop above. We use unaligned
390	// loads here, but I believe we are guaranteed that they are aligned
391	// since `cur` is aligned.
392	while cur <= end.sub(V::BYTES) {
393	debug_assert!(end.distance(cur) >= V::BYTES);
394	let chunk = V::load_unaligned(cur);
395	count += self.v1.cmpeq(chunk).movemask().count_ones();
396	cur = cur.add(V::BYTES);
397	}
398	// And finally count any leftovers that weren't caught above.
399	count += count_byte_by_byte(cur, end, confirm);
400	count
401	}
402
403	/// Search `V::BYTES` starting at `cur` via an unaligned load.
404	///
405	/// `mask_to_offset` should be a function that converts a `movemask` to
406	/// an offset such that `cur.add(offset)` corresponds to a pointer to the
407	/// match location if one is found. Generally it is expected to use either
408	/// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
409	/// one is implementing a forward or reverse search, respectively.
410	///
411	/// # Safety
412	///
413	/// `cur` must be a valid pointer and it must be valid to do an unaligned
414	/// load of size `V::BYTES` at `cur`.
415	#[inline(always)]
416	unsafe fn search_chunk(
417	&self,
418	cur: *const u8,
419	mask_to_offset: impl Fn(V::Mask) -> usize,
420	) -> Option<*const u8> {
421	let chunk = V::load_unaligned(cur);
422	let mask = self.v1.cmpeq(chunk).movemask();
423	if mask.has_non_zero() {
424	Some(cur.add(mask_to_offset(mask)))
425	} else {
426	None
427	}
428	}
429	}
430
431	/// Finds all occurrences of two bytes in a haystack.
432	///
433	/// That is, this reports matches of one of two possible bytes. For example,
434	/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
435	/// `4` and `5`.
436	#[derive(Clone, Copy, Debug)]
437	pub(crate) struct Two<V> {
438	s1: u8,
439	s2: u8,
440	v1: V,
441	v2: V,
442	}
443
444	impl<V: Vector> Two<V> {
445	/// The number of bytes we examine per each iteration of our search loop.
446	const LOOP_SIZE: usize = `2` * V::BYTES;
447
448	/// Create a new searcher that finds occurrences of the byte given.
449	#[inline(always)]
450	pub(crate) unsafe fn new(needle1: u8, needle2: u8) -> Two<V> {
451	Two {
452	s1: needle1,
453	s2: needle2,
454	v1: V::splat(needle1),
455	v2: V::splat(needle2),
456	}
457	}
458
459	/// Returns the first needle given to `Two::new`.
460	#[inline(always)]
461	pub(crate) fn needle1(&self) -> u8 {
462	self.s1
463	}
464
465	/// Returns the second needle given to `Two::new`.
466	#[inline(always)]
467	pub(crate) fn needle2(&self) -> u8 {
468	self.s2
469	}
470
471	/// Return a pointer to the first occurrence of one of the needles in the
472	/// given haystack. If no such occurrence exists, then `None` is returned.
473	///
474	/// When a match is found, the pointer returned is guaranteed to be
475	/// `>= start` and `< end`.
476	///
477	/// # Safety
478	///
479	/// It must be the case that `start < end` and that the distance between*
480	/// them is at least equal to `V::BYTES`. That is, it must always be valid
481	/// to do at least an unaligned load of `V` at `start`.
482	/// Both `start` and `end` must be valid for reads.*
483	/// Both `start` and `end` must point to an initialized value.*
484	/// Both `start` and `end` must point to the same allocated object and*
485	/// must either be in bounds or at most one byte past the end of the
486	/// allocated object.
487	/// Both `start` and `end` must be _derived from_ a pointer to the same*
488	/// object.
489	/// The distance between `start` and `end` must not overflow `isize`.*
490	/// The distance being in bounds must not rely on "wrapping around" the*
491	/// address space.
492	#[inline(always)]
493	pub(crate) unsafe fn find_raw(
494	&self,
495	start: *const u8,
496	end: *const u8,
497	) -> Option<*const u8> {
498	// If we want to support vectors bigger than 256 bits, we probably
499	// need to move up to using a u64 for the masks used below. Currently
500	// they are 32 bits, which means we're SOL for vectors that need masks
501	// bigger than 32 bits. Overall unclear until there's a use case.
502	debug_assert!(V::BYTES <= `32`, "vector cannot be bigger than 32 bytes");
503
504	let topos = V::Mask::first_offset;
505	let len = end.distance(start);
506	debug_assert!(
507	len >= V::BYTES,
508	"haystack has length {}, but must be at least {}",
509	len,
510	V::BYTES
511	);
512
513	// Search a possibly unaligned chunk at `start`. This covers any part
514	// of the haystack prior to where aligned loads can start.
515	if let Some(cur) = self.search_chunk(start, topos) {
516	return Some(cur);
517	}
518	// Set `cur` to the first V-aligned pointer greater than `start`.
519	let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
520	debug_assert!(cur > start && end.sub(V::BYTES) >= start);
521	if len >= Self::LOOP_SIZE {
522	while cur <= end.sub(Self::LOOP_SIZE) {
523	debug_assert_eq!(`0`, cur.as_usize() % V::BYTES);
524
525	let a = V::load_aligned(cur);
526	let b = V::load_aligned(cur.add(V::BYTES));
527	let eqa1 = self.v1.cmpeq(a);
528	let eqb1 = self.v1.cmpeq(b);
529	let eqa2 = self.v2.cmpeq(a);
530	let eqb2 = self.v2.cmpeq(b);
531	let or1 = eqa1.or(eqb1);
532	let or2 = eqa2.or(eqb2);
533	let or3 = or1.or(or2);
534	if or3.movemask_will_have_non_zero() {
535	let mask = eqa1.movemask().or(eqa2.movemask());
536	if mask.has_non_zero() {
537	return Some(cur.add(topos(mask)));
538	}
539
540	let mask = eqb1.movemask().or(eqb2.movemask());
541	debug_assert!(mask.has_non_zero());
542	return Some(cur.add(V::BYTES).add(topos(mask)));
543	}
544	cur = cur.add(Self::LOOP_SIZE);
545	}
546	}
547	// Handle any leftovers after the aligned loop above. We use unaligned
548	// loads here, but I believe we are guaranteed that they are aligned
549	// since `cur` is aligned.
550	while cur <= end.sub(V::BYTES) {
551	debug_assert!(end.distance(cur) >= V::BYTES);
552	if let Some(cur) = self.search_chunk(cur, topos) {
553	return Some(cur);
554	}
555	cur = cur.add(V::BYTES);
556	}
557	// Finally handle any remaining bytes less than the size of V. In this
558	// case, our pointer may indeed be unaligned and the load may overlap
559	// with the previous one. But that's okay since we know the previous
560	// load didn't lead to a match (otherwise we wouldn't be here).
561	if cur < end {
562	debug_assert!(end.distance(cur) < V::BYTES);
563	cur = cur.sub(V::BYTES - end.distance(cur));
564	debug_assert_eq!(end.distance(cur), V::BYTES);
565	return self.search_chunk(cur, topos);
566	}
567	None
568	}
569
570	/// Return a pointer to the last occurrence of the needle in the given
571	/// haystack. If no such occurrence exists, then `None` is returned.
572	///
573	/// When a match is found, the pointer returned is guaranteed to be
574	/// `>= start` and `< end`.
575	///
576	/// # Safety
577	///
578	/// It must be the case that `start < end` and that the distance between*
579	/// them is at least equal to `V::BYTES`. That is, it must always be valid
580	/// to do at least an unaligned load of `V` at `start`.
581	/// Both `start` and `end` must be valid for reads.*
582	/// Both `start` and `end` must point to an initialized value.*
583	/// Both `start` and `end` must point to the same allocated object and*
584	/// must either be in bounds or at most one byte past the end of the
585	/// allocated object.
586	/// Both `start` and `end` must be _derived from_ a pointer to the same*
587	/// object.
588	/// The distance between `start` and `end` must not overflow `isize`.*
589	/// The distance being in bounds must not rely on "wrapping around" the*
590	/// address space.
591	#[inline(always)]
592	pub(crate) unsafe fn rfind_raw(
593	&self,
594	start: *const u8,
595	end: *const u8,
596	) -> Option<*const u8> {
597	// If we want to support vectors bigger than 256 bits, we probably
598	// need to move up to using a u64 for the masks used below. Currently
599	// they are 32 bits, which means we're SOL for vectors that need masks
600	// bigger than 32 bits. Overall unclear until there's a use case.
601	debug_assert!(V::BYTES <= `32`, "vector cannot be bigger than 32 bytes");
602
603	let topos = V::Mask::last_offset;
604	let len = end.distance(start);
605	debug_assert!(
606	len >= V::BYTES,
607	"haystack has length {}, but must be at least {}",
608	len,
609	V::BYTES
610	);
611
612	if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
613	return Some(cur);
614	}
615	let mut cur = end.sub(end.as_usize() & V::ALIGN);
616	debug_assert!(start <= cur && cur <= end);
617	if len >= Self::LOOP_SIZE {
618	while cur >= start.add(Self::LOOP_SIZE) {
619	debug_assert_eq!(`0`, cur.as_usize() % V::BYTES);
620
621	cur = cur.sub(Self::LOOP_SIZE);
622	let a = V::load_aligned(cur);
623	let b = V::load_aligned(cur.add(V::BYTES));
624	let eqa1 = self.v1.cmpeq(a);
625	let eqb1 = self.v1.cmpeq(b);
626	let eqa2 = self.v2.cmpeq(a);
627	let eqb2 = self.v2.cmpeq(b);
628	let or1 = eqa1.or(eqb1);
629	let or2 = eqa2.or(eqb2);
630	let or3 = or1.or(or2);
631	if or3.movemask_will_have_non_zero() {
632	let mask = eqb1.movemask().or(eqb2.movemask());
633	if mask.has_non_zero() {
634	return Some(cur.add(V::BYTES).add(topos(mask)));
635	}
636
637	let mask = eqa1.movemask().or(eqa2.movemask());
638	debug_assert!(mask.has_non_zero());
639	return Some(cur.add(topos(mask)));
640	}
641	}
642	}
643	while cur >= start.add(V::BYTES) {
644	debug_assert!(cur.distance(start) >= V::BYTES);
645	cur = cur.sub(V::BYTES);
646	if let Some(cur) = self.search_chunk(cur, topos) {
647	return Some(cur);
648	}
649	}
650	if cur > start {
651	debug_assert!(cur.distance(start) < V::BYTES);
652	return self.search_chunk(start, topos);
653	}
654	None
655	}
656
657	/// Search `V::BYTES` starting at `cur` via an unaligned load.
658	///
659	/// `mask_to_offset` should be a function that converts a `movemask` to
660	/// an offset such that `cur.add(offset)` corresponds to a pointer to the
661	/// match location if one is found. Generally it is expected to use either
662	/// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
663	/// one is implementing a forward or reverse search, respectively.
664	///
665	/// # Safety
666	///
667	/// `cur` must be a valid pointer and it must be valid to do an unaligned
668	/// load of size `V::BYTES` at `cur`.
669	#[inline(always)]
670	unsafe fn search_chunk(
671	&self,
672	cur: *const u8,
673	mask_to_offset: impl Fn(V::Mask) -> usize,
674	) -> Option<*const u8> {
675	let chunk = V::load_unaligned(cur);
676	let eq1 = self.v1.cmpeq(chunk);
677	let eq2 = self.v2.cmpeq(chunk);
678	let mask = eq1.or(eq2).movemask();
679	if mask.has_non_zero() {
680	let mask1 = eq1.movemask();
681	let mask2 = eq2.movemask();
682	Some(cur.add(mask_to_offset(mask1.or(mask2))))
683	} else {
684	None
685	}
686	}
687	}
688
689	/// Finds all occurrences of two bytes in a haystack.
690	///
691	/// That is, this reports matches of one of two possible bytes. For example,
692	/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
693	/// `4` and `5`.
694	#[derive(Clone, Copy, Debug)]
695	pub(crate) struct Three<V> {
696	s1: u8,
697	s2: u8,
698	s3: u8,
699	v1: V,
700	v2: V,
701	v3: V,
702	}
703
704	impl<V: Vector> Three<V> {
705	/// The number of bytes we examine per each iteration of our search loop.
706	const LOOP_SIZE: usize = `2` * V::BYTES;
707
708	/// Create a new searcher that finds occurrences of the byte given.
709	#[inline(always)]
710	pub(crate) unsafe fn new(
711	needle1: u8,
712	needle2: u8,
713	needle3: u8,
714	) -> Three<V> {
715	Three {
716	s1: needle1,
717	s2: needle2,
718	s3: needle3,
719	v1: V::splat(needle1),
720	v2: V::splat(needle2),
721	v3: V::splat(needle3),
722	}
723	}
724
725	/// Returns the first needle given to `Three::new`.
726	#[inline(always)]
727	pub(crate) fn needle1(&self) -> u8 {
728	self.s1
729	}
730
731	/// Returns the second needle given to `Three::new`.
732	#[inline(always)]
733	pub(crate) fn needle2(&self) -> u8 {
734	self.s2
735	}
736
737	/// Returns the third needle given to `Three::new`.
738	#[inline(always)]
739	pub(crate) fn needle3(&self) -> u8 {
740	self.s3
741	}
742
743	/// Return a pointer to the first occurrence of one of the needles in the
744	/// given haystack. If no such occurrence exists, then `None` is returned.
745	///
746	/// When a match is found, the pointer returned is guaranteed to be
747	/// `>= start` and `< end`.
748	///
749	/// # Safety
750	///
751	/// It must be the case that `start < end` and that the distance between*
752	/// them is at least equal to `V::BYTES`. That is, it must always be valid
753	/// to do at least an unaligned load of `V` at `start`.
754	/// Both `start` and `end` must be valid for reads.*
755	/// Both `start` and `end` must point to an initialized value.*
756	/// Both `start` and `end` must point to the same allocated object and*
757	/// must either be in bounds or at most one byte past the end of the
758	/// allocated object.
759	/// Both `start` and `end` must be _derived from_ a pointer to the same*
760	/// object.
761	/// The distance between `start` and `end` must not overflow `isize`.*
762	/// The distance being in bounds must not rely on "wrapping around" the*
763	/// address space.
764	#[inline(always)]
765	pub(crate) unsafe fn find_raw(
766	&self,
767	start: *const u8,
768	end: *const u8,
769	) -> Option<*const u8> {
770	// If we want to support vectors bigger than 256 bits, we probably
771	// need to move up to using a u64 for the masks used below. Currently
772	// they are 32 bits, which means we're SOL for vectors that need masks
773	// bigger than 32 bits. Overall unclear until there's a use case.
774	debug_assert!(V::BYTES <= `32`, "vector cannot be bigger than 32 bytes");
775
776	let topos = V::Mask::first_offset;
777	let len = end.distance(start);
778	debug_assert!(
779	len >= V::BYTES,
780	"haystack has length {}, but must be at least {}",
781	len,
782	V::BYTES
783	);
784
785	// Search a possibly unaligned chunk at `start`. This covers any part
786	// of the haystack prior to where aligned loads can start.
787	if let Some(cur) = self.search_chunk(start, topos) {
788	return Some(cur);
789	}
790	// Set `cur` to the first V-aligned pointer greater than `start`.
791	let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
792	debug_assert!(cur > start && end.sub(V::BYTES) >= start);
793	if len >= Self::LOOP_SIZE {
794	while cur <= end.sub(Self::LOOP_SIZE) {
795	debug_assert_eq!(`0`, cur.as_usize() % V::BYTES);
796
797	let a = V::load_aligned(cur);
798	let b = V::load_aligned(cur.add(V::BYTES));
799	let eqa1 = self.v1.cmpeq(a);
800	let eqb1 = self.v1.cmpeq(b);
801	let eqa2 = self.v2.cmpeq(a);
802	let eqb2 = self.v2.cmpeq(b);
803	let eqa3 = self.v3.cmpeq(a);
804	let eqb3 = self.v3.cmpeq(b);
805	let or1 = eqa1.or(eqb1);
806	let or2 = eqa2.or(eqb2);
807	let or3 = eqa3.or(eqb3);
808	let or4 = or1.or(or2);
809	let or5 = or3.or(or4);
810	if or5.movemask_will_have_non_zero() {
811	let mask = eqa1
812	.movemask()
813	.or(eqa2.movemask())
814	.or(eqa3.movemask());
815	if mask.has_non_zero() {
816	return Some(cur.add(topos(mask)));
817	}
818
819	let mask = eqb1
820	.movemask()
821	.or(eqb2.movemask())
822	.or(eqb3.movemask());
823	debug_assert!(mask.has_non_zero());
824	return Some(cur.add(V::BYTES).add(topos(mask)));
825	}
826	cur = cur.add(Self::LOOP_SIZE);
827	}
828	}
829	// Handle any leftovers after the aligned loop above. We use unaligned
830	// loads here, but I believe we are guaranteed that they are aligned
831	// since `cur` is aligned.
832	while cur <= end.sub(V::BYTES) {
833	debug_assert!(end.distance(cur) >= V::BYTES);
834	if let Some(cur) = self.search_chunk(cur, topos) {
835	return Some(cur);
836	}
837	cur = cur.add(V::BYTES);
838	}
839	// Finally handle any remaining bytes less than the size of V. In this
840	// case, our pointer may indeed be unaligned and the load may overlap
841	// with the previous one. But that's okay since we know the previous
842	// load didn't lead to a match (otherwise we wouldn't be here).
843	if cur < end {
844	debug_assert!(end.distance(cur) < V::BYTES);
845	cur = cur.sub(V::BYTES - end.distance(cur));
846	debug_assert_eq!(end.distance(cur), V::BYTES);
847	return self.search_chunk(cur, topos);
848	}
849	None
850	}
851
852	/// Return a pointer to the last occurrence of the needle in the given
853	/// haystack. If no such occurrence exists, then `None` is returned.
854	///
855	/// When a match is found, the pointer returned is guaranteed to be
856	/// `>= start` and `< end`.
857	///
858	/// # Safety
859	///
860	/// It must be the case that `start < end` and that the distance between*
861	/// them is at least equal to `V::BYTES`. That is, it must always be valid
862	/// to do at least an unaligned load of `V` at `start`.
863	/// Both `start` and `end` must be valid for reads.*
864	/// Both `start` and `end` must point to an initialized value.*
865	/// Both `start` and `end` must point to the same allocated object and*
866	/// must either be in bounds or at most one byte past the end of the
867	/// allocated object.
868	/// Both `start` and `end` must be _derived from_ a pointer to the same*
869	/// object.
870	/// The distance between `start` and `end` must not overflow `isize`.*
871	/// The distance being in bounds must not rely on "wrapping around" the*
872	/// address space.
873	#[inline(always)]
874	pub(crate) unsafe fn rfind_raw(
875	&self,
876	start: *const u8,
877	end: *const u8,
878	) -> Option<*const u8> {
879	// If we want to support vectors bigger than 256 bits, we probably
880	// need to move up to using a u64 for the masks used below. Currently
881	// they are 32 bits, which means we're SOL for vectors that need masks
882	// bigger than 32 bits. Overall unclear until there's a use case.
883	debug_assert!(V::BYTES <= `32`, "vector cannot be bigger than 32 bytes");
884
885	let topos = V::Mask::last_offset;
886	let len = end.distance(start);
887	debug_assert!(
888	len >= V::BYTES,
889	"haystack has length {}, but must be at least {}",
890	len,
891	V::BYTES
892	);
893
894	if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
895	return Some(cur);
896	}
897	let mut cur = end.sub(end.as_usize() & V::ALIGN);
898	debug_assert!(start <= cur && cur <= end);
899	if len >= Self::LOOP_SIZE {
900	while cur >= start.add(Self::LOOP_SIZE) {
901	debug_assert_eq!(`0`, cur.as_usize() % V::BYTES);
902
903	cur = cur.sub(Self::LOOP_SIZE);
904	let a = V::load_aligned(cur);
905	let b = V::load_aligned(cur.add(V::BYTES));
906	let eqa1 = self.v1.cmpeq(a);
907	let eqb1 = self.v1.cmpeq(b);
908	let eqa2 = self.v2.cmpeq(a);
909	let eqb2 = self.v2.cmpeq(b);
910	let eqa3 = self.v3.cmpeq(a);
911	let eqb3 = self.v3.cmpeq(b);
912	let or1 = eqa1.or(eqb1);
913	let or2 = eqa2.or(eqb2);
914	let or3 = eqa3.or(eqb3);
915	let or4 = or1.or(or2);
916	let or5 = or3.or(or4);
917	if or5.movemask_will_have_non_zero() {
918	let mask = eqb1
919	.movemask()
920	.or(eqb2.movemask())
921	.or(eqb3.movemask());
922	if mask.has_non_zero() {
923	return Some(cur.add(V::BYTES).add(topos(mask)));
924	}
925
926	let mask = eqa1
927	.movemask()
928	.or(eqa2.movemask())
929	.or(eqa3.movemask());
930	debug_assert!(mask.has_non_zero());
931	return Some(cur.add(topos(mask)));
932	}
933	}
934	}
935	while cur >= start.add(V::BYTES) {
936	debug_assert!(cur.distance(start) >= V::BYTES);
937	cur = cur.sub(V::BYTES);
938	if let Some(cur) = self.search_chunk(cur, topos) {
939	return Some(cur);
940	}
941	}
942	if cur > start {
943	debug_assert!(cur.distance(start) < V::BYTES);
944	return self.search_chunk(start, topos);
945	}
946	None
947	}
948
949	/// Search `V::BYTES` starting at `cur` via an unaligned load.
950	///
951	/// `mask_to_offset` should be a function that converts a `movemask` to
952	/// an offset such that `cur.add(offset)` corresponds to a pointer to the
953	/// match location if one is found. Generally it is expected to use either
954	/// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
955	/// one is implementing a forward or reverse search, respectively.
956	///
957	/// # Safety
958	///
959	/// `cur` must be a valid pointer and it must be valid to do an unaligned
960	/// load of size `V::BYTES` at `cur`.
961	#[inline(always)]
962	unsafe fn search_chunk(
963	&self,
964	cur: *const u8,
965	mask_to_offset: impl Fn(V::Mask) -> usize,
966	) -> Option<*const u8> {
967	let chunk = V::load_unaligned(cur);
968	let eq1 = self.v1.cmpeq(chunk);
969	let eq2 = self.v2.cmpeq(chunk);
970	let eq3 = self.v3.cmpeq(chunk);
971	let mask = eq1.or(eq2).or(eq3).movemask();
972	if mask.has_non_zero() {
973	let mask1 = eq1.movemask();
974	let mask2 = eq2.movemask();
975	let mask3 = eq3.movemask();
976	Some(cur.add(mask_to_offset(mask1.or(mask2).or(mask3))))
977	} else {
978	None
979	}
980	}
981	}
982
983	/// An iterator over all occurrences of a set of bytes in a haystack.
984	///
985	/// This iterator implements the routines necessary to provide a
986	/// `DoubleEndedIterator` impl, which means it can also be used to find
987	/// occurrences in reverse order.
988	///
989	/// The lifetime parameters are as follows:
990	///
991	/// `'h` refers to the lifetime of the haystack being searched.*
992	///
993	/// This type is intended to be used to implement all iterators for the
994	/// `memchr` family of functions. It handles a tiny bit of marginally tricky
995	/// raw pointer math, but otherwise expects the caller to provide `find_raw`
996	/// and `rfind_raw` routines for each call of `next` and `next_back`,
997	/// respectively.
998	#[derive(Clone, Debug)]
999	pub(crate) struct Iter<'h> {
1000	/// The original starting point into the haystack. We use this to convert
1001	/// pointers to offsets.
1002	original_start: *const u8,
1003	/// The current starting point into the haystack. That is, where the next
1004	/// search will begin.
1005	start: *const u8,
1006	/// The current ending point into the haystack. That is, where the next
1007	/// reverse search will begin.
1008	end: *const u8,
1009	/// A marker for tracking the lifetime of the start/cur_start/cur_end
1010	/// pointers above, which all point into the haystack.
1011	haystack: core::marker::PhantomData<&'h [u8]>,
1012	}
1013
1014	// SAFETY: Iter contains no shared references to anything that performs any
1015	// interior mutations. Also, the lifetime guarantees that Iter will not outlive
1016	// the haystack.
1017	unsafe impl<'h> Send for Iter<'h> {}
1018
1019	// SAFETY: Iter perform no interior mutations, therefore no explicit
1020	// synchronization is necessary. Also, the lifetime guarantees that Iter will
1021	// not outlive the haystack.
1022	unsafe impl<'h> Sync for Iter<'h> {}
1023
1024	impl<'h> Iter<'h> {
1025	/// Create a new generic memchr iterator.
1026	#[inline(always)]
1027	pub(crate) fn new(haystack: &'h [u8]) -> Iter<'h> {
1028	Iter {
1029	original_start: haystack.as_ptr(),
1030	start: haystack.as_ptr(),
1031	end: haystack.as_ptr().wrapping_add(haystack.len()),
1032	haystack: core::marker::PhantomData,
1033	}
1034	}
1035
1036	/// Returns the next occurrence in the forward direction.
1037	///
1038	/// # Safety
1039	///
1040	/// Callers must ensure that if a pointer is returned from the closure
1041	/// provided, then it must be greater than or equal to the start pointer
1042	/// and less than the end pointer.
1043	#[inline(always)]
1044	pub(crate) unsafe fn next(
1045	&mut self,
1046	mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
1047	) -> Option<usize> {
1048	// SAFETY: Pointers are derived directly from the same &[u8] haystack.
1049	// We only ever modify start/end corresponding to a matching offset
1050	// found between start and end. Thus all changes to start/end maintain
1051	// our safety requirements.
1052	//
1053	// The only other assumption we rely on is that the pointer returned
1054	// by `find_raw` satisfies `self.start <= found < self.end`, and that
1055	// safety contract is forwarded to the caller.
1056	let found = find_raw(self.start, self.end)?;
1057	let result = found.distance(self.original_start);
1058	self.start = found.add(`1`);
1059	Some(result)
1060	}
1061
1062	/// Returns the number of remaining elements in this iterator.
1063	#[inline(always)]
1064	pub(crate) fn count(
1065	self,
1066	mut count_raw: impl FnMut(*const u8, *const u8) -> usize,
1067	) -> usize {
1068	// SAFETY: Pointers are derived directly from the same &[u8] haystack.
1069	// We only ever modify start/end corresponding to a matching offset
1070	// found between start and end. Thus all changes to start/end maintain
1071	// our safety requirements.
1072	count_raw(self.start, self.end)
1073	}
1074
1075	/// Returns the next occurrence in reverse.
1076	///
1077	/// # Safety
1078	///
1079	/// Callers must ensure that if a pointer is returned from the closure
1080	/// provided, then it must be greater than or equal to the start pointer
1081	/// and less than the end pointer.
1082	#[inline(always)]
1083	pub(crate) unsafe fn next_back(
1084	&mut self,
1085	mut rfind_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
1086	) -> Option<usize> {
1087	// SAFETY: Pointers are derived directly from the same &[u8] haystack.
1088	// We only ever modify start/end corresponding to a matching offset
1089	// found between start and end. Thus all changes to start/end maintain
1090	// our safety requirements.
1091	//
1092	// The only other assumption we rely on is that the pointer returned
1093	// by `rfind_raw` satisfies `self.start <= found < self.end`, and that
1094	// safety contract is forwarded to the caller.
1095	let found = rfind_raw(self.start, self.end)?;
1096	let result = found.distance(self.original_start);
1097	self.end = found;
1098	Some(result)
1099	}
1100
1101	/// Provides an implementation of `Iterator::size_hint`.
1102	#[inline(always)]
1103	pub(crate) fn size_hint(&self) -> (usize, Option<usize>) {
1104	(`0`, Some(self.end.as_usize().saturating_sub(self.start.as_usize())))
1105	}
1106	}
1107
1108	/// Search a slice using a function that operates on raw pointers.
1109	///
1110	/// Given a function to search a contiguous sequence of memory for the location
1111	/// of a non-empty set of bytes, this will execute that search on a slice of
1112	/// bytes. The pointer returned by the given function will be converted to an
1113	/// offset relative to the starting point of the given slice. That is, if a
1114	/// match is found, the offset returned by this routine is guaranteed to be a
1115	/// valid index into `haystack`.
1116	///
1117	/// Callers may use this for a forward or reverse search.
1118	///
1119	/// # Safety
1120	///
1121	/// Callers must ensure that if a pointer is returned by `find_raw`, then the
1122	/// pointer must be greater than or equal to the starting pointer and less than
1123	/// the end pointer.
1124	#[inline(always)]
1125	pub(crate) unsafe fn search_slice_with_raw(
1126	haystack: &[u8],
1127	mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
1128	) -> Option<usize> {
1129	// SAFETY: We rely on `find_raw` to return a correct and valid pointer, but
1130	// otherwise, `start` and `end` are valid due to the guarantees provided by
1131	// a &[u8].
1132	let start = haystack.as_ptr();
1133	let end = start.add(haystack.len());
1134	let found = find_raw(start, end)?;
1135	Some(found.distance(start))
1136	}
1137
1138	/// Performs a forward byte-at-a-time loop until either `ptr >= end_ptr` or
1139	/// until `confirm(ptr)` returns `true`. If the former occurs, then `None` is*
1140	/// returned. If the latter occurs, then the pointer at which `confirm` returns
1141	/// `true` is returned.
1142	///
1143	/// # Safety
1144	///
1145	/// Callers must provide valid pointers and they must satisfy `start_ptr <=
1146	/// ptr` and `ptr <= end_ptr`.
1147	#[inline(always)]
1148	pub(crate) unsafe fn fwd_byte_by_byte<F: Fn(u8) -> bool>(
1149	start: *const u8,
1150	end: *const u8,
1151	confirm: F,
1152	) -> Option<*const u8> {
1153	debug_assert!(start <= end);
1154	let mut ptr = start;
1155	while ptr < end {
1156	if confirm(*ptr) {
1157	return Some(ptr);
1158	}
1159	ptr = ptr.offset(`1`);
1160	}
1161	None
1162	}
1163
1164	/// Performs a reverse byte-at-a-time loop until either `ptr < start_ptr` or
1165	/// until `confirm(ptr)` returns `true`. If the former occurs, then `None` is*
1166	/// returned. If the latter occurs, then the pointer at which `confirm` returns
1167	/// `true` is returned.
1168	///
1169	/// # Safety
1170	///
1171	/// Callers must provide valid pointers and they must satisfy `start_ptr <=
1172	/// ptr` and `ptr <= end_ptr`.
1173	#[inline(always)]
1174	pub(crate) unsafe fn rev_byte_by_byte<F: Fn(u8) -> bool>(
1175	start: *const u8,
1176	end: *const u8,
1177	confirm: F,
1178	) -> Option<*const u8> {
1179	debug_assert!(start <= end);
1180
1181	let mut ptr = end;
1182	while ptr > start {
1183	ptr = ptr.offset(`-1`);
1184	if confirm(*ptr) {
1185	return Some(ptr);
1186	}
1187	}
1188	None
1189	}
1190
1191	/// Performs a forward byte-at-a-time loop until `ptr >= end_ptr` and returns
1192	/// the number of times `confirm(ptr)` returns `true`.*
1193	///
1194	/// # Safety
1195	///
1196	/// Callers must provide valid pointers and they must satisfy `start_ptr <=
1197	/// ptr` and `ptr <= end_ptr`.
1198	#[inline(always)]
1199	pub(crate) unsafe fn count_byte_by_byte<F: Fn(u8) -> bool>(
1200	start: *const u8,
1201	end: *const u8,
1202	confirm: F,
1203	) -> usize {
1204	debug_assert!(start <= end);
1205	let mut ptr = start;
1206	let mut count = `0`;
1207	while ptr < end {
1208	if confirm(*ptr) {
1209	count += `1`;
1210	}
1211	ptr = ptr.offset(`1`);
1212	}
1213	count
1214	}
1215