1/*!
2Generic crate-internal routines for the `memchr` family of functions.
3*/
4
5// What follows is a vector algorithm generic over the specific vector
6// type to detect the position of one, two or three needles in a haystack.
7// From what I know, this is a "classic" algorithm, although I don't
8// believe it has been published in any peer reviewed journal. I believe
9// it can be found in places like glibc and Go's standard library. It
10// appears to be well known and is elaborated on in more detail here:
11// https://gms.tf/stdfind-and-memchr-optimizations.html
12//
13// While the routine below is fairly long and perhaps intimidating, the basic
14// idea is actually very simple and can be expressed straight-forwardly in
15// pseudo code. The psuedo code below is written for 128 bit vectors, but the
16// actual code below works for anything that implements the Vector trait.
17//
18// needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
19// // Note: shift amount is in bytes
20//
21// while i <= haystack.len() - 16:
22// // A 16 byte vector. Each byte in chunk corresponds to a byte in
23// // the haystack.
24// chunk = haystack[i:i+16]
25// // Compare bytes in needle with bytes in chunk. The result is a 16
26// // byte chunk where each byte is 0xFF if the corresponding bytes
27// // in needle and chunk were equal, or 0x00 otherwise.
28// eqs = cmpeq(needle, chunk)
29// // Return a 32 bit integer where the most significant 16 bits
30// // are always 0 and the lower 16 bits correspond to whether the
31// // most significant bit in the correspond byte in `eqs` is set.
32// // In other words, `mask as u16` has bit i set if and only if
33// // needle[i] == chunk[i].
34// mask = movemask(eqs)
35//
36// // Mask is 0 if there is no match, and non-zero otherwise.
37// if mask != 0:
38// // trailing_zeros tells us the position of the least significant
39// // bit that is set.
40// return i + trailing_zeros(mask)
41//
42// // haystack length may not be a multiple of 16, so search the rest.
43// while i < haystack.len():
44// if haystack[i] == n1:
45// return i
46//
47// // No match found.
48// return NULL
49//
50// In fact, we could loosely translate the above code to Rust line-for-line
51// and it would be a pretty fast algorithm. But, we pull out all the stops
52// to go as fast as possible:
53//
54// 1. We use aligned loads. That is, we do some finagling to make sure our
55// primary loop not only proceeds in increments of 16 bytes, but that
56// the address of haystack's pointer that we dereference is aligned to
57// 16 bytes. 16 is a magic number here because it is the size of SSE2
58// 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
59// Therefore, to get aligned loads, our pointer's address must be evenly
60// divisible by 16.
61// 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
62// kind of like loop unrolling, but we combine the equality comparisons
63// using a vector OR such that we only need to extract a single mask to
64// determine whether a match exists or not. If so, then we do some
65// book-keeping to determine the precise location but otherwise mush on.
66// 3. We use our "chunk" comparison routine in as many places as possible,
67// even if it means using unaligned loads. In particular, if haystack
68// starts with an unaligned address, then we do an unaligned load to
69// search the first 16 bytes. We then start our primary loop at the
70// smallest subsequent aligned address, which will actually overlap with
71// previously searched bytes. But we're OK with that. We do a similar
72// dance at the end of our primary loop. Finally, to avoid a
73// byte-at-a-time loop at the end, we do a final 16 byte unaligned load
74// that may overlap with a previous load. This is OK because it converts
75// a loop into a small number of very fast vector instructions. The overlap
76// is OK because we know the place where the overlap occurs does not
77// contain a match.
78//
79// And that's pretty all there is to it. Note that since the below is
80// generic and since it's meant to be inlined into routines with a
81// `#[target_feature(enable = "...")]` annotation, we must mark all routines as
82// both unsafe and `#[inline(always)]`.
83//
84// The fact that the code below is generic does somewhat inhibit us. For
85// example, I've noticed that introducing an unlineable `#[cold]` function to
86// handle the match case in the loop generates tighter assembly, but there is
87// no way to do this in the generic code below because the generic code doesn't
88// know what `target_feature` annotation to apply to the unlineable function.
89// We could make such functions part of the `Vector` trait, but we instead live
90// with the slightly sub-optimal codegen for now since it doesn't seem to have
91// a noticeable perf difference.
92
93use crate::{
94 ext::Pointer,
95 vector::{MoveMask, Vector},
96};
97
98/// Finds all occurrences of a single byte in a haystack.
99#[derive(Clone, Copy, Debug)]
100pub(crate) struct One<V> {
101 s1: u8,
102 v1: V,
103}
104
105impl<V: Vector> One<V> {
106 /// The number of bytes we examine per each iteration of our search loop.
107 const LOOP_SIZE: usize = 4 * V::BYTES;
108
109 /// Create a new searcher that finds occurrences of the byte given.
110 #[inline(always)]
111 pub(crate) unsafe fn new(needle: u8) -> One<V> {
112 One { s1: needle, v1: V::splat(needle) }
113 }
114
115 /// Returns the needle given to `One::new`.
116 #[inline(always)]
117 pub(crate) fn needle1(&self) -> u8 {
118 self.s1
119 }
120
121 /// Return a pointer to the first occurrence of the needle in the given
122 /// haystack. If no such occurrence exists, then `None` is returned.
123 ///
124 /// When a match is found, the pointer returned is guaranteed to be
125 /// `>= start` and `< end`.
126 ///
127 /// # Safety
128 ///
129 /// * It must be the case that `start < end` and that the distance between
130 /// them is at least equal to `V::BYTES`. That is, it must always be valid
131 /// to do at least an unaligned load of `V` at `start`.
132 /// * Both `start` and `end` must be valid for reads.
133 /// * Both `start` and `end` must point to an initialized value.
134 /// * Both `start` and `end` must point to the same allocated object and
135 /// must either be in bounds or at most one byte past the end of the
136 /// allocated object.
137 /// * Both `start` and `end` must be _derived from_ a pointer to the same
138 /// object.
139 /// * The distance between `start` and `end` must not overflow `isize`.
140 /// * The distance being in bounds must not rely on "wrapping around" the
141 /// address space.
142 #[inline(always)]
143 pub(crate) unsafe fn find_raw(
144 &self,
145 start: *const u8,
146 end: *const u8,
147 ) -> Option<*const u8> {
148 // If we want to support vectors bigger than 256 bits, we probably
149 // need to move up to using a u64 for the masks used below. Currently
150 // they are 32 bits, which means we're SOL for vectors that need masks
151 // bigger than 32 bits. Overall unclear until there's a use case.
152 debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
153
154 let topos = V::Mask::first_offset;
155 let len = end.distance(start);
156 debug_assert!(
157 len >= V::BYTES,
158 "haystack has length {}, but must be at least {}",
159 len,
160 V::BYTES
161 );
162
163 // Search a possibly unaligned chunk at `start`. This covers any part
164 // of the haystack prior to where aligned loads can start.
165 if let Some(cur) = self.search_chunk(start, topos) {
166 return Some(cur);
167 }
168 // Set `cur` to the first V-aligned pointer greater than `start`.
169 let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
170 debug_assert!(cur > start && end.sub(V::BYTES) >= start);
171 if len >= Self::LOOP_SIZE {
172 while cur <= end.sub(Self::LOOP_SIZE) {
173 debug_assert_eq!(0, cur.as_usize() % V::BYTES);
174
175 let a = V::load_aligned(cur);
176 let b = V::load_aligned(cur.add(1 * V::BYTES));
177 let c = V::load_aligned(cur.add(2 * V::BYTES));
178 let d = V::load_aligned(cur.add(3 * V::BYTES));
179 let eqa = self.v1.cmpeq(a);
180 let eqb = self.v1.cmpeq(b);
181 let eqc = self.v1.cmpeq(c);
182 let eqd = self.v1.cmpeq(d);
183 let or1 = eqa.or(eqb);
184 let or2 = eqc.or(eqd);
185 let or3 = or1.or(or2);
186 if or3.movemask_will_have_non_zero() {
187 let mask = eqa.movemask();
188 if mask.has_non_zero() {
189 return Some(cur.add(topos(mask)));
190 }
191
192 let mask = eqb.movemask();
193 if mask.has_non_zero() {
194 return Some(cur.add(1 * V::BYTES).add(topos(mask)));
195 }
196
197 let mask = eqc.movemask();
198 if mask.has_non_zero() {
199 return Some(cur.add(2 * V::BYTES).add(topos(mask)));
200 }
201
202 let mask = eqd.movemask();
203 debug_assert!(mask.has_non_zero());
204 return Some(cur.add(3 * V::BYTES).add(topos(mask)));
205 }
206 cur = cur.add(Self::LOOP_SIZE);
207 }
208 }
209 // Handle any leftovers after the aligned loop above. We use unaligned
210 // loads here, but I believe we are guaranteed that they are aligned
211 // since `cur` is aligned.
212 while cur <= end.sub(V::BYTES) {
213 debug_assert!(end.distance(cur) >= V::BYTES);
214 if let Some(cur) = self.search_chunk(cur, topos) {
215 return Some(cur);
216 }
217 cur = cur.add(V::BYTES);
218 }
219 // Finally handle any remaining bytes less than the size of V. In this
220 // case, our pointer may indeed be unaligned and the load may overlap
221 // with the previous one. But that's okay since we know the previous
222 // load didn't lead to a match (otherwise we wouldn't be here).
223 if cur < end {
224 debug_assert!(end.distance(cur) < V::BYTES);
225 cur = cur.sub(V::BYTES - end.distance(cur));
226 debug_assert_eq!(end.distance(cur), V::BYTES);
227 return self.search_chunk(cur, topos);
228 }
229 None
230 }
231
232 /// Return a pointer to the last occurrence of the needle in the given
233 /// haystack. If no such occurrence exists, then `None` is returned.
234 ///
235 /// When a match is found, the pointer returned is guaranteed to be
236 /// `>= start` and `< end`.
237 ///
238 /// # Safety
239 ///
240 /// * It must be the case that `start < end` and that the distance between
241 /// them is at least equal to `V::BYTES`. That is, it must always be valid
242 /// to do at least an unaligned load of `V` at `start`.
243 /// * Both `start` and `end` must be valid for reads.
244 /// * Both `start` and `end` must point to an initialized value.
245 /// * Both `start` and `end` must point to the same allocated object and
246 /// must either be in bounds or at most one byte past the end of the
247 /// allocated object.
248 /// * Both `start` and `end` must be _derived from_ a pointer to the same
249 /// object.
250 /// * The distance between `start` and `end` must not overflow `isize`.
251 /// * The distance being in bounds must not rely on "wrapping around" the
252 /// address space.
253 #[inline(always)]
254 pub(crate) unsafe fn rfind_raw(
255 &self,
256 start: *const u8,
257 end: *const u8,
258 ) -> Option<*const u8> {
259 // If we want to support vectors bigger than 256 bits, we probably
260 // need to move up to using a u64 for the masks used below. Currently
261 // they are 32 bits, which means we're SOL for vectors that need masks
262 // bigger than 32 bits. Overall unclear until there's a use case.
263 debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
264
265 let topos = V::Mask::last_offset;
266 let len = end.distance(start);
267 debug_assert!(
268 len >= V::BYTES,
269 "haystack has length {}, but must be at least {}",
270 len,
271 V::BYTES
272 );
273
274 if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
275 return Some(cur);
276 }
277 let mut cur = end.sub(end.as_usize() & V::ALIGN);
278 debug_assert!(start <= cur && cur <= end);
279 if len >= Self::LOOP_SIZE {
280 while cur >= start.add(Self::LOOP_SIZE) {
281 debug_assert_eq!(0, cur.as_usize() % V::BYTES);
282
283 cur = cur.sub(Self::LOOP_SIZE);
284 let a = V::load_aligned(cur);
285 let b = V::load_aligned(cur.add(1 * V::BYTES));
286 let c = V::load_aligned(cur.add(2 * V::BYTES));
287 let d = V::load_aligned(cur.add(3 * V::BYTES));
288 let eqa = self.v1.cmpeq(a);
289 let eqb = self.v1.cmpeq(b);
290 let eqc = self.v1.cmpeq(c);
291 let eqd = self.v1.cmpeq(d);
292 let or1 = eqa.or(eqb);
293 let or2 = eqc.or(eqd);
294 let or3 = or1.or(or2);
295 if or3.movemask_will_have_non_zero() {
296 let mask = eqd.movemask();
297 if mask.has_non_zero() {
298 return Some(cur.add(3 * V::BYTES).add(topos(mask)));
299 }
300
301 let mask = eqc.movemask();
302 if mask.has_non_zero() {
303 return Some(cur.add(2 * V::BYTES).add(topos(mask)));
304 }
305
306 let mask = eqb.movemask();
307 if mask.has_non_zero() {
308 return Some(cur.add(1 * V::BYTES).add(topos(mask)));
309 }
310
311 let mask = eqa.movemask();
312 debug_assert!(mask.has_non_zero());
313 return Some(cur.add(topos(mask)));
314 }
315 }
316 }
317 while cur >= start.add(V::BYTES) {
318 debug_assert!(cur.distance(start) >= V::BYTES);
319 cur = cur.sub(V::BYTES);
320 if let Some(cur) = self.search_chunk(cur, topos) {
321 return Some(cur);
322 }
323 }
324 if cur > start {
325 debug_assert!(cur.distance(start) < V::BYTES);
326 return self.search_chunk(start, topos);
327 }
328 None
329 }
330
331 /// Return a count of all matching bytes in the given haystack.
332 ///
333 /// # Safety
334 ///
335 /// * It must be the case that `start < end` and that the distance between
336 /// them is at least equal to `V::BYTES`. That is, it must always be valid
337 /// to do at least an unaligned load of `V` at `start`.
338 /// * Both `start` and `end` must be valid for reads.
339 /// * Both `start` and `end` must point to an initialized value.
340 /// * Both `start` and `end` must point to the same allocated object and
341 /// must either be in bounds or at most one byte past the end of the
342 /// allocated object.
343 /// * Both `start` and `end` must be _derived from_ a pointer to the same
344 /// object.
345 /// * The distance between `start` and `end` must not overflow `isize`.
346 /// * The distance being in bounds must not rely on "wrapping around" the
347 /// address space.
348 #[inline(always)]
349 pub(crate) unsafe fn count_raw(
350 &self,
351 start: *const u8,
352 end: *const u8,
353 ) -> usize {
354 debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
355
356 let confirm = |b| b == self.needle1();
357 let len = end.distance(start);
358 debug_assert!(
359 len >= V::BYTES,
360 "haystack has length {}, but must be at least {}",
361 len,
362 V::BYTES
363 );
364
365 // Set `cur` to the first V-aligned pointer greater than `start`.
366 let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
367 // Count any matching bytes before we start our aligned loop.
368 let mut count = count_byte_by_byte(start, cur, confirm);
369 debug_assert!(cur > start && end.sub(V::BYTES) >= start);
370 if len >= Self::LOOP_SIZE {
371 while cur <= end.sub(Self::LOOP_SIZE) {
372 debug_assert_eq!(0, cur.as_usize() % V::BYTES);
373
374 let a = V::load_aligned(cur);
375 let b = V::load_aligned(cur.add(1 * V::BYTES));
376 let c = V::load_aligned(cur.add(2 * V::BYTES));
377 let d = V::load_aligned(cur.add(3 * V::BYTES));
378 let eqa = self.v1.cmpeq(a);
379 let eqb = self.v1.cmpeq(b);
380 let eqc = self.v1.cmpeq(c);
381 let eqd = self.v1.cmpeq(d);
382 count += eqa.movemask().count_ones();
383 count += eqb.movemask().count_ones();
384 count += eqc.movemask().count_ones();
385 count += eqd.movemask().count_ones();
386 cur = cur.add(Self::LOOP_SIZE);
387 }
388 }
389 // Handle any leftovers after the aligned loop above. We use unaligned
390 // loads here, but I believe we are guaranteed that they are aligned
391 // since `cur` is aligned.
392 while cur <= end.sub(V::BYTES) {
393 debug_assert!(end.distance(cur) >= V::BYTES);
394 let chunk = V::load_unaligned(cur);
395 count += self.v1.cmpeq(chunk).movemask().count_ones();
396 cur = cur.add(V::BYTES);
397 }
398 // And finally count any leftovers that weren't caught above.
399 count += count_byte_by_byte(cur, end, confirm);
400 count
401 }
402
403 /// Search `V::BYTES` starting at `cur` via an unaligned load.
404 ///
405 /// `mask_to_offset` should be a function that converts a `movemask` to
406 /// an offset such that `cur.add(offset)` corresponds to a pointer to the
407 /// match location if one is found. Generally it is expected to use either
408 /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
409 /// one is implementing a forward or reverse search, respectively.
410 ///
411 /// # Safety
412 ///
413 /// `cur` must be a valid pointer and it must be valid to do an unaligned
414 /// load of size `V::BYTES` at `cur`.
415 #[inline(always)]
416 unsafe fn search_chunk(
417 &self,
418 cur: *const u8,
419 mask_to_offset: impl Fn(V::Mask) -> usize,
420 ) -> Option<*const u8> {
421 let chunk = V::load_unaligned(cur);
422 let mask = self.v1.cmpeq(chunk).movemask();
423 if mask.has_non_zero() {
424 Some(cur.add(mask_to_offset(mask)))
425 } else {
426 None
427 }
428 }
429}
430
431/// Finds all occurrences of two bytes in a haystack.
432///
433/// That is, this reports matches of one of two possible bytes. For example,
434/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
435/// `4` and `5`.
436#[derive(Clone, Copy, Debug)]
437pub(crate) struct Two<V> {
438 s1: u8,
439 s2: u8,
440 v1: V,
441 v2: V,
442}
443
444impl<V: Vector> Two<V> {
445 /// The number of bytes we examine per each iteration of our search loop.
446 const LOOP_SIZE: usize = 2 * V::BYTES;
447
448 /// Create a new searcher that finds occurrences of the byte given.
449 #[inline(always)]
450 pub(crate) unsafe fn new(needle1: u8, needle2: u8) -> Two<V> {
451 Two {
452 s1: needle1,
453 s2: needle2,
454 v1: V::splat(needle1),
455 v2: V::splat(needle2),
456 }
457 }
458
459 /// Returns the first needle given to `Two::new`.
460 #[inline(always)]
461 pub(crate) fn needle1(&self) -> u8 {
462 self.s1
463 }
464
465 /// Returns the second needle given to `Two::new`.
466 #[inline(always)]
467 pub(crate) fn needle2(&self) -> u8 {
468 self.s2
469 }
470
471 /// Return a pointer to the first occurrence of one of the needles in the
472 /// given haystack. If no such occurrence exists, then `None` is returned.
473 ///
474 /// When a match is found, the pointer returned is guaranteed to be
475 /// `>= start` and `< end`.
476 ///
477 /// # Safety
478 ///
479 /// * It must be the case that `start < end` and that the distance between
480 /// them is at least equal to `V::BYTES`. That is, it must always be valid
481 /// to do at least an unaligned load of `V` at `start`.
482 /// * Both `start` and `end` must be valid for reads.
483 /// * Both `start` and `end` must point to an initialized value.
484 /// * Both `start` and `end` must point to the same allocated object and
485 /// must either be in bounds or at most one byte past the end of the
486 /// allocated object.
487 /// * Both `start` and `end` must be _derived from_ a pointer to the same
488 /// object.
489 /// * The distance between `start` and `end` must not overflow `isize`.
490 /// * The distance being in bounds must not rely on "wrapping around" the
491 /// address space.
492 #[inline(always)]
493 pub(crate) unsafe fn find_raw(
494 &self,
495 start: *const u8,
496 end: *const u8,
497 ) -> Option<*const u8> {
498 // If we want to support vectors bigger than 256 bits, we probably
499 // need to move up to using a u64 for the masks used below. Currently
500 // they are 32 bits, which means we're SOL for vectors that need masks
501 // bigger than 32 bits. Overall unclear until there's a use case.
502 debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
503
504 let topos = V::Mask::first_offset;
505 let len = end.distance(start);
506 debug_assert!(
507 len >= V::BYTES,
508 "haystack has length {}, but must be at least {}",
509 len,
510 V::BYTES
511 );
512
513 // Search a possibly unaligned chunk at `start`. This covers any part
514 // of the haystack prior to where aligned loads can start.
515 if let Some(cur) = self.search_chunk(start, topos) {
516 return Some(cur);
517 }
518 // Set `cur` to the first V-aligned pointer greater than `start`.
519 let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
520 debug_assert!(cur > start && end.sub(V::BYTES) >= start);
521 if len >= Self::LOOP_SIZE {
522 while cur <= end.sub(Self::LOOP_SIZE) {
523 debug_assert_eq!(0, cur.as_usize() % V::BYTES);
524
525 let a = V::load_aligned(cur);
526 let b = V::load_aligned(cur.add(V::BYTES));
527 let eqa1 = self.v1.cmpeq(a);
528 let eqb1 = self.v1.cmpeq(b);
529 let eqa2 = self.v2.cmpeq(a);
530 let eqb2 = self.v2.cmpeq(b);
531 let or1 = eqa1.or(eqb1);
532 let or2 = eqa2.or(eqb2);
533 let or3 = or1.or(or2);
534 if or3.movemask_will_have_non_zero() {
535 let mask = eqa1.movemask().or(eqa2.movemask());
536 if mask.has_non_zero() {
537 return Some(cur.add(topos(mask)));
538 }
539
540 let mask = eqb1.movemask().or(eqb2.movemask());
541 debug_assert!(mask.has_non_zero());
542 return Some(cur.add(V::BYTES).add(topos(mask)));
543 }
544 cur = cur.add(Self::LOOP_SIZE);
545 }
546 }
547 // Handle any leftovers after the aligned loop above. We use unaligned
548 // loads here, but I believe we are guaranteed that they are aligned
549 // since `cur` is aligned.
550 while cur <= end.sub(V::BYTES) {
551 debug_assert!(end.distance(cur) >= V::BYTES);
552 if let Some(cur) = self.search_chunk(cur, topos) {
553 return Some(cur);
554 }
555 cur = cur.add(V::BYTES);
556 }
557 // Finally handle any remaining bytes less than the size of V. In this
558 // case, our pointer may indeed be unaligned and the load may overlap
559 // with the previous one. But that's okay since we know the previous
560 // load didn't lead to a match (otherwise we wouldn't be here).
561 if cur < end {
562 debug_assert!(end.distance(cur) < V::BYTES);
563 cur = cur.sub(V::BYTES - end.distance(cur));
564 debug_assert_eq!(end.distance(cur), V::BYTES);
565 return self.search_chunk(cur, topos);
566 }
567 None
568 }
569
570 /// Return a pointer to the last occurrence of the needle in the given
571 /// haystack. If no such occurrence exists, then `None` is returned.
572 ///
573 /// When a match is found, the pointer returned is guaranteed to be
574 /// `>= start` and `< end`.
575 ///
576 /// # Safety
577 ///
578 /// * It must be the case that `start < end` and that the distance between
579 /// them is at least equal to `V::BYTES`. That is, it must always be valid
580 /// to do at least an unaligned load of `V` at `start`.
581 /// * Both `start` and `end` must be valid for reads.
582 /// * Both `start` and `end` must point to an initialized value.
583 /// * Both `start` and `end` must point to the same allocated object and
584 /// must either be in bounds or at most one byte past the end of the
585 /// allocated object.
586 /// * Both `start` and `end` must be _derived from_ a pointer to the same
587 /// object.
588 /// * The distance between `start` and `end` must not overflow `isize`.
589 /// * The distance being in bounds must not rely on "wrapping around" the
590 /// address space.
591 #[inline(always)]
592 pub(crate) unsafe fn rfind_raw(
593 &self,
594 start: *const u8,
595 end: *const u8,
596 ) -> Option<*const u8> {
597 // If we want to support vectors bigger than 256 bits, we probably
598 // need to move up to using a u64 for the masks used below. Currently
599 // they are 32 bits, which means we're SOL for vectors that need masks
600 // bigger than 32 bits. Overall unclear until there's a use case.
601 debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
602
603 let topos = V::Mask::last_offset;
604 let len = end.distance(start);
605 debug_assert!(
606 len >= V::BYTES,
607 "haystack has length {}, but must be at least {}",
608 len,
609 V::BYTES
610 );
611
612 if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
613 return Some(cur);
614 }
615 let mut cur = end.sub(end.as_usize() & V::ALIGN);
616 debug_assert!(start <= cur && cur <= end);
617 if len >= Self::LOOP_SIZE {
618 while cur >= start.add(Self::LOOP_SIZE) {
619 debug_assert_eq!(0, cur.as_usize() % V::BYTES);
620
621 cur = cur.sub(Self::LOOP_SIZE);
622 let a = V::load_aligned(cur);
623 let b = V::load_aligned(cur.add(V::BYTES));
624 let eqa1 = self.v1.cmpeq(a);
625 let eqb1 = self.v1.cmpeq(b);
626 let eqa2 = self.v2.cmpeq(a);
627 let eqb2 = self.v2.cmpeq(b);
628 let or1 = eqa1.or(eqb1);
629 let or2 = eqa2.or(eqb2);
630 let or3 = or1.or(or2);
631 if or3.movemask_will_have_non_zero() {
632 let mask = eqb1.movemask().or(eqb2.movemask());
633 if mask.has_non_zero() {
634 return Some(cur.add(V::BYTES).add(topos(mask)));
635 }
636
637 let mask = eqa1.movemask().or(eqa2.movemask());
638 debug_assert!(mask.has_non_zero());
639 return Some(cur.add(topos(mask)));
640 }
641 }
642 }
643 while cur >= start.add(V::BYTES) {
644 debug_assert!(cur.distance(start) >= V::BYTES);
645 cur = cur.sub(V::BYTES);
646 if let Some(cur) = self.search_chunk(cur, topos) {
647 return Some(cur);
648 }
649 }
650 if cur > start {
651 debug_assert!(cur.distance(start) < V::BYTES);
652 return self.search_chunk(start, topos);
653 }
654 None
655 }
656
657 /// Search `V::BYTES` starting at `cur` via an unaligned load.
658 ///
659 /// `mask_to_offset` should be a function that converts a `movemask` to
660 /// an offset such that `cur.add(offset)` corresponds to a pointer to the
661 /// match location if one is found. Generally it is expected to use either
662 /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
663 /// one is implementing a forward or reverse search, respectively.
664 ///
665 /// # Safety
666 ///
667 /// `cur` must be a valid pointer and it must be valid to do an unaligned
668 /// load of size `V::BYTES` at `cur`.
669 #[inline(always)]
670 unsafe fn search_chunk(
671 &self,
672 cur: *const u8,
673 mask_to_offset: impl Fn(V::Mask) -> usize,
674 ) -> Option<*const u8> {
675 let chunk = V::load_unaligned(cur);
676 let eq1 = self.v1.cmpeq(chunk);
677 let eq2 = self.v2.cmpeq(chunk);
678 let mask = eq1.or(eq2).movemask();
679 if mask.has_non_zero() {
680 let mask1 = eq1.movemask();
681 let mask2 = eq2.movemask();
682 Some(cur.add(mask_to_offset(mask1.or(mask2))))
683 } else {
684 None
685 }
686 }
687}
688
689/// Finds all occurrences of two bytes in a haystack.
690///
691/// That is, this reports matches of one of two possible bytes. For example,
692/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
693/// `4` and `5`.
694#[derive(Clone, Copy, Debug)]
695pub(crate) struct Three<V> {
696 s1: u8,
697 s2: u8,
698 s3: u8,
699 v1: V,
700 v2: V,
701 v3: V,
702}
703
704impl<V: Vector> Three<V> {
705 /// The number of bytes we examine per each iteration of our search loop.
706 const LOOP_SIZE: usize = 2 * V::BYTES;
707
708 /// Create a new searcher that finds occurrences of the byte given.
709 #[inline(always)]
710 pub(crate) unsafe fn new(
711 needle1: u8,
712 needle2: u8,
713 needle3: u8,
714 ) -> Three<V> {
715 Three {
716 s1: needle1,
717 s2: needle2,
718 s3: needle3,
719 v1: V::splat(needle1),
720 v2: V::splat(needle2),
721 v3: V::splat(needle3),
722 }
723 }
724
725 /// Returns the first needle given to `Three::new`.
726 #[inline(always)]
727 pub(crate) fn needle1(&self) -> u8 {
728 self.s1
729 }
730
731 /// Returns the second needle given to `Three::new`.
732 #[inline(always)]
733 pub(crate) fn needle2(&self) -> u8 {
734 self.s2
735 }
736
737 /// Returns the third needle given to `Three::new`.
738 #[inline(always)]
739 pub(crate) fn needle3(&self) -> u8 {
740 self.s3
741 }
742
743 /// Return a pointer to the first occurrence of one of the needles in the
744 /// given haystack. If no such occurrence exists, then `None` is returned.
745 ///
746 /// When a match is found, the pointer returned is guaranteed to be
747 /// `>= start` and `< end`.
748 ///
749 /// # Safety
750 ///
751 /// * It must be the case that `start < end` and that the distance between
752 /// them is at least equal to `V::BYTES`. That is, it must always be valid
753 /// to do at least an unaligned load of `V` at `start`.
754 /// * Both `start` and `end` must be valid for reads.
755 /// * Both `start` and `end` must point to an initialized value.
756 /// * Both `start` and `end` must point to the same allocated object and
757 /// must either be in bounds or at most one byte past the end of the
758 /// allocated object.
759 /// * Both `start` and `end` must be _derived from_ a pointer to the same
760 /// object.
761 /// * The distance between `start` and `end` must not overflow `isize`.
762 /// * The distance being in bounds must not rely on "wrapping around" the
763 /// address space.
764 #[inline(always)]
765 pub(crate) unsafe fn find_raw(
766 &self,
767 start: *const u8,
768 end: *const u8,
769 ) -> Option<*const u8> {
770 // If we want to support vectors bigger than 256 bits, we probably
771 // need to move up to using a u64 for the masks used below. Currently
772 // they are 32 bits, which means we're SOL for vectors that need masks
773 // bigger than 32 bits. Overall unclear until there's a use case.
774 debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
775
776 let topos = V::Mask::first_offset;
777 let len = end.distance(start);
778 debug_assert!(
779 len >= V::BYTES,
780 "haystack has length {}, but must be at least {}",
781 len,
782 V::BYTES
783 );
784
785 // Search a possibly unaligned chunk at `start`. This covers any part
786 // of the haystack prior to where aligned loads can start.
787 if let Some(cur) = self.search_chunk(start, topos) {
788 return Some(cur);
789 }
790 // Set `cur` to the first V-aligned pointer greater than `start`.
791 let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
792 debug_assert!(cur > start && end.sub(V::BYTES) >= start);
793 if len >= Self::LOOP_SIZE {
794 while cur <= end.sub(Self::LOOP_SIZE) {
795 debug_assert_eq!(0, cur.as_usize() % V::BYTES);
796
797 let a = V::load_aligned(cur);
798 let b = V::load_aligned(cur.add(V::BYTES));
799 let eqa1 = self.v1.cmpeq(a);
800 let eqb1 = self.v1.cmpeq(b);
801 let eqa2 = self.v2.cmpeq(a);
802 let eqb2 = self.v2.cmpeq(b);
803 let eqa3 = self.v3.cmpeq(a);
804 let eqb3 = self.v3.cmpeq(b);
805 let or1 = eqa1.or(eqb1);
806 let or2 = eqa2.or(eqb2);
807 let or3 = eqa3.or(eqb3);
808 let or4 = or1.or(or2);
809 let or5 = or3.or(or4);
810 if or5.movemask_will_have_non_zero() {
811 let mask = eqa1
812 .movemask()
813 .or(eqa2.movemask())
814 .or(eqa3.movemask());
815 if mask.has_non_zero() {
816 return Some(cur.add(topos(mask)));
817 }
818
819 let mask = eqb1
820 .movemask()
821 .or(eqb2.movemask())
822 .or(eqb3.movemask());
823 debug_assert!(mask.has_non_zero());
824 return Some(cur.add(V::BYTES).add(topos(mask)));
825 }
826 cur = cur.add(Self::LOOP_SIZE);
827 }
828 }
829 // Handle any leftovers after the aligned loop above. We use unaligned
830 // loads here, but I believe we are guaranteed that they are aligned
831 // since `cur` is aligned.
832 while cur <= end.sub(V::BYTES) {
833 debug_assert!(end.distance(cur) >= V::BYTES);
834 if let Some(cur) = self.search_chunk(cur, topos) {
835 return Some(cur);
836 }
837 cur = cur.add(V::BYTES);
838 }
839 // Finally handle any remaining bytes less than the size of V. In this
840 // case, our pointer may indeed be unaligned and the load may overlap
841 // with the previous one. But that's okay since we know the previous
842 // load didn't lead to a match (otherwise we wouldn't be here).
843 if cur < end {
844 debug_assert!(end.distance(cur) < V::BYTES);
845 cur = cur.sub(V::BYTES - end.distance(cur));
846 debug_assert_eq!(end.distance(cur), V::BYTES);
847 return self.search_chunk(cur, topos);
848 }
849 None
850 }
851
852 /// Return a pointer to the last occurrence of the needle in the given
853 /// haystack. If no such occurrence exists, then `None` is returned.
854 ///
855 /// When a match is found, the pointer returned is guaranteed to be
856 /// `>= start` and `< end`.
857 ///
858 /// # Safety
859 ///
860 /// * It must be the case that `start < end` and that the distance between
861 /// them is at least equal to `V::BYTES`. That is, it must always be valid
862 /// to do at least an unaligned load of `V` at `start`.
863 /// * Both `start` and `end` must be valid for reads.
864 /// * Both `start` and `end` must point to an initialized value.
865 /// * Both `start` and `end` must point to the same allocated object and
866 /// must either be in bounds or at most one byte past the end of the
867 /// allocated object.
868 /// * Both `start` and `end` must be _derived from_ a pointer to the same
869 /// object.
870 /// * The distance between `start` and `end` must not overflow `isize`.
871 /// * The distance being in bounds must not rely on "wrapping around" the
872 /// address space.
873 #[inline(always)]
874 pub(crate) unsafe fn rfind_raw(
875 &self,
876 start: *const u8,
877 end: *const u8,
878 ) -> Option<*const u8> {
879 // If we want to support vectors bigger than 256 bits, we probably
880 // need to move up to using a u64 for the masks used below. Currently
881 // they are 32 bits, which means we're SOL for vectors that need masks
882 // bigger than 32 bits. Overall unclear until there's a use case.
883 debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
884
885 let topos = V::Mask::last_offset;
886 let len = end.distance(start);
887 debug_assert!(
888 len >= V::BYTES,
889 "haystack has length {}, but must be at least {}",
890 len,
891 V::BYTES
892 );
893
894 if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
895 return Some(cur);
896 }
897 let mut cur = end.sub(end.as_usize() & V::ALIGN);
898 debug_assert!(start <= cur && cur <= end);
899 if len >= Self::LOOP_SIZE {
900 while cur >= start.add(Self::LOOP_SIZE) {
901 debug_assert_eq!(0, cur.as_usize() % V::BYTES);
902
903 cur = cur.sub(Self::LOOP_SIZE);
904 let a = V::load_aligned(cur);
905 let b = V::load_aligned(cur.add(V::BYTES));
906 let eqa1 = self.v1.cmpeq(a);
907 let eqb1 = self.v1.cmpeq(b);
908 let eqa2 = self.v2.cmpeq(a);
909 let eqb2 = self.v2.cmpeq(b);
910 let eqa3 = self.v3.cmpeq(a);
911 let eqb3 = self.v3.cmpeq(b);
912 let or1 = eqa1.or(eqb1);
913 let or2 = eqa2.or(eqb2);
914 let or3 = eqa3.or(eqb3);
915 let or4 = or1.or(or2);
916 let or5 = or3.or(or4);
917 if or5.movemask_will_have_non_zero() {
918 let mask = eqb1
919 .movemask()
920 .or(eqb2.movemask())
921 .or(eqb3.movemask());
922 if mask.has_non_zero() {
923 return Some(cur.add(V::BYTES).add(topos(mask)));
924 }
925
926 let mask = eqa1
927 .movemask()
928 .or(eqa2.movemask())
929 .or(eqa3.movemask());
930 debug_assert!(mask.has_non_zero());
931 return Some(cur.add(topos(mask)));
932 }
933 }
934 }
935 while cur >= start.add(V::BYTES) {
936 debug_assert!(cur.distance(start) >= V::BYTES);
937 cur = cur.sub(V::BYTES);
938 if let Some(cur) = self.search_chunk(cur, topos) {
939 return Some(cur);
940 }
941 }
942 if cur > start {
943 debug_assert!(cur.distance(start) < V::BYTES);
944 return self.search_chunk(start, topos);
945 }
946 None
947 }
948
949 /// Search `V::BYTES` starting at `cur` via an unaligned load.
950 ///
951 /// `mask_to_offset` should be a function that converts a `movemask` to
952 /// an offset such that `cur.add(offset)` corresponds to a pointer to the
953 /// match location if one is found. Generally it is expected to use either
954 /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
955 /// one is implementing a forward or reverse search, respectively.
956 ///
957 /// # Safety
958 ///
959 /// `cur` must be a valid pointer and it must be valid to do an unaligned
960 /// load of size `V::BYTES` at `cur`.
961 #[inline(always)]
962 unsafe fn search_chunk(
963 &self,
964 cur: *const u8,
965 mask_to_offset: impl Fn(V::Mask) -> usize,
966 ) -> Option<*const u8> {
967 let chunk = V::load_unaligned(cur);
968 let eq1 = self.v1.cmpeq(chunk);
969 let eq2 = self.v2.cmpeq(chunk);
970 let eq3 = self.v3.cmpeq(chunk);
971 let mask = eq1.or(eq2).or(eq3).movemask();
972 if mask.has_non_zero() {
973 let mask1 = eq1.movemask();
974 let mask2 = eq2.movemask();
975 let mask3 = eq3.movemask();
976 Some(cur.add(mask_to_offset(mask1.or(mask2).or(mask3))))
977 } else {
978 None
979 }
980 }
981}
982
983/// An iterator over all occurrences of a set of bytes in a haystack.
984///
985/// This iterator implements the routines necessary to provide a
986/// `DoubleEndedIterator` impl, which means it can also be used to find
987/// occurrences in reverse order.
988///
989/// The lifetime parameters are as follows:
990///
991/// * `'h` refers to the lifetime of the haystack being searched.
992///
993/// This type is intended to be used to implement all iterators for the
994/// `memchr` family of functions. It handles a tiny bit of marginally tricky
995/// raw pointer math, but otherwise expects the caller to provide `find_raw`
996/// and `rfind_raw` routines for each call of `next` and `next_back`,
997/// respectively.
998#[derive(Clone, Debug)]
999pub(crate) struct Iter<'h> {
1000 /// The original starting point into the haystack. We use this to convert
1001 /// pointers to offsets.
1002 original_start: *const u8,
1003 /// The current starting point into the haystack. That is, where the next
1004 /// search will begin.
1005 start: *const u8,
1006 /// The current ending point into the haystack. That is, where the next
1007 /// reverse search will begin.
1008 end: *const u8,
1009 /// A marker for tracking the lifetime of the start/cur_start/cur_end
1010 /// pointers above, which all point into the haystack.
1011 haystack: core::marker::PhantomData<&'h [u8]>,
1012}
1013
1014// SAFETY: Iter contains no shared references to anything that performs any
1015// interior mutations. Also, the lifetime guarantees that Iter will not outlive
1016// the haystack.
1017unsafe impl<'h> Send for Iter<'h> {}
1018
1019// SAFETY: Iter perform no interior mutations, therefore no explicit
1020// synchronization is necessary. Also, the lifetime guarantees that Iter will
1021// not outlive the haystack.
1022unsafe impl<'h> Sync for Iter<'h> {}
1023
1024impl<'h> Iter<'h> {
1025 /// Create a new generic memchr iterator.
1026 #[inline(always)]
1027 pub(crate) fn new(haystack: &'h [u8]) -> Iter<'h> {
1028 Iter {
1029 original_start: haystack.as_ptr(),
1030 start: haystack.as_ptr(),
1031 end: haystack.as_ptr().wrapping_add(haystack.len()),
1032 haystack: core::marker::PhantomData,
1033 }
1034 }
1035
1036 /// Returns the next occurrence in the forward direction.
1037 ///
1038 /// # Safety
1039 ///
1040 /// Callers must ensure that if a pointer is returned from the closure
1041 /// provided, then it must be greater than or equal to the start pointer
1042 /// and less than the end pointer.
1043 #[inline(always)]
1044 pub(crate) unsafe fn next(
1045 &mut self,
1046 mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
1047 ) -> Option<usize> {
1048 // SAFETY: Pointers are derived directly from the same &[u8] haystack.
1049 // We only ever modify start/end corresponding to a matching offset
1050 // found between start and end. Thus all changes to start/end maintain
1051 // our safety requirements.
1052 //
1053 // The only other assumption we rely on is that the pointer returned
1054 // by `find_raw` satisfies `self.start <= found < self.end`, and that
1055 // safety contract is forwarded to the caller.
1056 let found = find_raw(self.start, self.end)?;
1057 let result = found.distance(self.original_start);
1058 self.start = found.add(1);
1059 Some(result)
1060 }
1061
1062 /// Returns the number of remaining elements in this iterator.
1063 #[inline(always)]
1064 pub(crate) fn count(
1065 self,
1066 mut count_raw: impl FnMut(*const u8, *const u8) -> usize,
1067 ) -> usize {
1068 // SAFETY: Pointers are derived directly from the same &[u8] haystack.
1069 // We only ever modify start/end corresponding to a matching offset
1070 // found between start and end. Thus all changes to start/end maintain
1071 // our safety requirements.
1072 count_raw(self.start, self.end)
1073 }
1074
1075 /// Returns the next occurrence in reverse.
1076 ///
1077 /// # Safety
1078 ///
1079 /// Callers must ensure that if a pointer is returned from the closure
1080 /// provided, then it must be greater than or equal to the start pointer
1081 /// and less than the end pointer.
1082 #[inline(always)]
1083 pub(crate) unsafe fn next_back(
1084 &mut self,
1085 mut rfind_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
1086 ) -> Option<usize> {
1087 // SAFETY: Pointers are derived directly from the same &[u8] haystack.
1088 // We only ever modify start/end corresponding to a matching offset
1089 // found between start and end. Thus all changes to start/end maintain
1090 // our safety requirements.
1091 //
1092 // The only other assumption we rely on is that the pointer returned
1093 // by `rfind_raw` satisfies `self.start <= found < self.end`, and that
1094 // safety contract is forwarded to the caller.
1095 let found = rfind_raw(self.start, self.end)?;
1096 let result = found.distance(self.original_start);
1097 self.end = found;
1098 Some(result)
1099 }
1100
1101 /// Provides an implementation of `Iterator::size_hint`.
1102 #[inline(always)]
1103 pub(crate) fn size_hint(&self) -> (usize, Option<usize>) {
1104 (0, Some(self.end.as_usize().saturating_sub(self.start.as_usize())))
1105 }
1106}
1107
1108/// Search a slice using a function that operates on raw pointers.
1109///
1110/// Given a function to search a contiguous sequence of memory for the location
1111/// of a non-empty set of bytes, this will execute that search on a slice of
1112/// bytes. The pointer returned by the given function will be converted to an
1113/// offset relative to the starting point of the given slice. That is, if a
1114/// match is found, the offset returned by this routine is guaranteed to be a
1115/// valid index into `haystack`.
1116///
1117/// Callers may use this for a forward or reverse search.
1118///
1119/// # Safety
1120///
1121/// Callers must ensure that if a pointer is returned by `find_raw`, then the
1122/// pointer must be greater than or equal to the starting pointer and less than
1123/// the end pointer.
1124#[inline(always)]
1125pub(crate) unsafe fn search_slice_with_raw(
1126 haystack: &[u8],
1127 mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
1128) -> Option<usize> {
1129 // SAFETY: We rely on `find_raw` to return a correct and valid pointer, but
1130 // otherwise, `start` and `end` are valid due to the guarantees provided by
1131 // a &[u8].
1132 let start = haystack.as_ptr();
1133 let end = start.add(haystack.len());
1134 let found = find_raw(start, end)?;
1135 Some(found.distance(start))
1136}
1137
1138/// Performs a forward byte-at-a-time loop until either `ptr >= end_ptr` or
1139/// until `confirm(*ptr)` returns `true`. If the former occurs, then `None` is
1140/// returned. If the latter occurs, then the pointer at which `confirm` returns
1141/// `true` is returned.
1142///
1143/// # Safety
1144///
1145/// Callers must provide valid pointers and they must satisfy `start_ptr <=
1146/// ptr` and `ptr <= end_ptr`.
1147#[inline(always)]
1148pub(crate) unsafe fn fwd_byte_by_byte<F: Fn(u8) -> bool>(
1149 start: *const u8,
1150 end: *const u8,
1151 confirm: F,
1152) -> Option<*const u8> {
1153 debug_assert!(start <= end);
1154 let mut ptr = start;
1155 while ptr < end {
1156 if confirm(*ptr) {
1157 return Some(ptr);
1158 }
1159 ptr = ptr.offset(1);
1160 }
1161 None
1162}
1163
1164/// Performs a reverse byte-at-a-time loop until either `ptr < start_ptr` or
1165/// until `confirm(*ptr)` returns `true`. If the former occurs, then `None` is
1166/// returned. If the latter occurs, then the pointer at which `confirm` returns
1167/// `true` is returned.
1168///
1169/// # Safety
1170///
1171/// Callers must provide valid pointers and they must satisfy `start_ptr <=
1172/// ptr` and `ptr <= end_ptr`.
1173#[inline(always)]
1174pub(crate) unsafe fn rev_byte_by_byte<F: Fn(u8) -> bool>(
1175 start: *const u8,
1176 end: *const u8,
1177 confirm: F,
1178) -> Option<*const u8> {
1179 debug_assert!(start <= end);
1180
1181 let mut ptr = end;
1182 while ptr > start {
1183 ptr = ptr.offset(-1);
1184 if confirm(*ptr) {
1185 return Some(ptr);
1186 }
1187 }
1188 None
1189}
1190
1191/// Performs a forward byte-at-a-time loop until `ptr >= end_ptr` and returns
1192/// the number of times `confirm(*ptr)` returns `true`.
1193///
1194/// # Safety
1195///
1196/// Callers must provide valid pointers and they must satisfy `start_ptr <=
1197/// ptr` and `ptr <= end_ptr`.
1198#[inline(always)]
1199pub(crate) unsafe fn count_byte_by_byte<F: Fn(u8) -> bool>(
1200 start: *const u8,
1201 end: *const u8,
1202 confirm: F,
1203) -> usize {
1204 debug_assert!(start <= end);
1205 let mut ptr = start;
1206 let mut count = 0;
1207 while ptr < end {
1208 if confirm(*ptr) {
1209 count += 1;
1210 }
1211 ptr = ptr.offset(1);
1212 }
1213 count
1214}
1215