1// See the README in this directory for an explanation of the Teddy algorithm.
2// It is strongly recommended to peruse the README before trying to grok this
3// code, as its use of SIMD is pretty opaque, although I tried to add comments
4// where appropriate.
5//
6// Moreover, while there is a lot of code in this file, most of it is
7// repeated variants of the same thing. Specifically, there are three Teddy
8// variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets)
9// and Fat 256-bit Teddy (16 buckets). For each variant, there are three
10// implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to
11// a total of nine variants. Each one is structured roughly the same:
12//
13// while at <= len(haystack) - CHUNK_SIZE:
14// let candidate = find_candidate_in_chunk(haystack, at)
15// if not all zeroes(candidate):
16// if match = verify(haystack, at, candidate):
17// return match
18//
19// For the most part, this remains unchanged. The parts that vary are the
20// verification routine (for slim vs fat Teddy) and the candidate extraction
21// (based on the number of masks).
22//
23// In the code below, a "candidate" corresponds to a single vector with 8-bit
24// lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the
25// jth lane if and only if the byte occurring at position `j` is in the
26// bucket `i` (where the `j`th position is the position in the current window
27// of the haystack, which is always 16 or 32 bytes). Note to be careful here:
28// the ith bit and the jth lane correspond to the least significant bits of the
29// vector. So when visualizing how the current window of bytes is stored in a
30// vector, you often need to flip it around. For example, the text `abcd` in a
31// 4-byte vector would look like this:
32//
33// 01100100 01100011 01100010 01100001
34// d c b a
35//
36// When the mask length is 1, then finding the candidate is pretty straight
37// forward: you just apply the shuffle indices (from the haystack window) to
38// the masks, and then AND them together, as described in the README. But for
39// masks of length 2 and 3, you need to keep a little state. Specifically,
40// you need to store the final 1 (for mask length 2) or 2 (for mask length 3)
41// bytes of the candidate for use when searching the next window. This is for
42// handling matches that span two windows.
43//
44// With respect to the repeated code, it would likely be possible to reduce
45// the number of copies of code below using polymorphism, but I find this
46// formulation clearer instead of needing to reason through generics. However,
47// I admit, there may be a simpler generic construction that I'm missing.
48//
49// All variants are fairly heavily tested in src/packed/tests.rs.
50
51use core::{arch::x86_64::*, mem};
52
53use alloc::vec::Vec;
54
55use crate::{
56 packed::{
57 pattern::{PatternID, Patterns},
58 teddy::compile,
59 vector,
60 },
61 util::search::Match,
62};
63
64/// The Teddy runtime.
65///
66/// A Teddy runtime can be used to quickly search for occurrences of one or
67/// more patterns. While it does not scale to an arbitrary number of patterns
68/// like Aho-Corasick, it does find occurrences for a small set of patterns
69/// much more quickly than Aho-Corasick.
70///
71/// Teddy cannot run on small haystacks below a certain size, which is
72/// dependent on the type of matcher used. This size can be queried via the
73/// `minimum_len` method. Violating this will result in a panic.
74///
75/// Finally, when callers use a Teddy runtime, they must provide precisely the
76/// patterns used to construct the Teddy matcher. Violating this will result
77/// in either a panic or incorrect results, but will never sacrifice memory
78/// safety.
79#[derive(Clone, Debug)]
80pub struct Teddy {
81 /// The allocation of patterns in buckets. This only contains the IDs of
82 /// patterns. In order to do full verification, callers must provide the
83 /// actual patterns when using Teddy.
84 pub buckets: Vec<Vec<PatternID>>,
85 /// The maximum identifier of a pattern. This is used as a sanity check to
86 /// ensure that the patterns provided by the caller are the same as the
87 /// patterns that were used to compile the matcher. This sanity check
88 /// permits safely eliminating bounds checks regardless of what patterns
89 /// are provided by the caller.
90 ///
91 /// Note that users of the aho-corasick crate cannot get this wrong. Only
92 /// code internal to this crate can get it wrong, since neither `Patterns`
93 /// type nor the Teddy runtime are public API items.
94 pub max_pattern_id: PatternID,
95 /// The actual runtime to use.
96 pub exec: Exec,
97}
98
99impl Teddy {
100 /// Return the first occurrence of a match in the given haystack after or
101 /// starting at `at`.
102 ///
103 /// The patterns provided must be precisely the same patterns given to the
104 /// Teddy builder, otherwise this may panic or produce incorrect results.
105 ///
106 /// All matches are consistent with the match semantics (leftmost-first or
107 /// leftmost-longest) set on `pats`.
108 pub fn find_at(
109 &self,
110 pats: &Patterns,
111 haystack: &[u8],
112 at: usize,
113 ) -> Option<Match> {
114 // This assert is a bit subtle, but it's an important guarantee.
115 // Namely, if the maximum pattern ID seen by Teddy is the same as the
116 // one in the patterns given, then we are guaranteed that every pattern
117 // ID in all Teddy buckets are valid indices into `pats`. While this
118 // is nominally true, there is no guarantee that callers provide the
119 // same `pats` to both the Teddy builder and the searcher, which would
120 // otherwise make `find_at` unsafe to call. But this assert lets us
121 // keep this routine safe and eliminate an important bounds check in
122 // verification.
123 assert_eq!(
124 self.max_pattern_id,
125 pats.max_pattern_id(),
126 "teddy must be called with same patterns it was built with",
127 );
128 // SAFETY: The haystack must have at least a minimum number of bytes
129 // for Teddy to be able to work. The minimum number varies depending on
130 // which matcher is used below. If this is violated, then it's possible
131 // for searching to do out-of-bounds writes.
132 assert!(haystack[at..].len() >= self.minimum_len());
133 // SAFETY: The various Teddy matchers are always safe to call because
134 // the Teddy builder guarantees that a particular Exec variant is
135 // built only when it can be run the current CPU. That is, the Teddy
136 // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is
137 // enabled. That is, our dynamic CPU feature detection is performed
138 // once in the builder, and we rely on the type system to avoid needing
139 // to do it again.
140 unsafe {
141 match self.exec {
142 Exec::TeddySlim1Mask128(ref e) => {
143 e.find_at(pats, self, haystack, at)
144 }
145 Exec::TeddySlim1Mask256(ref e) => {
146 e.find_at(pats, self, haystack, at)
147 }
148 Exec::TeddyFat1Mask256(ref e) => {
149 e.find_at(pats, self, haystack, at)
150 }
151 Exec::TeddySlim2Mask128(ref e) => {
152 e.find_at(pats, self, haystack, at)
153 }
154 Exec::TeddySlim2Mask256(ref e) => {
155 e.find_at(pats, self, haystack, at)
156 }
157 Exec::TeddyFat2Mask256(ref e) => {
158 e.find_at(pats, self, haystack, at)
159 }
160 Exec::TeddySlim3Mask128(ref e) => {
161 e.find_at(pats, self, haystack, at)
162 }
163 Exec::TeddySlim3Mask256(ref e) => {
164 e.find_at(pats, self, haystack, at)
165 }
166 Exec::TeddyFat3Mask256(ref e) => {
167 e.find_at(pats, self, haystack, at)
168 }
169 Exec::TeddySlim4Mask128(ref e) => {
170 e.find_at(pats, self, haystack, at)
171 }
172 Exec::TeddySlim4Mask256(ref e) => {
173 e.find_at(pats, self, haystack, at)
174 }
175 Exec::TeddyFat4Mask256(ref e) => {
176 e.find_at(pats, self, haystack, at)
177 }
178 }
179 }
180 }
181
182 /// Returns the minimum length of a haystack that must be provided by
183 /// callers to this Teddy searcher. Providing a haystack shorter than this
184 /// will result in a panic, but will never violate memory safety.
185 pub fn minimum_len(&self) -> usize {
186 // SAFETY: These values must be correct in order to ensure safety.
187 // The Teddy runtime assumes their haystacks have at least these
188 // lengths. Violating this will sacrifice memory safety.
189 match self.exec {
190 Exec::TeddySlim1Mask128(_) => 16,
191 Exec::TeddySlim1Mask256(_) => 32,
192 Exec::TeddyFat1Mask256(_) => 16,
193 Exec::TeddySlim2Mask128(_) => 17,
194 Exec::TeddySlim2Mask256(_) => 33,
195 Exec::TeddyFat2Mask256(_) => 17,
196 Exec::TeddySlim3Mask128(_) => 18,
197 Exec::TeddySlim3Mask256(_) => 34,
198 Exec::TeddyFat3Mask256(_) => 18,
199 Exec::TeddySlim4Mask128(_) => 19,
200 Exec::TeddySlim4Mask256(_) => 35,
201 Exec::TeddyFat4Mask256(_) => 19,
202 }
203 }
204
205 /// Returns the approximate total amount of heap used by this searcher, in
206 /// units of bytes.
207 pub fn memory_usage(&self) -> usize {
208 let num_patterns = self.max_pattern_id as usize + 1;
209 self.buckets.len() * mem::size_of::<Vec<PatternID>>()
210 + num_patterns * mem::size_of::<PatternID>()
211 }
212
213 /// Runs the verification routine for Slim 128-bit Teddy.
214 ///
215 /// The candidate given should be a collection of 8-bit bitsets (one bitset
216 /// per lane), where the ith bit is set in the jth lane if and only if the
217 /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
218 ///
219 /// This is not safe to call unless the SSSE3 target feature is enabled.
220 /// The `target_feature` attribute is not applied since this function is
221 /// always forcefully inlined.
222 #[inline(always)]
223 unsafe fn verify128(
224 &self,
225 pats: &Patterns,
226 haystack: &[u8],
227 at: usize,
228 cand: __m128i,
229 ) -> Option<Match> {
230 debug_assert!(!vector::is_all_zeroes128(cand));
231 debug_assert_eq!(8, self.buckets.len());
232
233 // Convert the candidate into 64-bit chunks, and then verify each of
234 // those chunks.
235 let parts = vector::unpack64x128(cand);
236 for (i, &part) in parts.iter().enumerate() {
237 let pos = at + i * 8;
238 if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
239 return Some(m);
240 }
241 }
242 None
243 }
244
245 /// Runs the verification routine for Slim 256-bit Teddy.
246 ///
247 /// The candidate given should be a collection of 8-bit bitsets (one bitset
248 /// per lane), where the ith bit is set in the jth lane if and only if the
249 /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
250 ///
251 /// This is not safe to call unless the AVX2 target feature is enabled.
252 /// The `target_feature` attribute is not applied since this function is
253 /// always forcefully inlined.
254 #[inline(always)]
255 unsafe fn verify256(
256 &self,
257 pats: &Patterns,
258 haystack: &[u8],
259 at: usize,
260 cand: __m256i,
261 ) -> Option<Match> {
262 debug_assert!(!vector::is_all_zeroes256(cand));
263 debug_assert_eq!(8, self.buckets.len());
264
265 // Convert the candidate into 64-bit chunks, and then verify each of
266 // those chunks.
267 let parts = vector::unpack64x256(cand);
268 let mut pos = at;
269 if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[0]) {
270 return Some(m);
271 }
272 pos += 8;
273 if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[1]) {
274 return Some(m);
275 }
276 pos += 8;
277 if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[2]) {
278 return Some(m);
279 }
280 pos += 8;
281 if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[3]) {
282 return Some(m);
283 }
284 None
285 }
286
287 /// Runs the verification routine for Fat 256-bit Teddy.
288 ///
289 /// The candidate given should be a collection of 8-bit bitsets (one bitset
290 /// per lane), where the ith bit is set in the jth lane if and only if the
291 /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the
292 /// bucket `j < 16 ? i : i + 8`.
293 ///
294 /// This is not safe to call unless the AVX2 target feature is enabled.
295 /// The `target_feature` attribute is not applied since this function is
296 /// always forcefully inlined.
297 #[inline(always)]
298 unsafe fn verify_fat256(
299 &self,
300 pats: &Patterns,
301 haystack: &[u8],
302 at: usize,
303 cand: __m256i,
304 ) -> Option<Match> {
305 debug_assert!(!vector::is_all_zeroes256(cand));
306 debug_assert_eq!(16, self.buckets.len());
307
308 // This is a bit tricky, but we basically want to convert our
309 // candidate, which looks like this
310 //
311 // a31 a30 ... a17 a16 a15 a14 ... a01 a00
312 //
313 // where each a(i) is an 8-bit bitset corresponding to the activated
314 // buckets, to this
315 //
316 // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
317 //
318 // Namely, for Fat Teddy, the high 128-bits of the candidate correspond
319 // to the same bytes in the haystack in the low 128-bits (so we only
320 // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
321 //
322 // The verification routine wants to look at all potentially matching
323 // buckets before moving on to the next lane. So for example, both
324 // a16 and a00 both correspond to the first byte in our window; a00
325 // contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
326 // a16 should be checked before a01. So the transformation shown above
327 // allows us to use our normal verification procedure with one small
328 // change: we treat each bitset as 16 bits instead of 8 bits.
329
330 // Swap the 128-bit lanes in the candidate vector.
331 let swap = _mm256_permute4x64_epi64(cand, 0x4E);
332 // Interleave the bytes from the low 128-bit lanes, starting with
333 // cand first.
334 let r1 = _mm256_unpacklo_epi8(cand, swap);
335 // Interleave the bytes from the high 128-bit lanes, starting with
336 // cand first.
337 let r2 = _mm256_unpackhi_epi8(cand, swap);
338 // Now just take the 2 low 64-bit integers from both r1 and r2. We
339 // can drop the high 64-bit integers because they are a mirror image
340 // of the low 64-bit integers. All we care about are the low 128-bit
341 // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
342 // laid out in the desired order, as described above.
343 let parts = vector::unpacklo64x256(r1, r2);
344 for (i, &part) in parts.iter().enumerate() {
345 let pos = at + i * 4;
346 if let Some(m) = self.verify64(pats, 16, haystack, pos, part) {
347 return Some(m);
348 }
349 }
350 None
351 }
352
353 /// Verify whether there are any matches starting at or after `at` in the
354 /// given `haystack`. The candidate given should correspond to either 8-bit
355 /// (for 8 buckets) or 16-bit (16 buckets) bitsets.
356 #[inline(always)]
357 fn verify64(
358 &self,
359 pats: &Patterns,
360 bucket_count: usize,
361 haystack: &[u8],
362 at: usize,
363 mut cand: u64,
364 ) -> Option<Match> {
365 // N.B. While the bucket count is known from self.buckets.len(),
366 // requiring it as a parameter makes it easier for the optimizer to
367 // know its value, and thus produce more efficient codegen.
368 debug_assert!(bucket_count == 8 || bucket_count == 16);
369 while cand != 0 {
370 let bit = cand.trailing_zeros() as usize;
371 cand &= !(1 << bit);
372
373 let at = at + (bit / bucket_count);
374 let bucket = bit % bucket_count;
375 if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) {
376 return Some(m);
377 }
378 }
379 None
380 }
381
382 /// Verify whether there are any matches starting at `at` in the given
383 /// `haystack` corresponding only to patterns in the given bucket.
384 #[inline(always)]
385 fn verify_bucket(
386 &self,
387 pats: &Patterns,
388 haystack: &[u8],
389 bucket: usize,
390 at: usize,
391 ) -> Option<Match> {
392 // Forcing this function to not inline and be "cold" seems to help
393 // the codegen for Teddy overall. Interestingly, this is good for a
394 // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among
395 // others). Overall, this seems like a problem with codegen, since
396 // creating the Match itself is a very small amount of code.
397 #[cold]
398 #[inline(never)]
399 fn match_from_span(
400 pati: PatternID,
401 start: usize,
402 end: usize,
403 ) -> Match {
404 Match::must(pati as usize, start..end)
405 }
406
407 // N.B. The bounds check for this bucket lookup *should* be elided
408 // since we assert the number of buckets in each `find_at` routine,
409 // and the compiler can prove that the `% 8` (or `% 16`) in callers
410 // of this routine will always be in bounds.
411 for &pati in &self.buckets[bucket] {
412 // SAFETY: This is safe because we are guaranteed that every
413 // index in a Teddy bucket is a valid index into `pats`. This
414 // guarantee is upheld by the assert checking `max_pattern_id` in
415 // the beginning of `find_at` above.
416 //
417 // This explicit bounds check elision is (amazingly) good for a
418 // 25-50% boost in some benchmarks, particularly ones with a lot
419 // of short literals.
420 let pat = unsafe { pats.get_unchecked(pati) };
421 if pat.is_prefix(&haystack[at..]) {
422 return Some(match_from_span(pati, at, at + pat.len()));
423 }
424 }
425 None
426 }
427}
428
429/// Exec represents the different search strategies supported by the Teddy
430/// runtime.
431///
432/// This enum is an important safety abstraction. Namely, callers should only
433/// construct a variant in this enum if it is safe to execute its corresponding
434/// target features on the current CPU. The 128-bit searchers require SSSE3,
435/// while the 256-bit searchers require AVX2.
436#[derive(Clone, Debug)]
437pub enum Exec {
438 TeddySlim1Mask128(TeddySlim1Mask128),
439 TeddySlim1Mask256(TeddySlim1Mask256),
440 TeddyFat1Mask256(TeddyFat1Mask256),
441 TeddySlim2Mask128(TeddySlim2Mask128),
442 TeddySlim2Mask256(TeddySlim2Mask256),
443 TeddyFat2Mask256(TeddyFat2Mask256),
444 TeddySlim3Mask128(TeddySlim3Mask128),
445 TeddySlim3Mask256(TeddySlim3Mask256),
446 TeddyFat3Mask256(TeddyFat3Mask256),
447 TeddySlim4Mask128(TeddySlim4Mask128),
448 TeddySlim4Mask256(TeddySlim4Mask256),
449 TeddyFat4Mask256(TeddyFat4Mask256),
450}
451
452// Most of the code below remains undocumented because they are effectively
453// repeated versions of themselves. The general structure is described in the
454// README and in the comments above.
455
456#[derive(Clone, Debug)]
457pub struct TeddySlim1Mask128 {
458 pub mask1: Mask128,
459}
460
461impl TeddySlim1Mask128 {
462 #[target_feature(enable = "ssse3")]
463 unsafe fn find_at(
464 &self,
465 pats: &Patterns,
466 teddy: &Teddy,
467 haystack: &[u8],
468 mut at: usize,
469 ) -> Option<Match> {
470 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
471 // This assert helps eliminate bounds checks for bucket lookups in
472 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
473 assert_eq!(8, teddy.buckets.len());
474
475 let len = haystack.len();
476 while at <= len - 16 {
477 let c = self.candidate(haystack, at);
478 if !vector::is_all_zeroes128(c) {
479 if let Some(m) = teddy.verify128(pats, haystack, at, c) {
480 return Some(m);
481 }
482 }
483 at += 16;
484 }
485 if at < len {
486 at = len - 16;
487 let c = self.candidate(haystack, at);
488 if !vector::is_all_zeroes128(c) {
489 if let Some(m) = teddy.verify128(pats, haystack, at, c) {
490 return Some(m);
491 }
492 }
493 }
494 None
495 }
496
497 #[inline(always)]
498 unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i {
499 debug_assert!(haystack[at..].len() >= 16);
500
501 let chunk = vector::loadu128(haystack, at);
502 members1m128(chunk, self.mask1)
503 }
504}
505
506#[derive(Clone, Debug)]
507pub struct TeddySlim1Mask256 {
508 pub mask1: Mask256,
509}
510
511impl TeddySlim1Mask256 {
512 #[target_feature(enable = "avx2")]
513 unsafe fn find_at(
514 &self,
515 pats: &Patterns,
516 teddy: &Teddy,
517 haystack: &[u8],
518 mut at: usize,
519 ) -> Option<Match> {
520 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
521 // This assert helps eliminate bounds checks for bucket lookups in
522 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
523 assert_eq!(8, teddy.buckets.len());
524
525 let len = haystack.len();
526 while at <= len - 32 {
527 let c = self.candidate(haystack, at);
528 if !vector::is_all_zeroes256(c) {
529 if let Some(m) = teddy.verify256(pats, haystack, at, c) {
530 return Some(m);
531 }
532 }
533 at += 32;
534 }
535 if at < len {
536 at = len - 32;
537 let c = self.candidate(haystack, at);
538 if !vector::is_all_zeroes256(c) {
539 if let Some(m) = teddy.verify256(pats, haystack, at, c) {
540 return Some(m);
541 }
542 }
543 }
544 None
545 }
546
547 #[inline(always)]
548 unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
549 debug_assert!(haystack[at..].len() >= 32);
550
551 let chunk = vector::loadu256(haystack, at);
552 members1m256(chunk, self.mask1)
553 }
554}
555
556#[derive(Clone, Debug)]
557pub struct TeddyFat1Mask256 {
558 pub mask1: Mask256,
559}
560
561impl TeddyFat1Mask256 {
562 #[target_feature(enable = "avx2")]
563 unsafe fn find_at(
564 &self,
565 pats: &Patterns,
566 teddy: &Teddy,
567 haystack: &[u8],
568 mut at: usize,
569 ) -> Option<Match> {
570 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
571 // This assert helps eliminate bounds checks for bucket lookups in
572 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
573 assert_eq!(16, teddy.buckets.len());
574
575 let len = haystack.len();
576 while at <= len - 16 {
577 let c = self.candidate(haystack, at);
578 if !vector::is_all_zeroes256(c) {
579 if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
580 return Some(m);
581 }
582 }
583 at += 16;
584 }
585 if at < len {
586 at = len - 16;
587 let c = self.candidate(haystack, at);
588 if !vector::is_all_zeroes256(c) {
589 if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
590 return Some(m);
591 }
592 }
593 }
594 None
595 }
596
597 #[inline(always)]
598 unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
599 debug_assert!(haystack[at..].len() >= 16);
600
601 let chunk =
602 _mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
603 members1m256(chunk, self.mask1)
604 }
605}
606
607#[derive(Clone, Debug)]
608pub struct TeddySlim2Mask128 {
609 pub mask1: Mask128,
610 pub mask2: Mask128,
611}
612
613impl TeddySlim2Mask128 {
614 #[target_feature(enable = "ssse3")]
615 unsafe fn find_at(
616 &self,
617 pats: &Patterns,
618 teddy: &Teddy,
619 haystack: &[u8],
620 mut at: usize,
621 ) -> Option<Match> {
622 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
623 // This assert helps eliminate bounds checks for bucket lookups in
624 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
625 assert_eq!(8, teddy.buckets.len());
626
627 at += 1;
628 let len = haystack.len();
629 let mut prev0 = vector::ones128();
630 while at <= len - 16 {
631 let c = self.candidate(haystack, at, &mut prev0);
632 if !vector::is_all_zeroes128(c) {
633 if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
634 return Some(m);
635 }
636 }
637 at += 16;
638 }
639 if at < len {
640 at = len - 16;
641 prev0 = vector::ones128();
642
643 let c = self.candidate(haystack, at, &mut prev0);
644 if !vector::is_all_zeroes128(c) {
645 if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
646 return Some(m);
647 }
648 }
649 }
650 None
651 }
652
653 #[inline(always)]
654 unsafe fn candidate(
655 &self,
656 haystack: &[u8],
657 at: usize,
658 prev0: &mut __m128i,
659 ) -> __m128i {
660 debug_assert!(haystack[at..].len() >= 16);
661
662 let chunk = vector::loadu128(haystack, at);
663 let (res0, res1) = members2m128(chunk, self.mask1, self.mask2);
664 let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15);
665 _mm_and_si128(res0prev0, res1)
666 }
667}
668
669#[derive(Clone, Debug)]
670pub struct TeddySlim2Mask256 {
671 pub mask1: Mask256,
672 pub mask2: Mask256,
673}
674
675impl TeddySlim2Mask256 {
676 #[target_feature(enable = "avx2")]
677 unsafe fn find_at(
678 &self,
679 pats: &Patterns,
680 teddy: &Teddy,
681 haystack: &[u8],
682 mut at: usize,
683 ) -> Option<Match> {
684 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
685 // This assert helps eliminate bounds checks for bucket lookups in
686 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
687 assert_eq!(8, teddy.buckets.len());
688
689 at += 1;
690 let len = haystack.len();
691 let mut prev0 = vector::ones256();
692 while at <= len - 32 {
693 let c = self.candidate(haystack, at, &mut prev0);
694 if !vector::is_all_zeroes256(c) {
695 if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
696 return Some(m);
697 }
698 }
699 at += 32;
700 }
701 if at < len {
702 at = len - 32;
703 prev0 = vector::ones256();
704
705 let c = self.candidate(haystack, at, &mut prev0);
706 if !vector::is_all_zeroes256(c) {
707 if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
708 return Some(m);
709 }
710 }
711 }
712 None
713 }
714
715 #[inline(always)]
716 unsafe fn candidate(
717 &self,
718 haystack: &[u8],
719 at: usize,
720 prev0: &mut __m256i,
721 ) -> __m256i {
722 debug_assert!(haystack[at..].len() >= 32);
723
724 let chunk = vector::loadu256(haystack, at);
725 let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
726 let res0prev0 = vector::alignr256_15(res0, *prev0);
727 let res = _mm256_and_si256(res0prev0, res1);
728 *prev0 = res0;
729 res
730 }
731}
732
733#[derive(Clone, Debug)]
734pub struct TeddyFat2Mask256 {
735 pub mask1: Mask256,
736 pub mask2: Mask256,
737}
738
739impl TeddyFat2Mask256 {
740 #[target_feature(enable = "avx2")]
741 unsafe fn find_at(
742 &self,
743 pats: &Patterns,
744 teddy: &Teddy,
745 haystack: &[u8],
746 mut at: usize,
747 ) -> Option<Match> {
748 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
749 // This assert helps eliminate bounds checks for bucket lookups in
750 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
751 assert_eq!(16, teddy.buckets.len());
752
753 at += 1;
754 let len = haystack.len();
755 let mut prev0 = vector::ones256();
756 while at <= len - 16 {
757 let c = self.candidate(haystack, at, &mut prev0);
758 if !vector::is_all_zeroes256(c) {
759 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
760 {
761 return Some(m);
762 }
763 }
764 at += 16;
765 }
766 if at < len {
767 at = len - 16;
768 prev0 = vector::ones256();
769
770 let c = self.candidate(haystack, at, &mut prev0);
771 if !vector::is_all_zeroes256(c) {
772 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
773 {
774 return Some(m);
775 }
776 }
777 }
778 None
779 }
780
781 #[inline(always)]
782 unsafe fn candidate(
783 &self,
784 haystack: &[u8],
785 at: usize,
786 prev0: &mut __m256i,
787 ) -> __m256i {
788 debug_assert!(haystack[at..].len() >= 16);
789
790 let chunk =
791 _mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
792 let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
793 let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15);
794 let res = _mm256_and_si256(res0prev0, res1);
795 *prev0 = res0;
796 res
797 }
798}
799
800#[derive(Clone, Debug)]
801pub struct TeddySlim3Mask128 {
802 pub mask1: Mask128,
803 pub mask2: Mask128,
804 pub mask3: Mask128,
805}
806
807impl TeddySlim3Mask128 {
808 #[target_feature(enable = "ssse3")]
809 unsafe fn find_at(
810 &self,
811 pats: &Patterns,
812 teddy: &Teddy,
813 haystack: &[u8],
814 mut at: usize,
815 ) -> Option<Match> {
816 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
817 // This assert helps eliminate bounds checks for bucket lookups in
818 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
819 assert_eq!(8, teddy.buckets.len());
820
821 at += 2;
822 let len = haystack.len();
823 let (mut prev0, mut prev1) = (vector::ones128(), vector::ones128());
824 while at <= len - 16 {
825 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
826 if !vector::is_all_zeroes128(c) {
827 if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
828 return Some(m);
829 }
830 }
831 at += 16;
832 }
833 if at < len {
834 at = len - 16;
835 prev0 = vector::ones128();
836 prev1 = vector::ones128();
837
838 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
839 if !vector::is_all_zeroes128(c) {
840 if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
841 return Some(m);
842 }
843 }
844 }
845 None
846 }
847
848 #[inline(always)]
849 unsafe fn candidate(
850 &self,
851 haystack: &[u8],
852 at: usize,
853 prev0: &mut __m128i,
854 prev1: &mut __m128i,
855 ) -> __m128i {
856 debug_assert!(haystack[at..].len() >= 16);
857
858 let chunk = vector::loadu128(haystack, at);
859 let (res0, res1, res2) =
860 members3m128(chunk, self.mask1, self.mask2, self.mask3);
861 let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14);
862 let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15);
863 let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2);
864 *prev0 = res0;
865 *prev1 = res1;
866 res
867 }
868}
869
870#[derive(Clone, Debug)]
871pub struct TeddySlim3Mask256 {
872 pub mask1: Mask256,
873 pub mask2: Mask256,
874 pub mask3: Mask256,
875}
876
877impl TeddySlim3Mask256 {
878 #[target_feature(enable = "avx2")]
879 unsafe fn find_at(
880 &self,
881 pats: &Patterns,
882 teddy: &Teddy,
883 haystack: &[u8],
884 mut at: usize,
885 ) -> Option<Match> {
886 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
887 // This assert helps eliminate bounds checks for bucket lookups in
888 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
889 assert_eq!(8, teddy.buckets.len());
890
891 at += 2;
892 let len = haystack.len();
893 let (mut prev0, mut prev1) = (vector::ones256(), vector::ones256());
894 while at <= len - 32 {
895 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
896 if !vector::is_all_zeroes256(c) {
897 if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
898 return Some(m);
899 }
900 }
901 at += 32;
902 }
903 if at < len {
904 at = len - 32;
905 prev0 = vector::ones256();
906 prev1 = vector::ones256();
907
908 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
909 if !vector::is_all_zeroes256(c) {
910 if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
911 return Some(m);
912 }
913 }
914 }
915 None
916 }
917
918 #[inline(always)]
919 unsafe fn candidate(
920 &self,
921 haystack: &[u8],
922 at: usize,
923 prev0: &mut __m256i,
924 prev1: &mut __m256i,
925 ) -> __m256i {
926 debug_assert!(haystack[at..].len() >= 32);
927
928 let chunk = vector::loadu256(haystack, at);
929 let (res0, res1, res2) =
930 members3m256(chunk, self.mask1, self.mask2, self.mask3);
931 let res0prev0 = vector::alignr256_14(res0, *prev0);
932 let res1prev1 = vector::alignr256_15(res1, *prev1);
933 let res =
934 _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
935 *prev0 = res0;
936 *prev1 = res1;
937 res
938 }
939}
940
941#[derive(Clone, Debug)]
942pub struct TeddyFat3Mask256 {
943 pub mask1: Mask256,
944 pub mask2: Mask256,
945 pub mask3: Mask256,
946}
947
948impl TeddyFat3Mask256 {
949 #[target_feature(enable = "avx2")]
950 unsafe fn find_at(
951 &self,
952 pats: &Patterns,
953 teddy: &Teddy,
954 haystack: &[u8],
955 mut at: usize,
956 ) -> Option<Match> {
957 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
958 // This assert helps eliminate bounds checks for bucket lookups in
959 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
960 assert_eq!(16, teddy.buckets.len());
961
962 at += 2;
963 let len = haystack.len();
964 let (mut prev0, mut prev1) = (vector::ones256(), vector::ones256());
965 while at <= len - 16 {
966 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
967 if !vector::is_all_zeroes256(c) {
968 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
969 {
970 return Some(m);
971 }
972 }
973 at += 16;
974 }
975 if at < len {
976 at = len - 16;
977 prev0 = vector::ones256();
978 prev1 = vector::ones256();
979
980 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
981 if !vector::is_all_zeroes256(c) {
982 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
983 {
984 return Some(m);
985 }
986 }
987 }
988 None
989 }
990
991 #[inline(always)]
992 unsafe fn candidate(
993 &self,
994 haystack: &[u8],
995 at: usize,
996 prev0: &mut __m256i,
997 prev1: &mut __m256i,
998 ) -> __m256i {
999 debug_assert!(haystack[at..].len() >= 16);
1000
1001 let chunk =
1002 _mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
1003 let (res0, res1, res2) =
1004 members3m256(chunk, self.mask1, self.mask2, self.mask3);
1005 let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14);
1006 let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15);
1007 let res =
1008 _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
1009 *prev0 = res0;
1010 *prev1 = res1;
1011 res
1012 }
1013}
1014
1015#[derive(Clone, Debug)]
1016pub struct TeddySlim4Mask128 {
1017 pub mask1: Mask128,
1018 pub mask2: Mask128,
1019 pub mask3: Mask128,
1020 pub mask4: Mask128,
1021}
1022
1023impl TeddySlim4Mask128 {
1024 #[target_feature(enable = "ssse3")]
1025 unsafe fn find_at(
1026 &self,
1027 pats: &Patterns,
1028 teddy: &Teddy,
1029 haystack: &[u8],
1030 mut at: usize,
1031 ) -> Option<Match> {
1032 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
1033 // This assert helps eliminate bounds checks for bucket lookups in
1034 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
1035 assert_eq!(8, teddy.buckets.len());
1036
1037 at += 3;
1038 let len = haystack.len();
1039 let mut prev0 = vector::ones128();
1040 let mut prev1 = vector::ones128();
1041 let mut prev2 = vector::ones128();
1042 while at <= len - 16 {
1043 let c = self
1044 .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1045 if !vector::is_all_zeroes128(c) {
1046 if let Some(m) = teddy.verify128(pats, haystack, at - 3, c) {
1047 return Some(m);
1048 }
1049 }
1050 at += 16;
1051 }
1052 if at < len {
1053 at = len - 16;
1054 prev0 = vector::ones128();
1055 prev1 = vector::ones128();
1056 prev2 = vector::ones128();
1057
1058 let c = self
1059 .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1060 if !vector::is_all_zeroes128(c) {
1061 if let Some(m) = teddy.verify128(pats, haystack, at - 3, c) {
1062 return Some(m);
1063 }
1064 }
1065 }
1066 None
1067 }
1068
1069 #[inline(always)]
1070 unsafe fn candidate(
1071 &self,
1072 haystack: &[u8],
1073 at: usize,
1074 prev0: &mut __m128i,
1075 prev1: &mut __m128i,
1076 prev2: &mut __m128i,
1077 ) -> __m128i {
1078 debug_assert!(haystack[at..].len() >= 16);
1079
1080 let chunk = vector::loadu128(haystack, at);
1081 let (res0, res1, res2, res3) = members4m128(
1082 chunk, self.mask1, self.mask2, self.mask3, self.mask4,
1083 );
1084 let res0prev0 = _mm_alignr_epi8(res0, *prev0, 13);
1085 let res1prev1 = _mm_alignr_epi8(res1, *prev1, 14);
1086 let res2prev2 = _mm_alignr_epi8(res2, *prev2, 15);
1087 let res = _mm_and_si128(
1088 _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2prev2),
1089 res3,
1090 );
1091 *prev0 = res0;
1092 *prev1 = res1;
1093 *prev2 = res2;
1094 res
1095 }
1096}
1097
1098#[derive(Clone, Debug)]
1099pub struct TeddySlim4Mask256 {
1100 pub mask1: Mask256,
1101 pub mask2: Mask256,
1102 pub mask3: Mask256,
1103 pub mask4: Mask256,
1104}
1105
1106impl TeddySlim4Mask256 {
1107 #[target_feature(enable = "avx2")]
1108 unsafe fn find_at(
1109 &self,
1110 pats: &Patterns,
1111 teddy: &Teddy,
1112 haystack: &[u8],
1113 mut at: usize,
1114 ) -> Option<Match> {
1115 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
1116 // This assert helps eliminate bounds checks for bucket lookups in
1117 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
1118 assert_eq!(8, teddy.buckets.len());
1119
1120 at += 3;
1121 let len = haystack.len();
1122 let mut prev0 = vector::ones256();
1123 let mut prev1 = vector::ones256();
1124 let mut prev2 = vector::ones256();
1125 while at <= len - 32 {
1126 let c = self
1127 .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1128 if !vector::is_all_zeroes256(c) {
1129 if let Some(m) = teddy.verify256(pats, haystack, at - 3, c) {
1130 return Some(m);
1131 }
1132 }
1133 at += 32;
1134 }
1135 if at < len {
1136 at = len - 32;
1137 prev0 = vector::ones256();
1138 prev1 = vector::ones256();
1139 prev2 = vector::ones256();
1140
1141 let c = self
1142 .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1143 if !vector::is_all_zeroes256(c) {
1144 if let Some(m) = teddy.verify256(pats, haystack, at - 3, c) {
1145 return Some(m);
1146 }
1147 }
1148 }
1149 None
1150 }
1151
1152 #[inline(always)]
1153 unsafe fn candidate(
1154 &self,
1155 haystack: &[u8],
1156 at: usize,
1157 prev0: &mut __m256i,
1158 prev1: &mut __m256i,
1159 prev2: &mut __m256i,
1160 ) -> __m256i {
1161 debug_assert!(haystack[at..].len() >= 32);
1162
1163 let chunk = vector::loadu256(haystack, at);
1164 let (res0, res1, res2, res3) = members4m256(
1165 chunk, self.mask1, self.mask2, self.mask3, self.mask4,
1166 );
1167 let res0prev0 = vector::alignr256_13(res0, *prev0);
1168 let res1prev1 = vector::alignr256_14(res1, *prev1);
1169 let res2prev2 = vector::alignr256_15(res2, *prev2);
1170 let res = _mm256_and_si256(
1171 _mm256_and_si256(
1172 _mm256_and_si256(res0prev0, res1prev1),
1173 res2prev2,
1174 ),
1175 res3,
1176 );
1177 *prev0 = res0;
1178 *prev1 = res1;
1179 *prev2 = res2;
1180 res
1181 }
1182}
1183
1184#[derive(Clone, Debug)]
1185pub struct TeddyFat4Mask256 {
1186 pub mask1: Mask256,
1187 pub mask2: Mask256,
1188 pub mask3: Mask256,
1189 pub mask4: Mask256,
1190}
1191
1192impl TeddyFat4Mask256 {
1193 #[target_feature(enable = "avx2")]
1194 unsafe fn find_at(
1195 &self,
1196 pats: &Patterns,
1197 teddy: &Teddy,
1198 haystack: &[u8],
1199 mut at: usize,
1200 ) -> Option<Match> {
1201 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
1202 // This assert helps eliminate bounds checks for bucket lookups in
1203 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
1204 assert_eq!(16, teddy.buckets.len());
1205
1206 at += 3;
1207 let len = haystack.len();
1208 let mut prev0 = vector::ones256();
1209 let mut prev1 = vector::ones256();
1210 let mut prev2 = vector::ones256();
1211 while at <= len - 16 {
1212 let c = self
1213 .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1214 if !vector::is_all_zeroes256(c) {
1215 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 3, c)
1216 {
1217 return Some(m);
1218 }
1219 }
1220 at += 16;
1221 }
1222 if at < len {
1223 at = len - 16;
1224 prev0 = vector::ones256();
1225 prev1 = vector::ones256();
1226 prev2 = vector::ones256();
1227
1228 let c = self
1229 .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1230 if !vector::is_all_zeroes256(c) {
1231 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 3, c)
1232 {
1233 return Some(m);
1234 }
1235 }
1236 }
1237 None
1238 }
1239
1240 #[inline(always)]
1241 unsafe fn candidate(
1242 &self,
1243 haystack: &[u8],
1244 at: usize,
1245 prev0: &mut __m256i,
1246 prev1: &mut __m256i,
1247 prev2: &mut __m256i,
1248 ) -> __m256i {
1249 debug_assert!(haystack[at..].len() >= 16);
1250
1251 let chunk =
1252 _mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
1253 let (res0, res1, res2, res3) = members4m256(
1254 chunk, self.mask1, self.mask2, self.mask3, self.mask4,
1255 );
1256 let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 13);
1257 let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 14);
1258 let res2prev2 = _mm256_alignr_epi8(res2, *prev2, 15);
1259 let res = _mm256_and_si256(
1260 _mm256_and_si256(
1261 _mm256_and_si256(res0prev0, res1prev1),
1262 res2prev2,
1263 ),
1264 res3,
1265 );
1266 *prev0 = res0;
1267 *prev1 = res1;
1268 *prev2 = res2;
1269 res
1270 }
1271}
1272
1273/// A 128-bit mask for the low and high nybbles in a set of patterns. Each
1274/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
1275/// the nybble `j` is in the bucket `i` at a particular position.
1276#[derive(Clone, Copy, Debug)]
1277pub struct Mask128 {
1278 lo: __m128i,
1279 hi: __m128i,
1280}
1281
1282impl Mask128 {
1283 /// Create a new SIMD mask from the mask produced by the Teddy builder.
1284 pub fn new(mask: compile::Mask) -> Mask128 {
1285 // SAFETY: This is safe since [u8; 16] has the same representation
1286 // as __m128i.
1287 unsafe {
1288 Mask128 {
1289 lo: mem::transmute(src:mask.lo128()),
1290 hi: mem::transmute(src:mask.hi128()),
1291 }
1292 }
1293 }
1294}
1295
1296/// A 256-bit mask for the low and high nybbles in a set of patterns. Each
1297/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
1298/// the nybble `j` is in the bucket `i` at a particular position.
1299///
1300/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
1301/// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as
1302/// the bitsets in the higher 128-bits, so that we can search 32 bytes at a
1303/// time. (Remember, the nybbles in the haystack are used as indices into these
1304/// masks, and 256-bit shuffles only operate on 128-bit lanes.)
1305///
1306/// For Fat Teddy, the bitsets are not repeated, but instead, the high 128
1307/// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets
1308/// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set
1309/// if it's in the higher 128 bits.
1310#[derive(Clone, Copy, Debug)]
1311pub struct Mask256 {
1312 lo: __m256i,
1313 hi: __m256i,
1314}
1315
1316impl Mask256 {
1317 /// Create a new SIMD mask from the mask produced by the Teddy builder.
1318 pub fn new(mask: compile::Mask) -> Mask256 {
1319 // SAFETY: This is safe since [u8; 32] has the same representation
1320 // as __m256i.
1321 unsafe {
1322 Mask256 {
1323 lo: mem::transmute(src:mask.lo256()),
1324 hi: mem::transmute(src:mask.hi256()),
1325 }
1326 }
1327 }
1328}
1329
1330// The "members" routines below are responsible for taking a chunk of bytes,
1331// a number of nybble masks and returning the result of using the masks to
1332// lookup bytes in the chunk. The results of the high and low nybble masks are
1333// AND'ed together, such that each candidate returned is a vector, with byte
1334// sized lanes, and where each lane is an 8-bit bitset corresponding to the
1335// buckets that contain the corresponding byte.
1336//
1337// In the case of masks of length greater than 1, callers will need to keep
1338// the results from the previous haystack's window, and then shift the vectors
1339// so that they all line up. Then they can be AND'ed together.
1340
1341/// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a
1342/// 16-byte window of the haystack (where the least significant byte
1343/// corresponds to the start of the window), and `mask1` corresponds to a
1344/// low/high mask for the first byte of all patterns that are being searched.
1345#[target_feature(enable = "ssse3")]
1346unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i {
1347 let lomask: __m128i = _mm_set1_epi8(0xF);
1348 let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1349 let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask);
1350 _mm_and_si128(
1351 a:_mm_shuffle_epi8(mask1.lo, hlo),
1352 b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1353 )
1354}
1355
1356/// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a
1357/// 32-byte window of the haystack (where the least significant byte
1358/// corresponds to the start of the window), and `mask1` corresponds to a
1359/// low/high mask for the first byte of all patterns that are being searched.
1360///
1361/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1362/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1363/// window in the haystack.
1364#[target_feature(enable = "avx2")]
1365unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i {
1366 let lomask: __m256i = _mm256_set1_epi8(0xF);
1367 let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1368 let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask);
1369 _mm256_and_si256(
1370 a:_mm256_shuffle_epi8(mask1.lo, hlo),
1371 b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1372 )
1373}
1374
1375/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1376/// to a 16-byte window of the haystack (where the least significant byte
1377/// corresponds to the start of the window), and the masks correspond to a
1378/// low/high mask for the first and second bytes of all patterns that are being
1379/// searched. The vectors returned correspond to candidates for the first and
1380/// second bytes in the patterns represented by the masks.
1381#[target_feature(enable = "ssse3")]
1382unsafe fn members2m128(
1383 chunk: __m128i,
1384 mask1: Mask128,
1385 mask2: Mask128,
1386) -> (__m128i, __m128i) {
1387 let lomask: __m128i = _mm_set1_epi8(0xF);
1388 let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1389 let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask);
1390 let res0: __m128i = _mm_and_si128(
1391 a:_mm_shuffle_epi8(mask1.lo, hlo),
1392 b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1393 );
1394 let res1: __m128i = _mm_and_si128(
1395 a:_mm_shuffle_epi8(mask2.lo, hlo),
1396 b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1397 );
1398 (res0, res1)
1399}
1400
1401/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1402/// to a 32-byte window of the haystack (where the least significant byte
1403/// corresponds to the start of the window), and the masks correspond to a
1404/// low/high mask for the first and second bytes of all patterns that are being
1405/// searched. The vectors returned correspond to candidates for the first and
1406/// second bytes in the patterns represented by the masks.
1407///
1408/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1409/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1410/// window in the haystack.
1411#[target_feature(enable = "avx2")]
1412unsafe fn members2m256(
1413 chunk: __m256i,
1414 mask1: Mask256,
1415 mask2: Mask256,
1416) -> (__m256i, __m256i) {
1417 let lomask: __m256i = _mm256_set1_epi8(0xF);
1418 let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1419 let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask);
1420 let res0: __m256i = _mm256_and_si256(
1421 a:_mm256_shuffle_epi8(mask1.lo, hlo),
1422 b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1423 );
1424 let res1: __m256i = _mm256_and_si256(
1425 a:_mm256_shuffle_epi8(mask2.lo, hlo),
1426 b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1427 );
1428 (res0, res1)
1429}
1430
1431/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1432/// to a 16-byte window of the haystack (where the least significant byte
1433/// corresponds to the start of the window), and the masks correspond to a
1434/// low/high mask for the first, second and third bytes of all patterns that
1435/// are being searched. The vectors returned correspond to candidates for the
1436/// first, second and third bytes in the patterns represented by the masks.
1437#[target_feature(enable = "ssse3")]
1438unsafe fn members3m128(
1439 chunk: __m128i,
1440 mask1: Mask128,
1441 mask2: Mask128,
1442 mask3: Mask128,
1443) -> (__m128i, __m128i, __m128i) {
1444 let lomask: __m128i = _mm_set1_epi8(0xF);
1445 let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1446 let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask);
1447 let res0: __m128i = _mm_and_si128(
1448 a:_mm_shuffle_epi8(mask1.lo, hlo),
1449 b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1450 );
1451 let res1: __m128i = _mm_and_si128(
1452 a:_mm_shuffle_epi8(mask2.lo, hlo),
1453 b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1454 );
1455 let res2: __m128i = _mm_and_si128(
1456 a:_mm_shuffle_epi8(mask3.lo, hlo),
1457 b:_mm_shuffle_epi8(a:mask3.hi, b:hhi),
1458 );
1459 (res0, res1, res2)
1460}
1461
1462/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1463/// to a 32-byte window of the haystack (where the least significant byte
1464/// corresponds to the start of the window), and the masks correspond to a
1465/// low/high mask for the first, second and third bytes of all patterns that
1466/// are being searched. The vectors returned correspond to candidates for the
1467/// first, second and third bytes in the patterns represented by the masks.
1468///
1469/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1470/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1471/// window in the haystack.
1472#[target_feature(enable = "avx2")]
1473unsafe fn members3m256(
1474 chunk: __m256i,
1475 mask1: Mask256,
1476 mask2: Mask256,
1477 mask3: Mask256,
1478) -> (__m256i, __m256i, __m256i) {
1479 let lomask: __m256i = _mm256_set1_epi8(0xF);
1480 let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1481 let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask);
1482 let res0: __m256i = _mm256_and_si256(
1483 a:_mm256_shuffle_epi8(mask1.lo, hlo),
1484 b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1485 );
1486 let res1: __m256i = _mm256_and_si256(
1487 a:_mm256_shuffle_epi8(mask2.lo, hlo),
1488 b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1489 );
1490 let res2: __m256i = _mm256_and_si256(
1491 a:_mm256_shuffle_epi8(mask3.lo, hlo),
1492 b:_mm256_shuffle_epi8(a:mask3.hi, b:hhi),
1493 );
1494 (res0, res1, res2)
1495}
1496
1497/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1498/// to a 16-byte window of the haystack (where the least significant byte
1499/// corresponds to the start of the window), and the masks correspond to a
1500/// low/high mask for the first, second, third and fourth bytes of all patterns
1501/// that are being searched. The vectors returned correspond to candidates for
1502/// the first, second, third and fourth bytes in the patterns represented by
1503/// the masks.
1504#[target_feature(enable = "ssse3")]
1505unsafe fn members4m128(
1506 chunk: __m128i,
1507 mask1: Mask128,
1508 mask2: Mask128,
1509 mask3: Mask128,
1510 mask4: Mask128,
1511) -> (__m128i, __m128i, __m128i, __m128i) {
1512 let lomask: __m128i = _mm_set1_epi8(0xF);
1513 let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1514 let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask);
1515 let res0: __m128i = _mm_and_si128(
1516 a:_mm_shuffle_epi8(mask1.lo, hlo),
1517 b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1518 );
1519 let res1: __m128i = _mm_and_si128(
1520 a:_mm_shuffle_epi8(mask2.lo, hlo),
1521 b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1522 );
1523 let res2: __m128i = _mm_and_si128(
1524 a:_mm_shuffle_epi8(mask3.lo, hlo),
1525 b:_mm_shuffle_epi8(a:mask3.hi, b:hhi),
1526 );
1527 let res3: __m128i = _mm_and_si128(
1528 a:_mm_shuffle_epi8(mask4.lo, hlo),
1529 b:_mm_shuffle_epi8(a:mask4.hi, b:hhi),
1530 );
1531 (res0, res1, res2, res3)
1532}
1533
1534/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1535/// to a 32-byte window of the haystack (where the least significant byte
1536/// corresponds to the start of the window), and the masks correspond to a
1537/// low/high mask for the first, second, third and fourth bytes of all patterns
1538/// that are being searched. The vectors returned correspond to candidates for
1539/// the first, second, third and fourth bytes in the patterns represented by
1540/// the masks.
1541///
1542/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1543/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1544/// window in the haystack.
1545#[target_feature(enable = "avx2")]
1546unsafe fn members4m256(
1547 chunk: __m256i,
1548 mask1: Mask256,
1549 mask2: Mask256,
1550 mask3: Mask256,
1551 mask4: Mask256,
1552) -> (__m256i, __m256i, __m256i, __m256i) {
1553 let lomask: __m256i = _mm256_set1_epi8(0xF);
1554 let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1555 let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask);
1556 let res0: __m256i = _mm256_and_si256(
1557 a:_mm256_shuffle_epi8(mask1.lo, hlo),
1558 b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1559 );
1560 let res1: __m256i = _mm256_and_si256(
1561 a:_mm256_shuffle_epi8(mask2.lo, hlo),
1562 b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1563 );
1564 let res2: __m256i = _mm256_and_si256(
1565 a:_mm256_shuffle_epi8(mask3.lo, hlo),
1566 b:_mm256_shuffle_epi8(a:mask3.hi, b:hhi),
1567 );
1568 let res3: __m256i = _mm256_and_si256(
1569 a:_mm256_shuffle_epi8(mask4.lo, hlo),
1570 b:_mm256_shuffle_epi8(a:mask4.hi, b:hhi),
1571 );
1572 (res0, res1, res2, res3)
1573}
1574