1// See the README in this directory for an explanation of the Teddy algorithm.
2// It is strongly recommended to peruse the README before trying to grok this
3// code, as its use of SIMD is pretty opaque, although I tried to add comments
4// where appropriate.
5//
6// Moreover, while there is a lot of code in this file, most of it is
7// repeated variants of the same thing. Specifically, there are three Teddy
8// variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets)
9// and Fat 256-bit Teddy (16 buckets). For each variant, there are three
10// implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to
11// a total of nine variants. Each one is structured roughly the same:
12//
13// while at <= len(haystack) - CHUNK_SIZE:
14// let candidate = find_candidate_in_chunk(haystack, at)
15// if not all zeroes(candidate):
16// if match = verify(haystack, at, candidate):
17// return match
18//
19// For the most part, this remains unchanged. The parts that vary are the
20// verification routine (for slim vs fat Teddy) and the candidate extraction
21// (based on the number of masks).
22//
23// In the code below, a "candidate" corresponds to a single vector with 8-bit
24// lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the
25// jth lane if and only if the byte occurring at position `j` is in the
26// bucket `i` (where the `j`th position is the position in the current window
27// of the haystack, which is always 16 or 32 bytes). Note to be careful here:
28// the ith bit and the jth lane correspond to the least significant bits of the
29// vector. So when visualizing how the current window of bytes is stored in a
30// vector, you often need to flip it around. For example, the text `abcd` in a
31// 4-byte vector would look like this:
32//
33// 01100100 01100011 01100010 01100001
34// d c b a
35//
36// When the mask length is 1, then finding the candidate is pretty straight
37// forward: you just apply the shuffle indices (from the haystack window) to
38// the masks, and then AND them together, as described in the README. But for
39// masks of length 2 and 3, you need to keep a little state. Specifically,
40// you need to store the final 1 (for mask length 2) or 2 (for mask length 3)
41// bytes of the candidate for use when searching the next window. This is for
42// handling matches that span two windows.
43//
44// With respect to the repeated code, it would likely be possible to reduce
45// the number of copies of code below using polymorphism, but I find this
46// formulation clearer instead of needing to reason through generics. However,
47// I admit, there may be a simpler generic construction that I'm missing.
48//
49// All variants are fairly heavily tested in src/packed/tests.rs.
50
51use std::arch::x86_64::*;
52use std::mem;
53
54use crate::packed::pattern::{PatternID, Patterns};
55use crate::packed::teddy::compile;
56use crate::packed::vector::*;
57use crate::Match;
58
59/// The Teddy runtime.
60///
61/// A Teddy runtime can be used to quickly search for occurrences of one or
62/// more patterns. While it does not scale to an arbitrary number of patterns
63/// like Aho-Corasick, it does find occurrences for a small set of patterns
64/// much more quickly than Aho-Corasick.
65///
66/// Teddy cannot run on small haystacks below a certain size, which is
67/// dependent on the type of matcher used. This size can be queried via the
68/// `minimum_len` method. Violating this will result in a panic.
69///
70/// Finally, when callers use a Teddy runtime, they must provide precisely the
71/// patterns used to construct the Teddy matcher. Violating this will result
72/// in either a panic or incorrect results, but will never sacrifice memory
73/// safety.
74#[derive(Clone, Debug)]
75pub struct Teddy {
76 /// The allocation of patterns in buckets. This only contains the IDs of
77 /// patterns. In order to do full verification, callers must provide the
78 /// actual patterns when using Teddy.
79 pub buckets: Vec<Vec<PatternID>>,
80 /// The maximum identifier of a pattern. This is used as a sanity check to
81 /// ensure that the patterns provided by the caller are the same as the
82 /// patterns that were used to compile the matcher. This sanity check
83 /// permits safely eliminating bounds checks regardless of what patterns
84 /// are provided by the caller.
85 ///
86 /// Note that users of the aho-corasick crate cannot get this wrong. Only
87 /// code internal to this crate can get it wrong, since neither `Patterns`
88 /// type nor the Teddy runtime are public API items.
89 pub max_pattern_id: PatternID,
90 /// The actual runtime to use.
91 pub exec: Exec,
92}
93
94impl Teddy {
95 /// Return the first occurrence of a match in the given haystack after or
96 /// starting at `at`.
97 ///
98 /// The patterns provided must be precisely the same patterns given to the
99 /// Teddy builder, otherwise this may panic or produce incorrect results.
100 ///
101 /// All matches are consistent with the match semantics (leftmost-first or
102 /// leftmost-longest) set on `pats`.
103 pub fn find_at(
104 &self,
105 pats: &Patterns,
106 haystack: &[u8],
107 at: usize,
108 ) -> Option<Match> {
109 // This assert is a bit subtle, but it's an important guarantee.
110 // Namely, if the maximum pattern ID seen by Teddy is the same as the
111 // one in the patterns given, then we are guaranteed that every pattern
112 // ID in all Teddy buckets are valid indices into `pats`. While this
113 // is nominally true, there is no guarantee that callers provide the
114 // same `pats` to both the Teddy builder and the searcher, which would
115 // otherwise make `find_at` unsafe to call. But this assert lets us
116 // keep this routine safe and eliminate an important bounds check in
117 // verification.
118 assert_eq!(
119 self.max_pattern_id,
120 pats.max_pattern_id(),
121 "teddy must be called with same patterns it was built with",
122 );
123 // SAFETY: The haystack must have at least a minimum number of bytes
124 // for Teddy to be able to work. The minimum number varies depending on
125 // which matcher is used below. If this is violated, then it's possible
126 // for searching to do out-of-bounds writes.
127 assert!(haystack[at..].len() >= self.minimum_len());
128 // SAFETY: The various Teddy matchers are always safe to call because
129 // the Teddy builder guarantees that a particular Exec variant is
130 // built only when it can be run the current CPU. That is, the Teddy
131 // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is
132 // enabled. That is, our dynamic CPU feature detection is performed
133 // once in the builder, and we rely on the type system to avoid needing
134 // to do it again.
135 unsafe {
136 match self.exec {
137 Exec::TeddySlim1Mask128(ref e) => {
138 e.find_at(pats, self, haystack, at)
139 }
140 Exec::TeddySlim1Mask256(ref e) => {
141 e.find_at(pats, self, haystack, at)
142 }
143 Exec::TeddyFat1Mask256(ref e) => {
144 e.find_at(pats, self, haystack, at)
145 }
146 Exec::TeddySlim2Mask128(ref e) => {
147 e.find_at(pats, self, haystack, at)
148 }
149 Exec::TeddySlim2Mask256(ref e) => {
150 e.find_at(pats, self, haystack, at)
151 }
152 Exec::TeddyFat2Mask256(ref e) => {
153 e.find_at(pats, self, haystack, at)
154 }
155 Exec::TeddySlim3Mask128(ref e) => {
156 e.find_at(pats, self, haystack, at)
157 }
158 Exec::TeddySlim3Mask256(ref e) => {
159 e.find_at(pats, self, haystack, at)
160 }
161 Exec::TeddyFat3Mask256(ref e) => {
162 e.find_at(pats, self, haystack, at)
163 }
164 }
165 }
166 }
167
168 /// Returns the minimum length of a haystack that must be provided by
169 /// callers to this Teddy searcher. Providing a haystack shorter than this
170 /// will result in a panic, but will never violate memory safety.
171 pub fn minimum_len(&self) -> usize {
172 // SAFETY: These values must be correct in order to ensure safety.
173 // The Teddy runtime assumes their haystacks have at least these
174 // lengths. Violating this will sacrifice memory safety.
175 match self.exec {
176 Exec::TeddySlim1Mask128(_) => 16,
177 Exec::TeddySlim1Mask256(_) => 32,
178 Exec::TeddyFat1Mask256(_) => 16,
179 Exec::TeddySlim2Mask128(_) => 17,
180 Exec::TeddySlim2Mask256(_) => 33,
181 Exec::TeddyFat2Mask256(_) => 17,
182 Exec::TeddySlim3Mask128(_) => 18,
183 Exec::TeddySlim3Mask256(_) => 34,
184 Exec::TeddyFat3Mask256(_) => 34,
185 }
186 }
187
188 /// Returns the approximate total amount of heap used by this searcher, in
189 /// units of bytes.
190 pub fn heap_bytes(&self) -> usize {
191 let num_patterns = self.max_pattern_id as usize + 1;
192 self.buckets.len() * mem::size_of::<Vec<PatternID>>()
193 + num_patterns * mem::size_of::<PatternID>()
194 }
195
196 /// Runs the verification routine for Slim 128-bit Teddy.
197 ///
198 /// The candidate given should be a collection of 8-bit bitsets (one bitset
199 /// per lane), where the ith bit is set in the jth lane if and only if the
200 /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
201 ///
202 /// This is not safe to call unless the SSSE3 target feature is enabled.
203 /// The `target_feature` attribute is not applied since this function is
204 /// always forcefully inlined.
205 #[inline(always)]
206 unsafe fn verify128(
207 &self,
208 pats: &Patterns,
209 haystack: &[u8],
210 at: usize,
211 cand: __m128i,
212 ) -> Option<Match> {
213 debug_assert!(!is_all_zeroes128(cand));
214 debug_assert_eq!(8, self.buckets.len());
215
216 // Convert the candidate into 64-bit chunks, and then verify each of
217 // those chunks.
218 let parts = unpack64x128(cand);
219 for (i, &part) in parts.iter().enumerate() {
220 let pos = at + i * 8;
221 if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
222 return Some(m);
223 }
224 }
225 None
226 }
227
228 /// Runs the verification routine for Slim 256-bit Teddy.
229 ///
230 /// The candidate given should be a collection of 8-bit bitsets (one bitset
231 /// per lane), where the ith bit is set in the jth lane if and only if the
232 /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
233 ///
234 /// This is not safe to call unless the AVX2 target feature is enabled.
235 /// The `target_feature` attribute is not applied since this function is
236 /// always forcefully inlined.
237 #[inline(always)]
238 unsafe fn verify256(
239 &self,
240 pats: &Patterns,
241 haystack: &[u8],
242 at: usize,
243 cand: __m256i,
244 ) -> Option<Match> {
245 debug_assert!(!is_all_zeroes256(cand));
246 debug_assert_eq!(8, self.buckets.len());
247
248 // Convert the candidate into 64-bit chunks, and then verify each of
249 // those chunks.
250 let parts = unpack64x256(cand);
251 for (i, &part) in parts.iter().enumerate() {
252 let pos = at + i * 8;
253 if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
254 return Some(m);
255 }
256 }
257 None
258 }
259
260 /// Runs the verification routine for Fat 256-bit Teddy.
261 ///
262 /// The candidate given should be a collection of 8-bit bitsets (one bitset
263 /// per lane), where the ith bit is set in the jth lane if and only if the
264 /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the
265 /// bucket `j < 16 ? i : i + 8`.
266 ///
267 /// This is not safe to call unless the AVX2 target feature is enabled.
268 /// The `target_feature` attribute is not applied since this function is
269 /// always forcefully inlined.
270 #[inline(always)]
271 unsafe fn verify_fat256(
272 &self,
273 pats: &Patterns,
274 haystack: &[u8],
275 at: usize,
276 cand: __m256i,
277 ) -> Option<Match> {
278 debug_assert!(!is_all_zeroes256(cand));
279 debug_assert_eq!(16, self.buckets.len());
280
281 // This is a bit tricky, but we basically want to convert our
282 // candidate, which looks like this
283 //
284 // a31 a30 ... a17 a16 a15 a14 ... a01 a00
285 //
286 // where each a(i) is an 8-bit bitset corresponding to the activated
287 // buckets, to this
288 //
289 // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
290 //
291 // Namely, for Fat Teddy, the high 128-bits of the candidate correspond
292 // to the same bytes in the haystack in the low 128-bits (so we only
293 // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
294 //
295 // The verification routine wants to look at all potentially matching
296 // buckets before moving on to the next lane. So for example, both
297 // a16 and a00 both correspond to the first byte in our window; a00
298 // contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
299 // a16 should be checked before a01. So the transformation shown above
300 // allows us to use our normal verification procedure with one small
301 // change: we treat each bitset as 16 bits instead of 8 bits.
302
303 // Swap the 128-bit lanes in the candidate vector.
304 let swap = _mm256_permute4x64_epi64(cand, 0x4E);
305 // Interleave the bytes from the low 128-bit lanes, starting with
306 // cand first.
307 let r1 = _mm256_unpacklo_epi8(cand, swap);
308 // Interleave the bytes from the high 128-bit lanes, starting with
309 // cand first.
310 let r2 = _mm256_unpackhi_epi8(cand, swap);
311 // Now just take the 2 low 64-bit integers from both r1 and r2. We
312 // can drop the high 64-bit integers because they are a mirror image
313 // of the low 64-bit integers. All we care about are the low 128-bit
314 // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
315 // laid out in the desired order, as described above.
316 let parts = unpacklo64x256(r1, r2);
317 for (i, &part) in parts.iter().enumerate() {
318 let pos = at + i * 4;
319 if let Some(m) = self.verify64(pats, 16, haystack, pos, part) {
320 return Some(m);
321 }
322 }
323 None
324 }
325
326 /// Verify whether there are any matches starting at or after `at` in the
327 /// given `haystack`. The candidate given should correspond to either 8-bit
328 /// (for 8 buckets) or 16-bit (16 buckets) bitsets.
329 #[inline(always)]
330 fn verify64(
331 &self,
332 pats: &Patterns,
333 bucket_count: usize,
334 haystack: &[u8],
335 at: usize,
336 mut cand: u64,
337 ) -> Option<Match> {
338 // N.B. While the bucket count is known from self.buckets.len(),
339 // requiring it as a parameter makes it easier for the optimizer to
340 // know its value, and thus produce more efficient codegen.
341 debug_assert!(bucket_count == 8 || bucket_count == 16);
342 while cand != 0 {
343 let bit = cand.trailing_zeros() as usize;
344 cand &= !(1 << bit);
345
346 let at = at + (bit / bucket_count);
347 let bucket = bit % bucket_count;
348 if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) {
349 return Some(m);
350 }
351 }
352 None
353 }
354
355 /// Verify whether there are any matches starting at `at` in the given
356 /// `haystack` corresponding only to patterns in the given bucket.
357 #[inline(always)]
358 fn verify_bucket(
359 &self,
360 pats: &Patterns,
361 haystack: &[u8],
362 bucket: usize,
363 at: usize,
364 ) -> Option<Match> {
365 // Forcing this function to not inline and be "cold" seems to help
366 // the codegen for Teddy overall. Interestingly, this is good for a
367 // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among
368 // others). Overall, this seems like a problem with codegen, since
369 // creating the Match itself is a very small amount of code.
370 #[cold]
371 #[inline(never)]
372 fn match_from_span(
373 pati: PatternID,
374 start: usize,
375 end: usize,
376 ) -> Match {
377 Match::from_span(pati as usize, start, end)
378 }
379
380 // N.B. The bounds check for this bucket lookup *should* be elided
381 // since we assert the number of buckets in each `find_at` routine,
382 // and the compiler can prove that the `% 8` (or `% 16`) in callers
383 // of this routine will always be in bounds.
384 for &pati in &self.buckets[bucket] {
385 // SAFETY: This is safe because we are guaranteed that every
386 // index in a Teddy bucket is a valid index into `pats`. This
387 // guarantee is upheld by the assert checking `max_pattern_id` in
388 // the beginning of `find_at` above.
389 //
390 // This explicit bounds check elision is (amazingly) good for a
391 // 25-50% boost in some benchmarks, particularly ones with a lot
392 // of short literals.
393 let pat = unsafe { pats.get_unchecked(pati) };
394 if pat.is_prefix(&haystack[at..]) {
395 return Some(match_from_span(pati, at, at + pat.len()));
396 }
397 }
398 None
399 }
400}
401
402/// Exec represents the different search strategies supported by the Teddy
403/// runtime.
404///
405/// This enum is an important safety abstraction. Namely, callers should only
406/// construct a variant in this enum if it is safe to execute its corresponding
407/// target features on the current CPU. The 128-bit searchers require SSSE3,
408/// while the 256-bit searchers require AVX2.
409#[derive(Clone, Debug)]
410pub enum Exec {
411 TeddySlim1Mask128(TeddySlim1Mask128),
412 TeddySlim1Mask256(TeddySlim1Mask256),
413 TeddyFat1Mask256(TeddyFat1Mask256),
414 TeddySlim2Mask128(TeddySlim2Mask128),
415 TeddySlim2Mask256(TeddySlim2Mask256),
416 TeddyFat2Mask256(TeddyFat2Mask256),
417 TeddySlim3Mask128(TeddySlim3Mask128),
418 TeddySlim3Mask256(TeddySlim3Mask256),
419 TeddyFat3Mask256(TeddyFat3Mask256),
420}
421
422// Most of the code below remains undocumented because they are effectively
423// repeated versions of themselves. The general structure is described in the
424// README and in the comments above.
425
426#[derive(Clone, Debug)]
427pub struct TeddySlim1Mask128 {
428 pub mask1: Mask128,
429}
430
431impl TeddySlim1Mask128 {
432 #[target_feature(enable = "ssse3")]
433 unsafe fn find_at(
434 &self,
435 pats: &Patterns,
436 teddy: &Teddy,
437 haystack: &[u8],
438 mut at: usize,
439 ) -> Option<Match> {
440 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
441 // This assert helps eliminate bounds checks for bucket lookups in
442 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
443 assert_eq!(8, teddy.buckets.len());
444
445 let len = haystack.len();
446 while at <= len - 16 {
447 let c = self.candidate(haystack, at);
448 if !is_all_zeroes128(c) {
449 if let Some(m) = teddy.verify128(pats, haystack, at, c) {
450 return Some(m);
451 }
452 }
453 at += 16;
454 }
455 if at < len {
456 at = len - 16;
457 let c = self.candidate(haystack, at);
458 if !is_all_zeroes128(c) {
459 if let Some(m) = teddy.verify128(pats, haystack, at, c) {
460 return Some(m);
461 }
462 }
463 }
464 None
465 }
466
467 #[inline(always)]
468 unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i {
469 debug_assert!(haystack[at..].len() >= 16);
470
471 let chunk = loadu128(haystack, at);
472 members1m128(chunk, self.mask1)
473 }
474}
475
476#[derive(Clone, Debug)]
477pub struct TeddySlim1Mask256 {
478 pub mask1: Mask256,
479}
480
481impl TeddySlim1Mask256 {
482 #[target_feature(enable = "avx2")]
483 unsafe fn find_at(
484 &self,
485 pats: &Patterns,
486 teddy: &Teddy,
487 haystack: &[u8],
488 mut at: usize,
489 ) -> Option<Match> {
490 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
491 // This assert helps eliminate bounds checks for bucket lookups in
492 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
493 assert_eq!(8, teddy.buckets.len());
494
495 let len = haystack.len();
496 while at <= len - 32 {
497 let c = self.candidate(haystack, at);
498 if !is_all_zeroes256(c) {
499 if let Some(m) = teddy.verify256(pats, haystack, at, c) {
500 return Some(m);
501 }
502 }
503 at += 32;
504 }
505 if at < len {
506 at = len - 32;
507 let c = self.candidate(haystack, at);
508 if !is_all_zeroes256(c) {
509 if let Some(m) = teddy.verify256(pats, haystack, at, c) {
510 return Some(m);
511 }
512 }
513 }
514 None
515 }
516
517 #[inline(always)]
518 unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
519 debug_assert!(haystack[at..].len() >= 32);
520
521 let chunk = loadu256(haystack, at);
522 members1m256(chunk, self.mask1)
523 }
524}
525
526#[derive(Clone, Debug)]
527pub struct TeddyFat1Mask256 {
528 pub mask1: Mask256,
529}
530
531impl TeddyFat1Mask256 {
532 #[target_feature(enable = "avx2")]
533 unsafe fn find_at(
534 &self,
535 pats: &Patterns,
536 teddy: &Teddy,
537 haystack: &[u8],
538 mut at: usize,
539 ) -> Option<Match> {
540 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
541 // This assert helps eliminate bounds checks for bucket lookups in
542 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
543 assert_eq!(16, teddy.buckets.len());
544
545 let len = haystack.len();
546 while at <= len - 16 {
547 let c = self.candidate(haystack, at);
548 if !is_all_zeroes256(c) {
549 if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
550 return Some(m);
551 }
552 }
553 at += 16;
554 }
555 if at < len {
556 at = len - 16;
557 let c = self.candidate(haystack, at);
558 if !is_all_zeroes256(c) {
559 if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
560 return Some(m);
561 }
562 }
563 }
564 None
565 }
566
567 #[inline(always)]
568 unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
569 debug_assert!(haystack[at..].len() >= 16);
570
571 let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
572 members1m256(chunk, self.mask1)
573 }
574}
575
576#[derive(Clone, Debug)]
577pub struct TeddySlim2Mask128 {
578 pub mask1: Mask128,
579 pub mask2: Mask128,
580}
581
582impl TeddySlim2Mask128 {
583 #[target_feature(enable = "ssse3")]
584 unsafe fn find_at(
585 &self,
586 pats: &Patterns,
587 teddy: &Teddy,
588 haystack: &[u8],
589 mut at: usize,
590 ) -> Option<Match> {
591 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
592 // This assert helps eliminate bounds checks for bucket lookups in
593 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
594 assert_eq!(8, teddy.buckets.len());
595
596 at += 1;
597 let len = haystack.len();
598 let mut prev0 = ones128();
599 while at <= len - 16 {
600 let c = self.candidate(haystack, at, &mut prev0);
601 if !is_all_zeroes128(c) {
602 if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
603 return Some(m);
604 }
605 }
606 at += 16;
607 }
608 if at < len {
609 at = len - 16;
610 prev0 = ones128();
611
612 let c = self.candidate(haystack, at, &mut prev0);
613 if !is_all_zeroes128(c) {
614 if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
615 return Some(m);
616 }
617 }
618 }
619 None
620 }
621
622 #[inline(always)]
623 unsafe fn candidate(
624 &self,
625 haystack: &[u8],
626 at: usize,
627 prev0: &mut __m128i,
628 ) -> __m128i {
629 debug_assert!(haystack[at..].len() >= 16);
630
631 let chunk = loadu128(haystack, at);
632 let (res0, res1) = members2m128(chunk, self.mask1, self.mask2);
633 let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15);
634 _mm_and_si128(res0prev0, res1)
635 }
636}
637
638#[derive(Clone, Debug)]
639pub struct TeddySlim2Mask256 {
640 pub mask1: Mask256,
641 pub mask2: Mask256,
642}
643
644impl TeddySlim2Mask256 {
645 #[target_feature(enable = "avx2")]
646 unsafe fn find_at(
647 &self,
648 pats: &Patterns,
649 teddy: &Teddy,
650 haystack: &[u8],
651 mut at: usize,
652 ) -> Option<Match> {
653 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
654 // This assert helps eliminate bounds checks for bucket lookups in
655 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
656 assert_eq!(8, teddy.buckets.len());
657
658 at += 1;
659 let len = haystack.len();
660 let mut prev0 = ones256();
661 while at <= len - 32 {
662 let c = self.candidate(haystack, at, &mut prev0);
663 if !is_all_zeroes256(c) {
664 if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
665 return Some(m);
666 }
667 }
668 at += 32;
669 }
670 if at < len {
671 at = len - 32;
672 prev0 = ones256();
673
674 let c = self.candidate(haystack, at, &mut prev0);
675 if !is_all_zeroes256(c) {
676 if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
677 return Some(m);
678 }
679 }
680 }
681 None
682 }
683
684 #[inline(always)]
685 unsafe fn candidate(
686 &self,
687 haystack: &[u8],
688 at: usize,
689 prev0: &mut __m256i,
690 ) -> __m256i {
691 debug_assert!(haystack[at..].len() >= 32);
692
693 let chunk = loadu256(haystack, at);
694 let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
695 let res0prev0 = alignr256_15(res0, *prev0);
696 let res = _mm256_and_si256(res0prev0, res1);
697 *prev0 = res0;
698 res
699 }
700}
701
702#[derive(Clone, Debug)]
703pub struct TeddyFat2Mask256 {
704 pub mask1: Mask256,
705 pub mask2: Mask256,
706}
707
708impl TeddyFat2Mask256 {
709 #[target_feature(enable = "avx2")]
710 unsafe fn find_at(
711 &self,
712 pats: &Patterns,
713 teddy: &Teddy,
714 haystack: &[u8],
715 mut at: usize,
716 ) -> Option<Match> {
717 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
718 // This assert helps eliminate bounds checks for bucket lookups in
719 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
720 assert_eq!(16, teddy.buckets.len());
721
722 at += 1;
723 let len = haystack.len();
724 let mut prev0 = ones256();
725 while at <= len - 16 {
726 let c = self.candidate(haystack, at, &mut prev0);
727 if !is_all_zeroes256(c) {
728 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
729 {
730 return Some(m);
731 }
732 }
733 at += 16;
734 }
735 if at < len {
736 at = len - 16;
737 prev0 = ones256();
738
739 let c = self.candidate(haystack, at, &mut prev0);
740 if !is_all_zeroes256(c) {
741 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
742 {
743 return Some(m);
744 }
745 }
746 }
747 None
748 }
749
750 #[inline(always)]
751 unsafe fn candidate(
752 &self,
753 haystack: &[u8],
754 at: usize,
755 prev0: &mut __m256i,
756 ) -> __m256i {
757 debug_assert!(haystack[at..].len() >= 16);
758
759 let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
760 let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
761 let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15);
762 let res = _mm256_and_si256(res0prev0, res1);
763 *prev0 = res0;
764 res
765 }
766}
767
768#[derive(Clone, Debug)]
769pub struct TeddySlim3Mask128 {
770 pub mask1: Mask128,
771 pub mask2: Mask128,
772 pub mask3: Mask128,
773}
774
775impl TeddySlim3Mask128 {
776 #[target_feature(enable = "ssse3")]
777 unsafe fn find_at(
778 &self,
779 pats: &Patterns,
780 teddy: &Teddy,
781 haystack: &[u8],
782 mut at: usize,
783 ) -> Option<Match> {
784 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
785 // This assert helps eliminate bounds checks for bucket lookups in
786 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
787 assert_eq!(8, teddy.buckets.len());
788
789 at += 2;
790 let len = haystack.len();
791 let (mut prev0, mut prev1) = (ones128(), ones128());
792 while at <= len - 16 {
793 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
794 if !is_all_zeroes128(c) {
795 if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
796 return Some(m);
797 }
798 }
799 at += 16;
800 }
801 if at < len {
802 at = len - 16;
803 prev0 = ones128();
804 prev1 = ones128();
805
806 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
807 if !is_all_zeroes128(c) {
808 if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
809 return Some(m);
810 }
811 }
812 }
813 None
814 }
815
816 #[inline(always)]
817 unsafe fn candidate(
818 &self,
819 haystack: &[u8],
820 at: usize,
821 prev0: &mut __m128i,
822 prev1: &mut __m128i,
823 ) -> __m128i {
824 debug_assert!(haystack[at..].len() >= 16);
825
826 let chunk = loadu128(haystack, at);
827 let (res0, res1, res2) =
828 members3m128(chunk, self.mask1, self.mask2, self.mask3);
829 let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14);
830 let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15);
831 let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2);
832 *prev0 = res0;
833 *prev1 = res1;
834 res
835 }
836}
837
838#[derive(Clone, Debug)]
839pub struct TeddySlim3Mask256 {
840 pub mask1: Mask256,
841 pub mask2: Mask256,
842 pub mask3: Mask256,
843}
844
845impl TeddySlim3Mask256 {
846 #[target_feature(enable = "avx2")]
847 unsafe fn find_at(
848 &self,
849 pats: &Patterns,
850 teddy: &Teddy,
851 haystack: &[u8],
852 mut at: usize,
853 ) -> Option<Match> {
854 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
855 // This assert helps eliminate bounds checks for bucket lookups in
856 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
857 assert_eq!(8, teddy.buckets.len());
858
859 at += 2;
860 let len = haystack.len();
861 let (mut prev0, mut prev1) = (ones256(), ones256());
862 while at <= len - 32 {
863 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
864 if !is_all_zeroes256(c) {
865 if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
866 return Some(m);
867 }
868 }
869 at += 32;
870 }
871 if at < len {
872 at = len - 32;
873 prev0 = ones256();
874 prev1 = ones256();
875
876 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
877 if !is_all_zeroes256(c) {
878 if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
879 return Some(m);
880 }
881 }
882 }
883 None
884 }
885
886 #[inline(always)]
887 unsafe fn candidate(
888 &self,
889 haystack: &[u8],
890 at: usize,
891 prev0: &mut __m256i,
892 prev1: &mut __m256i,
893 ) -> __m256i {
894 debug_assert!(haystack[at..].len() >= 32);
895
896 let chunk = loadu256(haystack, at);
897 let (res0, res1, res2) =
898 members3m256(chunk, self.mask1, self.mask2, self.mask3);
899 let res0prev0 = alignr256_14(res0, *prev0);
900 let res1prev1 = alignr256_15(res1, *prev1);
901 let res =
902 _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
903 *prev0 = res0;
904 *prev1 = res1;
905 res
906 }
907}
908
909#[derive(Clone, Debug)]
910pub struct TeddyFat3Mask256 {
911 pub mask1: Mask256,
912 pub mask2: Mask256,
913 pub mask3: Mask256,
914}
915
916impl TeddyFat3Mask256 {
917 #[target_feature(enable = "avx2")]
918 unsafe fn find_at(
919 &self,
920 pats: &Patterns,
921 teddy: &Teddy,
922 haystack: &[u8],
923 mut at: usize,
924 ) -> Option<Match> {
925 debug_assert!(haystack[at..].len() >= teddy.minimum_len());
926 // This assert helps eliminate bounds checks for bucket lookups in
927 // Teddy::verify_bucket, which has a small (3-4%) performance boost.
928 assert_eq!(16, teddy.buckets.len());
929
930 at += 2;
931 let len = haystack.len();
932 let (mut prev0, mut prev1) = (ones256(), ones256());
933 while at <= len - 16 {
934 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
935 if !is_all_zeroes256(c) {
936 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
937 {
938 return Some(m);
939 }
940 }
941 at += 16;
942 }
943 if at < len {
944 at = len - 16;
945 prev0 = ones256();
946 prev1 = ones256();
947
948 let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
949 if !is_all_zeroes256(c) {
950 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
951 {
952 return Some(m);
953 }
954 }
955 }
956 None
957 }
958
959 #[inline(always)]
960 unsafe fn candidate(
961 &self,
962 haystack: &[u8],
963 at: usize,
964 prev0: &mut __m256i,
965 prev1: &mut __m256i,
966 ) -> __m256i {
967 debug_assert!(haystack[at..].len() >= 16);
968
969 let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
970 let (res0, res1, res2) =
971 members3m256(chunk, self.mask1, self.mask2, self.mask3);
972 let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14);
973 let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15);
974 let res =
975 _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
976 *prev0 = res0;
977 *prev1 = res1;
978 res
979 }
980}
981
982/// A 128-bit mask for the low and high nybbles in a set of patterns. Each
983/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
984/// the nybble `j` is in the bucket `i` at a particular position.
985#[derive(Clone, Copy, Debug)]
986pub struct Mask128 {
987 lo: __m128i,
988 hi: __m128i,
989}
990
991impl Mask128 {
992 /// Create a new SIMD mask from the mask produced by the Teddy builder.
993 pub fn new(mask: compile::Mask) -> Mask128 {
994 // SAFETY: This is safe since [u8; 16] has the same representation
995 // as __m128i.
996 unsafe {
997 Mask128 {
998 lo: mem::transmute(src:mask.lo128()),
999 hi: mem::transmute(src:mask.hi128()),
1000 }
1001 }
1002 }
1003}
1004
1005/// A 256-bit mask for the low and high nybbles in a set of patterns. Each
1006/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
1007/// the nybble `j` is in the bucket `i` at a particular position.
1008///
1009/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
1010/// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as
1011/// the bitsets in the higher 128-bits, so that we can search 32 bytes at a
1012/// time. (Remember, the nybbles in the haystack are used as indices into these
1013/// masks, and 256-bit shuffles only operate on 128-bit lanes.)
1014///
1015/// For Fat Teddy, the bitsets are not repeated, but instead, the high 128
1016/// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets
1017/// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set
1018/// if it's in the higher 128 bits.
1019#[derive(Clone, Copy, Debug)]
1020pub struct Mask256 {
1021 lo: __m256i,
1022 hi: __m256i,
1023}
1024
1025impl Mask256 {
1026 /// Create a new SIMD mask from the mask produced by the Teddy builder.
1027 pub fn new(mask: compile::Mask) -> Mask256 {
1028 // SAFETY: This is safe since [u8; 32] has the same representation
1029 // as __m256i.
1030 unsafe {
1031 Mask256 {
1032 lo: mem::transmute(src:mask.lo256()),
1033 hi: mem::transmute(src:mask.hi256()),
1034 }
1035 }
1036 }
1037}
1038
1039// The "members" routines below are responsible for taking a chunk of bytes,
1040// a number of nybble masks and returning the result of using the masks to
1041// lookup bytes in the chunk. The results of the high and low nybble masks are
1042// AND'ed together, such that each candidate returned is a vector, with byte
1043// sized lanes, and where each lane is an 8-bit bitset corresponding to the
1044// buckets that contain the corresponding byte.
1045//
1046// In the case of masks of length greater than 1, callers will need to keep
1047// the results from the previous haystack's window, and then shift the vectors
1048// so that they all line up. Then they can be AND'ed together.
1049
1050/// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a
1051/// 16-byte window of the haystack (where the least significant byte
1052/// corresponds to the start of the window), and `mask1` corresponds to a
1053/// low/high mask for the first byte of all patterns that are being searched.
1054#[target_feature(enable = "ssse3")]
1055unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i {
1056 let lomask: __m128i = _mm_set1_epi8(0xF);
1057 let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1058 let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask);
1059 _mm_and_si128(
1060 a:_mm_shuffle_epi8(mask1.lo, hlo),
1061 b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1062 )
1063}
1064
1065/// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a
1066/// 32-byte window of the haystack (where the least significant byte
1067/// corresponds to the start of the window), and `mask1` corresponds to a
1068/// low/high mask for the first byte of all patterns that are being searched.
1069///
1070/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1071/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1072/// window in the haystack.
1073#[target_feature(enable = "avx2")]
1074unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i {
1075 let lomask: __m256i = _mm256_set1_epi8(0xF);
1076 let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1077 let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask);
1078 _mm256_and_si256(
1079 a:_mm256_shuffle_epi8(mask1.lo, hlo),
1080 b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1081 )
1082}
1083
1084/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1085/// to a 16-byte window of the haystack (where the least significant byte
1086/// corresponds to the start of the window), and the masks correspond to a
1087/// low/high mask for the first and second bytes of all patterns that are being
1088/// searched. The vectors returned correspond to candidates for the first and
1089/// second bytes in the patterns represented by the masks.
1090#[target_feature(enable = "ssse3")]
1091unsafe fn members2m128(
1092 chunk: __m128i,
1093 mask1: Mask128,
1094 mask2: Mask128,
1095) -> (__m128i, __m128i) {
1096 let lomask: __m128i = _mm_set1_epi8(0xF);
1097 let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1098 let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask);
1099 let res0: __m128i = _mm_and_si128(
1100 a:_mm_shuffle_epi8(mask1.lo, hlo),
1101 b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1102 );
1103 let res1: __m128i = _mm_and_si128(
1104 a:_mm_shuffle_epi8(mask2.lo, hlo),
1105 b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1106 );
1107 (res0, res1)
1108}
1109
1110/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1111/// to a 32-byte window of the haystack (where the least significant byte
1112/// corresponds to the start of the window), and the masks correspond to a
1113/// low/high mask for the first and second bytes of all patterns that are being
1114/// searched. The vectors returned correspond to candidates for the first and
1115/// second bytes in the patterns represented by the masks.
1116///
1117/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1118/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1119/// window in the haystack.
1120#[target_feature(enable = "avx2")]
1121unsafe fn members2m256(
1122 chunk: __m256i,
1123 mask1: Mask256,
1124 mask2: Mask256,
1125) -> (__m256i, __m256i) {
1126 let lomask: __m256i = _mm256_set1_epi8(0xF);
1127 let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1128 let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask);
1129 let res0: __m256i = _mm256_and_si256(
1130 a:_mm256_shuffle_epi8(mask1.lo, hlo),
1131 b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1132 );
1133 let res1: __m256i = _mm256_and_si256(
1134 a:_mm256_shuffle_epi8(mask2.lo, hlo),
1135 b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1136 );
1137 (res0, res1)
1138}
1139
1140/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1141/// to a 16-byte window of the haystack (where the least significant byte
1142/// corresponds to the start of the window), and the masks correspond to a
1143/// low/high mask for the first, second and third bytes of all patterns that
1144/// are being searched. The vectors returned correspond to candidates for the
1145/// first, second and third bytes in the patterns represented by the masks.
1146#[target_feature(enable = "ssse3")]
1147unsafe fn members3m128(
1148 chunk: __m128i,
1149 mask1: Mask128,
1150 mask2: Mask128,
1151 mask3: Mask128,
1152) -> (__m128i, __m128i, __m128i) {
1153 let lomask: __m128i = _mm_set1_epi8(0xF);
1154 let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1155 let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask);
1156 let res0: __m128i = _mm_and_si128(
1157 a:_mm_shuffle_epi8(mask1.lo, hlo),
1158 b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1159 );
1160 let res1: __m128i = _mm_and_si128(
1161 a:_mm_shuffle_epi8(mask2.lo, hlo),
1162 b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1163 );
1164 let res2: __m128i = _mm_and_si128(
1165 a:_mm_shuffle_epi8(mask3.lo, hlo),
1166 b:_mm_shuffle_epi8(a:mask3.hi, b:hhi),
1167 );
1168 (res0, res1, res2)
1169}
1170
1171/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1172/// to a 32-byte window of the haystack (where the least significant byte
1173/// corresponds to the start of the window), and the masks correspond to a
1174/// low/high mask for the first, second and third bytes of all patterns that
1175/// are being searched. The vectors returned correspond to candidates for the
1176/// first, second and third bytes in the patterns represented by the masks.
1177///
1178/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1179/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1180/// window in the haystack.
1181#[target_feature(enable = "avx2")]
1182unsafe fn members3m256(
1183 chunk: __m256i,
1184 mask1: Mask256,
1185 mask2: Mask256,
1186 mask3: Mask256,
1187) -> (__m256i, __m256i, __m256i) {
1188 let lomask: __m256i = _mm256_set1_epi8(0xF);
1189 let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1190 let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask);
1191 let res0: __m256i = _mm256_and_si256(
1192 a:_mm256_shuffle_epi8(mask1.lo, hlo),
1193 b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1194 );
1195 let res1: __m256i = _mm256_and_si256(
1196 a:_mm256_shuffle_epi8(mask2.lo, hlo),
1197 b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1198 );
1199 let res2: __m256i = _mm256_and_si256(
1200 a:_mm256_shuffle_epi8(mask3.lo, hlo),
1201 b:_mm256_shuffle_epi8(a:mask3.hi, b:hhi),
1202 );
1203 (res0, res1, res2)
1204}
1205