1 | // See the README in this directory for an explanation of the Teddy algorithm. |
2 | // It is strongly recommended to peruse the README before trying to grok this |
3 | // code, as its use of SIMD is pretty opaque, although I tried to add comments |
4 | // where appropriate. |
5 | // |
6 | // Moreover, while there is a lot of code in this file, most of it is |
7 | // repeated variants of the same thing. Specifically, there are three Teddy |
8 | // variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets) |
9 | // and Fat 256-bit Teddy (16 buckets). For each variant, there are three |
10 | // implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to |
11 | // a total of nine variants. Each one is structured roughly the same: |
12 | // |
13 | // while at <= len(haystack) - CHUNK_SIZE: |
14 | // let candidate = find_candidate_in_chunk(haystack, at) |
15 | // if not all zeroes(candidate): |
16 | // if match = verify(haystack, at, candidate): |
17 | // return match |
18 | // |
19 | // For the most part, this remains unchanged. The parts that vary are the |
20 | // verification routine (for slim vs fat Teddy) and the candidate extraction |
21 | // (based on the number of masks). |
22 | // |
23 | // In the code below, a "candidate" corresponds to a single vector with 8-bit |
24 | // lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the |
25 | // jth lane if and only if the byte occurring at position `j` is in the |
26 | // bucket `i` (where the `j`th position is the position in the current window |
27 | // of the haystack, which is always 16 or 32 bytes). Note to be careful here: |
28 | // the ith bit and the jth lane correspond to the least significant bits of the |
29 | // vector. So when visualizing how the current window of bytes is stored in a |
30 | // vector, you often need to flip it around. For example, the text `abcd` in a |
31 | // 4-byte vector would look like this: |
32 | // |
33 | // 01100100 01100011 01100010 01100001 |
34 | // d c b a |
35 | // |
36 | // When the mask length is 1, then finding the candidate is pretty straight |
37 | // forward: you just apply the shuffle indices (from the haystack window) to |
38 | // the masks, and then AND them together, as described in the README. But for |
39 | // masks of length 2 and 3, you need to keep a little state. Specifically, |
40 | // you need to store the final 1 (for mask length 2) or 2 (for mask length 3) |
41 | // bytes of the candidate for use when searching the next window. This is for |
42 | // handling matches that span two windows. |
43 | // |
44 | // With respect to the repeated code, it would likely be possible to reduce |
45 | // the number of copies of code below using polymorphism, but I find this |
46 | // formulation clearer instead of needing to reason through generics. However, |
47 | // I admit, there may be a simpler generic construction that I'm missing. |
48 | // |
49 | // All variants are fairly heavily tested in src/packed/tests.rs. |
50 | |
51 | use std::arch::x86_64::*; |
52 | use std::mem; |
53 | |
54 | use crate::packed::pattern::{PatternID, Patterns}; |
55 | use crate::packed::teddy::compile; |
56 | use crate::packed::vector::*; |
57 | use crate::Match; |
58 | |
59 | /// The Teddy runtime. |
60 | /// |
61 | /// A Teddy runtime can be used to quickly search for occurrences of one or |
62 | /// more patterns. While it does not scale to an arbitrary number of patterns |
63 | /// like Aho-Corasick, it does find occurrences for a small set of patterns |
64 | /// much more quickly than Aho-Corasick. |
65 | /// |
66 | /// Teddy cannot run on small haystacks below a certain size, which is |
67 | /// dependent on the type of matcher used. This size can be queried via the |
68 | /// `minimum_len` method. Violating this will result in a panic. |
69 | /// |
70 | /// Finally, when callers use a Teddy runtime, they must provide precisely the |
71 | /// patterns used to construct the Teddy matcher. Violating this will result |
72 | /// in either a panic or incorrect results, but will never sacrifice memory |
73 | /// safety. |
74 | #[derive (Clone, Debug)] |
75 | pub struct Teddy { |
76 | /// The allocation of patterns in buckets. This only contains the IDs of |
77 | /// patterns. In order to do full verification, callers must provide the |
78 | /// actual patterns when using Teddy. |
79 | pub buckets: Vec<Vec<PatternID>>, |
80 | /// The maximum identifier of a pattern. This is used as a sanity check to |
81 | /// ensure that the patterns provided by the caller are the same as the |
82 | /// patterns that were used to compile the matcher. This sanity check |
83 | /// permits safely eliminating bounds checks regardless of what patterns |
84 | /// are provided by the caller. |
85 | /// |
86 | /// Note that users of the aho-corasick crate cannot get this wrong. Only |
87 | /// code internal to this crate can get it wrong, since neither `Patterns` |
88 | /// type nor the Teddy runtime are public API items. |
89 | pub max_pattern_id: PatternID, |
90 | /// The actual runtime to use. |
91 | pub exec: Exec, |
92 | } |
93 | |
94 | impl Teddy { |
95 | /// Return the first occurrence of a match in the given haystack after or |
96 | /// starting at `at`. |
97 | /// |
98 | /// The patterns provided must be precisely the same patterns given to the |
99 | /// Teddy builder, otherwise this may panic or produce incorrect results. |
100 | /// |
101 | /// All matches are consistent with the match semantics (leftmost-first or |
102 | /// leftmost-longest) set on `pats`. |
103 | pub fn find_at( |
104 | &self, |
105 | pats: &Patterns, |
106 | haystack: &[u8], |
107 | at: usize, |
108 | ) -> Option<Match> { |
109 | // This assert is a bit subtle, but it's an important guarantee. |
110 | // Namely, if the maximum pattern ID seen by Teddy is the same as the |
111 | // one in the patterns given, then we are guaranteed that every pattern |
112 | // ID in all Teddy buckets are valid indices into `pats`. While this |
113 | // is nominally true, there is no guarantee that callers provide the |
114 | // same `pats` to both the Teddy builder and the searcher, which would |
115 | // otherwise make `find_at` unsafe to call. But this assert lets us |
116 | // keep this routine safe and eliminate an important bounds check in |
117 | // verification. |
118 | assert_eq!( |
119 | self.max_pattern_id, |
120 | pats.max_pattern_id(), |
121 | "teddy must be called with same patterns it was built with" , |
122 | ); |
123 | // SAFETY: The haystack must have at least a minimum number of bytes |
124 | // for Teddy to be able to work. The minimum number varies depending on |
125 | // which matcher is used below. If this is violated, then it's possible |
126 | // for searching to do out-of-bounds writes. |
127 | assert!(haystack[at..].len() >= self.minimum_len()); |
128 | // SAFETY: The various Teddy matchers are always safe to call because |
129 | // the Teddy builder guarantees that a particular Exec variant is |
130 | // built only when it can be run the current CPU. That is, the Teddy |
131 | // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is |
132 | // enabled. That is, our dynamic CPU feature detection is performed |
133 | // once in the builder, and we rely on the type system to avoid needing |
134 | // to do it again. |
135 | unsafe { |
136 | match self.exec { |
137 | Exec::TeddySlim1Mask128(ref e) => { |
138 | e.find_at(pats, self, haystack, at) |
139 | } |
140 | Exec::TeddySlim1Mask256(ref e) => { |
141 | e.find_at(pats, self, haystack, at) |
142 | } |
143 | Exec::TeddyFat1Mask256(ref e) => { |
144 | e.find_at(pats, self, haystack, at) |
145 | } |
146 | Exec::TeddySlim2Mask128(ref e) => { |
147 | e.find_at(pats, self, haystack, at) |
148 | } |
149 | Exec::TeddySlim2Mask256(ref e) => { |
150 | e.find_at(pats, self, haystack, at) |
151 | } |
152 | Exec::TeddyFat2Mask256(ref e) => { |
153 | e.find_at(pats, self, haystack, at) |
154 | } |
155 | Exec::TeddySlim3Mask128(ref e) => { |
156 | e.find_at(pats, self, haystack, at) |
157 | } |
158 | Exec::TeddySlim3Mask256(ref e) => { |
159 | e.find_at(pats, self, haystack, at) |
160 | } |
161 | Exec::TeddyFat3Mask256(ref e) => { |
162 | e.find_at(pats, self, haystack, at) |
163 | } |
164 | } |
165 | } |
166 | } |
167 | |
168 | /// Returns the minimum length of a haystack that must be provided by |
169 | /// callers to this Teddy searcher. Providing a haystack shorter than this |
170 | /// will result in a panic, but will never violate memory safety. |
171 | pub fn minimum_len(&self) -> usize { |
172 | // SAFETY: These values must be correct in order to ensure safety. |
173 | // The Teddy runtime assumes their haystacks have at least these |
174 | // lengths. Violating this will sacrifice memory safety. |
175 | match self.exec { |
176 | Exec::TeddySlim1Mask128(_) => 16, |
177 | Exec::TeddySlim1Mask256(_) => 32, |
178 | Exec::TeddyFat1Mask256(_) => 16, |
179 | Exec::TeddySlim2Mask128(_) => 17, |
180 | Exec::TeddySlim2Mask256(_) => 33, |
181 | Exec::TeddyFat2Mask256(_) => 17, |
182 | Exec::TeddySlim3Mask128(_) => 18, |
183 | Exec::TeddySlim3Mask256(_) => 34, |
184 | Exec::TeddyFat3Mask256(_) => 34, |
185 | } |
186 | } |
187 | |
188 | /// Returns the approximate total amount of heap used by this searcher, in |
189 | /// units of bytes. |
190 | pub fn heap_bytes(&self) -> usize { |
191 | let num_patterns = self.max_pattern_id as usize + 1; |
192 | self.buckets.len() * mem::size_of::<Vec<PatternID>>() |
193 | + num_patterns * mem::size_of::<PatternID>() |
194 | } |
195 | |
196 | /// Runs the verification routine for Slim 128-bit Teddy. |
197 | /// |
198 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
199 | /// per lane), where the ith bit is set in the jth lane if and only if the |
200 | /// byte occurring at `at + j` in `haystack` is in the bucket `i`. |
201 | /// |
202 | /// This is not safe to call unless the SSSE3 target feature is enabled. |
203 | /// The `target_feature` attribute is not applied since this function is |
204 | /// always forcefully inlined. |
205 | #[inline (always)] |
206 | unsafe fn verify128( |
207 | &self, |
208 | pats: &Patterns, |
209 | haystack: &[u8], |
210 | at: usize, |
211 | cand: __m128i, |
212 | ) -> Option<Match> { |
213 | debug_assert!(!is_all_zeroes128(cand)); |
214 | debug_assert_eq!(8, self.buckets.len()); |
215 | |
216 | // Convert the candidate into 64-bit chunks, and then verify each of |
217 | // those chunks. |
218 | let parts = unpack64x128(cand); |
219 | for (i, &part) in parts.iter().enumerate() { |
220 | let pos = at + i * 8; |
221 | if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { |
222 | return Some(m); |
223 | } |
224 | } |
225 | None |
226 | } |
227 | |
228 | /// Runs the verification routine for Slim 256-bit Teddy. |
229 | /// |
230 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
231 | /// per lane), where the ith bit is set in the jth lane if and only if the |
232 | /// byte occurring at `at + j` in `haystack` is in the bucket `i`. |
233 | /// |
234 | /// This is not safe to call unless the AVX2 target feature is enabled. |
235 | /// The `target_feature` attribute is not applied since this function is |
236 | /// always forcefully inlined. |
237 | #[inline (always)] |
238 | unsafe fn verify256( |
239 | &self, |
240 | pats: &Patterns, |
241 | haystack: &[u8], |
242 | at: usize, |
243 | cand: __m256i, |
244 | ) -> Option<Match> { |
245 | debug_assert!(!is_all_zeroes256(cand)); |
246 | debug_assert_eq!(8, self.buckets.len()); |
247 | |
248 | // Convert the candidate into 64-bit chunks, and then verify each of |
249 | // those chunks. |
250 | let parts = unpack64x256(cand); |
251 | for (i, &part) in parts.iter().enumerate() { |
252 | let pos = at + i * 8; |
253 | if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { |
254 | return Some(m); |
255 | } |
256 | } |
257 | None |
258 | } |
259 | |
260 | /// Runs the verification routine for Fat 256-bit Teddy. |
261 | /// |
262 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
263 | /// per lane), where the ith bit is set in the jth lane if and only if the |
264 | /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the |
265 | /// bucket `j < 16 ? i : i + 8`. |
266 | /// |
267 | /// This is not safe to call unless the AVX2 target feature is enabled. |
268 | /// The `target_feature` attribute is not applied since this function is |
269 | /// always forcefully inlined. |
270 | #[inline (always)] |
271 | unsafe fn verify_fat256( |
272 | &self, |
273 | pats: &Patterns, |
274 | haystack: &[u8], |
275 | at: usize, |
276 | cand: __m256i, |
277 | ) -> Option<Match> { |
278 | debug_assert!(!is_all_zeroes256(cand)); |
279 | debug_assert_eq!(16, self.buckets.len()); |
280 | |
281 | // This is a bit tricky, but we basically want to convert our |
282 | // candidate, which looks like this |
283 | // |
284 | // a31 a30 ... a17 a16 a15 a14 ... a01 a00 |
285 | // |
286 | // where each a(i) is an 8-bit bitset corresponding to the activated |
287 | // buckets, to this |
288 | // |
289 | // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 |
290 | // |
291 | // Namely, for Fat Teddy, the high 128-bits of the candidate correspond |
292 | // to the same bytes in the haystack in the low 128-bits (so we only |
293 | // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. |
294 | // |
295 | // The verification routine wants to look at all potentially matching |
296 | // buckets before moving on to the next lane. So for example, both |
297 | // a16 and a00 both correspond to the first byte in our window; a00 |
298 | // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, |
299 | // a16 should be checked before a01. So the transformation shown above |
300 | // allows us to use our normal verification procedure with one small |
301 | // change: we treat each bitset as 16 bits instead of 8 bits. |
302 | |
303 | // Swap the 128-bit lanes in the candidate vector. |
304 | let swap = _mm256_permute4x64_epi64(cand, 0x4E); |
305 | // Interleave the bytes from the low 128-bit lanes, starting with |
306 | // cand first. |
307 | let r1 = _mm256_unpacklo_epi8(cand, swap); |
308 | // Interleave the bytes from the high 128-bit lanes, starting with |
309 | // cand first. |
310 | let r2 = _mm256_unpackhi_epi8(cand, swap); |
311 | // Now just take the 2 low 64-bit integers from both r1 and r2. We |
312 | // can drop the high 64-bit integers because they are a mirror image |
313 | // of the low 64-bit integers. All we care about are the low 128-bit |
314 | // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets |
315 | // laid out in the desired order, as described above. |
316 | let parts = unpacklo64x256(r1, r2); |
317 | for (i, &part) in parts.iter().enumerate() { |
318 | let pos = at + i * 4; |
319 | if let Some(m) = self.verify64(pats, 16, haystack, pos, part) { |
320 | return Some(m); |
321 | } |
322 | } |
323 | None |
324 | } |
325 | |
326 | /// Verify whether there are any matches starting at or after `at` in the |
327 | /// given `haystack`. The candidate given should correspond to either 8-bit |
328 | /// (for 8 buckets) or 16-bit (16 buckets) bitsets. |
329 | #[inline (always)] |
330 | fn verify64( |
331 | &self, |
332 | pats: &Patterns, |
333 | bucket_count: usize, |
334 | haystack: &[u8], |
335 | at: usize, |
336 | mut cand: u64, |
337 | ) -> Option<Match> { |
338 | // N.B. While the bucket count is known from self.buckets.len(), |
339 | // requiring it as a parameter makes it easier for the optimizer to |
340 | // know its value, and thus produce more efficient codegen. |
341 | debug_assert!(bucket_count == 8 || bucket_count == 16); |
342 | while cand != 0 { |
343 | let bit = cand.trailing_zeros() as usize; |
344 | cand &= !(1 << bit); |
345 | |
346 | let at = at + (bit / bucket_count); |
347 | let bucket = bit % bucket_count; |
348 | if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) { |
349 | return Some(m); |
350 | } |
351 | } |
352 | None |
353 | } |
354 | |
355 | /// Verify whether there are any matches starting at `at` in the given |
356 | /// `haystack` corresponding only to patterns in the given bucket. |
357 | #[inline (always)] |
358 | fn verify_bucket( |
359 | &self, |
360 | pats: &Patterns, |
361 | haystack: &[u8], |
362 | bucket: usize, |
363 | at: usize, |
364 | ) -> Option<Match> { |
365 | // Forcing this function to not inline and be "cold" seems to help |
366 | // the codegen for Teddy overall. Interestingly, this is good for a |
367 | // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among |
368 | // others). Overall, this seems like a problem with codegen, since |
369 | // creating the Match itself is a very small amount of code. |
370 | #[cold ] |
371 | #[inline (never)] |
372 | fn match_from_span( |
373 | pati: PatternID, |
374 | start: usize, |
375 | end: usize, |
376 | ) -> Match { |
377 | Match::from_span(pati as usize, start, end) |
378 | } |
379 | |
380 | // N.B. The bounds check for this bucket lookup *should* be elided |
381 | // since we assert the number of buckets in each `find_at` routine, |
382 | // and the compiler can prove that the `% 8` (or `% 16`) in callers |
383 | // of this routine will always be in bounds. |
384 | for &pati in &self.buckets[bucket] { |
385 | // SAFETY: This is safe because we are guaranteed that every |
386 | // index in a Teddy bucket is a valid index into `pats`. This |
387 | // guarantee is upheld by the assert checking `max_pattern_id` in |
388 | // the beginning of `find_at` above. |
389 | // |
390 | // This explicit bounds check elision is (amazingly) good for a |
391 | // 25-50% boost in some benchmarks, particularly ones with a lot |
392 | // of short literals. |
393 | let pat = unsafe { pats.get_unchecked(pati) }; |
394 | if pat.is_prefix(&haystack[at..]) { |
395 | return Some(match_from_span(pati, at, at + pat.len())); |
396 | } |
397 | } |
398 | None |
399 | } |
400 | } |
401 | |
402 | /// Exec represents the different search strategies supported by the Teddy |
403 | /// runtime. |
404 | /// |
405 | /// This enum is an important safety abstraction. Namely, callers should only |
406 | /// construct a variant in this enum if it is safe to execute its corresponding |
407 | /// target features on the current CPU. The 128-bit searchers require SSSE3, |
408 | /// while the 256-bit searchers require AVX2. |
409 | #[derive (Clone, Debug)] |
410 | pub enum Exec { |
411 | TeddySlim1Mask128(TeddySlim1Mask128), |
412 | TeddySlim1Mask256(TeddySlim1Mask256), |
413 | TeddyFat1Mask256(TeddyFat1Mask256), |
414 | TeddySlim2Mask128(TeddySlim2Mask128), |
415 | TeddySlim2Mask256(TeddySlim2Mask256), |
416 | TeddyFat2Mask256(TeddyFat2Mask256), |
417 | TeddySlim3Mask128(TeddySlim3Mask128), |
418 | TeddySlim3Mask256(TeddySlim3Mask256), |
419 | TeddyFat3Mask256(TeddyFat3Mask256), |
420 | } |
421 | |
422 | // Most of the code below remains undocumented because they are effectively |
423 | // repeated versions of themselves. The general structure is described in the |
424 | // README and in the comments above. |
425 | |
426 | #[derive (Clone, Debug)] |
427 | pub struct TeddySlim1Mask128 { |
428 | pub mask1: Mask128, |
429 | } |
430 | |
431 | impl TeddySlim1Mask128 { |
432 | #[target_feature (enable = "ssse3" )] |
433 | unsafe fn find_at( |
434 | &self, |
435 | pats: &Patterns, |
436 | teddy: &Teddy, |
437 | haystack: &[u8], |
438 | mut at: usize, |
439 | ) -> Option<Match> { |
440 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
441 | // This assert helps eliminate bounds checks for bucket lookups in |
442 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
443 | assert_eq!(8, teddy.buckets.len()); |
444 | |
445 | let len = haystack.len(); |
446 | while at <= len - 16 { |
447 | let c = self.candidate(haystack, at); |
448 | if !is_all_zeroes128(c) { |
449 | if let Some(m) = teddy.verify128(pats, haystack, at, c) { |
450 | return Some(m); |
451 | } |
452 | } |
453 | at += 16; |
454 | } |
455 | if at < len { |
456 | at = len - 16; |
457 | let c = self.candidate(haystack, at); |
458 | if !is_all_zeroes128(c) { |
459 | if let Some(m) = teddy.verify128(pats, haystack, at, c) { |
460 | return Some(m); |
461 | } |
462 | } |
463 | } |
464 | None |
465 | } |
466 | |
467 | #[inline (always)] |
468 | unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i { |
469 | debug_assert!(haystack[at..].len() >= 16); |
470 | |
471 | let chunk = loadu128(haystack, at); |
472 | members1m128(chunk, self.mask1) |
473 | } |
474 | } |
475 | |
476 | #[derive (Clone, Debug)] |
477 | pub struct TeddySlim1Mask256 { |
478 | pub mask1: Mask256, |
479 | } |
480 | |
481 | impl TeddySlim1Mask256 { |
482 | #[target_feature (enable = "avx2" )] |
483 | unsafe fn find_at( |
484 | &self, |
485 | pats: &Patterns, |
486 | teddy: &Teddy, |
487 | haystack: &[u8], |
488 | mut at: usize, |
489 | ) -> Option<Match> { |
490 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
491 | // This assert helps eliminate bounds checks for bucket lookups in |
492 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
493 | assert_eq!(8, teddy.buckets.len()); |
494 | |
495 | let len = haystack.len(); |
496 | while at <= len - 32 { |
497 | let c = self.candidate(haystack, at); |
498 | if !is_all_zeroes256(c) { |
499 | if let Some(m) = teddy.verify256(pats, haystack, at, c) { |
500 | return Some(m); |
501 | } |
502 | } |
503 | at += 32; |
504 | } |
505 | if at < len { |
506 | at = len - 32; |
507 | let c = self.candidate(haystack, at); |
508 | if !is_all_zeroes256(c) { |
509 | if let Some(m) = teddy.verify256(pats, haystack, at, c) { |
510 | return Some(m); |
511 | } |
512 | } |
513 | } |
514 | None |
515 | } |
516 | |
517 | #[inline (always)] |
518 | unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { |
519 | debug_assert!(haystack[at..].len() >= 32); |
520 | |
521 | let chunk = loadu256(haystack, at); |
522 | members1m256(chunk, self.mask1) |
523 | } |
524 | } |
525 | |
526 | #[derive (Clone, Debug)] |
527 | pub struct TeddyFat1Mask256 { |
528 | pub mask1: Mask256, |
529 | } |
530 | |
531 | impl TeddyFat1Mask256 { |
532 | #[target_feature (enable = "avx2" )] |
533 | unsafe fn find_at( |
534 | &self, |
535 | pats: &Patterns, |
536 | teddy: &Teddy, |
537 | haystack: &[u8], |
538 | mut at: usize, |
539 | ) -> Option<Match> { |
540 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
541 | // This assert helps eliminate bounds checks for bucket lookups in |
542 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
543 | assert_eq!(16, teddy.buckets.len()); |
544 | |
545 | let len = haystack.len(); |
546 | while at <= len - 16 { |
547 | let c = self.candidate(haystack, at); |
548 | if !is_all_zeroes256(c) { |
549 | if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { |
550 | return Some(m); |
551 | } |
552 | } |
553 | at += 16; |
554 | } |
555 | if at < len { |
556 | at = len - 16; |
557 | let c = self.candidate(haystack, at); |
558 | if !is_all_zeroes256(c) { |
559 | if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { |
560 | return Some(m); |
561 | } |
562 | } |
563 | } |
564 | None |
565 | } |
566 | |
567 | #[inline (always)] |
568 | unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { |
569 | debug_assert!(haystack[at..].len() >= 16); |
570 | |
571 | let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); |
572 | members1m256(chunk, self.mask1) |
573 | } |
574 | } |
575 | |
576 | #[derive (Clone, Debug)] |
577 | pub struct TeddySlim2Mask128 { |
578 | pub mask1: Mask128, |
579 | pub mask2: Mask128, |
580 | } |
581 | |
582 | impl TeddySlim2Mask128 { |
583 | #[target_feature (enable = "ssse3" )] |
584 | unsafe fn find_at( |
585 | &self, |
586 | pats: &Patterns, |
587 | teddy: &Teddy, |
588 | haystack: &[u8], |
589 | mut at: usize, |
590 | ) -> Option<Match> { |
591 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
592 | // This assert helps eliminate bounds checks for bucket lookups in |
593 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
594 | assert_eq!(8, teddy.buckets.len()); |
595 | |
596 | at += 1; |
597 | let len = haystack.len(); |
598 | let mut prev0 = ones128(); |
599 | while at <= len - 16 { |
600 | let c = self.candidate(haystack, at, &mut prev0); |
601 | if !is_all_zeroes128(c) { |
602 | if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { |
603 | return Some(m); |
604 | } |
605 | } |
606 | at += 16; |
607 | } |
608 | if at < len { |
609 | at = len - 16; |
610 | prev0 = ones128(); |
611 | |
612 | let c = self.candidate(haystack, at, &mut prev0); |
613 | if !is_all_zeroes128(c) { |
614 | if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { |
615 | return Some(m); |
616 | } |
617 | } |
618 | } |
619 | None |
620 | } |
621 | |
622 | #[inline (always)] |
623 | unsafe fn candidate( |
624 | &self, |
625 | haystack: &[u8], |
626 | at: usize, |
627 | prev0: &mut __m128i, |
628 | ) -> __m128i { |
629 | debug_assert!(haystack[at..].len() >= 16); |
630 | |
631 | let chunk = loadu128(haystack, at); |
632 | let (res0, res1) = members2m128(chunk, self.mask1, self.mask2); |
633 | let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15); |
634 | _mm_and_si128(res0prev0, res1) |
635 | } |
636 | } |
637 | |
638 | #[derive (Clone, Debug)] |
639 | pub struct TeddySlim2Mask256 { |
640 | pub mask1: Mask256, |
641 | pub mask2: Mask256, |
642 | } |
643 | |
644 | impl TeddySlim2Mask256 { |
645 | #[target_feature (enable = "avx2" )] |
646 | unsafe fn find_at( |
647 | &self, |
648 | pats: &Patterns, |
649 | teddy: &Teddy, |
650 | haystack: &[u8], |
651 | mut at: usize, |
652 | ) -> Option<Match> { |
653 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
654 | // This assert helps eliminate bounds checks for bucket lookups in |
655 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
656 | assert_eq!(8, teddy.buckets.len()); |
657 | |
658 | at += 1; |
659 | let len = haystack.len(); |
660 | let mut prev0 = ones256(); |
661 | while at <= len - 32 { |
662 | let c = self.candidate(haystack, at, &mut prev0); |
663 | if !is_all_zeroes256(c) { |
664 | if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { |
665 | return Some(m); |
666 | } |
667 | } |
668 | at += 32; |
669 | } |
670 | if at < len { |
671 | at = len - 32; |
672 | prev0 = ones256(); |
673 | |
674 | let c = self.candidate(haystack, at, &mut prev0); |
675 | if !is_all_zeroes256(c) { |
676 | if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { |
677 | return Some(m); |
678 | } |
679 | } |
680 | } |
681 | None |
682 | } |
683 | |
684 | #[inline (always)] |
685 | unsafe fn candidate( |
686 | &self, |
687 | haystack: &[u8], |
688 | at: usize, |
689 | prev0: &mut __m256i, |
690 | ) -> __m256i { |
691 | debug_assert!(haystack[at..].len() >= 32); |
692 | |
693 | let chunk = loadu256(haystack, at); |
694 | let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); |
695 | let res0prev0 = alignr256_15(res0, *prev0); |
696 | let res = _mm256_and_si256(res0prev0, res1); |
697 | *prev0 = res0; |
698 | res |
699 | } |
700 | } |
701 | |
702 | #[derive (Clone, Debug)] |
703 | pub struct TeddyFat2Mask256 { |
704 | pub mask1: Mask256, |
705 | pub mask2: Mask256, |
706 | } |
707 | |
708 | impl TeddyFat2Mask256 { |
709 | #[target_feature (enable = "avx2" )] |
710 | unsafe fn find_at( |
711 | &self, |
712 | pats: &Patterns, |
713 | teddy: &Teddy, |
714 | haystack: &[u8], |
715 | mut at: usize, |
716 | ) -> Option<Match> { |
717 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
718 | // This assert helps eliminate bounds checks for bucket lookups in |
719 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
720 | assert_eq!(16, teddy.buckets.len()); |
721 | |
722 | at += 1; |
723 | let len = haystack.len(); |
724 | let mut prev0 = ones256(); |
725 | while at <= len - 16 { |
726 | let c = self.candidate(haystack, at, &mut prev0); |
727 | if !is_all_zeroes256(c) { |
728 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) |
729 | { |
730 | return Some(m); |
731 | } |
732 | } |
733 | at += 16; |
734 | } |
735 | if at < len { |
736 | at = len - 16; |
737 | prev0 = ones256(); |
738 | |
739 | let c = self.candidate(haystack, at, &mut prev0); |
740 | if !is_all_zeroes256(c) { |
741 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) |
742 | { |
743 | return Some(m); |
744 | } |
745 | } |
746 | } |
747 | None |
748 | } |
749 | |
750 | #[inline (always)] |
751 | unsafe fn candidate( |
752 | &self, |
753 | haystack: &[u8], |
754 | at: usize, |
755 | prev0: &mut __m256i, |
756 | ) -> __m256i { |
757 | debug_assert!(haystack[at..].len() >= 16); |
758 | |
759 | let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); |
760 | let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); |
761 | let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15); |
762 | let res = _mm256_and_si256(res0prev0, res1); |
763 | *prev0 = res0; |
764 | res |
765 | } |
766 | } |
767 | |
768 | #[derive (Clone, Debug)] |
769 | pub struct TeddySlim3Mask128 { |
770 | pub mask1: Mask128, |
771 | pub mask2: Mask128, |
772 | pub mask3: Mask128, |
773 | } |
774 | |
775 | impl TeddySlim3Mask128 { |
776 | #[target_feature (enable = "ssse3" )] |
777 | unsafe fn find_at( |
778 | &self, |
779 | pats: &Patterns, |
780 | teddy: &Teddy, |
781 | haystack: &[u8], |
782 | mut at: usize, |
783 | ) -> Option<Match> { |
784 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
785 | // This assert helps eliminate bounds checks for bucket lookups in |
786 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
787 | assert_eq!(8, teddy.buckets.len()); |
788 | |
789 | at += 2; |
790 | let len = haystack.len(); |
791 | let (mut prev0, mut prev1) = (ones128(), ones128()); |
792 | while at <= len - 16 { |
793 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
794 | if !is_all_zeroes128(c) { |
795 | if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { |
796 | return Some(m); |
797 | } |
798 | } |
799 | at += 16; |
800 | } |
801 | if at < len { |
802 | at = len - 16; |
803 | prev0 = ones128(); |
804 | prev1 = ones128(); |
805 | |
806 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
807 | if !is_all_zeroes128(c) { |
808 | if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { |
809 | return Some(m); |
810 | } |
811 | } |
812 | } |
813 | None |
814 | } |
815 | |
816 | #[inline (always)] |
817 | unsafe fn candidate( |
818 | &self, |
819 | haystack: &[u8], |
820 | at: usize, |
821 | prev0: &mut __m128i, |
822 | prev1: &mut __m128i, |
823 | ) -> __m128i { |
824 | debug_assert!(haystack[at..].len() >= 16); |
825 | |
826 | let chunk = loadu128(haystack, at); |
827 | let (res0, res1, res2) = |
828 | members3m128(chunk, self.mask1, self.mask2, self.mask3); |
829 | let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14); |
830 | let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15); |
831 | let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2); |
832 | *prev0 = res0; |
833 | *prev1 = res1; |
834 | res |
835 | } |
836 | } |
837 | |
838 | #[derive (Clone, Debug)] |
839 | pub struct TeddySlim3Mask256 { |
840 | pub mask1: Mask256, |
841 | pub mask2: Mask256, |
842 | pub mask3: Mask256, |
843 | } |
844 | |
845 | impl TeddySlim3Mask256 { |
846 | #[target_feature (enable = "avx2" )] |
847 | unsafe fn find_at( |
848 | &self, |
849 | pats: &Patterns, |
850 | teddy: &Teddy, |
851 | haystack: &[u8], |
852 | mut at: usize, |
853 | ) -> Option<Match> { |
854 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
855 | // This assert helps eliminate bounds checks for bucket lookups in |
856 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
857 | assert_eq!(8, teddy.buckets.len()); |
858 | |
859 | at += 2; |
860 | let len = haystack.len(); |
861 | let (mut prev0, mut prev1) = (ones256(), ones256()); |
862 | while at <= len - 32 { |
863 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
864 | if !is_all_zeroes256(c) { |
865 | if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { |
866 | return Some(m); |
867 | } |
868 | } |
869 | at += 32; |
870 | } |
871 | if at < len { |
872 | at = len - 32; |
873 | prev0 = ones256(); |
874 | prev1 = ones256(); |
875 | |
876 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
877 | if !is_all_zeroes256(c) { |
878 | if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { |
879 | return Some(m); |
880 | } |
881 | } |
882 | } |
883 | None |
884 | } |
885 | |
886 | #[inline (always)] |
887 | unsafe fn candidate( |
888 | &self, |
889 | haystack: &[u8], |
890 | at: usize, |
891 | prev0: &mut __m256i, |
892 | prev1: &mut __m256i, |
893 | ) -> __m256i { |
894 | debug_assert!(haystack[at..].len() >= 32); |
895 | |
896 | let chunk = loadu256(haystack, at); |
897 | let (res0, res1, res2) = |
898 | members3m256(chunk, self.mask1, self.mask2, self.mask3); |
899 | let res0prev0 = alignr256_14(res0, *prev0); |
900 | let res1prev1 = alignr256_15(res1, *prev1); |
901 | let res = |
902 | _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); |
903 | *prev0 = res0; |
904 | *prev1 = res1; |
905 | res |
906 | } |
907 | } |
908 | |
909 | #[derive (Clone, Debug)] |
910 | pub struct TeddyFat3Mask256 { |
911 | pub mask1: Mask256, |
912 | pub mask2: Mask256, |
913 | pub mask3: Mask256, |
914 | } |
915 | |
916 | impl TeddyFat3Mask256 { |
917 | #[target_feature (enable = "avx2" )] |
918 | unsafe fn find_at( |
919 | &self, |
920 | pats: &Patterns, |
921 | teddy: &Teddy, |
922 | haystack: &[u8], |
923 | mut at: usize, |
924 | ) -> Option<Match> { |
925 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
926 | // This assert helps eliminate bounds checks for bucket lookups in |
927 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
928 | assert_eq!(16, teddy.buckets.len()); |
929 | |
930 | at += 2; |
931 | let len = haystack.len(); |
932 | let (mut prev0, mut prev1) = (ones256(), ones256()); |
933 | while at <= len - 16 { |
934 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
935 | if !is_all_zeroes256(c) { |
936 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) |
937 | { |
938 | return Some(m); |
939 | } |
940 | } |
941 | at += 16; |
942 | } |
943 | if at < len { |
944 | at = len - 16; |
945 | prev0 = ones256(); |
946 | prev1 = ones256(); |
947 | |
948 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
949 | if !is_all_zeroes256(c) { |
950 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) |
951 | { |
952 | return Some(m); |
953 | } |
954 | } |
955 | } |
956 | None |
957 | } |
958 | |
959 | #[inline (always)] |
960 | unsafe fn candidate( |
961 | &self, |
962 | haystack: &[u8], |
963 | at: usize, |
964 | prev0: &mut __m256i, |
965 | prev1: &mut __m256i, |
966 | ) -> __m256i { |
967 | debug_assert!(haystack[at..].len() >= 16); |
968 | |
969 | let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); |
970 | let (res0, res1, res2) = |
971 | members3m256(chunk, self.mask1, self.mask2, self.mask3); |
972 | let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14); |
973 | let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15); |
974 | let res = |
975 | _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); |
976 | *prev0 = res0; |
977 | *prev1 = res1; |
978 | res |
979 | } |
980 | } |
981 | |
982 | /// A 128-bit mask for the low and high nybbles in a set of patterns. Each |
983 | /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if |
984 | /// the nybble `j` is in the bucket `i` at a particular position. |
985 | #[derive (Clone, Copy, Debug)] |
986 | pub struct Mask128 { |
987 | lo: __m128i, |
988 | hi: __m128i, |
989 | } |
990 | |
991 | impl Mask128 { |
992 | /// Create a new SIMD mask from the mask produced by the Teddy builder. |
993 | pub fn new(mask: compile::Mask) -> Mask128 { |
994 | // SAFETY: This is safe since [u8; 16] has the same representation |
995 | // as __m128i. |
996 | unsafe { |
997 | Mask128 { |
998 | lo: mem::transmute(src:mask.lo128()), |
999 | hi: mem::transmute(src:mask.hi128()), |
1000 | } |
1001 | } |
1002 | } |
1003 | } |
1004 | |
1005 | /// A 256-bit mask for the low and high nybbles in a set of patterns. Each |
1006 | /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if |
1007 | /// the nybble `j` is in the bucket `i` at a particular position. |
1008 | /// |
1009 | /// This is slightly tweaked dependending on whether Slim or Fat Teddy is being |
1010 | /// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as |
1011 | /// the bitsets in the higher 128-bits, so that we can search 32 bytes at a |
1012 | /// time. (Remember, the nybbles in the haystack are used as indices into these |
1013 | /// masks, and 256-bit shuffles only operate on 128-bit lanes.) |
1014 | /// |
1015 | /// For Fat Teddy, the bitsets are not repeated, but instead, the high 128 |
1016 | /// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets |
1017 | /// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set |
1018 | /// if it's in the higher 128 bits. |
1019 | #[derive (Clone, Copy, Debug)] |
1020 | pub struct Mask256 { |
1021 | lo: __m256i, |
1022 | hi: __m256i, |
1023 | } |
1024 | |
1025 | impl Mask256 { |
1026 | /// Create a new SIMD mask from the mask produced by the Teddy builder. |
1027 | pub fn new(mask: compile::Mask) -> Mask256 { |
1028 | // SAFETY: This is safe since [u8; 32] has the same representation |
1029 | // as __m256i. |
1030 | unsafe { |
1031 | Mask256 { |
1032 | lo: mem::transmute(src:mask.lo256()), |
1033 | hi: mem::transmute(src:mask.hi256()), |
1034 | } |
1035 | } |
1036 | } |
1037 | } |
1038 | |
1039 | // The "members" routines below are responsible for taking a chunk of bytes, |
1040 | // a number of nybble masks and returning the result of using the masks to |
1041 | // lookup bytes in the chunk. The results of the high and low nybble masks are |
1042 | // AND'ed together, such that each candidate returned is a vector, with byte |
1043 | // sized lanes, and where each lane is an 8-bit bitset corresponding to the |
1044 | // buckets that contain the corresponding byte. |
1045 | // |
1046 | // In the case of masks of length greater than 1, callers will need to keep |
1047 | // the results from the previous haystack's window, and then shift the vectors |
1048 | // so that they all line up. Then they can be AND'ed together. |
1049 | |
1050 | /// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a |
1051 | /// 16-byte window of the haystack (where the least significant byte |
1052 | /// corresponds to the start of the window), and `mask1` corresponds to a |
1053 | /// low/high mask for the first byte of all patterns that are being searched. |
1054 | #[target_feature (enable = "ssse3" )] |
1055 | unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i { |
1056 | let lomask: __m128i = _mm_set1_epi8(0xF); |
1057 | let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask); |
1058 | let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask); |
1059 | _mm_and_si128( |
1060 | a:_mm_shuffle_epi8(mask1.lo, hlo), |
1061 | b:_mm_shuffle_epi8(a:mask1.hi, b:hhi), |
1062 | ) |
1063 | } |
1064 | |
1065 | /// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a |
1066 | /// 32-byte window of the haystack (where the least significant byte |
1067 | /// corresponds to the start of the window), and `mask1` corresponds to a |
1068 | /// low/high mask for the first byte of all patterns that are being searched. |
1069 | /// |
1070 | /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
1071 | /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
1072 | /// window in the haystack. |
1073 | #[target_feature (enable = "avx2" )] |
1074 | unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i { |
1075 | let lomask: __m256i = _mm256_set1_epi8(0xF); |
1076 | let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask); |
1077 | let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask); |
1078 | _mm256_and_si256( |
1079 | a:_mm256_shuffle_epi8(mask1.lo, hlo), |
1080 | b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi), |
1081 | ) |
1082 | } |
1083 | |
1084 | /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds |
1085 | /// to a 16-byte window of the haystack (where the least significant byte |
1086 | /// corresponds to the start of the window), and the masks correspond to a |
1087 | /// low/high mask for the first and second bytes of all patterns that are being |
1088 | /// searched. The vectors returned correspond to candidates for the first and |
1089 | /// second bytes in the patterns represented by the masks. |
1090 | #[target_feature (enable = "ssse3" )] |
1091 | unsafe fn members2m128( |
1092 | chunk: __m128i, |
1093 | mask1: Mask128, |
1094 | mask2: Mask128, |
1095 | ) -> (__m128i, __m128i) { |
1096 | let lomask: __m128i = _mm_set1_epi8(0xF); |
1097 | let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask); |
1098 | let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask); |
1099 | let res0: __m128i = _mm_and_si128( |
1100 | a:_mm_shuffle_epi8(mask1.lo, hlo), |
1101 | b:_mm_shuffle_epi8(a:mask1.hi, b:hhi), |
1102 | ); |
1103 | let res1: __m128i = _mm_and_si128( |
1104 | a:_mm_shuffle_epi8(mask2.lo, hlo), |
1105 | b:_mm_shuffle_epi8(a:mask2.hi, b:hhi), |
1106 | ); |
1107 | (res0, res1) |
1108 | } |
1109 | |
1110 | /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds |
1111 | /// to a 32-byte window of the haystack (where the least significant byte |
1112 | /// corresponds to the start of the window), and the masks correspond to a |
1113 | /// low/high mask for the first and second bytes of all patterns that are being |
1114 | /// searched. The vectors returned correspond to candidates for the first and |
1115 | /// second bytes in the patterns represented by the masks. |
1116 | /// |
1117 | /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
1118 | /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
1119 | /// window in the haystack. |
1120 | #[target_feature (enable = "avx2" )] |
1121 | unsafe fn members2m256( |
1122 | chunk: __m256i, |
1123 | mask1: Mask256, |
1124 | mask2: Mask256, |
1125 | ) -> (__m256i, __m256i) { |
1126 | let lomask: __m256i = _mm256_set1_epi8(0xF); |
1127 | let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask); |
1128 | let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask); |
1129 | let res0: __m256i = _mm256_and_si256( |
1130 | a:_mm256_shuffle_epi8(mask1.lo, hlo), |
1131 | b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi), |
1132 | ); |
1133 | let res1: __m256i = _mm256_and_si256( |
1134 | a:_mm256_shuffle_epi8(mask2.lo, hlo), |
1135 | b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi), |
1136 | ); |
1137 | (res0, res1) |
1138 | } |
1139 | |
1140 | /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds |
1141 | /// to a 16-byte window of the haystack (where the least significant byte |
1142 | /// corresponds to the start of the window), and the masks correspond to a |
1143 | /// low/high mask for the first, second and third bytes of all patterns that |
1144 | /// are being searched. The vectors returned correspond to candidates for the |
1145 | /// first, second and third bytes in the patterns represented by the masks. |
1146 | #[target_feature (enable = "ssse3" )] |
1147 | unsafe fn members3m128( |
1148 | chunk: __m128i, |
1149 | mask1: Mask128, |
1150 | mask2: Mask128, |
1151 | mask3: Mask128, |
1152 | ) -> (__m128i, __m128i, __m128i) { |
1153 | let lomask: __m128i = _mm_set1_epi8(0xF); |
1154 | let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask); |
1155 | let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask); |
1156 | let res0: __m128i = _mm_and_si128( |
1157 | a:_mm_shuffle_epi8(mask1.lo, hlo), |
1158 | b:_mm_shuffle_epi8(a:mask1.hi, b:hhi), |
1159 | ); |
1160 | let res1: __m128i = _mm_and_si128( |
1161 | a:_mm_shuffle_epi8(mask2.lo, hlo), |
1162 | b:_mm_shuffle_epi8(a:mask2.hi, b:hhi), |
1163 | ); |
1164 | let res2: __m128i = _mm_and_si128( |
1165 | a:_mm_shuffle_epi8(mask3.lo, hlo), |
1166 | b:_mm_shuffle_epi8(a:mask3.hi, b:hhi), |
1167 | ); |
1168 | (res0, res1, res2) |
1169 | } |
1170 | |
1171 | /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds |
1172 | /// to a 32-byte window of the haystack (where the least significant byte |
1173 | /// corresponds to the start of the window), and the masks correspond to a |
1174 | /// low/high mask for the first, second and third bytes of all patterns that |
1175 | /// are being searched. The vectors returned correspond to candidates for the |
1176 | /// first, second and third bytes in the patterns represented by the masks. |
1177 | /// |
1178 | /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
1179 | /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
1180 | /// window in the haystack. |
1181 | #[target_feature (enable = "avx2" )] |
1182 | unsafe fn members3m256( |
1183 | chunk: __m256i, |
1184 | mask1: Mask256, |
1185 | mask2: Mask256, |
1186 | mask3: Mask256, |
1187 | ) -> (__m256i, __m256i, __m256i) { |
1188 | let lomask: __m256i = _mm256_set1_epi8(0xF); |
1189 | let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask); |
1190 | let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask); |
1191 | let res0: __m256i = _mm256_and_si256( |
1192 | a:_mm256_shuffle_epi8(mask1.lo, hlo), |
1193 | b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi), |
1194 | ); |
1195 | let res1: __m256i = _mm256_and_si256( |
1196 | a:_mm256_shuffle_epi8(mask2.lo, hlo), |
1197 | b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi), |
1198 | ); |
1199 | let res2: __m256i = _mm256_and_si256( |
1200 | a:_mm256_shuffle_epi8(mask3.lo, hlo), |
1201 | b:_mm256_shuffle_epi8(a:mask3.hi, b:hhi), |
1202 | ); |
1203 | (res0, res1, res2) |
1204 | } |
1205 | |