1 | // See the README in this directory for an explanation of the Teddy algorithm. |
2 | // It is strongly recommended to peruse the README before trying to grok this |
3 | // code, as its use of SIMD is pretty opaque, although I tried to add comments |
4 | // where appropriate. |
5 | // |
6 | // Moreover, while there is a lot of code in this file, most of it is |
7 | // repeated variants of the same thing. Specifically, there are three Teddy |
8 | // variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets) |
9 | // and Fat 256-bit Teddy (16 buckets). For each variant, there are three |
10 | // implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to |
11 | // a total of nine variants. Each one is structured roughly the same: |
12 | // |
13 | // while at <= len(haystack) - CHUNK_SIZE: |
14 | // let candidate = find_candidate_in_chunk(haystack, at) |
15 | // if not all zeroes(candidate): |
16 | // if match = verify(haystack, at, candidate): |
17 | // return match |
18 | // |
19 | // For the most part, this remains unchanged. The parts that vary are the |
20 | // verification routine (for slim vs fat Teddy) and the candidate extraction |
21 | // (based on the number of masks). |
22 | // |
23 | // In the code below, a "candidate" corresponds to a single vector with 8-bit |
24 | // lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the |
25 | // jth lane if and only if the byte occurring at position `j` is in the |
26 | // bucket `i` (where the `j`th position is the position in the current window |
27 | // of the haystack, which is always 16 or 32 bytes). Note to be careful here: |
28 | // the ith bit and the jth lane correspond to the least significant bits of the |
29 | // vector. So when visualizing how the current window of bytes is stored in a |
30 | // vector, you often need to flip it around. For example, the text `abcd` in a |
31 | // 4-byte vector would look like this: |
32 | // |
33 | // 01100100 01100011 01100010 01100001 |
34 | // d c b a |
35 | // |
36 | // When the mask length is 1, then finding the candidate is pretty straight |
37 | // forward: you just apply the shuffle indices (from the haystack window) to |
38 | // the masks, and then AND them together, as described in the README. But for |
39 | // masks of length 2 and 3, you need to keep a little state. Specifically, |
40 | // you need to store the final 1 (for mask length 2) or 2 (for mask length 3) |
41 | // bytes of the candidate for use when searching the next window. This is for |
42 | // handling matches that span two windows. |
43 | // |
44 | // With respect to the repeated code, it would likely be possible to reduce |
45 | // the number of copies of code below using polymorphism, but I find this |
46 | // formulation clearer instead of needing to reason through generics. However, |
47 | // I admit, there may be a simpler generic construction that I'm missing. |
48 | // |
49 | // All variants are fairly heavily tested in src/packed/tests.rs. |
50 | |
51 | use core::{arch::x86_64::*, mem}; |
52 | |
53 | use alloc::vec::Vec; |
54 | |
55 | use crate::{ |
56 | packed::{ |
57 | pattern::{PatternID, Patterns}, |
58 | teddy::compile, |
59 | vector, |
60 | }, |
61 | util::search::Match, |
62 | }; |
63 | |
64 | /// The Teddy runtime. |
65 | /// |
66 | /// A Teddy runtime can be used to quickly search for occurrences of one or |
67 | /// more patterns. While it does not scale to an arbitrary number of patterns |
68 | /// like Aho-Corasick, it does find occurrences for a small set of patterns |
69 | /// much more quickly than Aho-Corasick. |
70 | /// |
71 | /// Teddy cannot run on small haystacks below a certain size, which is |
72 | /// dependent on the type of matcher used. This size can be queried via the |
73 | /// `minimum_len` method. Violating this will result in a panic. |
74 | /// |
75 | /// Finally, when callers use a Teddy runtime, they must provide precisely the |
76 | /// patterns used to construct the Teddy matcher. Violating this will result |
77 | /// in either a panic or incorrect results, but will never sacrifice memory |
78 | /// safety. |
79 | #[derive (Clone, Debug)] |
80 | pub struct Teddy { |
81 | /// The allocation of patterns in buckets. This only contains the IDs of |
82 | /// patterns. In order to do full verification, callers must provide the |
83 | /// actual patterns when using Teddy. |
84 | pub buckets: Vec<Vec<PatternID>>, |
85 | /// The maximum identifier of a pattern. This is used as a sanity check to |
86 | /// ensure that the patterns provided by the caller are the same as the |
87 | /// patterns that were used to compile the matcher. This sanity check |
88 | /// permits safely eliminating bounds checks regardless of what patterns |
89 | /// are provided by the caller. |
90 | /// |
91 | /// Note that users of the aho-corasick crate cannot get this wrong. Only |
92 | /// code internal to this crate can get it wrong, since neither `Patterns` |
93 | /// type nor the Teddy runtime are public API items. |
94 | pub max_pattern_id: PatternID, |
95 | /// The actual runtime to use. |
96 | pub exec: Exec, |
97 | } |
98 | |
99 | impl Teddy { |
100 | /// Return the first occurrence of a match in the given haystack after or |
101 | /// starting at `at`. |
102 | /// |
103 | /// The patterns provided must be precisely the same patterns given to the |
104 | /// Teddy builder, otherwise this may panic or produce incorrect results. |
105 | /// |
106 | /// All matches are consistent with the match semantics (leftmost-first or |
107 | /// leftmost-longest) set on `pats`. |
108 | pub fn find_at( |
109 | &self, |
110 | pats: &Patterns, |
111 | haystack: &[u8], |
112 | at: usize, |
113 | ) -> Option<Match> { |
114 | // This assert is a bit subtle, but it's an important guarantee. |
115 | // Namely, if the maximum pattern ID seen by Teddy is the same as the |
116 | // one in the patterns given, then we are guaranteed that every pattern |
117 | // ID in all Teddy buckets are valid indices into `pats`. While this |
118 | // is nominally true, there is no guarantee that callers provide the |
119 | // same `pats` to both the Teddy builder and the searcher, which would |
120 | // otherwise make `find_at` unsafe to call. But this assert lets us |
121 | // keep this routine safe and eliminate an important bounds check in |
122 | // verification. |
123 | assert_eq!( |
124 | self.max_pattern_id, |
125 | pats.max_pattern_id(), |
126 | "teddy must be called with same patterns it was built with" , |
127 | ); |
128 | // SAFETY: The haystack must have at least a minimum number of bytes |
129 | // for Teddy to be able to work. The minimum number varies depending on |
130 | // which matcher is used below. If this is violated, then it's possible |
131 | // for searching to do out-of-bounds writes. |
132 | assert!(haystack[at..].len() >= self.minimum_len()); |
133 | // SAFETY: The various Teddy matchers are always safe to call because |
134 | // the Teddy builder guarantees that a particular Exec variant is |
135 | // built only when it can be run the current CPU. That is, the Teddy |
136 | // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is |
137 | // enabled. That is, our dynamic CPU feature detection is performed |
138 | // once in the builder, and we rely on the type system to avoid needing |
139 | // to do it again. |
140 | unsafe { |
141 | match self.exec { |
142 | Exec::TeddySlim1Mask128(ref e) => { |
143 | e.find_at(pats, self, haystack, at) |
144 | } |
145 | Exec::TeddySlim1Mask256(ref e) => { |
146 | e.find_at(pats, self, haystack, at) |
147 | } |
148 | Exec::TeddyFat1Mask256(ref e) => { |
149 | e.find_at(pats, self, haystack, at) |
150 | } |
151 | Exec::TeddySlim2Mask128(ref e) => { |
152 | e.find_at(pats, self, haystack, at) |
153 | } |
154 | Exec::TeddySlim2Mask256(ref e) => { |
155 | e.find_at(pats, self, haystack, at) |
156 | } |
157 | Exec::TeddyFat2Mask256(ref e) => { |
158 | e.find_at(pats, self, haystack, at) |
159 | } |
160 | Exec::TeddySlim3Mask128(ref e) => { |
161 | e.find_at(pats, self, haystack, at) |
162 | } |
163 | Exec::TeddySlim3Mask256(ref e) => { |
164 | e.find_at(pats, self, haystack, at) |
165 | } |
166 | Exec::TeddyFat3Mask256(ref e) => { |
167 | e.find_at(pats, self, haystack, at) |
168 | } |
169 | Exec::TeddySlim4Mask128(ref e) => { |
170 | e.find_at(pats, self, haystack, at) |
171 | } |
172 | Exec::TeddySlim4Mask256(ref e) => { |
173 | e.find_at(pats, self, haystack, at) |
174 | } |
175 | Exec::TeddyFat4Mask256(ref e) => { |
176 | e.find_at(pats, self, haystack, at) |
177 | } |
178 | } |
179 | } |
180 | } |
181 | |
182 | /// Returns the minimum length of a haystack that must be provided by |
183 | /// callers to this Teddy searcher. Providing a haystack shorter than this |
184 | /// will result in a panic, but will never violate memory safety. |
185 | pub fn minimum_len(&self) -> usize { |
186 | // SAFETY: These values must be correct in order to ensure safety. |
187 | // The Teddy runtime assumes their haystacks have at least these |
188 | // lengths. Violating this will sacrifice memory safety. |
189 | match self.exec { |
190 | Exec::TeddySlim1Mask128(_) => 16, |
191 | Exec::TeddySlim1Mask256(_) => 32, |
192 | Exec::TeddyFat1Mask256(_) => 16, |
193 | Exec::TeddySlim2Mask128(_) => 17, |
194 | Exec::TeddySlim2Mask256(_) => 33, |
195 | Exec::TeddyFat2Mask256(_) => 17, |
196 | Exec::TeddySlim3Mask128(_) => 18, |
197 | Exec::TeddySlim3Mask256(_) => 34, |
198 | Exec::TeddyFat3Mask256(_) => 18, |
199 | Exec::TeddySlim4Mask128(_) => 19, |
200 | Exec::TeddySlim4Mask256(_) => 35, |
201 | Exec::TeddyFat4Mask256(_) => 19, |
202 | } |
203 | } |
204 | |
205 | /// Returns the approximate total amount of heap used by this searcher, in |
206 | /// units of bytes. |
207 | pub fn memory_usage(&self) -> usize { |
208 | let num_patterns = self.max_pattern_id as usize + 1; |
209 | self.buckets.len() * mem::size_of::<Vec<PatternID>>() |
210 | + num_patterns * mem::size_of::<PatternID>() |
211 | } |
212 | |
213 | /// Runs the verification routine for Slim 128-bit Teddy. |
214 | /// |
215 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
216 | /// per lane), where the ith bit is set in the jth lane if and only if the |
217 | /// byte occurring at `at + j` in `haystack` is in the bucket `i`. |
218 | /// |
219 | /// This is not safe to call unless the SSSE3 target feature is enabled. |
220 | /// The `target_feature` attribute is not applied since this function is |
221 | /// always forcefully inlined. |
222 | #[inline (always)] |
223 | unsafe fn verify128( |
224 | &self, |
225 | pats: &Patterns, |
226 | haystack: &[u8], |
227 | at: usize, |
228 | cand: __m128i, |
229 | ) -> Option<Match> { |
230 | debug_assert!(!vector::is_all_zeroes128(cand)); |
231 | debug_assert_eq!(8, self.buckets.len()); |
232 | |
233 | // Convert the candidate into 64-bit chunks, and then verify each of |
234 | // those chunks. |
235 | let parts = vector::unpack64x128(cand); |
236 | for (i, &part) in parts.iter().enumerate() { |
237 | let pos = at + i * 8; |
238 | if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { |
239 | return Some(m); |
240 | } |
241 | } |
242 | None |
243 | } |
244 | |
245 | /// Runs the verification routine for Slim 256-bit Teddy. |
246 | /// |
247 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
248 | /// per lane), where the ith bit is set in the jth lane if and only if the |
249 | /// byte occurring at `at + j` in `haystack` is in the bucket `i`. |
250 | /// |
251 | /// This is not safe to call unless the AVX2 target feature is enabled. |
252 | /// The `target_feature` attribute is not applied since this function is |
253 | /// always forcefully inlined. |
254 | #[inline (always)] |
255 | unsafe fn verify256( |
256 | &self, |
257 | pats: &Patterns, |
258 | haystack: &[u8], |
259 | at: usize, |
260 | cand: __m256i, |
261 | ) -> Option<Match> { |
262 | debug_assert!(!vector::is_all_zeroes256(cand)); |
263 | debug_assert_eq!(8, self.buckets.len()); |
264 | |
265 | // Convert the candidate into 64-bit chunks, and then verify each of |
266 | // those chunks. |
267 | let parts = vector::unpack64x256(cand); |
268 | let mut pos = at; |
269 | if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[0]) { |
270 | return Some(m); |
271 | } |
272 | pos += 8; |
273 | if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[1]) { |
274 | return Some(m); |
275 | } |
276 | pos += 8; |
277 | if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[2]) { |
278 | return Some(m); |
279 | } |
280 | pos += 8; |
281 | if let Some(m) = self.verify64(pats, 8, haystack, pos, parts[3]) { |
282 | return Some(m); |
283 | } |
284 | None |
285 | } |
286 | |
287 | /// Runs the verification routine for Fat 256-bit Teddy. |
288 | /// |
289 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
290 | /// per lane), where the ith bit is set in the jth lane if and only if the |
291 | /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the |
292 | /// bucket `j < 16 ? i : i + 8`. |
293 | /// |
294 | /// This is not safe to call unless the AVX2 target feature is enabled. |
295 | /// The `target_feature` attribute is not applied since this function is |
296 | /// always forcefully inlined. |
297 | #[inline (always)] |
298 | unsafe fn verify_fat256( |
299 | &self, |
300 | pats: &Patterns, |
301 | haystack: &[u8], |
302 | at: usize, |
303 | cand: __m256i, |
304 | ) -> Option<Match> { |
305 | debug_assert!(!vector::is_all_zeroes256(cand)); |
306 | debug_assert_eq!(16, self.buckets.len()); |
307 | |
308 | // This is a bit tricky, but we basically want to convert our |
309 | // candidate, which looks like this |
310 | // |
311 | // a31 a30 ... a17 a16 a15 a14 ... a01 a00 |
312 | // |
313 | // where each a(i) is an 8-bit bitset corresponding to the activated |
314 | // buckets, to this |
315 | // |
316 | // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 |
317 | // |
318 | // Namely, for Fat Teddy, the high 128-bits of the candidate correspond |
319 | // to the same bytes in the haystack in the low 128-bits (so we only |
320 | // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. |
321 | // |
322 | // The verification routine wants to look at all potentially matching |
323 | // buckets before moving on to the next lane. So for example, both |
324 | // a16 and a00 both correspond to the first byte in our window; a00 |
325 | // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, |
326 | // a16 should be checked before a01. So the transformation shown above |
327 | // allows us to use our normal verification procedure with one small |
328 | // change: we treat each bitset as 16 bits instead of 8 bits. |
329 | |
330 | // Swap the 128-bit lanes in the candidate vector. |
331 | let swap = _mm256_permute4x64_epi64(cand, 0x4E); |
332 | // Interleave the bytes from the low 128-bit lanes, starting with |
333 | // cand first. |
334 | let r1 = _mm256_unpacklo_epi8(cand, swap); |
335 | // Interleave the bytes from the high 128-bit lanes, starting with |
336 | // cand first. |
337 | let r2 = _mm256_unpackhi_epi8(cand, swap); |
338 | // Now just take the 2 low 64-bit integers from both r1 and r2. We |
339 | // can drop the high 64-bit integers because they are a mirror image |
340 | // of the low 64-bit integers. All we care about are the low 128-bit |
341 | // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets |
342 | // laid out in the desired order, as described above. |
343 | let parts = vector::unpacklo64x256(r1, r2); |
344 | for (i, &part) in parts.iter().enumerate() { |
345 | let pos = at + i * 4; |
346 | if let Some(m) = self.verify64(pats, 16, haystack, pos, part) { |
347 | return Some(m); |
348 | } |
349 | } |
350 | None |
351 | } |
352 | |
353 | /// Verify whether there are any matches starting at or after `at` in the |
354 | /// given `haystack`. The candidate given should correspond to either 8-bit |
355 | /// (for 8 buckets) or 16-bit (16 buckets) bitsets. |
356 | #[inline (always)] |
357 | fn verify64( |
358 | &self, |
359 | pats: &Patterns, |
360 | bucket_count: usize, |
361 | haystack: &[u8], |
362 | at: usize, |
363 | mut cand: u64, |
364 | ) -> Option<Match> { |
365 | // N.B. While the bucket count is known from self.buckets.len(), |
366 | // requiring it as a parameter makes it easier for the optimizer to |
367 | // know its value, and thus produce more efficient codegen. |
368 | debug_assert!(bucket_count == 8 || bucket_count == 16); |
369 | while cand != 0 { |
370 | let bit = cand.trailing_zeros() as usize; |
371 | cand &= !(1 << bit); |
372 | |
373 | let at = at + (bit / bucket_count); |
374 | let bucket = bit % bucket_count; |
375 | if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) { |
376 | return Some(m); |
377 | } |
378 | } |
379 | None |
380 | } |
381 | |
382 | /// Verify whether there are any matches starting at `at` in the given |
383 | /// `haystack` corresponding only to patterns in the given bucket. |
384 | #[inline (always)] |
385 | fn verify_bucket( |
386 | &self, |
387 | pats: &Patterns, |
388 | haystack: &[u8], |
389 | bucket: usize, |
390 | at: usize, |
391 | ) -> Option<Match> { |
392 | // Forcing this function to not inline and be "cold" seems to help |
393 | // the codegen for Teddy overall. Interestingly, this is good for a |
394 | // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among |
395 | // others). Overall, this seems like a problem with codegen, since |
396 | // creating the Match itself is a very small amount of code. |
397 | #[cold ] |
398 | #[inline (never)] |
399 | fn match_from_span( |
400 | pati: PatternID, |
401 | start: usize, |
402 | end: usize, |
403 | ) -> Match { |
404 | Match::must(pati as usize, start..end) |
405 | } |
406 | |
407 | // N.B. The bounds check for this bucket lookup *should* be elided |
408 | // since we assert the number of buckets in each `find_at` routine, |
409 | // and the compiler can prove that the `% 8` (or `% 16`) in callers |
410 | // of this routine will always be in bounds. |
411 | for &pati in &self.buckets[bucket] { |
412 | // SAFETY: This is safe because we are guaranteed that every |
413 | // index in a Teddy bucket is a valid index into `pats`. This |
414 | // guarantee is upheld by the assert checking `max_pattern_id` in |
415 | // the beginning of `find_at` above. |
416 | // |
417 | // This explicit bounds check elision is (amazingly) good for a |
418 | // 25-50% boost in some benchmarks, particularly ones with a lot |
419 | // of short literals. |
420 | let pat = unsafe { pats.get_unchecked(pati) }; |
421 | if pat.is_prefix(&haystack[at..]) { |
422 | return Some(match_from_span(pati, at, at + pat.len())); |
423 | } |
424 | } |
425 | None |
426 | } |
427 | } |
428 | |
429 | /// Exec represents the different search strategies supported by the Teddy |
430 | /// runtime. |
431 | /// |
432 | /// This enum is an important safety abstraction. Namely, callers should only |
433 | /// construct a variant in this enum if it is safe to execute its corresponding |
434 | /// target features on the current CPU. The 128-bit searchers require SSSE3, |
435 | /// while the 256-bit searchers require AVX2. |
436 | #[derive (Clone, Debug)] |
437 | pub enum Exec { |
438 | TeddySlim1Mask128(TeddySlim1Mask128), |
439 | TeddySlim1Mask256(TeddySlim1Mask256), |
440 | TeddyFat1Mask256(TeddyFat1Mask256), |
441 | TeddySlim2Mask128(TeddySlim2Mask128), |
442 | TeddySlim2Mask256(TeddySlim2Mask256), |
443 | TeddyFat2Mask256(TeddyFat2Mask256), |
444 | TeddySlim3Mask128(TeddySlim3Mask128), |
445 | TeddySlim3Mask256(TeddySlim3Mask256), |
446 | TeddyFat3Mask256(TeddyFat3Mask256), |
447 | TeddySlim4Mask128(TeddySlim4Mask128), |
448 | TeddySlim4Mask256(TeddySlim4Mask256), |
449 | TeddyFat4Mask256(TeddyFat4Mask256), |
450 | } |
451 | |
452 | // Most of the code below remains undocumented because they are effectively |
453 | // repeated versions of themselves. The general structure is described in the |
454 | // README and in the comments above. |
455 | |
456 | #[derive (Clone, Debug)] |
457 | pub struct TeddySlim1Mask128 { |
458 | pub mask1: Mask128, |
459 | } |
460 | |
461 | impl TeddySlim1Mask128 { |
462 | #[target_feature (enable = "ssse3" )] |
463 | unsafe fn find_at( |
464 | &self, |
465 | pats: &Patterns, |
466 | teddy: &Teddy, |
467 | haystack: &[u8], |
468 | mut at: usize, |
469 | ) -> Option<Match> { |
470 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
471 | // This assert helps eliminate bounds checks for bucket lookups in |
472 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
473 | assert_eq!(8, teddy.buckets.len()); |
474 | |
475 | let len = haystack.len(); |
476 | while at <= len - 16 { |
477 | let c = self.candidate(haystack, at); |
478 | if !vector::is_all_zeroes128(c) { |
479 | if let Some(m) = teddy.verify128(pats, haystack, at, c) { |
480 | return Some(m); |
481 | } |
482 | } |
483 | at += 16; |
484 | } |
485 | if at < len { |
486 | at = len - 16; |
487 | let c = self.candidate(haystack, at); |
488 | if !vector::is_all_zeroes128(c) { |
489 | if let Some(m) = teddy.verify128(pats, haystack, at, c) { |
490 | return Some(m); |
491 | } |
492 | } |
493 | } |
494 | None |
495 | } |
496 | |
497 | #[inline (always)] |
498 | unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i { |
499 | debug_assert!(haystack[at..].len() >= 16); |
500 | |
501 | let chunk = vector::loadu128(haystack, at); |
502 | members1m128(chunk, self.mask1) |
503 | } |
504 | } |
505 | |
506 | #[derive (Clone, Debug)] |
507 | pub struct TeddySlim1Mask256 { |
508 | pub mask1: Mask256, |
509 | } |
510 | |
511 | impl TeddySlim1Mask256 { |
512 | #[target_feature (enable = "avx2" )] |
513 | unsafe fn find_at( |
514 | &self, |
515 | pats: &Patterns, |
516 | teddy: &Teddy, |
517 | haystack: &[u8], |
518 | mut at: usize, |
519 | ) -> Option<Match> { |
520 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
521 | // This assert helps eliminate bounds checks for bucket lookups in |
522 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
523 | assert_eq!(8, teddy.buckets.len()); |
524 | |
525 | let len = haystack.len(); |
526 | while at <= len - 32 { |
527 | let c = self.candidate(haystack, at); |
528 | if !vector::is_all_zeroes256(c) { |
529 | if let Some(m) = teddy.verify256(pats, haystack, at, c) { |
530 | return Some(m); |
531 | } |
532 | } |
533 | at += 32; |
534 | } |
535 | if at < len { |
536 | at = len - 32; |
537 | let c = self.candidate(haystack, at); |
538 | if !vector::is_all_zeroes256(c) { |
539 | if let Some(m) = teddy.verify256(pats, haystack, at, c) { |
540 | return Some(m); |
541 | } |
542 | } |
543 | } |
544 | None |
545 | } |
546 | |
547 | #[inline (always)] |
548 | unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { |
549 | debug_assert!(haystack[at..].len() >= 32); |
550 | |
551 | let chunk = vector::loadu256(haystack, at); |
552 | members1m256(chunk, self.mask1) |
553 | } |
554 | } |
555 | |
556 | #[derive (Clone, Debug)] |
557 | pub struct TeddyFat1Mask256 { |
558 | pub mask1: Mask256, |
559 | } |
560 | |
561 | impl TeddyFat1Mask256 { |
562 | #[target_feature (enable = "avx2" )] |
563 | unsafe fn find_at( |
564 | &self, |
565 | pats: &Patterns, |
566 | teddy: &Teddy, |
567 | haystack: &[u8], |
568 | mut at: usize, |
569 | ) -> Option<Match> { |
570 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
571 | // This assert helps eliminate bounds checks for bucket lookups in |
572 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
573 | assert_eq!(16, teddy.buckets.len()); |
574 | |
575 | let len = haystack.len(); |
576 | while at <= len - 16 { |
577 | let c = self.candidate(haystack, at); |
578 | if !vector::is_all_zeroes256(c) { |
579 | if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { |
580 | return Some(m); |
581 | } |
582 | } |
583 | at += 16; |
584 | } |
585 | if at < len { |
586 | at = len - 16; |
587 | let c = self.candidate(haystack, at); |
588 | if !vector::is_all_zeroes256(c) { |
589 | if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { |
590 | return Some(m); |
591 | } |
592 | } |
593 | } |
594 | None |
595 | } |
596 | |
597 | #[inline (always)] |
598 | unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { |
599 | debug_assert!(haystack[at..].len() >= 16); |
600 | |
601 | let chunk = |
602 | _mm256_broadcastsi128_si256(vector::loadu128(haystack, at)); |
603 | members1m256(chunk, self.mask1) |
604 | } |
605 | } |
606 | |
607 | #[derive (Clone, Debug)] |
608 | pub struct TeddySlim2Mask128 { |
609 | pub mask1: Mask128, |
610 | pub mask2: Mask128, |
611 | } |
612 | |
613 | impl TeddySlim2Mask128 { |
614 | #[target_feature (enable = "ssse3" )] |
615 | unsafe fn find_at( |
616 | &self, |
617 | pats: &Patterns, |
618 | teddy: &Teddy, |
619 | haystack: &[u8], |
620 | mut at: usize, |
621 | ) -> Option<Match> { |
622 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
623 | // This assert helps eliminate bounds checks for bucket lookups in |
624 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
625 | assert_eq!(8, teddy.buckets.len()); |
626 | |
627 | at += 1; |
628 | let len = haystack.len(); |
629 | let mut prev0 = vector::ones128(); |
630 | while at <= len - 16 { |
631 | let c = self.candidate(haystack, at, &mut prev0); |
632 | if !vector::is_all_zeroes128(c) { |
633 | if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { |
634 | return Some(m); |
635 | } |
636 | } |
637 | at += 16; |
638 | } |
639 | if at < len { |
640 | at = len - 16; |
641 | prev0 = vector::ones128(); |
642 | |
643 | let c = self.candidate(haystack, at, &mut prev0); |
644 | if !vector::is_all_zeroes128(c) { |
645 | if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { |
646 | return Some(m); |
647 | } |
648 | } |
649 | } |
650 | None |
651 | } |
652 | |
653 | #[inline (always)] |
654 | unsafe fn candidate( |
655 | &self, |
656 | haystack: &[u8], |
657 | at: usize, |
658 | prev0: &mut __m128i, |
659 | ) -> __m128i { |
660 | debug_assert!(haystack[at..].len() >= 16); |
661 | |
662 | let chunk = vector::loadu128(haystack, at); |
663 | let (res0, res1) = members2m128(chunk, self.mask1, self.mask2); |
664 | let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15); |
665 | _mm_and_si128(res0prev0, res1) |
666 | } |
667 | } |
668 | |
669 | #[derive (Clone, Debug)] |
670 | pub struct TeddySlim2Mask256 { |
671 | pub mask1: Mask256, |
672 | pub mask2: Mask256, |
673 | } |
674 | |
675 | impl TeddySlim2Mask256 { |
676 | #[target_feature (enable = "avx2" )] |
677 | unsafe fn find_at( |
678 | &self, |
679 | pats: &Patterns, |
680 | teddy: &Teddy, |
681 | haystack: &[u8], |
682 | mut at: usize, |
683 | ) -> Option<Match> { |
684 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
685 | // This assert helps eliminate bounds checks for bucket lookups in |
686 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
687 | assert_eq!(8, teddy.buckets.len()); |
688 | |
689 | at += 1; |
690 | let len = haystack.len(); |
691 | let mut prev0 = vector::ones256(); |
692 | while at <= len - 32 { |
693 | let c = self.candidate(haystack, at, &mut prev0); |
694 | if !vector::is_all_zeroes256(c) { |
695 | if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { |
696 | return Some(m); |
697 | } |
698 | } |
699 | at += 32; |
700 | } |
701 | if at < len { |
702 | at = len - 32; |
703 | prev0 = vector::ones256(); |
704 | |
705 | let c = self.candidate(haystack, at, &mut prev0); |
706 | if !vector::is_all_zeroes256(c) { |
707 | if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { |
708 | return Some(m); |
709 | } |
710 | } |
711 | } |
712 | None |
713 | } |
714 | |
715 | #[inline (always)] |
716 | unsafe fn candidate( |
717 | &self, |
718 | haystack: &[u8], |
719 | at: usize, |
720 | prev0: &mut __m256i, |
721 | ) -> __m256i { |
722 | debug_assert!(haystack[at..].len() >= 32); |
723 | |
724 | let chunk = vector::loadu256(haystack, at); |
725 | let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); |
726 | let res0prev0 = vector::alignr256_15(res0, *prev0); |
727 | let res = _mm256_and_si256(res0prev0, res1); |
728 | *prev0 = res0; |
729 | res |
730 | } |
731 | } |
732 | |
733 | #[derive (Clone, Debug)] |
734 | pub struct TeddyFat2Mask256 { |
735 | pub mask1: Mask256, |
736 | pub mask2: Mask256, |
737 | } |
738 | |
739 | impl TeddyFat2Mask256 { |
740 | #[target_feature (enable = "avx2" )] |
741 | unsafe fn find_at( |
742 | &self, |
743 | pats: &Patterns, |
744 | teddy: &Teddy, |
745 | haystack: &[u8], |
746 | mut at: usize, |
747 | ) -> Option<Match> { |
748 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
749 | // This assert helps eliminate bounds checks for bucket lookups in |
750 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
751 | assert_eq!(16, teddy.buckets.len()); |
752 | |
753 | at += 1; |
754 | let len = haystack.len(); |
755 | let mut prev0 = vector::ones256(); |
756 | while at <= len - 16 { |
757 | let c = self.candidate(haystack, at, &mut prev0); |
758 | if !vector::is_all_zeroes256(c) { |
759 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) |
760 | { |
761 | return Some(m); |
762 | } |
763 | } |
764 | at += 16; |
765 | } |
766 | if at < len { |
767 | at = len - 16; |
768 | prev0 = vector::ones256(); |
769 | |
770 | let c = self.candidate(haystack, at, &mut prev0); |
771 | if !vector::is_all_zeroes256(c) { |
772 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) |
773 | { |
774 | return Some(m); |
775 | } |
776 | } |
777 | } |
778 | None |
779 | } |
780 | |
781 | #[inline (always)] |
782 | unsafe fn candidate( |
783 | &self, |
784 | haystack: &[u8], |
785 | at: usize, |
786 | prev0: &mut __m256i, |
787 | ) -> __m256i { |
788 | debug_assert!(haystack[at..].len() >= 16); |
789 | |
790 | let chunk = |
791 | _mm256_broadcastsi128_si256(vector::loadu128(haystack, at)); |
792 | let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); |
793 | let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15); |
794 | let res = _mm256_and_si256(res0prev0, res1); |
795 | *prev0 = res0; |
796 | res |
797 | } |
798 | } |
799 | |
800 | #[derive (Clone, Debug)] |
801 | pub struct TeddySlim3Mask128 { |
802 | pub mask1: Mask128, |
803 | pub mask2: Mask128, |
804 | pub mask3: Mask128, |
805 | } |
806 | |
807 | impl TeddySlim3Mask128 { |
808 | #[target_feature (enable = "ssse3" )] |
809 | unsafe fn find_at( |
810 | &self, |
811 | pats: &Patterns, |
812 | teddy: &Teddy, |
813 | haystack: &[u8], |
814 | mut at: usize, |
815 | ) -> Option<Match> { |
816 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
817 | // This assert helps eliminate bounds checks for bucket lookups in |
818 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
819 | assert_eq!(8, teddy.buckets.len()); |
820 | |
821 | at += 2; |
822 | let len = haystack.len(); |
823 | let (mut prev0, mut prev1) = (vector::ones128(), vector::ones128()); |
824 | while at <= len - 16 { |
825 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
826 | if !vector::is_all_zeroes128(c) { |
827 | if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { |
828 | return Some(m); |
829 | } |
830 | } |
831 | at += 16; |
832 | } |
833 | if at < len { |
834 | at = len - 16; |
835 | prev0 = vector::ones128(); |
836 | prev1 = vector::ones128(); |
837 | |
838 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
839 | if !vector::is_all_zeroes128(c) { |
840 | if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { |
841 | return Some(m); |
842 | } |
843 | } |
844 | } |
845 | None |
846 | } |
847 | |
848 | #[inline (always)] |
849 | unsafe fn candidate( |
850 | &self, |
851 | haystack: &[u8], |
852 | at: usize, |
853 | prev0: &mut __m128i, |
854 | prev1: &mut __m128i, |
855 | ) -> __m128i { |
856 | debug_assert!(haystack[at..].len() >= 16); |
857 | |
858 | let chunk = vector::loadu128(haystack, at); |
859 | let (res0, res1, res2) = |
860 | members3m128(chunk, self.mask1, self.mask2, self.mask3); |
861 | let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14); |
862 | let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15); |
863 | let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2); |
864 | *prev0 = res0; |
865 | *prev1 = res1; |
866 | res |
867 | } |
868 | } |
869 | |
870 | #[derive (Clone, Debug)] |
871 | pub struct TeddySlim3Mask256 { |
872 | pub mask1: Mask256, |
873 | pub mask2: Mask256, |
874 | pub mask3: Mask256, |
875 | } |
876 | |
877 | impl TeddySlim3Mask256 { |
878 | #[target_feature (enable = "avx2" )] |
879 | unsafe fn find_at( |
880 | &self, |
881 | pats: &Patterns, |
882 | teddy: &Teddy, |
883 | haystack: &[u8], |
884 | mut at: usize, |
885 | ) -> Option<Match> { |
886 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
887 | // This assert helps eliminate bounds checks for bucket lookups in |
888 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
889 | assert_eq!(8, teddy.buckets.len()); |
890 | |
891 | at += 2; |
892 | let len = haystack.len(); |
893 | let (mut prev0, mut prev1) = (vector::ones256(), vector::ones256()); |
894 | while at <= len - 32 { |
895 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
896 | if !vector::is_all_zeroes256(c) { |
897 | if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { |
898 | return Some(m); |
899 | } |
900 | } |
901 | at += 32; |
902 | } |
903 | if at < len { |
904 | at = len - 32; |
905 | prev0 = vector::ones256(); |
906 | prev1 = vector::ones256(); |
907 | |
908 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
909 | if !vector::is_all_zeroes256(c) { |
910 | if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { |
911 | return Some(m); |
912 | } |
913 | } |
914 | } |
915 | None |
916 | } |
917 | |
918 | #[inline (always)] |
919 | unsafe fn candidate( |
920 | &self, |
921 | haystack: &[u8], |
922 | at: usize, |
923 | prev0: &mut __m256i, |
924 | prev1: &mut __m256i, |
925 | ) -> __m256i { |
926 | debug_assert!(haystack[at..].len() >= 32); |
927 | |
928 | let chunk = vector::loadu256(haystack, at); |
929 | let (res0, res1, res2) = |
930 | members3m256(chunk, self.mask1, self.mask2, self.mask3); |
931 | let res0prev0 = vector::alignr256_14(res0, *prev0); |
932 | let res1prev1 = vector::alignr256_15(res1, *prev1); |
933 | let res = |
934 | _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); |
935 | *prev0 = res0; |
936 | *prev1 = res1; |
937 | res |
938 | } |
939 | } |
940 | |
941 | #[derive (Clone, Debug)] |
942 | pub struct TeddyFat3Mask256 { |
943 | pub mask1: Mask256, |
944 | pub mask2: Mask256, |
945 | pub mask3: Mask256, |
946 | } |
947 | |
948 | impl TeddyFat3Mask256 { |
949 | #[target_feature (enable = "avx2" )] |
950 | unsafe fn find_at( |
951 | &self, |
952 | pats: &Patterns, |
953 | teddy: &Teddy, |
954 | haystack: &[u8], |
955 | mut at: usize, |
956 | ) -> Option<Match> { |
957 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
958 | // This assert helps eliminate bounds checks for bucket lookups in |
959 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
960 | assert_eq!(16, teddy.buckets.len()); |
961 | |
962 | at += 2; |
963 | let len = haystack.len(); |
964 | let (mut prev0, mut prev1) = (vector::ones256(), vector::ones256()); |
965 | while at <= len - 16 { |
966 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
967 | if !vector::is_all_zeroes256(c) { |
968 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) |
969 | { |
970 | return Some(m); |
971 | } |
972 | } |
973 | at += 16; |
974 | } |
975 | if at < len { |
976 | at = len - 16; |
977 | prev0 = vector::ones256(); |
978 | prev1 = vector::ones256(); |
979 | |
980 | let c = self.candidate(haystack, at, &mut prev0, &mut prev1); |
981 | if !vector::is_all_zeroes256(c) { |
982 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) |
983 | { |
984 | return Some(m); |
985 | } |
986 | } |
987 | } |
988 | None |
989 | } |
990 | |
991 | #[inline (always)] |
992 | unsafe fn candidate( |
993 | &self, |
994 | haystack: &[u8], |
995 | at: usize, |
996 | prev0: &mut __m256i, |
997 | prev1: &mut __m256i, |
998 | ) -> __m256i { |
999 | debug_assert!(haystack[at..].len() >= 16); |
1000 | |
1001 | let chunk = |
1002 | _mm256_broadcastsi128_si256(vector::loadu128(haystack, at)); |
1003 | let (res0, res1, res2) = |
1004 | members3m256(chunk, self.mask1, self.mask2, self.mask3); |
1005 | let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14); |
1006 | let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15); |
1007 | let res = |
1008 | _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); |
1009 | *prev0 = res0; |
1010 | *prev1 = res1; |
1011 | res |
1012 | } |
1013 | } |
1014 | |
1015 | #[derive (Clone, Debug)] |
1016 | pub struct TeddySlim4Mask128 { |
1017 | pub mask1: Mask128, |
1018 | pub mask2: Mask128, |
1019 | pub mask3: Mask128, |
1020 | pub mask4: Mask128, |
1021 | } |
1022 | |
1023 | impl TeddySlim4Mask128 { |
1024 | #[target_feature (enable = "ssse3" )] |
1025 | unsafe fn find_at( |
1026 | &self, |
1027 | pats: &Patterns, |
1028 | teddy: &Teddy, |
1029 | haystack: &[u8], |
1030 | mut at: usize, |
1031 | ) -> Option<Match> { |
1032 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
1033 | // This assert helps eliminate bounds checks for bucket lookups in |
1034 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
1035 | assert_eq!(8, teddy.buckets.len()); |
1036 | |
1037 | at += 3; |
1038 | let len = haystack.len(); |
1039 | let mut prev0 = vector::ones128(); |
1040 | let mut prev1 = vector::ones128(); |
1041 | let mut prev2 = vector::ones128(); |
1042 | while at <= len - 16 { |
1043 | let c = self |
1044 | .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2); |
1045 | if !vector::is_all_zeroes128(c) { |
1046 | if let Some(m) = teddy.verify128(pats, haystack, at - 3, c) { |
1047 | return Some(m); |
1048 | } |
1049 | } |
1050 | at += 16; |
1051 | } |
1052 | if at < len { |
1053 | at = len - 16; |
1054 | prev0 = vector::ones128(); |
1055 | prev1 = vector::ones128(); |
1056 | prev2 = vector::ones128(); |
1057 | |
1058 | let c = self |
1059 | .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2); |
1060 | if !vector::is_all_zeroes128(c) { |
1061 | if let Some(m) = teddy.verify128(pats, haystack, at - 3, c) { |
1062 | return Some(m); |
1063 | } |
1064 | } |
1065 | } |
1066 | None |
1067 | } |
1068 | |
1069 | #[inline (always)] |
1070 | unsafe fn candidate( |
1071 | &self, |
1072 | haystack: &[u8], |
1073 | at: usize, |
1074 | prev0: &mut __m128i, |
1075 | prev1: &mut __m128i, |
1076 | prev2: &mut __m128i, |
1077 | ) -> __m128i { |
1078 | debug_assert!(haystack[at..].len() >= 16); |
1079 | |
1080 | let chunk = vector::loadu128(haystack, at); |
1081 | let (res0, res1, res2, res3) = members4m128( |
1082 | chunk, self.mask1, self.mask2, self.mask3, self.mask4, |
1083 | ); |
1084 | let res0prev0 = _mm_alignr_epi8(res0, *prev0, 13); |
1085 | let res1prev1 = _mm_alignr_epi8(res1, *prev1, 14); |
1086 | let res2prev2 = _mm_alignr_epi8(res2, *prev2, 15); |
1087 | let res = _mm_and_si128( |
1088 | _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2prev2), |
1089 | res3, |
1090 | ); |
1091 | *prev0 = res0; |
1092 | *prev1 = res1; |
1093 | *prev2 = res2; |
1094 | res |
1095 | } |
1096 | } |
1097 | |
1098 | #[derive (Clone, Debug)] |
1099 | pub struct TeddySlim4Mask256 { |
1100 | pub mask1: Mask256, |
1101 | pub mask2: Mask256, |
1102 | pub mask3: Mask256, |
1103 | pub mask4: Mask256, |
1104 | } |
1105 | |
1106 | impl TeddySlim4Mask256 { |
1107 | #[target_feature (enable = "avx2" )] |
1108 | unsafe fn find_at( |
1109 | &self, |
1110 | pats: &Patterns, |
1111 | teddy: &Teddy, |
1112 | haystack: &[u8], |
1113 | mut at: usize, |
1114 | ) -> Option<Match> { |
1115 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
1116 | // This assert helps eliminate bounds checks for bucket lookups in |
1117 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
1118 | assert_eq!(8, teddy.buckets.len()); |
1119 | |
1120 | at += 3; |
1121 | let len = haystack.len(); |
1122 | let mut prev0 = vector::ones256(); |
1123 | let mut prev1 = vector::ones256(); |
1124 | let mut prev2 = vector::ones256(); |
1125 | while at <= len - 32 { |
1126 | let c = self |
1127 | .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2); |
1128 | if !vector::is_all_zeroes256(c) { |
1129 | if let Some(m) = teddy.verify256(pats, haystack, at - 3, c) { |
1130 | return Some(m); |
1131 | } |
1132 | } |
1133 | at += 32; |
1134 | } |
1135 | if at < len { |
1136 | at = len - 32; |
1137 | prev0 = vector::ones256(); |
1138 | prev1 = vector::ones256(); |
1139 | prev2 = vector::ones256(); |
1140 | |
1141 | let c = self |
1142 | .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2); |
1143 | if !vector::is_all_zeroes256(c) { |
1144 | if let Some(m) = teddy.verify256(pats, haystack, at - 3, c) { |
1145 | return Some(m); |
1146 | } |
1147 | } |
1148 | } |
1149 | None |
1150 | } |
1151 | |
1152 | #[inline (always)] |
1153 | unsafe fn candidate( |
1154 | &self, |
1155 | haystack: &[u8], |
1156 | at: usize, |
1157 | prev0: &mut __m256i, |
1158 | prev1: &mut __m256i, |
1159 | prev2: &mut __m256i, |
1160 | ) -> __m256i { |
1161 | debug_assert!(haystack[at..].len() >= 32); |
1162 | |
1163 | let chunk = vector::loadu256(haystack, at); |
1164 | let (res0, res1, res2, res3) = members4m256( |
1165 | chunk, self.mask1, self.mask2, self.mask3, self.mask4, |
1166 | ); |
1167 | let res0prev0 = vector::alignr256_13(res0, *prev0); |
1168 | let res1prev1 = vector::alignr256_14(res1, *prev1); |
1169 | let res2prev2 = vector::alignr256_15(res2, *prev2); |
1170 | let res = _mm256_and_si256( |
1171 | _mm256_and_si256( |
1172 | _mm256_and_si256(res0prev0, res1prev1), |
1173 | res2prev2, |
1174 | ), |
1175 | res3, |
1176 | ); |
1177 | *prev0 = res0; |
1178 | *prev1 = res1; |
1179 | *prev2 = res2; |
1180 | res |
1181 | } |
1182 | } |
1183 | |
1184 | #[derive (Clone, Debug)] |
1185 | pub struct TeddyFat4Mask256 { |
1186 | pub mask1: Mask256, |
1187 | pub mask2: Mask256, |
1188 | pub mask3: Mask256, |
1189 | pub mask4: Mask256, |
1190 | } |
1191 | |
1192 | impl TeddyFat4Mask256 { |
1193 | #[target_feature (enable = "avx2" )] |
1194 | unsafe fn find_at( |
1195 | &self, |
1196 | pats: &Patterns, |
1197 | teddy: &Teddy, |
1198 | haystack: &[u8], |
1199 | mut at: usize, |
1200 | ) -> Option<Match> { |
1201 | debug_assert!(haystack[at..].len() >= teddy.minimum_len()); |
1202 | // This assert helps eliminate bounds checks for bucket lookups in |
1203 | // Teddy::verify_bucket, which has a small (3-4%) performance boost. |
1204 | assert_eq!(16, teddy.buckets.len()); |
1205 | |
1206 | at += 3; |
1207 | let len = haystack.len(); |
1208 | let mut prev0 = vector::ones256(); |
1209 | let mut prev1 = vector::ones256(); |
1210 | let mut prev2 = vector::ones256(); |
1211 | while at <= len - 16 { |
1212 | let c = self |
1213 | .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2); |
1214 | if !vector::is_all_zeroes256(c) { |
1215 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 3, c) |
1216 | { |
1217 | return Some(m); |
1218 | } |
1219 | } |
1220 | at += 16; |
1221 | } |
1222 | if at < len { |
1223 | at = len - 16; |
1224 | prev0 = vector::ones256(); |
1225 | prev1 = vector::ones256(); |
1226 | prev2 = vector::ones256(); |
1227 | |
1228 | let c = self |
1229 | .candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2); |
1230 | if !vector::is_all_zeroes256(c) { |
1231 | if let Some(m) = teddy.verify_fat256(pats, haystack, at - 3, c) |
1232 | { |
1233 | return Some(m); |
1234 | } |
1235 | } |
1236 | } |
1237 | None |
1238 | } |
1239 | |
1240 | #[inline (always)] |
1241 | unsafe fn candidate( |
1242 | &self, |
1243 | haystack: &[u8], |
1244 | at: usize, |
1245 | prev0: &mut __m256i, |
1246 | prev1: &mut __m256i, |
1247 | prev2: &mut __m256i, |
1248 | ) -> __m256i { |
1249 | debug_assert!(haystack[at..].len() >= 16); |
1250 | |
1251 | let chunk = |
1252 | _mm256_broadcastsi128_si256(vector::loadu128(haystack, at)); |
1253 | let (res0, res1, res2, res3) = members4m256( |
1254 | chunk, self.mask1, self.mask2, self.mask3, self.mask4, |
1255 | ); |
1256 | let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 13); |
1257 | let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 14); |
1258 | let res2prev2 = _mm256_alignr_epi8(res2, *prev2, 15); |
1259 | let res = _mm256_and_si256( |
1260 | _mm256_and_si256( |
1261 | _mm256_and_si256(res0prev0, res1prev1), |
1262 | res2prev2, |
1263 | ), |
1264 | res3, |
1265 | ); |
1266 | *prev0 = res0; |
1267 | *prev1 = res1; |
1268 | *prev2 = res2; |
1269 | res |
1270 | } |
1271 | } |
1272 | |
1273 | /// A 128-bit mask for the low and high nybbles in a set of patterns. Each |
1274 | /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if |
1275 | /// the nybble `j` is in the bucket `i` at a particular position. |
1276 | #[derive (Clone, Copy, Debug)] |
1277 | pub struct Mask128 { |
1278 | lo: __m128i, |
1279 | hi: __m128i, |
1280 | } |
1281 | |
1282 | impl Mask128 { |
1283 | /// Create a new SIMD mask from the mask produced by the Teddy builder. |
1284 | pub fn new(mask: compile::Mask) -> Mask128 { |
1285 | // SAFETY: This is safe since [u8; 16] has the same representation |
1286 | // as __m128i. |
1287 | unsafe { |
1288 | Mask128 { |
1289 | lo: mem::transmute(src:mask.lo128()), |
1290 | hi: mem::transmute(src:mask.hi128()), |
1291 | } |
1292 | } |
1293 | } |
1294 | } |
1295 | |
1296 | /// A 256-bit mask for the low and high nybbles in a set of patterns. Each |
1297 | /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if |
1298 | /// the nybble `j` is in the bucket `i` at a particular position. |
1299 | /// |
1300 | /// This is slightly tweaked dependending on whether Slim or Fat Teddy is being |
1301 | /// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as |
1302 | /// the bitsets in the higher 128-bits, so that we can search 32 bytes at a |
1303 | /// time. (Remember, the nybbles in the haystack are used as indices into these |
1304 | /// masks, and 256-bit shuffles only operate on 128-bit lanes.) |
1305 | /// |
1306 | /// For Fat Teddy, the bitsets are not repeated, but instead, the high 128 |
1307 | /// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets |
1308 | /// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set |
1309 | /// if it's in the higher 128 bits. |
1310 | #[derive (Clone, Copy, Debug)] |
1311 | pub struct Mask256 { |
1312 | lo: __m256i, |
1313 | hi: __m256i, |
1314 | } |
1315 | |
1316 | impl Mask256 { |
1317 | /// Create a new SIMD mask from the mask produced by the Teddy builder. |
1318 | pub fn new(mask: compile::Mask) -> Mask256 { |
1319 | // SAFETY: This is safe since [u8; 32] has the same representation |
1320 | // as __m256i. |
1321 | unsafe { |
1322 | Mask256 { |
1323 | lo: mem::transmute(src:mask.lo256()), |
1324 | hi: mem::transmute(src:mask.hi256()), |
1325 | } |
1326 | } |
1327 | } |
1328 | } |
1329 | |
1330 | // The "members" routines below are responsible for taking a chunk of bytes, |
1331 | // a number of nybble masks and returning the result of using the masks to |
1332 | // lookup bytes in the chunk. The results of the high and low nybble masks are |
1333 | // AND'ed together, such that each candidate returned is a vector, with byte |
1334 | // sized lanes, and where each lane is an 8-bit bitset corresponding to the |
1335 | // buckets that contain the corresponding byte. |
1336 | // |
1337 | // In the case of masks of length greater than 1, callers will need to keep |
1338 | // the results from the previous haystack's window, and then shift the vectors |
1339 | // so that they all line up. Then they can be AND'ed together. |
1340 | |
1341 | /// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a |
1342 | /// 16-byte window of the haystack (where the least significant byte |
1343 | /// corresponds to the start of the window), and `mask1` corresponds to a |
1344 | /// low/high mask for the first byte of all patterns that are being searched. |
1345 | #[target_feature (enable = "ssse3" )] |
1346 | unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i { |
1347 | let lomask: __m128i = _mm_set1_epi8(0xF); |
1348 | let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask); |
1349 | let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask); |
1350 | _mm_and_si128( |
1351 | a:_mm_shuffle_epi8(mask1.lo, hlo), |
1352 | b:_mm_shuffle_epi8(a:mask1.hi, b:hhi), |
1353 | ) |
1354 | } |
1355 | |
1356 | /// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a |
1357 | /// 32-byte window of the haystack (where the least significant byte |
1358 | /// corresponds to the start of the window), and `mask1` corresponds to a |
1359 | /// low/high mask for the first byte of all patterns that are being searched. |
1360 | /// |
1361 | /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
1362 | /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
1363 | /// window in the haystack. |
1364 | #[target_feature (enable = "avx2" )] |
1365 | unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i { |
1366 | let lomask: __m256i = _mm256_set1_epi8(0xF); |
1367 | let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask); |
1368 | let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask); |
1369 | _mm256_and_si256( |
1370 | a:_mm256_shuffle_epi8(mask1.lo, hlo), |
1371 | b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi), |
1372 | ) |
1373 | } |
1374 | |
1375 | /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds |
1376 | /// to a 16-byte window of the haystack (where the least significant byte |
1377 | /// corresponds to the start of the window), and the masks correspond to a |
1378 | /// low/high mask for the first and second bytes of all patterns that are being |
1379 | /// searched. The vectors returned correspond to candidates for the first and |
1380 | /// second bytes in the patterns represented by the masks. |
1381 | #[target_feature (enable = "ssse3" )] |
1382 | unsafe fn members2m128( |
1383 | chunk: __m128i, |
1384 | mask1: Mask128, |
1385 | mask2: Mask128, |
1386 | ) -> (__m128i, __m128i) { |
1387 | let lomask: __m128i = _mm_set1_epi8(0xF); |
1388 | let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask); |
1389 | let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask); |
1390 | let res0: __m128i = _mm_and_si128( |
1391 | a:_mm_shuffle_epi8(mask1.lo, hlo), |
1392 | b:_mm_shuffle_epi8(a:mask1.hi, b:hhi), |
1393 | ); |
1394 | let res1: __m128i = _mm_and_si128( |
1395 | a:_mm_shuffle_epi8(mask2.lo, hlo), |
1396 | b:_mm_shuffle_epi8(a:mask2.hi, b:hhi), |
1397 | ); |
1398 | (res0, res1) |
1399 | } |
1400 | |
1401 | /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds |
1402 | /// to a 32-byte window of the haystack (where the least significant byte |
1403 | /// corresponds to the start of the window), and the masks correspond to a |
1404 | /// low/high mask for the first and second bytes of all patterns that are being |
1405 | /// searched. The vectors returned correspond to candidates for the first and |
1406 | /// second bytes in the patterns represented by the masks. |
1407 | /// |
1408 | /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
1409 | /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
1410 | /// window in the haystack. |
1411 | #[target_feature (enable = "avx2" )] |
1412 | unsafe fn members2m256( |
1413 | chunk: __m256i, |
1414 | mask1: Mask256, |
1415 | mask2: Mask256, |
1416 | ) -> (__m256i, __m256i) { |
1417 | let lomask: __m256i = _mm256_set1_epi8(0xF); |
1418 | let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask); |
1419 | let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask); |
1420 | let res0: __m256i = _mm256_and_si256( |
1421 | a:_mm256_shuffle_epi8(mask1.lo, hlo), |
1422 | b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi), |
1423 | ); |
1424 | let res1: __m256i = _mm256_and_si256( |
1425 | a:_mm256_shuffle_epi8(mask2.lo, hlo), |
1426 | b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi), |
1427 | ); |
1428 | (res0, res1) |
1429 | } |
1430 | |
1431 | /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds |
1432 | /// to a 16-byte window of the haystack (where the least significant byte |
1433 | /// corresponds to the start of the window), and the masks correspond to a |
1434 | /// low/high mask for the first, second and third bytes of all patterns that |
1435 | /// are being searched. The vectors returned correspond to candidates for the |
1436 | /// first, second and third bytes in the patterns represented by the masks. |
1437 | #[target_feature (enable = "ssse3" )] |
1438 | unsafe fn members3m128( |
1439 | chunk: __m128i, |
1440 | mask1: Mask128, |
1441 | mask2: Mask128, |
1442 | mask3: Mask128, |
1443 | ) -> (__m128i, __m128i, __m128i) { |
1444 | let lomask: __m128i = _mm_set1_epi8(0xF); |
1445 | let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask); |
1446 | let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask); |
1447 | let res0: __m128i = _mm_and_si128( |
1448 | a:_mm_shuffle_epi8(mask1.lo, hlo), |
1449 | b:_mm_shuffle_epi8(a:mask1.hi, b:hhi), |
1450 | ); |
1451 | let res1: __m128i = _mm_and_si128( |
1452 | a:_mm_shuffle_epi8(mask2.lo, hlo), |
1453 | b:_mm_shuffle_epi8(a:mask2.hi, b:hhi), |
1454 | ); |
1455 | let res2: __m128i = _mm_and_si128( |
1456 | a:_mm_shuffle_epi8(mask3.lo, hlo), |
1457 | b:_mm_shuffle_epi8(a:mask3.hi, b:hhi), |
1458 | ); |
1459 | (res0, res1, res2) |
1460 | } |
1461 | |
1462 | /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds |
1463 | /// to a 32-byte window of the haystack (where the least significant byte |
1464 | /// corresponds to the start of the window), and the masks correspond to a |
1465 | /// low/high mask for the first, second and third bytes of all patterns that |
1466 | /// are being searched. The vectors returned correspond to candidates for the |
1467 | /// first, second and third bytes in the patterns represented by the masks. |
1468 | /// |
1469 | /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
1470 | /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
1471 | /// window in the haystack. |
1472 | #[target_feature (enable = "avx2" )] |
1473 | unsafe fn members3m256( |
1474 | chunk: __m256i, |
1475 | mask1: Mask256, |
1476 | mask2: Mask256, |
1477 | mask3: Mask256, |
1478 | ) -> (__m256i, __m256i, __m256i) { |
1479 | let lomask: __m256i = _mm256_set1_epi8(0xF); |
1480 | let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask); |
1481 | let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask); |
1482 | let res0: __m256i = _mm256_and_si256( |
1483 | a:_mm256_shuffle_epi8(mask1.lo, hlo), |
1484 | b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi), |
1485 | ); |
1486 | let res1: __m256i = _mm256_and_si256( |
1487 | a:_mm256_shuffle_epi8(mask2.lo, hlo), |
1488 | b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi), |
1489 | ); |
1490 | let res2: __m256i = _mm256_and_si256( |
1491 | a:_mm256_shuffle_epi8(mask3.lo, hlo), |
1492 | b:_mm256_shuffle_epi8(a:mask3.hi, b:hhi), |
1493 | ); |
1494 | (res0, res1, res2) |
1495 | } |
1496 | |
1497 | /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds |
1498 | /// to a 16-byte window of the haystack (where the least significant byte |
1499 | /// corresponds to the start of the window), and the masks correspond to a |
1500 | /// low/high mask for the first, second, third and fourth bytes of all patterns |
1501 | /// that are being searched. The vectors returned correspond to candidates for |
1502 | /// the first, second, third and fourth bytes in the patterns represented by |
1503 | /// the masks. |
1504 | #[target_feature (enable = "ssse3" )] |
1505 | unsafe fn members4m128( |
1506 | chunk: __m128i, |
1507 | mask1: Mask128, |
1508 | mask2: Mask128, |
1509 | mask3: Mask128, |
1510 | mask4: Mask128, |
1511 | ) -> (__m128i, __m128i, __m128i, __m128i) { |
1512 | let lomask: __m128i = _mm_set1_epi8(0xF); |
1513 | let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask); |
1514 | let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, 4), b:lomask); |
1515 | let res0: __m128i = _mm_and_si128( |
1516 | a:_mm_shuffle_epi8(mask1.lo, hlo), |
1517 | b:_mm_shuffle_epi8(a:mask1.hi, b:hhi), |
1518 | ); |
1519 | let res1: __m128i = _mm_and_si128( |
1520 | a:_mm_shuffle_epi8(mask2.lo, hlo), |
1521 | b:_mm_shuffle_epi8(a:mask2.hi, b:hhi), |
1522 | ); |
1523 | let res2: __m128i = _mm_and_si128( |
1524 | a:_mm_shuffle_epi8(mask3.lo, hlo), |
1525 | b:_mm_shuffle_epi8(a:mask3.hi, b:hhi), |
1526 | ); |
1527 | let res3: __m128i = _mm_and_si128( |
1528 | a:_mm_shuffle_epi8(mask4.lo, hlo), |
1529 | b:_mm_shuffle_epi8(a:mask4.hi, b:hhi), |
1530 | ); |
1531 | (res0, res1, res2, res3) |
1532 | } |
1533 | |
1534 | /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds |
1535 | /// to a 32-byte window of the haystack (where the least significant byte |
1536 | /// corresponds to the start of the window), and the masks correspond to a |
1537 | /// low/high mask for the first, second, third and fourth bytes of all patterns |
1538 | /// that are being searched. The vectors returned correspond to candidates for |
1539 | /// the first, second, third and fourth bytes in the patterns represented by |
1540 | /// the masks. |
1541 | /// |
1542 | /// Note that this can also be used for Fat Teddy, where the high 128 bits in |
1543 | /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte |
1544 | /// window in the haystack. |
1545 | #[target_feature (enable = "avx2" )] |
1546 | unsafe fn members4m256( |
1547 | chunk: __m256i, |
1548 | mask1: Mask256, |
1549 | mask2: Mask256, |
1550 | mask3: Mask256, |
1551 | mask4: Mask256, |
1552 | ) -> (__m256i, __m256i, __m256i, __m256i) { |
1553 | let lomask: __m256i = _mm256_set1_epi8(0xF); |
1554 | let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask); |
1555 | let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, 4), b:lomask); |
1556 | let res0: __m256i = _mm256_and_si256( |
1557 | a:_mm256_shuffle_epi8(mask1.lo, hlo), |
1558 | b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi), |
1559 | ); |
1560 | let res1: __m256i = _mm256_and_si256( |
1561 | a:_mm256_shuffle_epi8(mask2.lo, hlo), |
1562 | b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi), |
1563 | ); |
1564 | let res2: __m256i = _mm256_and_si256( |
1565 | a:_mm256_shuffle_epi8(mask3.lo, hlo), |
1566 | b:_mm256_shuffle_epi8(a:mask3.hi, b:hhi), |
1567 | ); |
1568 | let res3: __m256i = _mm256_and_si256( |
1569 | a:_mm256_shuffle_epi8(mask4.lo, hlo), |
1570 | b:_mm256_shuffle_epi8(a:mask4.hi, b:hhi), |
1571 | ); |
1572 | (res0, res1, res2, res3) |
1573 | } |
1574 | |