1 | use core::fmt::Debug; |
2 | |
3 | use alloc::{ |
4 | boxed::Box, collections::BTreeMap, format, sync::Arc, vec, vec::Vec, |
5 | }; |
6 | |
7 | use crate::{ |
8 | packed::{ |
9 | ext::Pointer, |
10 | pattern::Patterns, |
11 | vector::{FatVector, Vector}, |
12 | }, |
13 | util::int::U32, |
14 | PatternID, |
15 | }; |
16 | |
17 | /// A match type specialized to the Teddy implementations below. |
18 | /// |
19 | /// Essentially, instead of representing a match at byte offsets, we use |
20 | /// raw pointers. This is because the implementations below operate on raw |
21 | /// pointers, and so this is a more natural return type based on how the |
22 | /// implementation works. |
23 | /// |
24 | /// Also, the `PatternID` used here is a `u16`. |
25 | #[derive (Clone, Copy, Debug)] |
26 | pub(crate) struct Match { |
27 | pid: PatternID, |
28 | start: *const u8, |
29 | end: *const u8, |
30 | } |
31 | |
32 | impl Match { |
33 | /// Returns the ID of the pattern that matched. |
34 | pub(crate) fn pattern(&self) -> PatternID { |
35 | self.pid |
36 | } |
37 | |
38 | /// Returns a pointer into the haystack at which the match starts. |
39 | pub(crate) fn start(&self) -> *const u8 { |
40 | self.start |
41 | } |
42 | |
43 | /// Returns a pointer into the haystack at which the match ends. |
44 | pub(crate) fn end(&self) -> *const u8 { |
45 | self.end |
46 | } |
47 | } |
48 | |
49 | /// A "slim" Teddy implementation that is generic over both the vector type |
50 | /// and the minimum length of the patterns being searched for. |
51 | /// |
52 | /// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. |
53 | #[derive (Clone, Debug)] |
54 | pub(crate) struct Slim<V, const BYTES: usize> { |
55 | /// A generic data structure for doing "slim" Teddy verification. |
56 | teddy: Teddy<8>, |
57 | /// The masks used as inputs to the shuffle operation to generate |
58 | /// candidates (which are fed into the verification routines). |
59 | masks: [Mask<V>; BYTES], |
60 | } |
61 | |
62 | impl<V: Vector, const BYTES: usize> Slim<V, BYTES> { |
63 | /// Create a new "slim" Teddy searcher for the given patterns. |
64 | /// |
65 | /// # Panics |
66 | /// |
67 | /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. |
68 | /// |
69 | /// # Safety |
70 | /// |
71 | /// Callers must ensure that this is okay to call in the current target for |
72 | /// the current CPU. |
73 | #[inline (always)] |
74 | pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Slim<V, BYTES> { |
75 | assert!( |
76 | 1 <= BYTES && BYTES <= 4, |
77 | "only 1, 2, 3 or 4 bytes are supported" |
78 | ); |
79 | let teddy = Teddy::new(patterns); |
80 | let masks = SlimMaskBuilder::from_teddy(&teddy); |
81 | Slim { teddy, masks } |
82 | } |
83 | |
84 | /// Returns the approximate total amount of heap used by this type, in |
85 | /// units of bytes. |
86 | #[inline (always)] |
87 | pub(crate) fn memory_usage(&self) -> usize { |
88 | self.teddy.memory_usage() |
89 | } |
90 | |
91 | /// Returns the minimum length, in bytes, that a haystack must be in order |
92 | /// to use it with this searcher. |
93 | #[inline (always)] |
94 | pub(crate) fn minimum_len(&self) -> usize { |
95 | V::BYTES + (BYTES - 1) |
96 | } |
97 | } |
98 | |
99 | impl<V: Vector> Slim<V, 1> { |
100 | /// Look for an occurrences of the patterns in this finder in the haystack |
101 | /// given by the `start` and `end` pointers. |
102 | /// |
103 | /// If no match could be found, then `None` is returned. |
104 | /// |
105 | /// # Safety |
106 | /// |
107 | /// The given pointers representing the haystack must be valid to read |
108 | /// from. They must also point to a region of memory that is at least the |
109 | /// minimum length required by this searcher. |
110 | /// |
111 | /// Callers must ensure that this is okay to call in the current target for |
112 | /// the current CPU. |
113 | #[inline (always)] |
114 | pub(crate) unsafe fn find( |
115 | &self, |
116 | start: *const u8, |
117 | end: *const u8, |
118 | ) -> Option<Match> { |
119 | let len = end.distance(start); |
120 | debug_assert!(len >= self.minimum_len()); |
121 | let mut cur = start; |
122 | while cur <= end.sub(V::BYTES) { |
123 | if let Some(m) = self.find_one(cur, end) { |
124 | return Some(m); |
125 | } |
126 | cur = cur.add(V::BYTES); |
127 | } |
128 | if cur < end { |
129 | cur = end.sub(V::BYTES); |
130 | if let Some(m) = self.find_one(cur, end) { |
131 | return Some(m); |
132 | } |
133 | } |
134 | None |
135 | } |
136 | |
137 | /// Look for a match starting at the `V::BYTES` at and after `cur`. If |
138 | /// there isn't one, then `None` is returned. |
139 | /// |
140 | /// # Safety |
141 | /// |
142 | /// The given pointers representing the haystack must be valid to read |
143 | /// from. They must also point to a region of memory that is at least the |
144 | /// minimum length required by this searcher. |
145 | /// |
146 | /// Callers must ensure that this is okay to call in the current target for |
147 | /// the current CPU. |
148 | #[inline (always)] |
149 | unsafe fn find_one( |
150 | &self, |
151 | cur: *const u8, |
152 | end: *const u8, |
153 | ) -> Option<Match> { |
154 | let c = self.candidate(cur); |
155 | if !c.is_zero() { |
156 | if let Some(m) = self.teddy.verify(cur, end, c) { |
157 | return Some(m); |
158 | } |
159 | } |
160 | None |
161 | } |
162 | |
163 | /// Look for a candidate match (represented as a vector) starting at the |
164 | /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with |
165 | /// all bits set to zero is returned. |
166 | /// |
167 | /// # Safety |
168 | /// |
169 | /// The given pointer representing the haystack must be valid to read |
170 | /// from. |
171 | /// |
172 | /// Callers must ensure that this is okay to call in the current target for |
173 | /// the current CPU. |
174 | #[inline (always)] |
175 | unsafe fn candidate(&self, cur: *const u8) -> V { |
176 | let chunk = V::load_unaligned(cur); |
177 | Mask::members1(chunk, self.masks) |
178 | } |
179 | } |
180 | |
181 | impl<V: Vector> Slim<V, 2> { |
182 | /// See Slim<V, 1>::find. |
183 | #[inline (always)] |
184 | pub(crate) unsafe fn find( |
185 | &self, |
186 | start: *const u8, |
187 | end: *const u8, |
188 | ) -> Option<Match> { |
189 | let len = end.distance(start); |
190 | debug_assert!(len >= self.minimum_len()); |
191 | let mut cur = start.add(1); |
192 | let mut prev0 = V::splat(0xFF); |
193 | while cur <= end.sub(V::BYTES) { |
194 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
195 | return Some(m); |
196 | } |
197 | cur = cur.add(V::BYTES); |
198 | } |
199 | if cur < end { |
200 | cur = end.sub(V::BYTES); |
201 | prev0 = V::splat(0xFF); |
202 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
203 | return Some(m); |
204 | } |
205 | } |
206 | None |
207 | } |
208 | |
209 | /// See Slim<V, 1>::find_one. |
210 | #[inline (always)] |
211 | unsafe fn find_one( |
212 | &self, |
213 | cur: *const u8, |
214 | end: *const u8, |
215 | prev0: &mut V, |
216 | ) -> Option<Match> { |
217 | let c = self.candidate(cur, prev0); |
218 | if !c.is_zero() { |
219 | if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { |
220 | return Some(m); |
221 | } |
222 | } |
223 | None |
224 | } |
225 | |
226 | /// See Slim<V, 1>::candidate. |
227 | #[inline (always)] |
228 | unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { |
229 | let chunk = V::load_unaligned(cur); |
230 | let (res0, res1) = Mask::members2(chunk, self.masks); |
231 | let res0prev0 = res0.shift_in_one_byte(*prev0); |
232 | let res = res0prev0.and(res1); |
233 | *prev0 = res0; |
234 | res |
235 | } |
236 | } |
237 | |
238 | impl<V: Vector> Slim<V, 3> { |
239 | /// See Slim<V, 1>::find. |
240 | #[inline (always)] |
241 | pub(crate) unsafe fn find( |
242 | &self, |
243 | start: *const u8, |
244 | end: *const u8, |
245 | ) -> Option<Match> { |
246 | let len = end.distance(start); |
247 | debug_assert!(len >= self.minimum_len()); |
248 | let mut cur = start.add(2); |
249 | let mut prev0 = V::splat(0xFF); |
250 | let mut prev1 = V::splat(0xFF); |
251 | while cur <= end.sub(V::BYTES) { |
252 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
253 | return Some(m); |
254 | } |
255 | cur = cur.add(V::BYTES); |
256 | } |
257 | if cur < end { |
258 | cur = end.sub(V::BYTES); |
259 | prev0 = V::splat(0xFF); |
260 | prev1 = V::splat(0xFF); |
261 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
262 | return Some(m); |
263 | } |
264 | } |
265 | None |
266 | } |
267 | |
268 | /// See Slim<V, 1>::find_one. |
269 | #[inline (always)] |
270 | unsafe fn find_one( |
271 | &self, |
272 | cur: *const u8, |
273 | end: *const u8, |
274 | prev0: &mut V, |
275 | prev1: &mut V, |
276 | ) -> Option<Match> { |
277 | let c = self.candidate(cur, prev0, prev1); |
278 | if !c.is_zero() { |
279 | if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { |
280 | return Some(m); |
281 | } |
282 | } |
283 | None |
284 | } |
285 | |
286 | /// See Slim<V, 1>::candidate. |
287 | #[inline (always)] |
288 | unsafe fn candidate( |
289 | &self, |
290 | cur: *const u8, |
291 | prev0: &mut V, |
292 | prev1: &mut V, |
293 | ) -> V { |
294 | let chunk = V::load_unaligned(cur); |
295 | let (res0, res1, res2) = Mask::members3(chunk, self.masks); |
296 | let res0prev0 = res0.shift_in_two_bytes(*prev0); |
297 | let res1prev1 = res1.shift_in_one_byte(*prev1); |
298 | let res = res0prev0.and(res1prev1).and(res2); |
299 | *prev0 = res0; |
300 | *prev1 = res1; |
301 | res |
302 | } |
303 | } |
304 | |
305 | impl<V: Vector> Slim<V, 4> { |
306 | /// See Slim<V, 1>::find. |
307 | #[inline (always)] |
308 | pub(crate) unsafe fn find( |
309 | &self, |
310 | start: *const u8, |
311 | end: *const u8, |
312 | ) -> Option<Match> { |
313 | let len = end.distance(start); |
314 | debug_assert!(len >= self.minimum_len()); |
315 | let mut cur = start.add(3); |
316 | let mut prev0 = V::splat(0xFF); |
317 | let mut prev1 = V::splat(0xFF); |
318 | let mut prev2 = V::splat(0xFF); |
319 | while cur <= end.sub(V::BYTES) { |
320 | if let Some(m) = |
321 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
322 | { |
323 | return Some(m); |
324 | } |
325 | cur = cur.add(V::BYTES); |
326 | } |
327 | if cur < end { |
328 | cur = end.sub(V::BYTES); |
329 | prev0 = V::splat(0xFF); |
330 | prev1 = V::splat(0xFF); |
331 | prev2 = V::splat(0xFF); |
332 | if let Some(m) = |
333 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
334 | { |
335 | return Some(m); |
336 | } |
337 | } |
338 | None |
339 | } |
340 | |
341 | /// See Slim<V, 1>::find_one. |
342 | #[inline (always)] |
343 | unsafe fn find_one( |
344 | &self, |
345 | cur: *const u8, |
346 | end: *const u8, |
347 | prev0: &mut V, |
348 | prev1: &mut V, |
349 | prev2: &mut V, |
350 | ) -> Option<Match> { |
351 | let c = self.candidate(cur, prev0, prev1, prev2); |
352 | if !c.is_zero() { |
353 | if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { |
354 | return Some(m); |
355 | } |
356 | } |
357 | None |
358 | } |
359 | |
360 | /// See Slim<V, 1>::candidate. |
361 | #[inline (always)] |
362 | unsafe fn candidate( |
363 | &self, |
364 | cur: *const u8, |
365 | prev0: &mut V, |
366 | prev1: &mut V, |
367 | prev2: &mut V, |
368 | ) -> V { |
369 | let chunk = V::load_unaligned(cur); |
370 | let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); |
371 | let res0prev0 = res0.shift_in_three_bytes(*prev0); |
372 | let res1prev1 = res1.shift_in_two_bytes(*prev1); |
373 | let res2prev2 = res2.shift_in_one_byte(*prev2); |
374 | let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); |
375 | *prev0 = res0; |
376 | *prev1 = res1; |
377 | *prev2 = res2; |
378 | res |
379 | } |
380 | } |
381 | |
382 | /// A "fat" Teddy implementation that is generic over both the vector type |
383 | /// and the minimum length of the patterns being searched for. |
384 | /// |
385 | /// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. |
386 | #[derive (Clone, Debug)] |
387 | pub(crate) struct Fat<V, const BYTES: usize> { |
388 | /// A generic data structure for doing "fat" Teddy verification. |
389 | teddy: Teddy<16>, |
390 | /// The masks used as inputs to the shuffle operation to generate |
391 | /// candidates (which are fed into the verification routines). |
392 | masks: [Mask<V>; BYTES], |
393 | } |
394 | |
395 | impl<V: FatVector, const BYTES: usize> Fat<V, BYTES> { |
396 | /// Create a new "fat" Teddy searcher for the given patterns. |
397 | /// |
398 | /// # Panics |
399 | /// |
400 | /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. |
401 | /// |
402 | /// # Safety |
403 | /// |
404 | /// Callers must ensure that this is okay to call in the current target for |
405 | /// the current CPU. |
406 | #[inline (always)] |
407 | pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Fat<V, BYTES> { |
408 | assert!( |
409 | 1 <= BYTES && BYTES <= 4, |
410 | "only 1, 2, 3 or 4 bytes are supported" |
411 | ); |
412 | let teddy = Teddy::new(patterns); |
413 | let masks = FatMaskBuilder::from_teddy(&teddy); |
414 | Fat { teddy, masks } |
415 | } |
416 | |
417 | /// Returns the approximate total amount of heap used by this type, in |
418 | /// units of bytes. |
419 | #[inline (always)] |
420 | pub(crate) fn memory_usage(&self) -> usize { |
421 | self.teddy.memory_usage() |
422 | } |
423 | |
424 | /// Returns the minimum length, in bytes, that a haystack must be in order |
425 | /// to use it with this searcher. |
426 | #[inline (always)] |
427 | pub(crate) fn minimum_len(&self) -> usize { |
428 | V::Half::BYTES + (BYTES - 1) |
429 | } |
430 | } |
431 | |
432 | impl<V: FatVector> Fat<V, 1> { |
433 | /// Look for an occurrences of the patterns in this finder in the haystack |
434 | /// given by the `start` and `end` pointers. |
435 | /// |
436 | /// If no match could be found, then `None` is returned. |
437 | /// |
438 | /// # Safety |
439 | /// |
440 | /// The given pointers representing the haystack must be valid to read |
441 | /// from. They must also point to a region of memory that is at least the |
442 | /// minimum length required by this searcher. |
443 | /// |
444 | /// Callers must ensure that this is okay to call in the current target for |
445 | /// the current CPU. |
446 | #[inline (always)] |
447 | pub(crate) unsafe fn find( |
448 | &self, |
449 | start: *const u8, |
450 | end: *const u8, |
451 | ) -> Option<Match> { |
452 | let len = end.distance(start); |
453 | debug_assert!(len >= self.minimum_len()); |
454 | let mut cur = start; |
455 | while cur <= end.sub(V::Half::BYTES) { |
456 | if let Some(m) = self.find_one(cur, end) { |
457 | return Some(m); |
458 | } |
459 | cur = cur.add(V::Half::BYTES); |
460 | } |
461 | if cur < end { |
462 | cur = end.sub(V::Half::BYTES); |
463 | if let Some(m) = self.find_one(cur, end) { |
464 | return Some(m); |
465 | } |
466 | } |
467 | None |
468 | } |
469 | |
470 | /// Look for a match starting at the `V::BYTES` at and after `cur`. If |
471 | /// there isn't one, then `None` is returned. |
472 | /// |
473 | /// # Safety |
474 | /// |
475 | /// The given pointers representing the haystack must be valid to read |
476 | /// from. They must also point to a region of memory that is at least the |
477 | /// minimum length required by this searcher. |
478 | /// |
479 | /// Callers must ensure that this is okay to call in the current target for |
480 | /// the current CPU. |
481 | #[inline (always)] |
482 | unsafe fn find_one( |
483 | &self, |
484 | cur: *const u8, |
485 | end: *const u8, |
486 | ) -> Option<Match> { |
487 | let c = self.candidate(cur); |
488 | if !c.is_zero() { |
489 | if let Some(m) = self.teddy.verify(cur, end, c) { |
490 | return Some(m); |
491 | } |
492 | } |
493 | None |
494 | } |
495 | |
496 | /// Look for a candidate match (represented as a vector) starting at the |
497 | /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with |
498 | /// all bits set to zero is returned. |
499 | /// |
500 | /// # Safety |
501 | /// |
502 | /// The given pointer representing the haystack must be valid to read |
503 | /// from. |
504 | /// |
505 | /// Callers must ensure that this is okay to call in the current target for |
506 | /// the current CPU. |
507 | #[inline (always)] |
508 | unsafe fn candidate(&self, cur: *const u8) -> V { |
509 | let chunk = V::load_half_unaligned(cur); |
510 | Mask::members1(chunk, self.masks) |
511 | } |
512 | } |
513 | |
514 | impl<V: FatVector> Fat<V, 2> { |
515 | /// See `Fat<V, 1>::find`. |
516 | #[inline (always)] |
517 | pub(crate) unsafe fn find( |
518 | &self, |
519 | start: *const u8, |
520 | end: *const u8, |
521 | ) -> Option<Match> { |
522 | let len = end.distance(start); |
523 | debug_assert!(len >= self.minimum_len()); |
524 | let mut cur = start.add(1); |
525 | let mut prev0 = V::splat(0xFF); |
526 | while cur <= end.sub(V::Half::BYTES) { |
527 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
528 | return Some(m); |
529 | } |
530 | cur = cur.add(V::Half::BYTES); |
531 | } |
532 | if cur < end { |
533 | cur = end.sub(V::Half::BYTES); |
534 | prev0 = V::splat(0xFF); |
535 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
536 | return Some(m); |
537 | } |
538 | } |
539 | None |
540 | } |
541 | |
542 | /// See `Fat<V, 1>::find_one`. |
543 | #[inline (always)] |
544 | unsafe fn find_one( |
545 | &self, |
546 | cur: *const u8, |
547 | end: *const u8, |
548 | prev0: &mut V, |
549 | ) -> Option<Match> { |
550 | let c = self.candidate(cur, prev0); |
551 | if !c.is_zero() { |
552 | if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { |
553 | return Some(m); |
554 | } |
555 | } |
556 | None |
557 | } |
558 | |
559 | /// See `Fat<V, 1>::candidate`. |
560 | #[inline (always)] |
561 | unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { |
562 | let chunk = V::load_half_unaligned(cur); |
563 | let (res0, res1) = Mask::members2(chunk, self.masks); |
564 | let res0prev0 = res0.half_shift_in_one_byte(*prev0); |
565 | let res = res0prev0.and(res1); |
566 | *prev0 = res0; |
567 | res |
568 | } |
569 | } |
570 | |
571 | impl<V: FatVector> Fat<V, 3> { |
572 | /// See `Fat<V, 1>::find`. |
573 | #[inline (always)] |
574 | pub(crate) unsafe fn find( |
575 | &self, |
576 | start: *const u8, |
577 | end: *const u8, |
578 | ) -> Option<Match> { |
579 | let len = end.distance(start); |
580 | debug_assert!(len >= self.minimum_len()); |
581 | let mut cur = start.add(2); |
582 | let mut prev0 = V::splat(0xFF); |
583 | let mut prev1 = V::splat(0xFF); |
584 | while cur <= end.sub(V::Half::BYTES) { |
585 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
586 | return Some(m); |
587 | } |
588 | cur = cur.add(V::Half::BYTES); |
589 | } |
590 | if cur < end { |
591 | cur = end.sub(V::Half::BYTES); |
592 | prev0 = V::splat(0xFF); |
593 | prev1 = V::splat(0xFF); |
594 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
595 | return Some(m); |
596 | } |
597 | } |
598 | None |
599 | } |
600 | |
601 | /// See `Fat<V, 1>::find_one`. |
602 | #[inline (always)] |
603 | unsafe fn find_one( |
604 | &self, |
605 | cur: *const u8, |
606 | end: *const u8, |
607 | prev0: &mut V, |
608 | prev1: &mut V, |
609 | ) -> Option<Match> { |
610 | let c = self.candidate(cur, prev0, prev1); |
611 | if !c.is_zero() { |
612 | if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { |
613 | return Some(m); |
614 | } |
615 | } |
616 | None |
617 | } |
618 | |
619 | /// See `Fat<V, 1>::candidate`. |
620 | #[inline (always)] |
621 | unsafe fn candidate( |
622 | &self, |
623 | cur: *const u8, |
624 | prev0: &mut V, |
625 | prev1: &mut V, |
626 | ) -> V { |
627 | let chunk = V::load_half_unaligned(cur); |
628 | let (res0, res1, res2) = Mask::members3(chunk, self.masks); |
629 | let res0prev0 = res0.half_shift_in_two_bytes(*prev0); |
630 | let res1prev1 = res1.half_shift_in_one_byte(*prev1); |
631 | let res = res0prev0.and(res1prev1).and(res2); |
632 | *prev0 = res0; |
633 | *prev1 = res1; |
634 | res |
635 | } |
636 | } |
637 | |
638 | impl<V: FatVector> Fat<V, 4> { |
639 | /// See `Fat<V, 1>::find`. |
640 | #[inline (always)] |
641 | pub(crate) unsafe fn find( |
642 | &self, |
643 | start: *const u8, |
644 | end: *const u8, |
645 | ) -> Option<Match> { |
646 | let len = end.distance(start); |
647 | debug_assert!(len >= self.minimum_len()); |
648 | let mut cur = start.add(3); |
649 | let mut prev0 = V::splat(0xFF); |
650 | let mut prev1 = V::splat(0xFF); |
651 | let mut prev2 = V::splat(0xFF); |
652 | while cur <= end.sub(V::Half::BYTES) { |
653 | if let Some(m) = |
654 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
655 | { |
656 | return Some(m); |
657 | } |
658 | cur = cur.add(V::Half::BYTES); |
659 | } |
660 | if cur < end { |
661 | cur = end.sub(V::Half::BYTES); |
662 | prev0 = V::splat(0xFF); |
663 | prev1 = V::splat(0xFF); |
664 | prev2 = V::splat(0xFF); |
665 | if let Some(m) = |
666 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
667 | { |
668 | return Some(m); |
669 | } |
670 | } |
671 | None |
672 | } |
673 | |
674 | /// See `Fat<V, 1>::find_one`. |
675 | #[inline (always)] |
676 | unsafe fn find_one( |
677 | &self, |
678 | cur: *const u8, |
679 | end: *const u8, |
680 | prev0: &mut V, |
681 | prev1: &mut V, |
682 | prev2: &mut V, |
683 | ) -> Option<Match> { |
684 | let c = self.candidate(cur, prev0, prev1, prev2); |
685 | if !c.is_zero() { |
686 | if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { |
687 | return Some(m); |
688 | } |
689 | } |
690 | None |
691 | } |
692 | |
693 | /// See `Fat<V, 1>::candidate`. |
694 | #[inline (always)] |
695 | unsafe fn candidate( |
696 | &self, |
697 | cur: *const u8, |
698 | prev0: &mut V, |
699 | prev1: &mut V, |
700 | prev2: &mut V, |
701 | ) -> V { |
702 | let chunk = V::load_half_unaligned(cur); |
703 | let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); |
704 | let res0prev0 = res0.half_shift_in_three_bytes(*prev0); |
705 | let res1prev1 = res1.half_shift_in_two_bytes(*prev1); |
706 | let res2prev2 = res2.half_shift_in_one_byte(*prev2); |
707 | let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); |
708 | *prev0 = res0; |
709 | *prev1 = res1; |
710 | *prev2 = res2; |
711 | res |
712 | } |
713 | } |
714 | |
715 | /// The common elements of all "slim" and "fat" Teddy search implementations. |
716 | /// |
717 | /// Essentially, this contains the patterns and the buckets. Namely, it |
718 | /// contains enough to implement the verification step after candidates are |
719 | /// identified via the shuffle masks. |
720 | /// |
721 | /// It is generic over the number of buckets used. In general, the number of |
722 | /// buckets is either 8 (for "slim" Teddy) or 16 (for "fat" Teddy). The generic |
723 | /// parameter isn't really meant to be instantiated for any value other than |
724 | /// 8 or 16, although it is technically possible. The main hiccup is that there |
725 | /// is some bit-shifting done in the critical part of verification that could |
726 | /// be quite expensive if `N` is not a multiple of 2. |
727 | #[derive (Clone, Debug)] |
728 | struct Teddy<const BUCKETS: usize> { |
729 | /// The patterns we are searching for. |
730 | /// |
731 | /// A pattern string can be found by its `PatternID`. |
732 | patterns: Arc<Patterns>, |
733 | /// The allocation of patterns in buckets. This only contains the IDs of |
734 | /// patterns. In order to do full verification, callers must provide the |
735 | /// actual patterns when using Teddy. |
736 | buckets: [Vec<PatternID>; BUCKETS], |
737 | // N.B. The above representation is very simple, but it definitely results |
738 | // in ping-ponging between different allocations during verification. I've |
739 | // tried experimenting with other representations that flatten the pattern |
740 | // strings into a single allocation, but it doesn't seem to help much. |
741 | // Probably everything is small enough to fit into cache anyway, and so the |
742 | // pointer chasing isn't a big deal? |
743 | // |
744 | // One other avenue I haven't explored is some kind of hashing trick |
745 | // that let's us do another high-confidence check before launching into |
746 | // `memcmp`. |
747 | } |
748 | |
749 | impl<const BUCKETS: usize> Teddy<BUCKETS> { |
750 | /// Create a new generic data structure for Teddy verification. |
751 | fn new(patterns: Arc<Patterns>) -> Teddy<BUCKETS> { |
752 | assert_ne!(0, patterns.len(), "Teddy requires at least one pattern" ); |
753 | assert_ne!( |
754 | 0, |
755 | patterns.minimum_len(), |
756 | "Teddy does not support zero-length patterns" |
757 | ); |
758 | assert!( |
759 | BUCKETS == 8 || BUCKETS == 16, |
760 | "Teddy only supports 8 or 16 buckets" |
761 | ); |
762 | // MSRV(1.63): Use core::array::from_fn below instead of allocating a |
763 | // superfluous outer Vec. Not a big deal (especially given the BTreeMap |
764 | // allocation below), but nice to not do it. |
765 | let buckets = |
766 | <[Vec<PatternID>; BUCKETS]>::try_from(vec![vec![]; BUCKETS]) |
767 | .unwrap(); |
768 | let mut t = Teddy { patterns, buckets }; |
769 | |
770 | let mut map: BTreeMap<Box<[u8]>, usize> = BTreeMap::new(); |
771 | for (id, pattern) in t.patterns.iter() { |
772 | // We try to be slightly clever in how we assign patterns into |
773 | // buckets. Generally speaking, we want patterns with the same |
774 | // prefix to be in the same bucket, since it minimizes the amount |
775 | // of time we spend churning through buckets in the verification |
776 | // step. |
777 | // |
778 | // So we could assign patterns with the same N-prefix (where N is |
779 | // the size of the mask, which is one of {1, 2, 3}) to the same |
780 | // bucket. However, case insensitive searches are fairly common, so |
781 | // we'd for example, ideally want to treat `abc` and `ABC` as if |
782 | // they shared the same prefix. ASCII has the nice property that |
783 | // the lower 4 bits of A and a are the same, so we therefore group |
784 | // patterns with the same low-nybble-N-prefix into the same bucket. |
785 | // |
786 | // MOREOVER, this is actually necessary for correctness! In |
787 | // particular, by grouping patterns with the same prefix into the |
788 | // same bucket, we ensure that we preserve correct leftmost-first |
789 | // and leftmost-longest match semantics. In addition to the fact |
790 | // that `patterns.iter()` iterates in the correct order, this |
791 | // guarantees that all possible ambiguous matches will occur in |
792 | // the same bucket. The verification routine could be adjusted to |
793 | // support correct leftmost match semantics regardless of bucket |
794 | // allocation, but that results in a performance hit. It's much |
795 | // nicer to be able to just stop as soon as a match is found. |
796 | let lonybs = pattern.low_nybbles(t.mask_len()); |
797 | if let Some(&bucket) = map.get(&lonybs) { |
798 | t.buckets[bucket].push(id); |
799 | } else { |
800 | // N.B. We assign buckets in reverse because it shouldn't have |
801 | // any influence on performance, but it does make it harder to |
802 | // get leftmost match semantics accidentally correct. |
803 | let bucket = (BUCKETS - 1) - (id.as_usize() % BUCKETS); |
804 | t.buckets[bucket].push(id); |
805 | map.insert(lonybs, bucket); |
806 | } |
807 | } |
808 | t |
809 | } |
810 | |
811 | /// Verify whether there are any matches starting at or after `cur` in the |
812 | /// haystack. The candidate chunk given should correspond to 8-bit bitsets |
813 | /// for N buckets. |
814 | /// |
815 | /// # Safety |
816 | /// |
817 | /// The given pointers representing the haystack must be valid to read |
818 | /// from. |
819 | #[inline (always)] |
820 | unsafe fn verify64( |
821 | &self, |
822 | cur: *const u8, |
823 | end: *const u8, |
824 | mut candidate_chunk: u64, |
825 | ) -> Option<Match> { |
826 | while candidate_chunk != 0 { |
827 | let bit = candidate_chunk.trailing_zeros().as_usize(); |
828 | candidate_chunk &= !(1 << bit); |
829 | |
830 | let cur = cur.add(bit / BUCKETS); |
831 | let bucket = bit % BUCKETS; |
832 | if let Some(m) = self.verify_bucket(cur, end, bucket) { |
833 | return Some(m); |
834 | } |
835 | } |
836 | None |
837 | } |
838 | |
839 | /// Verify whether there are any matches starting at `at` in the given |
840 | /// `haystack` corresponding only to patterns in the given bucket. |
841 | /// |
842 | /// # Safety |
843 | /// |
844 | /// The given pointers representing the haystack must be valid to read |
845 | /// from. |
846 | /// |
847 | /// The bucket index must be less than or equal to `self.buckets.len()`. |
848 | #[inline (always)] |
849 | unsafe fn verify_bucket( |
850 | &self, |
851 | cur: *const u8, |
852 | end: *const u8, |
853 | bucket: usize, |
854 | ) -> Option<Match> { |
855 | debug_assert!(bucket < self.buckets.len()); |
856 | // SAFETY: The caller must ensure that the bucket index is correct. |
857 | for pid in self.buckets.get_unchecked(bucket).iter().copied() { |
858 | // SAFETY: This is safe because we are guaranteed that every |
859 | // index in a Teddy bucket is a valid index into `pats`, by |
860 | // construction. |
861 | debug_assert!(pid.as_usize() < self.patterns.len()); |
862 | let pat = self.patterns.get_unchecked(pid); |
863 | if pat.is_prefix_raw(cur, end) { |
864 | let start = cur; |
865 | let end = start.add(pat.len()); |
866 | return Some(Match { pid, start, end }); |
867 | } |
868 | } |
869 | None |
870 | } |
871 | |
872 | /// Returns the total number of masks required by the patterns in this |
873 | /// Teddy searcher. |
874 | /// |
875 | /// Basically, the mask length corresponds to the type of Teddy searcher |
876 | /// to use: a 1-byte, 2-byte, 3-byte or 4-byte searcher. The bigger the |
877 | /// better, typically, since searching for longer substrings usually |
878 | /// decreases the rate of false positives. Therefore, the number of masks |
879 | /// needed is the length of the shortest pattern in this searcher. If the |
880 | /// length of the shortest pattern (in bytes) is bigger than 4, then the |
881 | /// mask length is 4 since there are no Teddy searchers for more than 4 |
882 | /// bytes. |
883 | fn mask_len(&self) -> usize { |
884 | core::cmp::min(4, self.patterns.minimum_len()) |
885 | } |
886 | |
887 | /// Returns the approximate total amount of heap used by this type, in |
888 | /// units of bytes. |
889 | fn memory_usage(&self) -> usize { |
890 | // This is an upper bound rather than a precise accounting. No |
891 | // particular reason, other than it's probably very close to actual |
892 | // memory usage in practice. |
893 | self.patterns.len() * core::mem::size_of::<PatternID>() |
894 | } |
895 | } |
896 | |
897 | impl Teddy<8> { |
898 | /// Runs the verification routine for "slim" Teddy. |
899 | /// |
900 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
901 | /// per lane), where the ith bit is set in the jth lane if and only if the |
902 | /// byte occurring at `at + j` in `cur` is in the bucket `i`. |
903 | /// |
904 | /// # Safety |
905 | /// |
906 | /// Callers must ensure that this is okay to call in the current target for |
907 | /// the current CPU. |
908 | /// |
909 | /// The given pointers must be valid to read from. |
910 | #[inline (always)] |
911 | unsafe fn verify<V: Vector>( |
912 | &self, |
913 | mut cur: *const u8, |
914 | end: *const u8, |
915 | candidate: V, |
916 | ) -> Option<Match> { |
917 | debug_assert!(!candidate.is_zero()); |
918 | // Convert the candidate into 64-bit chunks, and then verify each of |
919 | // those chunks. |
920 | candidate.for_each_64bit_lane( |
921 | #[inline (always)] |
922 | |_, chunk| { |
923 | let result = self.verify64(cur, end, chunk); |
924 | cur = cur.add(8); |
925 | result |
926 | }, |
927 | ) |
928 | } |
929 | } |
930 | |
931 | impl Teddy<16> { |
932 | /// Runs the verification routine for "fat" Teddy. |
933 | /// |
934 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
935 | /// per lane), where the ith bit is set in the jth lane if and only if the |
936 | /// byte occurring at `at + (j < 16 ? j : j - 16)` in `cur` is in the |
937 | /// bucket `j < 16 ? i : i + 8`. |
938 | /// |
939 | /// # Safety |
940 | /// |
941 | /// Callers must ensure that this is okay to call in the current target for |
942 | /// the current CPU. |
943 | /// |
944 | /// The given pointers must be valid to read from. |
945 | #[inline (always)] |
946 | unsafe fn verify<V: FatVector>( |
947 | &self, |
948 | mut cur: *const u8, |
949 | end: *const u8, |
950 | candidate: V, |
951 | ) -> Option<Match> { |
952 | // This is a bit tricky, but we basically want to convert our |
953 | // candidate, which looks like this (assuming a 256-bit vector): |
954 | // |
955 | // a31 a30 ... a17 a16 a15 a14 ... a01 a00 |
956 | // |
957 | // where each a(i) is an 8-bit bitset corresponding to the activated |
958 | // buckets, to this |
959 | // |
960 | // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 |
961 | // |
962 | // Namely, for Fat Teddy, the high 128-bits of the candidate correspond |
963 | // to the same bytes in the haystack in the low 128-bits (so we only |
964 | // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. |
965 | // |
966 | // The verification routine wants to look at all potentially matching |
967 | // buckets before moving on to the next lane. So for example, both |
968 | // a16 and a00 both correspond to the first byte in our window; a00 |
969 | // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, |
970 | // a16 should be checked before a01. So the transformation shown above |
971 | // allows us to use our normal verification procedure with one small |
972 | // change: we treat each bitset as 16 bits instead of 8 bits. |
973 | debug_assert!(!candidate.is_zero()); |
974 | |
975 | // Swap the 128-bit lanes in the candidate vector. |
976 | let swapped = candidate.swap_halves(); |
977 | // Interleave the bytes from the low 128-bit lanes, starting with |
978 | // cand first. |
979 | let r1 = candidate.interleave_low_8bit_lanes(swapped); |
980 | // Interleave the bytes from the high 128-bit lanes, starting with |
981 | // cand first. |
982 | let r2 = candidate.interleave_high_8bit_lanes(swapped); |
983 | // Now just take the 2 low 64-bit integers from both r1 and r2. We |
984 | // can drop the high 64-bit integers because they are a mirror image |
985 | // of the low 64-bit integers. All we care about are the low 128-bit |
986 | // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets |
987 | // laid out in the desired order, as described above. |
988 | r1.for_each_low_64bit_lane( |
989 | r2, |
990 | #[inline (always)] |
991 | |_, chunk| { |
992 | let result = self.verify64(cur, end, chunk); |
993 | cur = cur.add(4); |
994 | result |
995 | }, |
996 | ) |
997 | } |
998 | } |
999 | |
1000 | /// A vector generic mask for the low and high nybbles in a set of patterns. |
1001 | /// Each 8-bit lane `j` in a vector corresponds to a bitset where the `i`th bit |
1002 | /// is set if and only if the nybble `j` is in the bucket `i` at a particular |
1003 | /// position. |
1004 | /// |
1005 | /// This is slightly tweaked dependending on whether Slim or Fat Teddy is being |
1006 | /// used. For Slim Teddy, the bitsets in the lower half are the same as the |
1007 | /// bitsets in the higher half, so that we can search `V::BYTES` bytes at a |
1008 | /// time. (Remember, the nybbles in the haystack are used as indices into these |
1009 | /// masks, and 256-bit shuffles only operate on 128-bit lanes.) |
1010 | /// |
1011 | /// For Fat Teddy, the bitsets are not repeated, but instead, the high half |
1012 | /// bits correspond to an addition 8 buckets. So that a bitset `00100010` has |
1013 | /// buckets 1 and 5 set if it's in the lower half, but has buckets 9 and 13 set |
1014 | /// if it's in the higher half. |
1015 | #[derive (Clone, Copy, Debug)] |
1016 | struct Mask<V> { |
1017 | lo: V, |
1018 | hi: V, |
1019 | } |
1020 | |
1021 | impl<V: Vector> Mask<V> { |
1022 | /// Return a candidate for Teddy (fat or slim) that is searching for 1-byte |
1023 | /// candidates. |
1024 | /// |
1025 | /// If a candidate is returned, it will be a collection of 8-bit bitsets |
1026 | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
1027 | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
1028 | /// `i`. If no candidate is found, then the vector returned will have all |
1029 | /// lanes set to zero. |
1030 | /// |
1031 | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
1032 | /// the least significant byte corresponds to the start of the window). For |
1033 | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
1034 | /// the window repeated in each half of the vector. |
1035 | /// |
1036 | /// `mask1` should correspond to a low/high mask for the first byte of all |
1037 | /// patterns that are being searched. |
1038 | #[inline (always)] |
1039 | unsafe fn members1(chunk: V, masks: [Mask<V>; 1]) -> V { |
1040 | let lomask = V::splat(0xF); |
1041 | let hlo = chunk.and(lomask); |
1042 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
1043 | let locand = masks[0].lo.shuffle_bytes(hlo); |
1044 | let hicand = masks[0].hi.shuffle_bytes(hhi); |
1045 | locand.and(hicand) |
1046 | } |
1047 | |
1048 | /// Return a candidate for Teddy (fat or slim) that is searching for 2-byte |
1049 | /// candidates. |
1050 | /// |
1051 | /// If candidates are returned, each will be a collection of 8-bit bitsets |
1052 | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
1053 | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
1054 | /// `i`. Each candidate returned corresponds to the first and second bytes |
1055 | /// of the patterns being searched. If no candidate is found, then all of |
1056 | /// the lanes will be set to zero in at least one of the vectors returned. |
1057 | /// |
1058 | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
1059 | /// the least significant byte corresponds to the start of the window). For |
1060 | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
1061 | /// the window repeated in each half of the vector. |
1062 | /// |
1063 | /// The masks should correspond to the masks computed for the first and |
1064 | /// second bytes of all patterns that are being searched. |
1065 | #[inline (always)] |
1066 | unsafe fn members2(chunk: V, masks: [Mask<V>; 2]) -> (V, V) { |
1067 | let lomask = V::splat(0xF); |
1068 | let hlo = chunk.and(lomask); |
1069 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
1070 | |
1071 | let locand1 = masks[0].lo.shuffle_bytes(hlo); |
1072 | let hicand1 = masks[0].hi.shuffle_bytes(hhi); |
1073 | let cand1 = locand1.and(hicand1); |
1074 | |
1075 | let locand2 = masks[1].lo.shuffle_bytes(hlo); |
1076 | let hicand2 = masks[1].hi.shuffle_bytes(hhi); |
1077 | let cand2 = locand2.and(hicand2); |
1078 | |
1079 | (cand1, cand2) |
1080 | } |
1081 | |
1082 | /// Return a candidate for Teddy (fat or slim) that is searching for 3-byte |
1083 | /// candidates. |
1084 | /// |
1085 | /// If candidates are returned, each will be a collection of 8-bit bitsets |
1086 | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
1087 | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
1088 | /// `i`. Each candidate returned corresponds to the first, second and third |
1089 | /// bytes of the patterns being searched. If no candidate is found, then |
1090 | /// all of the lanes will be set to zero in at least one of the vectors |
1091 | /// returned. |
1092 | /// |
1093 | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
1094 | /// the least significant byte corresponds to the start of the window). For |
1095 | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
1096 | /// the window repeated in each half of the vector. |
1097 | /// |
1098 | /// The masks should correspond to the masks computed for the first, second |
1099 | /// and third bytes of all patterns that are being searched. |
1100 | #[inline (always)] |
1101 | unsafe fn members3(chunk: V, masks: [Mask<V>; 3]) -> (V, V, V) { |
1102 | let lomask = V::splat(0xF); |
1103 | let hlo = chunk.and(lomask); |
1104 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
1105 | |
1106 | let locand1 = masks[0].lo.shuffle_bytes(hlo); |
1107 | let hicand1 = masks[0].hi.shuffle_bytes(hhi); |
1108 | let cand1 = locand1.and(hicand1); |
1109 | |
1110 | let locand2 = masks[1].lo.shuffle_bytes(hlo); |
1111 | let hicand2 = masks[1].hi.shuffle_bytes(hhi); |
1112 | let cand2 = locand2.and(hicand2); |
1113 | |
1114 | let locand3 = masks[2].lo.shuffle_bytes(hlo); |
1115 | let hicand3 = masks[2].hi.shuffle_bytes(hhi); |
1116 | let cand3 = locand3.and(hicand3); |
1117 | |
1118 | (cand1, cand2, cand3) |
1119 | } |
1120 | |
1121 | /// Return a candidate for Teddy (fat or slim) that is searching for 4-byte |
1122 | /// candidates. |
1123 | /// |
1124 | /// If candidates are returned, each will be a collection of 8-bit bitsets |
1125 | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
1126 | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
1127 | /// `i`. Each candidate returned corresponds to the first, second, third |
1128 | /// and fourth bytes of the patterns being searched. If no candidate is |
1129 | /// found, then all of the lanes will be set to zero in at least one of the |
1130 | /// vectors returned. |
1131 | /// |
1132 | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
1133 | /// the least significant byte corresponds to the start of the window). For |
1134 | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
1135 | /// the window repeated in each half of the vector. |
1136 | /// |
1137 | /// The masks should correspond to the masks computed for the first, |
1138 | /// second, third and fourth bytes of all patterns that are being searched. |
1139 | #[inline (always)] |
1140 | unsafe fn members4(chunk: V, masks: [Mask<V>; 4]) -> (V, V, V, V) { |
1141 | let lomask = V::splat(0xF); |
1142 | let hlo = chunk.and(lomask); |
1143 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
1144 | |
1145 | let locand1 = masks[0].lo.shuffle_bytes(hlo); |
1146 | let hicand1 = masks[0].hi.shuffle_bytes(hhi); |
1147 | let cand1 = locand1.and(hicand1); |
1148 | |
1149 | let locand2 = masks[1].lo.shuffle_bytes(hlo); |
1150 | let hicand2 = masks[1].hi.shuffle_bytes(hhi); |
1151 | let cand2 = locand2.and(hicand2); |
1152 | |
1153 | let locand3 = masks[2].lo.shuffle_bytes(hlo); |
1154 | let hicand3 = masks[2].hi.shuffle_bytes(hhi); |
1155 | let cand3 = locand3.and(hicand3); |
1156 | |
1157 | let locand4 = masks[3].lo.shuffle_bytes(hlo); |
1158 | let hicand4 = masks[3].hi.shuffle_bytes(hhi); |
1159 | let cand4 = locand4.and(hicand4); |
1160 | |
1161 | (cand1, cand2, cand3, cand4) |
1162 | } |
1163 | } |
1164 | |
1165 | /// Represents the low and high nybble masks that will be used during |
1166 | /// search. Each mask is 32 bytes wide, although only the first 16 bytes are |
1167 | /// used for 128-bit vectors. |
1168 | /// |
1169 | /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set |
1170 | /// if and only if the corresponding nybble is in the ith bucket. The index of |
1171 | /// the byte (0-15, inclusive) corresponds to the nybble. |
1172 | /// |
1173 | /// Each mask is used as the target of a shuffle, where the indices for the |
1174 | /// shuffle are taken from the haystack. AND'ing the shuffles for both the |
1175 | /// low and high masks together also results in 8-bit bitsets, but where bit |
1176 | /// `i` is set if and only if the correspond *byte* is in the ith bucket. |
1177 | #[derive (Clone, Default)] |
1178 | struct SlimMaskBuilder { |
1179 | lo: [u8; 32], |
1180 | hi: [u8; 32], |
1181 | } |
1182 | |
1183 | impl SlimMaskBuilder { |
1184 | /// Update this mask by adding the given byte to the given bucket. The |
1185 | /// given bucket must be in the range 0-7. |
1186 | /// |
1187 | /// # Panics |
1188 | /// |
1189 | /// When `bucket >= 8`. |
1190 | fn add(&mut self, bucket: usize, byte: u8) { |
1191 | assert!(bucket < 8); |
1192 | |
1193 | let bucket = u8::try_from(bucket).unwrap(); |
1194 | let byte_lo = usize::from(byte & 0xF); |
1195 | let byte_hi = usize::from((byte >> 4) & 0xF); |
1196 | // When using 256-bit vectors, we need to set this bucket assignment in |
1197 | // the low and high 128-bit portions of the mask. This allows us to |
1198 | // process 32 bytes at a time. Namely, AVX2 shuffles operate on each |
1199 | // of the 128-bit lanes, rather than the full 256-bit vector at once. |
1200 | self.lo[byte_lo] |= 1 << bucket; |
1201 | self.lo[byte_lo + 16] |= 1 << bucket; |
1202 | self.hi[byte_hi] |= 1 << bucket; |
1203 | self.hi[byte_hi + 16] |= 1 << bucket; |
1204 | } |
1205 | |
1206 | /// Turn this builder into a vector mask. |
1207 | /// |
1208 | /// # Panics |
1209 | /// |
1210 | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
1211 | /// |
1212 | /// # Safety |
1213 | /// |
1214 | /// Callers must ensure that this is okay to call in the current target for |
1215 | /// the current CPU. |
1216 | #[inline (always)] |
1217 | unsafe fn build<V: Vector>(&self) -> Mask<V> { |
1218 | assert!(V::BYTES <= self.lo.len()); |
1219 | assert!(V::BYTES <= self.hi.len()); |
1220 | Mask { |
1221 | lo: V::load_unaligned(self.lo[..].as_ptr()), |
1222 | hi: V::load_unaligned(self.hi[..].as_ptr()), |
1223 | } |
1224 | } |
1225 | |
1226 | /// A convenience function for building `N` vector masks from a slim |
1227 | /// `Teddy` value. |
1228 | /// |
1229 | /// # Panics |
1230 | /// |
1231 | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
1232 | /// |
1233 | /// # Safety |
1234 | /// |
1235 | /// Callers must ensure that this is okay to call in the current target for |
1236 | /// the current CPU. |
1237 | #[inline (always)] |
1238 | unsafe fn from_teddy<const BYTES: usize, V: Vector>( |
1239 | teddy: &Teddy<8>, |
1240 | ) -> [Mask<V>; BYTES] { |
1241 | // MSRV(1.63): Use core::array::from_fn to just build the array here |
1242 | // instead of creating a vector and turning it into an array. |
1243 | let mut mask_builders = vec![SlimMaskBuilder::default(); BYTES]; |
1244 | for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { |
1245 | for pid in bucket.iter().copied() { |
1246 | let pat = teddy.patterns.get(pid); |
1247 | for (i, builder) in mask_builders.iter_mut().enumerate() { |
1248 | builder.add(bucket_index, pat.bytes()[i]); |
1249 | } |
1250 | } |
1251 | } |
1252 | let array = |
1253 | <[SlimMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); |
1254 | array.map(|builder| builder.build()) |
1255 | } |
1256 | } |
1257 | |
1258 | impl Debug for SlimMaskBuilder { |
1259 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
1260 | let (mut parts_lo: Vec, mut parts_hi: Vec) = (vec![], vec![]); |
1261 | for i: usize in 0..32 { |
1262 | parts_lo.push(format!(" {:02}: {:08b}" , i, self.lo[i])); |
1263 | parts_hi.push(format!(" {:02}: {:08b}" , i, self.hi[i])); |
1264 | } |
1265 | f&mut DebugStruct<'_, '_>.debug_struct("SlimMaskBuilder" ) |
1266 | .field("lo" , &parts_lo) |
1267 | .field(name:"hi" , &parts_hi) |
1268 | .finish() |
1269 | } |
1270 | } |
1271 | |
1272 | /// Represents the low and high nybble masks that will be used during "fat" |
1273 | /// Teddy search. |
1274 | /// |
1275 | /// Each mask is 32 bytes wide, and at the time of writing, only 256-bit vectors |
1276 | /// support fat Teddy. |
1277 | /// |
1278 | /// A fat Teddy mask is like a slim Teddy mask, except that instead of |
1279 | /// repeating the bitsets in the high and low 128-bits in 256-bit vectors, the |
1280 | /// high and low 128-bit halves each represent distinct buckets. (Bringing the |
1281 | /// total to 16 instead of 8.) This permits spreading the patterns out a bit |
1282 | /// more and thus putting less pressure on verification to be fast. |
1283 | /// |
1284 | /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set |
1285 | /// if and only if the corresponding nybble is in the ith bucket. The index of |
1286 | /// the byte (0-15, inclusive) corresponds to the nybble. |
1287 | #[derive (Clone, Copy, Default)] |
1288 | struct FatMaskBuilder { |
1289 | lo: [u8; 32], |
1290 | hi: [u8; 32], |
1291 | } |
1292 | |
1293 | impl FatMaskBuilder { |
1294 | /// Update this mask by adding the given byte to the given bucket. The |
1295 | /// given bucket must be in the range 0-15. |
1296 | /// |
1297 | /// # Panics |
1298 | /// |
1299 | /// When `bucket >= 16`. |
1300 | fn add(&mut self, bucket: usize, byte: u8) { |
1301 | assert!(bucket < 16); |
1302 | |
1303 | let bucket = u8::try_from(bucket).unwrap(); |
1304 | let byte_lo = usize::from(byte & 0xF); |
1305 | let byte_hi = usize::from((byte >> 4) & 0xF); |
1306 | // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, |
1307 | // the high 128 bits of our mask correspond to buckets 8-15, while the |
1308 | // low 128 bits correspond to buckets 0-7. |
1309 | if bucket < 8 { |
1310 | self.lo[byte_lo] |= 1 << bucket; |
1311 | self.hi[byte_hi] |= 1 << bucket; |
1312 | } else { |
1313 | self.lo[byte_lo + 16] |= 1 << (bucket % 8); |
1314 | self.hi[byte_hi + 16] |= 1 << (bucket % 8); |
1315 | } |
1316 | } |
1317 | |
1318 | /// Turn this builder into a vector mask. |
1319 | /// |
1320 | /// # Panics |
1321 | /// |
1322 | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
1323 | /// |
1324 | /// # Safety |
1325 | /// |
1326 | /// Callers must ensure that this is okay to call in the current target for |
1327 | /// the current CPU. |
1328 | #[inline (always)] |
1329 | unsafe fn build<V: Vector>(&self) -> Mask<V> { |
1330 | assert!(V::BYTES <= self.lo.len()); |
1331 | assert!(V::BYTES <= self.hi.len()); |
1332 | Mask { |
1333 | lo: V::load_unaligned(self.lo[..].as_ptr()), |
1334 | hi: V::load_unaligned(self.hi[..].as_ptr()), |
1335 | } |
1336 | } |
1337 | |
1338 | /// A convenience function for building `N` vector masks from a fat |
1339 | /// `Teddy` value. |
1340 | /// |
1341 | /// # Panics |
1342 | /// |
1343 | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
1344 | /// |
1345 | /// # Safety |
1346 | /// |
1347 | /// Callers must ensure that this is okay to call in the current target for |
1348 | /// the current CPU. |
1349 | #[inline (always)] |
1350 | unsafe fn from_teddy<const BYTES: usize, V: Vector>( |
1351 | teddy: &Teddy<16>, |
1352 | ) -> [Mask<V>; BYTES] { |
1353 | // MSRV(1.63): Use core::array::from_fn to just build the array here |
1354 | // instead of creating a vector and turning it into an array. |
1355 | let mut mask_builders = vec![FatMaskBuilder::default(); BYTES]; |
1356 | for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { |
1357 | for pid in bucket.iter().copied() { |
1358 | let pat = teddy.patterns.get(pid); |
1359 | for (i, builder) in mask_builders.iter_mut().enumerate() { |
1360 | builder.add(bucket_index, pat.bytes()[i]); |
1361 | } |
1362 | } |
1363 | } |
1364 | let array = |
1365 | <[FatMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); |
1366 | array.map(|builder| builder.build()) |
1367 | } |
1368 | } |
1369 | |
1370 | impl Debug for FatMaskBuilder { |
1371 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
1372 | let (mut parts_lo: Vec, mut parts_hi: Vec) = (vec![], vec![]); |
1373 | for i: usize in 0..32 { |
1374 | parts_lo.push(format!(" {:02}: {:08b}" , i, self.lo[i])); |
1375 | parts_hi.push(format!(" {:02}: {:08b}" , i, self.hi[i])); |
1376 | } |
1377 | f&mut DebugStruct<'_, '_>.debug_struct("FatMaskBuilder" ) |
1378 | .field("lo" , &parts_lo) |
1379 | .field(name:"hi" , &parts_hi) |
1380 | .finish() |
1381 | } |
1382 | } |
1383 | |