1 | use core::{ |
2 | fmt::Debug, |
3 | panic::{RefUnwindSafe, UnwindSafe}, |
4 | }; |
5 | |
6 | use alloc::sync::Arc; |
7 | |
8 | use crate::packed::{ext::Pointer, pattern::Patterns, teddy::generic::Match}; |
9 | |
10 | /// A builder for constructing a Teddy matcher. |
11 | /// |
12 | /// The builder primarily permits fine grained configuration of the Teddy |
13 | /// matcher. Most options are made only available for testing/benchmarking |
14 | /// purposes. In reality, options are automatically determined by the nature |
15 | /// and number of patterns given to the builder. |
16 | #[derive (Clone, Debug)] |
17 | pub(crate) struct Builder { |
18 | /// When none, this is automatically determined. Otherwise, `false` means |
19 | /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used |
20 | /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't |
21 | /// available and Fat Teddy was requested, no matcher will be built. |
22 | only_fat: Option<bool>, |
23 | /// When none, this is automatically determined. Otherwise, `false` means |
24 | /// that 128-bit vectors will be used (up to SSSE3 instructions) where as |
25 | /// `true` means that 256-bit vectors will be used. As with `fat`, if |
26 | /// 256-bit vectors are requested and they aren't available, then a |
27 | /// searcher will not be built. |
28 | only_256bit: Option<bool>, |
29 | /// When true (the default), the number of patterns will be used as a |
30 | /// heuristic for refusing construction of a Teddy searcher. The point here |
31 | /// is that too many patterns can overwhelm Teddy. But this can be disabled |
32 | /// in cases where the caller knows better. |
33 | heuristic_pattern_limits: bool, |
34 | } |
35 | |
36 | impl Default for Builder { |
37 | fn default() -> Builder { |
38 | Builder::new() |
39 | } |
40 | } |
41 | |
42 | impl Builder { |
43 | /// Create a new builder for configuring a Teddy matcher. |
44 | pub(crate) fn new() -> Builder { |
45 | Builder { |
46 | only_fat: None, |
47 | only_256bit: None, |
48 | heuristic_pattern_limits: true, |
49 | } |
50 | } |
51 | |
52 | /// Build a matcher for the set of patterns given. If a matcher could not |
53 | /// be built, then `None` is returned. |
54 | /// |
55 | /// Generally, a matcher isn't built if the necessary CPU features aren't |
56 | /// available, an unsupported target or if the searcher is believed to be |
57 | /// slower than standard techniques (i.e., if there are too many literals). |
58 | pub(crate) fn build(&self, patterns: Arc<Patterns>) -> Option<Searcher> { |
59 | self.build_imp(patterns) |
60 | } |
61 | |
62 | /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses |
63 | /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful |
64 | /// for a larger set of literals. |
65 | /// |
66 | /// `None` is the default, which results in an automatic selection based |
67 | /// on the number of literals and available CPU features. |
68 | pub(crate) fn only_fat(&mut self, yes: Option<bool>) -> &mut Builder { |
69 | self.only_fat = yes; |
70 | self |
71 | } |
72 | |
73 | /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). |
74 | /// Generally, a larger vector size is better since it either permits |
75 | /// matching more patterns or matching more bytes in the haystack at once. |
76 | /// |
77 | /// `None` is the default, which results in an automatic selection based on |
78 | /// the number of literals and available CPU features. |
79 | pub(crate) fn only_256bit(&mut self, yes: Option<bool>) -> &mut Builder { |
80 | self.only_256bit = yes; |
81 | self |
82 | } |
83 | |
84 | /// Request that heuristic limitations on the number of patterns be |
85 | /// employed. This useful to disable for benchmarking where one wants to |
86 | /// explore how Teddy performs on large number of patterns even if the |
87 | /// heuristics would otherwise refuse construction. |
88 | /// |
89 | /// This is enabled by default. |
90 | pub(crate) fn heuristic_pattern_limits( |
91 | &mut self, |
92 | yes: bool, |
93 | ) -> &mut Builder { |
94 | self.heuristic_pattern_limits = yes; |
95 | self |
96 | } |
97 | |
98 | fn build_imp(&self, patterns: Arc<Patterns>) -> Option<Searcher> { |
99 | let patlimit = self.heuristic_pattern_limits; |
100 | // There's no particular reason why we limit ourselves to little endian |
101 | // here, but it seems likely that some parts of Teddy as they are |
102 | // currently written (e.g., the uses of `trailing_zeros`) are likely |
103 | // wrong on non-little-endian targets. Such things are likely easy to |
104 | // fix, but at the time of writing (2023/09/18), I actually do not know |
105 | // how to test this code on a big-endian target. So for now, we're |
106 | // conservative and just bail out. |
107 | if !cfg!(target_endian = "little" ) { |
108 | debug!("skipping Teddy because target isn't little endian" ); |
109 | return None; |
110 | } |
111 | // Too many patterns will overwhelm Teddy and likely lead to slow |
112 | // downs, typically in the verification step. |
113 | if patlimit && patterns.len() > 64 { |
114 | debug!("skipping Teddy because of too many patterns" ); |
115 | return None; |
116 | } |
117 | |
118 | #[cfg (all(target_arch = "x86_64" , target_feature = "sse2" ))] |
119 | { |
120 | use self::x86_64::{FatAVX2, SlimAVX2, SlimSSSE3}; |
121 | |
122 | let mask_len = core::cmp::min(4, patterns.minimum_len()); |
123 | let beefy = patterns.len() > 32; |
124 | let has_avx2 = self::x86_64::is_available_avx2(); |
125 | let has_ssse3 = has_avx2 || self::x86_64::is_available_ssse3(); |
126 | let use_avx2 = if self.only_256bit == Some(true) { |
127 | if !has_avx2 { |
128 | debug!( |
129 | "skipping Teddy because avx2 was demanded but unavailable" |
130 | ); |
131 | return None; |
132 | } |
133 | true |
134 | } else if self.only_256bit == Some(false) { |
135 | if !has_ssse3 { |
136 | debug!( |
137 | "skipping Teddy because ssse3 was demanded but unavailable" |
138 | ); |
139 | return None; |
140 | } |
141 | false |
142 | } else if !has_ssse3 && !has_avx2 { |
143 | debug!( |
144 | "skipping Teddy because ssse3 and avx2 are unavailable" |
145 | ); |
146 | return None; |
147 | } else { |
148 | has_avx2 |
149 | }; |
150 | let fat = match self.only_fat { |
151 | None => use_avx2 && beefy, |
152 | Some(false) => false, |
153 | Some(true) if !use_avx2 => { |
154 | debug!( |
155 | "skipping Teddy because fat was demanded, but fat \ |
156 | Teddy requires avx2 which is unavailable" |
157 | ); |
158 | return None; |
159 | } |
160 | Some(true) => true, |
161 | }; |
162 | // Just like for aarch64, it's possible that too many patterns will |
163 | // overhwelm Teddy. Unlike aarch64 though, we have Fat teddy which |
164 | // helps things scale a bit more by spreading patterns over more |
165 | // buckets. |
166 | // |
167 | // These thresholds were determined by looking at the measurements |
168 | // for the rust/aho-corasick/packed/leftmost-first and |
169 | // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` |
170 | // benchmarks. |
171 | if patlimit && mask_len == 1 && patterns.len() > 16 { |
172 | debug!( |
173 | "skipping Teddy (mask len: 1) because there are \ |
174 | too many patterns" , |
175 | ); |
176 | return None; |
177 | } |
178 | match (mask_len, use_avx2, fat) { |
179 | (1, false, _) => { |
180 | debug!("Teddy choice: 128-bit slim, 1 byte" ); |
181 | SlimSSSE3::<1>::new(&patterns) |
182 | } |
183 | (1, true, false) => { |
184 | debug!("Teddy choice: 256-bit slim, 1 byte" ); |
185 | SlimAVX2::<1>::new(&patterns) |
186 | } |
187 | (1, true, true) => { |
188 | debug!("Teddy choice: 256-bit fat, 1 byte" ); |
189 | FatAVX2::<1>::new(&patterns) |
190 | } |
191 | (2, false, _) => { |
192 | debug!("Teddy choice: 128-bit slim, 2 bytes" ); |
193 | SlimSSSE3::<2>::new(&patterns) |
194 | } |
195 | (2, true, false) => { |
196 | debug!("Teddy choice: 256-bit slim, 2 bytes" ); |
197 | SlimAVX2::<2>::new(&patterns) |
198 | } |
199 | (2, true, true) => { |
200 | debug!("Teddy choice: 256-bit fat, 2 bytes" ); |
201 | FatAVX2::<2>::new(&patterns) |
202 | } |
203 | (3, false, _) => { |
204 | debug!("Teddy choice: 128-bit slim, 3 bytes" ); |
205 | SlimSSSE3::<3>::new(&patterns) |
206 | } |
207 | (3, true, false) => { |
208 | debug!("Teddy choice: 256-bit slim, 3 bytes" ); |
209 | SlimAVX2::<3>::new(&patterns) |
210 | } |
211 | (3, true, true) => { |
212 | debug!("Teddy choice: 256-bit fat, 3 bytes" ); |
213 | FatAVX2::<3>::new(&patterns) |
214 | } |
215 | (4, false, _) => { |
216 | debug!("Teddy choice: 128-bit slim, 4 bytes" ); |
217 | SlimSSSE3::<4>::new(&patterns) |
218 | } |
219 | (4, true, false) => { |
220 | debug!("Teddy choice: 256-bit slim, 4 bytes" ); |
221 | SlimAVX2::<4>::new(&patterns) |
222 | } |
223 | (4, true, true) => { |
224 | debug!("Teddy choice: 256-bit fat, 4 bytes" ); |
225 | FatAVX2::<4>::new(&patterns) |
226 | } |
227 | _ => { |
228 | debug!("no supported Teddy configuration found" ); |
229 | None |
230 | } |
231 | } |
232 | } |
233 | #[cfg (all( |
234 | target_arch = "aarch64" , |
235 | target_feature = "neon" , |
236 | target_endian = "little" |
237 | ))] |
238 | { |
239 | use self::aarch64::SlimNeon; |
240 | |
241 | let mask_len = core::cmp::min(4, patterns.minimum_len()); |
242 | if self.only_256bit == Some(true) { |
243 | debug!( |
244 | "skipping Teddy because 256-bits were demanded \ |
245 | but unavailable" |
246 | ); |
247 | return None; |
248 | } |
249 | if self.only_fat == Some(true) { |
250 | debug!( |
251 | "skipping Teddy because fat was demanded but unavailable" |
252 | ); |
253 | } |
254 | // Since we don't have Fat teddy in aarch64 (I think we'd want at |
255 | // least 256-bit vectors for that), we need to be careful not to |
256 | // allow too many patterns as it might overwhelm Teddy. Generally |
257 | // speaking, as the mask length goes up, the more patterns we can |
258 | // handle because the mask length results in fewer candidates |
259 | // generated. |
260 | // |
261 | // These thresholds were determined by looking at the measurements |
262 | // for the rust/aho-corasick/packed/leftmost-first and |
263 | // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` |
264 | // benchmarks. |
265 | match mask_len { |
266 | 1 => { |
267 | if patlimit && patterns.len() > 16 { |
268 | debug!( |
269 | "skipping Teddy (mask len: 1) because there are \ |
270 | too many patterns" , |
271 | ); |
272 | } |
273 | debug!("Teddy choice: 128-bit slim, 1 byte" ); |
274 | SlimNeon::<1>::new(&patterns) |
275 | } |
276 | 2 => { |
277 | if patlimit && patterns.len() > 32 { |
278 | debug!( |
279 | "skipping Teddy (mask len: 2) because there are \ |
280 | too many patterns" , |
281 | ); |
282 | } |
283 | debug!("Teddy choice: 128-bit slim, 2 bytes" ); |
284 | SlimNeon::<2>::new(&patterns) |
285 | } |
286 | 3 => { |
287 | if patlimit && patterns.len() > 48 { |
288 | debug!( |
289 | "skipping Teddy (mask len: 3) because there are \ |
290 | too many patterns" , |
291 | ); |
292 | } |
293 | debug!("Teddy choice: 128-bit slim, 3 bytes" ); |
294 | SlimNeon::<3>::new(&patterns) |
295 | } |
296 | 4 => { |
297 | debug!("Teddy choice: 128-bit slim, 4 bytes" ); |
298 | SlimNeon::<4>::new(&patterns) |
299 | } |
300 | _ => { |
301 | debug!("no supported Teddy configuration found" ); |
302 | None |
303 | } |
304 | } |
305 | } |
306 | #[cfg (not(any( |
307 | all(target_arch = "x86_64" , target_feature = "sse2" ), |
308 | all( |
309 | target_arch = "aarch64" , |
310 | target_feature = "neon" , |
311 | target_endian = "little" |
312 | ) |
313 | )))] |
314 | { |
315 | None |
316 | } |
317 | } |
318 | } |
319 | |
320 | /// A searcher that dispatches to one of several possible Teddy variants. |
321 | #[derive (Clone, Debug)] |
322 | pub(crate) struct Searcher { |
323 | /// The Teddy variant we use. We use dynamic dispatch under the theory that |
324 | /// it results in better codegen then a enum, although this is a specious |
325 | /// claim. |
326 | /// |
327 | /// This `Searcher` is essentially a wrapper for a `SearcherT` trait |
328 | /// object. We just make `memory_usage` and `minimum_len` available without |
329 | /// going through dynamic dispatch. |
330 | imp: Arc<dyn SearcherT>, |
331 | /// Total heap memory used by the Teddy variant. |
332 | memory_usage: usize, |
333 | /// The minimum haystack length this searcher can handle. It is intended |
334 | /// for callers to use some other search routine (such as Rabin-Karp) in |
335 | /// cases where the haystack (or remainer of the haystack) is too short. |
336 | minimum_len: usize, |
337 | } |
338 | |
339 | impl Searcher { |
340 | /// Look for the leftmost occurrence of any pattern in this search in the |
341 | /// given haystack starting at the given position. |
342 | /// |
343 | /// # Panics |
344 | /// |
345 | /// This panics when `haystack[at..].len()` is less than the minimum length |
346 | /// for this haystack. |
347 | #[inline (always)] |
348 | pub(crate) fn find( |
349 | &self, |
350 | haystack: &[u8], |
351 | at: usize, |
352 | ) -> Option<crate::Match> { |
353 | // SAFETY: The Teddy implementations all require a minimum haystack |
354 | // length, and this is required for safety. Therefore, we assert it |
355 | // here in order to make this method sound. |
356 | assert!(haystack[at..].len() >= self.minimum_len); |
357 | let hayptr = haystack.as_ptr(); |
358 | // SAFETY: Construction of the searcher guarantees that we are able |
359 | // to run it in the current environment (i.e., we won't get an AVX2 |
360 | // searcher on a x86-64 CPU without AVX2 support). Also, the pointers |
361 | // are valid as they are derived directly from a borrowed slice. |
362 | let teddym = unsafe { |
363 | self.imp.find(hayptr.add(at), hayptr.add(haystack.len()))? |
364 | }; |
365 | let start = teddym.start().as_usize().wrapping_sub(hayptr.as_usize()); |
366 | let end = teddym.end().as_usize().wrapping_sub(hayptr.as_usize()); |
367 | let span = crate::Span { start, end }; |
368 | // OK because we won't permit the construction of a searcher that |
369 | // could report a pattern ID bigger than what can fit in the crate-wide |
370 | // PatternID type. |
371 | let pid = crate::PatternID::new_unchecked(teddym.pattern().as_usize()); |
372 | let m = crate::Match::new(pid, span); |
373 | Some(m) |
374 | } |
375 | |
376 | /// Returns the approximate total amount of heap used by this type, in |
377 | /// units of bytes. |
378 | #[inline (always)] |
379 | pub(crate) fn memory_usage(&self) -> usize { |
380 | self.memory_usage |
381 | } |
382 | |
383 | /// Returns the minimum length, in bytes, that a haystack must be in order |
384 | /// to use it with this searcher. |
385 | #[inline (always)] |
386 | pub(crate) fn minimum_len(&self) -> usize { |
387 | self.minimum_len |
388 | } |
389 | } |
390 | |
391 | /// A trait that provides dynamic dispatch over the different possible Teddy |
392 | /// variants on the same algorithm. |
393 | /// |
394 | /// On `x86_64` for example, it isn't known until runtime which of 12 possible |
395 | /// variants will be used. One might use one of the four slim 128-bit vector |
396 | /// variants, or one of the four 256-bit vector variants or even one of the |
397 | /// four fat 256-bit vector variants. |
398 | /// |
399 | /// Since this choice is generally made when the Teddy searcher is constructed |
400 | /// and this choice is based on the patterns given and what the current CPU |
401 | /// supports, it follows that there must be some kind of indirection at search |
402 | /// time that "selects" the variant chosen at build time. |
403 | /// |
404 | /// There are a few different ways to go about this. One approach is to use an |
405 | /// enum. It works fine, but in my experiments, this generally results in worse |
406 | /// codegen. Another approach, which is what we use here, is dynamic dispatch |
407 | /// via a trait object. We basically implement this trait for each possible |
408 | /// variant, select the variant we want at build time and convert it to a |
409 | /// trait object for use at search time. |
410 | /// |
411 | /// Another approach is to use function pointers and stick each of the possible |
412 | /// variants into a union. This is essentially isomorphic to the dynamic |
413 | /// dispatch approach, but doesn't require any allocations. Since this crate |
414 | /// requires `alloc`, there's no real reason (AFAIK) to go down this path. (The |
415 | /// `memchr` crate does this.) |
416 | trait SearcherT: |
417 | Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static |
418 | { |
419 | /// Execute a search on the given haystack (identified by `start` and `end` |
420 | /// raw pointers). |
421 | /// |
422 | /// # Safety |
423 | /// |
424 | /// Essentially, the `start` and `end` pointers must be valid and point |
425 | /// to a haystack one can read. As long as you derive them from, for |
426 | /// example, a `&[u8]`, they should automatically satisfy all of the safety |
427 | /// obligations: |
428 | /// |
429 | /// * Both `start` and `end` must be valid for reads. |
430 | /// * Both `start` and `end` must point to an initialized value. |
431 | /// * Both `start` and `end` must point to the same allocated object and |
432 | /// must either be in bounds or at most one byte past the end of the |
433 | /// allocated object. |
434 | /// * Both `start` and `end` must be _derived from_ a pointer to the same |
435 | /// object. |
436 | /// * The distance between `start` and `end` must not overflow `isize`. |
437 | /// * The distance being in bounds must not rely on "wrapping around" the |
438 | /// address space. |
439 | /// * It must be the case that `start <= end`. |
440 | /// * `end - start` must be greater than the minimum length for this |
441 | /// searcher. |
442 | /// |
443 | /// Also, it is expected that implementations of this trait will tag this |
444 | /// method with a `target_feature` attribute. Callers must ensure that |
445 | /// they are executing this method in an environment where that attribute |
446 | /// is valid. |
447 | unsafe fn find(&self, start: *const u8, end: *const u8) -> Option<Match>; |
448 | } |
449 | |
450 | #[cfg (all(target_arch = "x86_64" , target_feature = "sse2" ))] |
451 | mod x86_64 { |
452 | use core::arch::x86_64::{__m128i, __m256i}; |
453 | |
454 | use alloc::sync::Arc; |
455 | |
456 | use crate::packed::{ |
457 | ext::Pointer, |
458 | pattern::Patterns, |
459 | teddy::generic::{self, Match}, |
460 | }; |
461 | |
462 | use super::{Searcher, SearcherT}; |
463 | |
464 | #[derive (Clone, Debug)] |
465 | pub(super) struct SlimSSSE3<const BYTES: usize> { |
466 | slim128: generic::Slim<__m128i, BYTES>, |
467 | } |
468 | |
469 | // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. |
470 | macro_rules! slim_ssse3 { |
471 | ($len:expr) => { |
472 | impl SlimSSSE3<$len> { |
473 | /// Creates a new searcher using "slim" Teddy with 128-bit |
474 | /// vectors. If SSSE3 is not available in the current |
475 | /// environment, then this returns `None`. |
476 | pub(super) fn new( |
477 | patterns: &Arc<Patterns>, |
478 | ) -> Option<Searcher> { |
479 | if !is_available_ssse3() { |
480 | return None; |
481 | } |
482 | Some(unsafe { SlimSSSE3::<$len>::new_unchecked(patterns) }) |
483 | } |
484 | |
485 | /// Creates a new searcher using "slim" Teddy with 256-bit |
486 | /// vectors without checking whether SSSE3 is available or not. |
487 | /// |
488 | /// # Safety |
489 | /// |
490 | /// Callers must ensure that SSSE3 is available in the current |
491 | /// environment. |
492 | #[target_feature(enable = "ssse3" )] |
493 | unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { |
494 | let slim128 = generic::Slim::<__m128i, $len>::new( |
495 | Arc::clone(patterns), |
496 | ); |
497 | let memory_usage = slim128.memory_usage(); |
498 | let minimum_len = slim128.minimum_len(); |
499 | let imp = Arc::new(SlimSSSE3 { slim128 }); |
500 | Searcher { imp, memory_usage, minimum_len } |
501 | } |
502 | } |
503 | |
504 | impl SearcherT for SlimSSSE3<$len> { |
505 | #[target_feature(enable = "ssse3" )] |
506 | #[inline] |
507 | unsafe fn find( |
508 | &self, |
509 | start: *const u8, |
510 | end: *const u8, |
511 | ) -> Option<Match> { |
512 | // SAFETY: All obligations except for `target_feature` are |
513 | // passed to the caller. Our use of `target_feature` is |
514 | // safe because construction of this type requires that the |
515 | // requisite target features are available. |
516 | self.slim128.find(start, end) |
517 | } |
518 | } |
519 | }; |
520 | } |
521 | |
522 | slim_ssse3!(1); |
523 | slim_ssse3!(2); |
524 | slim_ssse3!(3); |
525 | slim_ssse3!(4); |
526 | |
527 | #[derive (Clone, Debug)] |
528 | pub(super) struct SlimAVX2<const BYTES: usize> { |
529 | slim128: generic::Slim<__m128i, BYTES>, |
530 | slim256: generic::Slim<__m256i, BYTES>, |
531 | } |
532 | |
533 | // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. |
534 | macro_rules! slim_avx2 { |
535 | ($len:expr) => { |
536 | impl SlimAVX2<$len> { |
537 | /// Creates a new searcher using "slim" Teddy with 256-bit |
538 | /// vectors. If AVX2 is not available in the current |
539 | /// environment, then this returns `None`. |
540 | pub(super) fn new( |
541 | patterns: &Arc<Patterns>, |
542 | ) -> Option<Searcher> { |
543 | if !is_available_avx2() { |
544 | return None; |
545 | } |
546 | Some(unsafe { SlimAVX2::<$len>::new_unchecked(patterns) }) |
547 | } |
548 | |
549 | /// Creates a new searcher using "slim" Teddy with 256-bit |
550 | /// vectors without checking whether AVX2 is available or not. |
551 | /// |
552 | /// # Safety |
553 | /// |
554 | /// Callers must ensure that AVX2 is available in the current |
555 | /// environment. |
556 | #[target_feature(enable = "avx2" )] |
557 | unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { |
558 | let slim128 = generic::Slim::<__m128i, $len>::new( |
559 | Arc::clone(&patterns), |
560 | ); |
561 | let slim256 = generic::Slim::<__m256i, $len>::new( |
562 | Arc::clone(&patterns), |
563 | ); |
564 | let memory_usage = |
565 | slim128.memory_usage() + slim256.memory_usage(); |
566 | let minimum_len = slim128.minimum_len(); |
567 | let imp = Arc::new(SlimAVX2 { slim128, slim256 }); |
568 | Searcher { imp, memory_usage, minimum_len } |
569 | } |
570 | } |
571 | |
572 | impl SearcherT for SlimAVX2<$len> { |
573 | #[target_feature(enable = "avx2" )] |
574 | #[inline] |
575 | unsafe fn find( |
576 | &self, |
577 | start: *const u8, |
578 | end: *const u8, |
579 | ) -> Option<Match> { |
580 | // SAFETY: All obligations except for `target_feature` are |
581 | // passed to the caller. Our use of `target_feature` is |
582 | // safe because construction of this type requires that the |
583 | // requisite target features are available. |
584 | let len = end.distance(start); |
585 | if len < self.slim256.minimum_len() { |
586 | self.slim128.find(start, end) |
587 | } else { |
588 | self.slim256.find(start, end) |
589 | } |
590 | } |
591 | } |
592 | }; |
593 | } |
594 | |
595 | slim_avx2!(1); |
596 | slim_avx2!(2); |
597 | slim_avx2!(3); |
598 | slim_avx2!(4); |
599 | |
600 | #[derive (Clone, Debug)] |
601 | pub(super) struct FatAVX2<const BYTES: usize> { |
602 | fat256: generic::Fat<__m256i, BYTES>, |
603 | } |
604 | |
605 | // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. |
606 | macro_rules! fat_avx2 { |
607 | ($len:expr) => { |
608 | impl FatAVX2<$len> { |
609 | /// Creates a new searcher using "slim" Teddy with 256-bit |
610 | /// vectors. If AVX2 is not available in the current |
611 | /// environment, then this returns `None`. |
612 | pub(super) fn new( |
613 | patterns: &Arc<Patterns>, |
614 | ) -> Option<Searcher> { |
615 | if !is_available_avx2() { |
616 | return None; |
617 | } |
618 | Some(unsafe { FatAVX2::<$len>::new_unchecked(patterns) }) |
619 | } |
620 | |
621 | /// Creates a new searcher using "slim" Teddy with 256-bit |
622 | /// vectors without checking whether AVX2 is available or not. |
623 | /// |
624 | /// # Safety |
625 | /// |
626 | /// Callers must ensure that AVX2 is available in the current |
627 | /// environment. |
628 | #[target_feature(enable = "avx2" )] |
629 | unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { |
630 | let fat256 = generic::Fat::<__m256i, $len>::new( |
631 | Arc::clone(&patterns), |
632 | ); |
633 | let memory_usage = fat256.memory_usage(); |
634 | let minimum_len = fat256.minimum_len(); |
635 | let imp = Arc::new(FatAVX2 { fat256 }); |
636 | Searcher { imp, memory_usage, minimum_len } |
637 | } |
638 | } |
639 | |
640 | impl SearcherT for FatAVX2<$len> { |
641 | #[target_feature(enable = "avx2" )] |
642 | #[inline] |
643 | unsafe fn find( |
644 | &self, |
645 | start: *const u8, |
646 | end: *const u8, |
647 | ) -> Option<Match> { |
648 | // SAFETY: All obligations except for `target_feature` are |
649 | // passed to the caller. Our use of `target_feature` is |
650 | // safe because construction of this type requires that the |
651 | // requisite target features are available. |
652 | self.fat256.find(start, end) |
653 | } |
654 | } |
655 | }; |
656 | } |
657 | |
658 | fat_avx2!(1); |
659 | fat_avx2!(2); |
660 | fat_avx2!(3); |
661 | fat_avx2!(4); |
662 | |
663 | #[inline ] |
664 | pub(super) fn is_available_ssse3() -> bool { |
665 | #[cfg (not(target_feature = "sse2" ))] |
666 | { |
667 | false |
668 | } |
669 | #[cfg (target_feature = "sse2" )] |
670 | { |
671 | #[cfg (target_feature = "ssse3" )] |
672 | { |
673 | true |
674 | } |
675 | #[cfg (not(target_feature = "ssse3" ))] |
676 | { |
677 | #[cfg (feature = "std" )] |
678 | { |
679 | std::is_x86_feature_detected!("ssse3" ) |
680 | } |
681 | #[cfg (not(feature = "std" ))] |
682 | { |
683 | false |
684 | } |
685 | } |
686 | } |
687 | } |
688 | |
689 | #[inline ] |
690 | pub(super) fn is_available_avx2() -> bool { |
691 | #[cfg (not(target_feature = "sse2" ))] |
692 | { |
693 | false |
694 | } |
695 | #[cfg (target_feature = "sse2" )] |
696 | { |
697 | #[cfg (target_feature = "avx2" )] |
698 | { |
699 | true |
700 | } |
701 | #[cfg (not(target_feature = "avx2" ))] |
702 | { |
703 | #[cfg (feature = "std" )] |
704 | { |
705 | std::is_x86_feature_detected!("avx2" ) |
706 | } |
707 | #[cfg (not(feature = "std" ))] |
708 | { |
709 | false |
710 | } |
711 | } |
712 | } |
713 | } |
714 | } |
715 | |
716 | #[cfg (all( |
717 | target_arch = "aarch64" , |
718 | target_feature = "neon" , |
719 | target_endian = "little" |
720 | ))] |
721 | mod aarch64 { |
722 | use core::arch::aarch64::uint8x16_t; |
723 | |
724 | use alloc::sync::Arc; |
725 | |
726 | use crate::packed::{ |
727 | pattern::Patterns, |
728 | teddy::generic::{self, Match}, |
729 | }; |
730 | |
731 | use super::{Searcher, SearcherT}; |
732 | |
733 | #[derive (Clone, Debug)] |
734 | pub(super) struct SlimNeon<const BYTES: usize> { |
735 | slim128: generic::Slim<uint8x16_t, BYTES>, |
736 | } |
737 | |
738 | // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. |
739 | macro_rules! slim_neon { |
740 | ($len:expr) => { |
741 | impl SlimNeon<$len> { |
742 | /// Creates a new searcher using "slim" Teddy with 128-bit |
743 | /// vectors. If SSSE3 is not available in the current |
744 | /// environment, then this returns `None`. |
745 | pub(super) fn new( |
746 | patterns: &Arc<Patterns>, |
747 | ) -> Option<Searcher> { |
748 | Some(unsafe { SlimNeon::<$len>::new_unchecked(patterns) }) |
749 | } |
750 | |
751 | /// Creates a new searcher using "slim" Teddy with 256-bit |
752 | /// vectors without checking whether SSSE3 is available or not. |
753 | /// |
754 | /// # Safety |
755 | /// |
756 | /// Callers must ensure that SSSE3 is available in the current |
757 | /// environment. |
758 | #[target_feature(enable = "neon" )] |
759 | unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { |
760 | let slim128 = generic::Slim::<uint8x16_t, $len>::new( |
761 | Arc::clone(patterns), |
762 | ); |
763 | let memory_usage = slim128.memory_usage(); |
764 | let minimum_len = slim128.minimum_len(); |
765 | let imp = Arc::new(SlimNeon { slim128 }); |
766 | Searcher { imp, memory_usage, minimum_len } |
767 | } |
768 | } |
769 | |
770 | impl SearcherT for SlimNeon<$len> { |
771 | #[target_feature(enable = "neon" )] |
772 | #[inline] |
773 | unsafe fn find( |
774 | &self, |
775 | start: *const u8, |
776 | end: *const u8, |
777 | ) -> Option<Match> { |
778 | // SAFETY: All obligations except for `target_feature` are |
779 | // passed to the caller. Our use of `target_feature` is |
780 | // safe because construction of this type requires that the |
781 | // requisite target features are available. |
782 | self.slim128.find(start, end) |
783 | } |
784 | } |
785 | }; |
786 | } |
787 | |
788 | slim_neon!(1); |
789 | slim_neon!(2); |
790 | slim_neon!(3); |
791 | slim_neon!(4); |
792 | } |
793 | |