1 | use core::{ |
2 | fmt::Debug, |
3 | panic::{RefUnwindSafe, UnwindSafe}, |
4 | }; |
5 | |
6 | use alloc::sync::Arc; |
7 | |
8 | use crate::packed::{ext::Pointer, pattern::Patterns, teddy::generic::Match}; |
9 | |
10 | /// A builder for constructing a Teddy matcher. |
11 | /// |
12 | /// The builder primarily permits fine grained configuration of the Teddy |
13 | /// matcher. Most options are made only available for testing/benchmarking |
14 | /// purposes. In reality, options are automatically determined by the nature |
15 | /// and number of patterns given to the builder. |
16 | #[derive(Clone, Debug)] |
17 | pub(crate) struct Builder { |
18 | /// When none, this is automatically determined. Otherwise, `false` means |
19 | /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used |
20 | /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't |
21 | /// available and Fat Teddy was requested, no matcher will be built. |
22 | only_fat: Option<bool>, |
23 | /// When none, this is automatically determined. Otherwise, `false` means |
24 | /// that 128-bit vectors will be used (up to SSSE3 instructions) where as |
25 | /// `true` means that 256-bit vectors will be used. As with `fat`, if |
26 | /// 256-bit vectors are requested and they aren't available, then a |
27 | /// searcher will not be built. |
28 | only_256bit: Option<bool>, |
29 | /// When true (the default), the number of patterns will be used as a |
30 | /// heuristic for refusing construction of a Teddy searcher. The point here |
31 | /// is that too many patterns can overwhelm Teddy. But this can be disabled |
32 | /// in cases where the caller knows better. |
33 | heuristic_pattern_limits: bool, |
34 | } |
35 | |
36 | impl Default for Builder { |
37 | fn default() -> Builder { |
38 | Builder::new() |
39 | } |
40 | } |
41 | |
42 | impl Builder { |
43 | /// Create a new builder for configuring a Teddy matcher. |
44 | pub(crate) fn new() -> Builder { |
45 | Builder { |
46 | only_fat: None, |
47 | only_256bit: None, |
48 | heuristic_pattern_limits: true, |
49 | } |
50 | } |
51 | |
52 | /// Build a matcher for the set of patterns given. If a matcher could not |
53 | /// be built, then `None` is returned. |
54 | /// |
55 | /// Generally, a matcher isn't built if the necessary CPU features aren't |
56 | /// available, an unsupported target or if the searcher is believed to be |
57 | /// slower than standard techniques (i.e., if there are too many literals). |
58 | pub(crate) fn build(&self, patterns: Arc<Patterns>) -> Option<Searcher> { |
59 | self.build_imp(patterns) |
60 | } |
61 | |
62 | /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses |
63 | /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful |
64 | /// for a larger set of literals. |
65 | /// |
66 | /// `None` is the default, which results in an automatic selection based |
67 | /// on the number of literals and available CPU features. |
68 | pub(crate) fn only_fat(&mut self, yes: Option<bool>) -> &mut Builder { |
69 | self.only_fat = yes; |
70 | self |
71 | } |
72 | |
73 | /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). |
74 | /// Generally, a larger vector size is better since it either permits |
75 | /// matching more patterns or matching more bytes in the haystack at once. |
76 | /// |
77 | /// `None` is the default, which results in an automatic selection based on |
78 | /// the number of literals and available CPU features. |
79 | pub(crate) fn only_256bit(&mut self, yes: Option<bool>) -> &mut Builder { |
80 | self.only_256bit = yes; |
81 | self |
82 | } |
83 | |
84 | /// Request that heuristic limitations on the number of patterns be |
85 | /// employed. This useful to disable for benchmarking where one wants to |
86 | /// explore how Teddy performs on large number of patterns even if the |
87 | /// heuristics would otherwise refuse construction. |
88 | /// |
89 | /// This is enabled by default. |
90 | pub(crate) fn heuristic_pattern_limits( |
91 | &mut self, |
92 | yes: bool, |
93 | ) -> &mut Builder { |
94 | self.heuristic_pattern_limits = yes; |
95 | self |
96 | } |
97 | |
98 | fn build_imp(&self, patterns: Arc<Patterns>) -> Option<Searcher> { |
99 | let patlimit = self.heuristic_pattern_limits; |
100 | // There's no particular reason why we limit ourselves to little endian |
101 | // here, but it seems likely that some parts of Teddy as they are |
102 | // currently written (e.g., the uses of `trailing_zeros`) are likely |
103 | // wrong on non-little-endian targets. Such things are likely easy to |
104 | // fix, but at the time of writing (2023/09/18), I actually do not know |
105 | // how to test this code on a big-endian target. So for now, we're |
106 | // conservative and just bail out. |
107 | if !cfg!(target_endian = "little" ) { |
108 | debug!("skipping Teddy because target isn't little endian" ); |
109 | return None; |
110 | } |
111 | // Too many patterns will overwhelm Teddy and likely lead to slow |
112 | // downs, typically in the verification step. |
113 | if patlimit && patterns.len() > 64 { |
114 | debug!("skipping Teddy because of too many patterns" ); |
115 | return None; |
116 | } |
117 | |
118 | #[cfg (all(target_arch = "x86_64" , target_feature = "sse2" ))] |
119 | { |
120 | use self::x86_64::{FatAVX2, SlimAVX2, SlimSSSE3}; |
121 | |
122 | let mask_len = core::cmp::min(4, patterns.minimum_len()); |
123 | let beefy = patterns.len() > 32; |
124 | let has_avx2 = self::x86_64::is_available_avx2(); |
125 | let has_ssse3 = has_avx2 || self::x86_64::is_available_ssse3(); |
126 | let use_avx2 = if self.only_256bit == Some(true) { |
127 | if !has_avx2 { |
128 | debug!( |
129 | "skipping Teddy because avx2 was demanded but unavailable" |
130 | ); |
131 | return None; |
132 | } |
133 | true |
134 | } else if self.only_256bit == Some(false) { |
135 | if !has_ssse3 { |
136 | debug!( |
137 | "skipping Teddy because ssse3 was demanded but unavailable" |
138 | ); |
139 | return None; |
140 | } |
141 | false |
142 | } else if !has_ssse3 && !has_avx2 { |
143 | debug!( |
144 | "skipping Teddy because ssse3 and avx2 are unavailable" |
145 | ); |
146 | return None; |
147 | } else { |
148 | has_avx2 |
149 | }; |
150 | let fat = match self.only_fat { |
151 | None => use_avx2 && beefy, |
152 | Some(false) => false, |
153 | Some(true) if !use_avx2 => { |
154 | debug!( |
155 | "skipping Teddy because fat was demanded, but fat \ |
156 | Teddy requires avx2 which is unavailable" |
157 | ); |
158 | return None; |
159 | } |
160 | Some(true) => true, |
161 | }; |
162 | // Just like for aarch64, it's possible that too many patterns will |
163 | // overhwelm Teddy. Unlike aarch64 though, we have Fat teddy which |
164 | // helps things scale a bit more by spreading patterns over more |
165 | // buckets. |
166 | // |
167 | // These thresholds were determined by looking at the measurements |
168 | // for the rust/aho-corasick/packed/leftmost-first and |
169 | // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` |
170 | // benchmarks. |
171 | if patlimit && mask_len == 1 && patterns.len() > 16 { |
172 | debug!( |
173 | "skipping Teddy (mask len: 1) because there are \ |
174 | too many patterns" , |
175 | ); |
176 | return None; |
177 | } |
178 | match (mask_len, use_avx2, fat) { |
179 | (1, false, _) => { |
180 | debug!("Teddy choice: 128-bit slim, 1 byte" ); |
181 | SlimSSSE3::<1>::new(&patterns) |
182 | } |
183 | (1, true, false) => { |
184 | debug!("Teddy choice: 256-bit slim, 1 byte" ); |
185 | SlimAVX2::<1>::new(&patterns) |
186 | } |
187 | (1, true, true) => { |
188 | debug!("Teddy choice: 256-bit fat, 1 byte" ); |
189 | FatAVX2::<1>::new(&patterns) |
190 | } |
191 | (2, false, _) => { |
192 | debug!("Teddy choice: 128-bit slim, 2 bytes" ); |
193 | SlimSSSE3::<2>::new(&patterns) |
194 | } |
195 | (2, true, false) => { |
196 | debug!("Teddy choice: 256-bit slim, 2 bytes" ); |
197 | SlimAVX2::<2>::new(&patterns) |
198 | } |
199 | (2, true, true) => { |
200 | debug!("Teddy choice: 256-bit fat, 2 bytes" ); |
201 | FatAVX2::<2>::new(&patterns) |
202 | } |
203 | (3, false, _) => { |
204 | debug!("Teddy choice: 128-bit slim, 3 bytes" ); |
205 | SlimSSSE3::<3>::new(&patterns) |
206 | } |
207 | (3, true, false) => { |
208 | debug!("Teddy choice: 256-bit slim, 3 bytes" ); |
209 | SlimAVX2::<3>::new(&patterns) |
210 | } |
211 | (3, true, true) => { |
212 | debug!("Teddy choice: 256-bit fat, 3 bytes" ); |
213 | FatAVX2::<3>::new(&patterns) |
214 | } |
215 | (4, false, _) => { |
216 | debug!("Teddy choice: 128-bit slim, 4 bytes" ); |
217 | SlimSSSE3::<4>::new(&patterns) |
218 | } |
219 | (4, true, false) => { |
220 | debug!("Teddy choice: 256-bit slim, 4 bytes" ); |
221 | SlimAVX2::<4>::new(&patterns) |
222 | } |
223 | (4, true, true) => { |
224 | debug!("Teddy choice: 256-bit fat, 4 bytes" ); |
225 | FatAVX2::<4>::new(&patterns) |
226 | } |
227 | _ => { |
228 | debug!("no supported Teddy configuration found" ); |
229 | None |
230 | } |
231 | } |
232 | } |
233 | #[cfg (target_arch = "aarch64" )] |
234 | { |
235 | use self::aarch64::SlimNeon; |
236 | |
237 | let mask_len = core::cmp::min(4, patterns.minimum_len()); |
238 | if self.only_256bit == Some(true) { |
239 | debug!( |
240 | "skipping Teddy because 256-bits were demanded \ |
241 | but unavailable" |
242 | ); |
243 | return None; |
244 | } |
245 | if self.only_fat == Some(true) { |
246 | debug!( |
247 | "skipping Teddy because fat was demanded but unavailable" |
248 | ); |
249 | } |
250 | // Since we don't have Fat teddy in aarch64 (I think we'd want at |
251 | // least 256-bit vectors for that), we need to be careful not to |
252 | // allow too many patterns as it might overwhelm Teddy. Generally |
253 | // speaking, as the mask length goes up, the more patterns we can |
254 | // handle because the mask length results in fewer candidates |
255 | // generated. |
256 | // |
257 | // These thresholds were determined by looking at the measurements |
258 | // for the rust/aho-corasick/packed/leftmost-first and |
259 | // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` |
260 | // benchmarks. |
261 | match mask_len { |
262 | 1 => { |
263 | if patlimit && patterns.len() > 16 { |
264 | debug!( |
265 | "skipping Teddy (mask len: 1) because there are \ |
266 | too many patterns" , |
267 | ); |
268 | } |
269 | debug!("Teddy choice: 128-bit slim, 1 byte" ); |
270 | SlimNeon::<1>::new(&patterns) |
271 | } |
272 | 2 => { |
273 | if patlimit && patterns.len() > 32 { |
274 | debug!( |
275 | "skipping Teddy (mask len: 2) because there are \ |
276 | too many patterns" , |
277 | ); |
278 | } |
279 | debug!("Teddy choice: 128-bit slim, 2 bytes" ); |
280 | SlimNeon::<2>::new(&patterns) |
281 | } |
282 | 3 => { |
283 | if patlimit && patterns.len() > 48 { |
284 | debug!( |
285 | "skipping Teddy (mask len: 3) because there are \ |
286 | too many patterns" , |
287 | ); |
288 | } |
289 | debug!("Teddy choice: 128-bit slim, 3 bytes" ); |
290 | SlimNeon::<3>::new(&patterns) |
291 | } |
292 | 4 => { |
293 | debug!("Teddy choice: 128-bit slim, 4 bytes" ); |
294 | SlimNeon::<4>::new(&patterns) |
295 | } |
296 | _ => { |
297 | debug!("no supported Teddy configuration found" ); |
298 | None |
299 | } |
300 | } |
301 | } |
302 | #[cfg (not(any( |
303 | all(target_arch = "x86_64" , target_feature = "sse2" ), |
304 | target_arch = "aarch64" |
305 | )))] |
306 | { |
307 | None |
308 | } |
309 | } |
310 | } |
311 | |
312 | /// A searcher that dispatches to one of several possible Teddy variants. |
313 | #[derive(Clone, Debug)] |
314 | pub(crate) struct Searcher { |
315 | /// The Teddy variant we use. We use dynamic dispatch under the theory that |
316 | /// it results in better codegen then a enum, although this is a specious |
317 | /// claim. |
318 | /// |
319 | /// This `Searcher` is essentially a wrapper for a `SearcherT` trait |
320 | /// object. We just make `memory_usage` and `minimum_len` available without |
321 | /// going through dynamic dispatch. |
322 | imp: Arc<dyn SearcherT>, |
323 | /// Total heap memory used by the Teddy variant. |
324 | memory_usage: usize, |
325 | /// The minimum haystack length this searcher can handle. It is intended |
326 | /// for callers to use some other search routine (such as Rabin-Karp) in |
327 | /// cases where the haystack (or remainer of the haystack) is too short. |
328 | minimum_len: usize, |
329 | } |
330 | |
331 | impl Searcher { |
332 | /// Look for the leftmost occurrence of any pattern in this search in the |
333 | /// given haystack starting at the given position. |
334 | /// |
335 | /// # Panics |
336 | /// |
337 | /// This panics when `haystack[at..].len()` is less than the minimum length |
338 | /// for this haystack. |
339 | #[inline (always)] |
340 | pub(crate) fn find( |
341 | &self, |
342 | haystack: &[u8], |
343 | at: usize, |
344 | ) -> Option<crate::Match> { |
345 | // SAFETY: The Teddy implementations all require a minimum haystack |
346 | // length, and this is required for safety. Therefore, we assert it |
347 | // here in order to make this method sound. |
348 | assert!(haystack[at..].len() >= self.minimum_len); |
349 | let hayptr = haystack.as_ptr(); |
350 | // SAFETY: Construction of the searcher guarantees that we are able |
351 | // to run it in the current environment (i.e., we won't get an AVX2 |
352 | // searcher on a x86-64 CPU without AVX2 support). Also, the pointers |
353 | // are valid as they are derived directly from a borrowed slice. |
354 | let teddym = unsafe { |
355 | self.imp.find(hayptr.add(at), hayptr.add(haystack.len()))? |
356 | }; |
357 | let start = teddym.start().as_usize().wrapping_sub(hayptr.as_usize()); |
358 | let end = teddym.end().as_usize().wrapping_sub(hayptr.as_usize()); |
359 | let span = crate::Span { start, end }; |
360 | // OK because we won't permit the construction of a searcher that |
361 | // could report a pattern ID bigger than what can fit in the crate-wide |
362 | // PatternID type. |
363 | let pid = crate::PatternID::new_unchecked(teddym.pattern().as_usize()); |
364 | let m = crate::Match::new(pid, span); |
365 | Some(m) |
366 | } |
367 | |
368 | /// Returns the approximate total amount of heap used by this type, in |
369 | /// units of bytes. |
370 | #[inline (always)] |
371 | pub(crate) fn memory_usage(&self) -> usize { |
372 | self.memory_usage |
373 | } |
374 | |
375 | /// Returns the minimum length, in bytes, that a haystack must be in order |
376 | /// to use it with this searcher. |
377 | #[inline (always)] |
378 | pub(crate) fn minimum_len(&self) -> usize { |
379 | self.minimum_len |
380 | } |
381 | } |
382 | |
383 | /// A trait that provides dynamic dispatch over the different possible Teddy |
384 | /// variants on the same algorithm. |
385 | /// |
386 | /// On `x86_64` for example, it isn't known until runtime which of 12 possible |
387 | /// variants will be used. One might use one of the four slim 128-bit vector |
388 | /// variants, or one of the four 256-bit vector variants or even one of the |
389 | /// four fat 256-bit vector variants. |
390 | /// |
391 | /// Since this choice is generally made when the Teddy searcher is constructed |
392 | /// and this choice is based on the patterns given and what the current CPU |
393 | /// supports, it follows that there must be some kind of indirection at search |
394 | /// time that "selects" the variant chosen at build time. |
395 | /// |
396 | /// There are a few different ways to go about this. One approach is to use an |
397 | /// enum. It works fine, but in my experiments, this generally results in worse |
398 | /// codegen. Another approach, which is what we use here, is dynamic dispatch |
399 | /// via a trait object. We basically implement this trait for each possible |
400 | /// variant, select the variant we want at build time and convert it to a |
401 | /// trait object for use at search time. |
402 | /// |
403 | /// Another approach is to use function pointers and stick each of the possible |
404 | /// variants into a union. This is essentially isomorphic to the dynamic |
405 | /// dispatch approach, but doesn't require any allocations. Since this crate |
406 | /// requires `alloc`, there's no real reason (AFAIK) to go down this path. (The |
407 | /// `memchr` crate does this.) |
408 | trait SearcherT: |
409 | Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static |
410 | { |
411 | /// Execute a search on the given haystack (identified by `start` and `end` |
412 | /// raw pointers). |
413 | /// |
414 | /// # Safety |
415 | /// |
416 | /// Essentially, the `start` and `end` pointers must be valid and point |
417 | /// to a haystack one can read. As long as you derive them from, for |
418 | /// example, a `&[u8]`, they should automatically satisfy all of the safety |
419 | /// obligations: |
420 | /// |
421 | /// * Both `start` and `end` must be valid for reads. |
422 | /// * Both `start` and `end` must point to an initialized value. |
423 | /// * Both `start` and `end` must point to the same allocated object and |
424 | /// must either be in bounds or at most one byte past the end of the |
425 | /// allocated object. |
426 | /// * Both `start` and `end` must be _derived from_ a pointer to the same |
427 | /// object. |
428 | /// * The distance between `start` and `end` must not overflow `isize`. |
429 | /// * The distance being in bounds must not rely on "wrapping around" the |
430 | /// address space. |
431 | /// * It must be the case that `start <= end`. |
432 | /// * `end - start` must be greater than the minimum length for this |
433 | /// searcher. |
434 | /// |
435 | /// Also, it is expected that implementations of this trait will tag this |
436 | /// method with a `target_feature` attribute. Callers must ensure that |
437 | /// they are executing this method in an environment where that attribute |
438 | /// is valid. |
439 | unsafe fn find(&self, start: *const u8, end: *const u8) -> Option<Match>; |
440 | } |
441 | |
442 | #[cfg (all(target_arch = "x86_64" , target_feature = "sse2" ))] |
443 | mod x86_64 { |
444 | use core::arch::x86_64::{__m128i, __m256i}; |
445 | |
446 | use alloc::sync::Arc; |
447 | |
448 | use crate::packed::{ |
449 | ext::Pointer, |
450 | pattern::Patterns, |
451 | teddy::generic::{self, Match}, |
452 | }; |
453 | |
454 | use super::{Searcher, SearcherT}; |
455 | |
456 | #[derive(Clone, Debug)] |
457 | pub(super) struct SlimSSSE3<const BYTES: usize> { |
458 | slim128: generic::Slim<__m128i, BYTES>, |
459 | } |
460 | |
461 | // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. |
462 | macro_rules! slim_ssse3 { |
463 | ($len:expr) => { |
464 | impl SlimSSSE3<$len> { |
465 | /// Creates a new searcher using "slim" Teddy with 128-bit |
466 | /// vectors. If SSSE3 is not available in the current |
467 | /// environment, then this returns `None`. |
468 | pub(super) fn new( |
469 | patterns: &Arc<Patterns>, |
470 | ) -> Option<Searcher> { |
471 | if !is_available_ssse3() { |
472 | return None; |
473 | } |
474 | Some(unsafe { SlimSSSE3::<$len>::new_unchecked(patterns) }) |
475 | } |
476 | |
477 | /// Creates a new searcher using "slim" Teddy with 256-bit |
478 | /// vectors without checking whether SSSE3 is available or not. |
479 | /// |
480 | /// # Safety |
481 | /// |
482 | /// Callers must ensure that SSSE3 is available in the current |
483 | /// environment. |
484 | #[target_feature(enable = "ssse3" )] |
485 | unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { |
486 | let slim128 = generic::Slim::<__m128i, $len>::new( |
487 | Arc::clone(patterns), |
488 | ); |
489 | let memory_usage = slim128.memory_usage(); |
490 | let minimum_len = slim128.minimum_len(); |
491 | let imp = Arc::new(SlimSSSE3 { slim128 }); |
492 | Searcher { imp, memory_usage, minimum_len } |
493 | } |
494 | } |
495 | |
496 | impl SearcherT for SlimSSSE3<$len> { |
497 | #[target_feature(enable = "ssse3" )] |
498 | #[inline] |
499 | unsafe fn find( |
500 | &self, |
501 | start: *const u8, |
502 | end: *const u8, |
503 | ) -> Option<Match> { |
504 | // SAFETY: All obligations except for `target_feature` are |
505 | // passed to the caller. Our use of `target_feature` is |
506 | // safe because construction of this type requires that the |
507 | // requisite target features are available. |
508 | self.slim128.find(start, end) |
509 | } |
510 | } |
511 | }; |
512 | } |
513 | |
514 | slim_ssse3!(1); |
515 | slim_ssse3!(2); |
516 | slim_ssse3!(3); |
517 | slim_ssse3!(4); |
518 | |
519 | #[derive(Clone, Debug)] |
520 | pub(super) struct SlimAVX2<const BYTES: usize> { |
521 | slim128: generic::Slim<__m128i, BYTES>, |
522 | slim256: generic::Slim<__m256i, BYTES>, |
523 | } |
524 | |
525 | // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. |
526 | macro_rules! slim_avx2 { |
527 | ($len:expr) => { |
528 | impl SlimAVX2<$len> { |
529 | /// Creates a new searcher using "slim" Teddy with 256-bit |
530 | /// vectors. If AVX2 is not available in the current |
531 | /// environment, then this returns `None`. |
532 | pub(super) fn new( |
533 | patterns: &Arc<Patterns>, |
534 | ) -> Option<Searcher> { |
535 | if !is_available_avx2() { |
536 | return None; |
537 | } |
538 | Some(unsafe { SlimAVX2::<$len>::new_unchecked(patterns) }) |
539 | } |
540 | |
541 | /// Creates a new searcher using "slim" Teddy with 256-bit |
542 | /// vectors without checking whether AVX2 is available or not. |
543 | /// |
544 | /// # Safety |
545 | /// |
546 | /// Callers must ensure that AVX2 is available in the current |
547 | /// environment. |
548 | #[target_feature(enable = "avx2" )] |
549 | unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { |
550 | let slim128 = generic::Slim::<__m128i, $len>::new( |
551 | Arc::clone(&patterns), |
552 | ); |
553 | let slim256 = generic::Slim::<__m256i, $len>::new( |
554 | Arc::clone(&patterns), |
555 | ); |
556 | let memory_usage = |
557 | slim128.memory_usage() + slim256.memory_usage(); |
558 | let minimum_len = slim128.minimum_len(); |
559 | let imp = Arc::new(SlimAVX2 { slim128, slim256 }); |
560 | Searcher { imp, memory_usage, minimum_len } |
561 | } |
562 | } |
563 | |
564 | impl SearcherT for SlimAVX2<$len> { |
565 | #[target_feature(enable = "avx2" )] |
566 | #[inline] |
567 | unsafe fn find( |
568 | &self, |
569 | start: *const u8, |
570 | end: *const u8, |
571 | ) -> Option<Match> { |
572 | // SAFETY: All obligations except for `target_feature` are |
573 | // passed to the caller. Our use of `target_feature` is |
574 | // safe because construction of this type requires that the |
575 | // requisite target features are available. |
576 | let len = end.distance(start); |
577 | if len < self.slim256.minimum_len() { |
578 | self.slim128.find(start, end) |
579 | } else { |
580 | self.slim256.find(start, end) |
581 | } |
582 | } |
583 | } |
584 | }; |
585 | } |
586 | |
587 | slim_avx2!(1); |
588 | slim_avx2!(2); |
589 | slim_avx2!(3); |
590 | slim_avx2!(4); |
591 | |
592 | #[derive(Clone, Debug)] |
593 | pub(super) struct FatAVX2<const BYTES: usize> { |
594 | fat256: generic::Fat<__m256i, BYTES>, |
595 | } |
596 | |
597 | // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. |
598 | macro_rules! fat_avx2 { |
599 | ($len:expr) => { |
600 | impl FatAVX2<$len> { |
601 | /// Creates a new searcher using "slim" Teddy with 256-bit |
602 | /// vectors. If AVX2 is not available in the current |
603 | /// environment, then this returns `None`. |
604 | pub(super) fn new( |
605 | patterns: &Arc<Patterns>, |
606 | ) -> Option<Searcher> { |
607 | if !is_available_avx2() { |
608 | return None; |
609 | } |
610 | Some(unsafe { FatAVX2::<$len>::new_unchecked(patterns) }) |
611 | } |
612 | |
613 | /// Creates a new searcher using "slim" Teddy with 256-bit |
614 | /// vectors without checking whether AVX2 is available or not. |
615 | /// |
616 | /// # Safety |
617 | /// |
618 | /// Callers must ensure that AVX2 is available in the current |
619 | /// environment. |
620 | #[target_feature(enable = "avx2" )] |
621 | unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { |
622 | let fat256 = generic::Fat::<__m256i, $len>::new( |
623 | Arc::clone(&patterns), |
624 | ); |
625 | let memory_usage = fat256.memory_usage(); |
626 | let minimum_len = fat256.minimum_len(); |
627 | let imp = Arc::new(FatAVX2 { fat256 }); |
628 | Searcher { imp, memory_usage, minimum_len } |
629 | } |
630 | } |
631 | |
632 | impl SearcherT for FatAVX2<$len> { |
633 | #[target_feature(enable = "avx2" )] |
634 | #[inline] |
635 | unsafe fn find( |
636 | &self, |
637 | start: *const u8, |
638 | end: *const u8, |
639 | ) -> Option<Match> { |
640 | // SAFETY: All obligations except for `target_feature` are |
641 | // passed to the caller. Our use of `target_feature` is |
642 | // safe because construction of this type requires that the |
643 | // requisite target features are available. |
644 | self.fat256.find(start, end) |
645 | } |
646 | } |
647 | }; |
648 | } |
649 | |
650 | fat_avx2!(1); |
651 | fat_avx2!(2); |
652 | fat_avx2!(3); |
653 | fat_avx2!(4); |
654 | |
655 | #[inline ] |
656 | pub(super) fn is_available_ssse3() -> bool { |
657 | #[cfg (not(target_feature = "sse2" ))] |
658 | { |
659 | false |
660 | } |
661 | #[cfg (target_feature = "sse2" )] |
662 | { |
663 | #[cfg (target_feature = "ssse3" )] |
664 | { |
665 | true |
666 | } |
667 | #[cfg (not(target_feature = "ssse3" ))] |
668 | { |
669 | #[cfg (feature = "std" )] |
670 | { |
671 | std::is_x86_feature_detected!("ssse3" ) |
672 | } |
673 | #[cfg (not(feature = "std" ))] |
674 | { |
675 | false |
676 | } |
677 | } |
678 | } |
679 | } |
680 | |
681 | #[inline ] |
682 | pub(super) fn is_available_avx2() -> bool { |
683 | #[cfg (not(target_feature = "sse2" ))] |
684 | { |
685 | false |
686 | } |
687 | #[cfg (target_feature = "sse2" )] |
688 | { |
689 | #[cfg (target_feature = "avx2" )] |
690 | { |
691 | true |
692 | } |
693 | #[cfg (not(target_feature = "avx2" ))] |
694 | { |
695 | #[cfg (feature = "std" )] |
696 | { |
697 | std::is_x86_feature_detected!("avx2" ) |
698 | } |
699 | #[cfg (not(feature = "std" ))] |
700 | { |
701 | false |
702 | } |
703 | } |
704 | } |
705 | } |
706 | } |
707 | |
708 | #[cfg (target_arch = "aarch64" )] |
709 | mod aarch64 { |
710 | use core::arch::aarch64::uint8x16_t; |
711 | |
712 | use alloc::sync::Arc; |
713 | |
714 | use crate::packed::{ |
715 | pattern::Patterns, |
716 | teddy::generic::{self, Match}, |
717 | }; |
718 | |
719 | use super::{Searcher, SearcherT}; |
720 | |
721 | #[derive(Clone, Debug)] |
722 | pub(super) struct SlimNeon<const BYTES: usize> { |
723 | slim128: generic::Slim<uint8x16_t, BYTES>, |
724 | } |
725 | |
726 | // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. |
727 | macro_rules! slim_neon { |
728 | ($len:expr) => { |
729 | impl SlimNeon<$len> { |
730 | /// Creates a new searcher using "slim" Teddy with 128-bit |
731 | /// vectors. If SSSE3 is not available in the current |
732 | /// environment, then this returns `None`. |
733 | pub(super) fn new( |
734 | patterns: &Arc<Patterns>, |
735 | ) -> Option<Searcher> { |
736 | Some(unsafe { SlimNeon::<$len>::new_unchecked(patterns) }) |
737 | } |
738 | |
739 | /// Creates a new searcher using "slim" Teddy with 256-bit |
740 | /// vectors without checking whether SSSE3 is available or not. |
741 | /// |
742 | /// # Safety |
743 | /// |
744 | /// Callers must ensure that SSSE3 is available in the current |
745 | /// environment. |
746 | #[target_feature(enable = "neon" )] |
747 | unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { |
748 | let slim128 = generic::Slim::<uint8x16_t, $len>::new( |
749 | Arc::clone(patterns), |
750 | ); |
751 | let memory_usage = slim128.memory_usage(); |
752 | let minimum_len = slim128.minimum_len(); |
753 | let imp = Arc::new(SlimNeon { slim128 }); |
754 | Searcher { imp, memory_usage, minimum_len } |
755 | } |
756 | } |
757 | |
758 | impl SearcherT for SlimNeon<$len> { |
759 | #[target_feature(enable = "neon" )] |
760 | #[inline] |
761 | unsafe fn find( |
762 | &self, |
763 | start: *const u8, |
764 | end: *const u8, |
765 | ) -> Option<Match> { |
766 | // SAFETY: All obligations except for `target_feature` are |
767 | // passed to the caller. Our use of `target_feature` is |
768 | // safe because construction of this type requires that the |
769 | // requisite target features are available. |
770 | self.slim128.find(start, end) |
771 | } |
772 | } |
773 | }; |
774 | } |
775 | |
776 | slim_neon!(1); |
777 | slim_neon!(2); |
778 | slim_neon!(3); |
779 | slim_neon!(4); |
780 | } |
781 | |