1 | // See the README in this directory for an explanation of the Teddy algorithm. |
2 | |
3 | use core::{cmp, fmt}; |
4 | |
5 | use alloc::{collections::BTreeMap, format, vec, vec::Vec}; |
6 | |
7 | use crate::packed::{ |
8 | pattern::{PatternID, Patterns}, |
9 | teddy::Teddy, |
10 | }; |
11 | |
12 | /// A builder for constructing a Teddy matcher. |
13 | /// |
14 | /// The builder primarily permits fine grained configuration of the Teddy |
15 | /// matcher. Most options are made only available for testing/benchmarking |
16 | /// purposes. In reality, options are automatically determined by the nature |
17 | /// and number of patterns given to the builder. |
18 | #[derive (Clone, Debug)] |
19 | pub struct Builder { |
20 | /// When none, this is automatically determined. Otherwise, `false` means |
21 | /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used |
22 | /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't |
23 | /// available and Fat Teddy was requested, no matcher will be built. |
24 | fat: Option<bool>, |
25 | /// When none, this is automatically determined. Otherwise, `false` means |
26 | /// that 128-bit vectors will be used (up to SSSE3 instructions) where as |
27 | /// `true` means that 256-bit vectors will be used. As with `fat`, if |
28 | /// 256-bit vectors are requested and they aren't available, then a |
29 | /// searcher will not be built. |
30 | avx: Option<bool>, |
31 | } |
32 | |
33 | impl Default for Builder { |
34 | fn default() -> Builder { |
35 | Builder::new() |
36 | } |
37 | } |
38 | |
39 | impl Builder { |
40 | /// Create a new builder for configuring a Teddy matcher. |
41 | pub fn new() -> Builder { |
42 | Builder { fat: None, avx: None } |
43 | } |
44 | |
45 | /// Build a matcher for the set of patterns given. If a matcher could not |
46 | /// be built, then `None` is returned. |
47 | /// |
48 | /// Generally, a matcher isn't built if the necessary CPU features aren't |
49 | /// available, an unsupported target or if the searcher is believed to be |
50 | /// slower than standard techniques (i.e., if there are too many literals). |
51 | pub fn build(&self, patterns: &Patterns) -> Option<Teddy> { |
52 | self.build_imp(patterns) |
53 | } |
54 | |
55 | /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses |
56 | /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful |
57 | /// for a larger set of literals. |
58 | /// |
59 | /// `None` is the default, which results in an automatic selection based |
60 | /// on the number of literals and available CPU features. |
61 | pub fn fat(&mut self, yes: Option<bool>) -> &mut Builder { |
62 | self.fat = yes; |
63 | self |
64 | } |
65 | |
66 | /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). |
67 | /// Generally, a larger vector size is better since it either permits |
68 | /// matching more patterns or matching more bytes in the haystack at once. |
69 | /// |
70 | /// `None` is the default, which results in an automatic selection based on |
71 | /// the number of literals and available CPU features. |
72 | pub fn avx(&mut self, yes: Option<bool>) -> &mut Builder { |
73 | self.avx = yes; |
74 | self |
75 | } |
76 | |
77 | fn build_imp(&self, patterns: &Patterns) -> Option<Teddy> { |
78 | use crate::packed::teddy::runtime; |
79 | |
80 | // Most of the logic here is just about selecting the optimal settings, |
81 | // or perhaps even rejecting construction altogether. The choices |
82 | // we have are: fat (avx only) or not, ssse3 or avx2, and how many |
83 | // patterns we allow ourselves to search. Additionally, for testing |
84 | // and benchmarking, we permit callers to try to "force" a setting, |
85 | // and if the setting isn't allowed (e.g., forcing AVX when AVX isn't |
86 | // available), then we bail and return nothing. |
87 | |
88 | if patterns.len() > 64 { |
89 | debug!("skipping Teddy because of too many patterns" ); |
90 | return None; |
91 | } |
92 | let has_ssse3 = std::is_x86_feature_detected!("ssse3" ); |
93 | let has_avx = std::is_x86_feature_detected!("avx2" ); |
94 | let avx = if self.avx == Some(true) { |
95 | if !has_avx { |
96 | debug!( |
97 | "skipping Teddy because avx was demanded but unavailable" |
98 | ); |
99 | return None; |
100 | } |
101 | true |
102 | } else if self.avx == Some(false) { |
103 | if !has_ssse3 { |
104 | debug!( |
105 | "skipping Teddy because ssse3 was demanded but unavailable" |
106 | ); |
107 | return None; |
108 | } |
109 | false |
110 | } else if !has_ssse3 && !has_avx { |
111 | debug!("skipping Teddy because ssse3 and avx are unavailable" ); |
112 | return None; |
113 | } else { |
114 | has_avx |
115 | }; |
116 | let fat = match self.fat { |
117 | None => avx && patterns.len() > 32, |
118 | Some(false) => false, |
119 | Some(true) if !avx => { |
120 | debug!( |
121 | "skipping Teddy because it needs to be fat, but fat \ |
122 | Teddy requires avx which is unavailable" |
123 | ); |
124 | return None; |
125 | } |
126 | Some(true) => true, |
127 | }; |
128 | |
129 | let mut compiler = Compiler::new(patterns, fat); |
130 | compiler.compile(); |
131 | let Compiler { buckets, masks, .. } = compiler; |
132 | // SAFETY: It is required that the builder only produce Teddy matchers |
133 | // that are allowed to run on the current CPU, since we later assume |
134 | // that the presence of (for example) TeddySlim1Mask256 means it is |
135 | // safe to call functions marked with the `avx2` target feature. |
136 | match (masks.len(), avx, fat) { |
137 | (1, false, _) => { |
138 | debug!("Teddy choice: 128-bit slim, 1 byte" ); |
139 | Some(Teddy { |
140 | buckets, |
141 | max_pattern_id: patterns.max_pattern_id(), |
142 | exec: runtime::Exec::TeddySlim1Mask128( |
143 | runtime::TeddySlim1Mask128 { |
144 | mask1: runtime::Mask128::new(masks[0]), |
145 | }, |
146 | ), |
147 | }) |
148 | } |
149 | (1, true, false) => { |
150 | debug!("Teddy choice: 256-bit slim, 1 byte" ); |
151 | Some(Teddy { |
152 | buckets, |
153 | max_pattern_id: patterns.max_pattern_id(), |
154 | exec: runtime::Exec::TeddySlim1Mask256( |
155 | runtime::TeddySlim1Mask256 { |
156 | mask1: runtime::Mask256::new(masks[0]), |
157 | }, |
158 | ), |
159 | }) |
160 | } |
161 | (1, true, true) => { |
162 | debug!("Teddy choice: 256-bit fat, 1 byte" ); |
163 | Some(Teddy { |
164 | buckets, |
165 | max_pattern_id: patterns.max_pattern_id(), |
166 | exec: runtime::Exec::TeddyFat1Mask256( |
167 | runtime::TeddyFat1Mask256 { |
168 | mask1: runtime::Mask256::new(masks[0]), |
169 | }, |
170 | ), |
171 | }) |
172 | } |
173 | (2, false, _) => { |
174 | debug!("Teddy choice: 128-bit slim, 2 bytes" ); |
175 | Some(Teddy { |
176 | buckets, |
177 | max_pattern_id: patterns.max_pattern_id(), |
178 | exec: runtime::Exec::TeddySlim2Mask128( |
179 | runtime::TeddySlim2Mask128 { |
180 | mask1: runtime::Mask128::new(masks[0]), |
181 | mask2: runtime::Mask128::new(masks[1]), |
182 | }, |
183 | ), |
184 | }) |
185 | } |
186 | (2, true, false) => { |
187 | debug!("Teddy choice: 256-bit slim, 2 bytes" ); |
188 | Some(Teddy { |
189 | buckets, |
190 | max_pattern_id: patterns.max_pattern_id(), |
191 | exec: runtime::Exec::TeddySlim2Mask256( |
192 | runtime::TeddySlim2Mask256 { |
193 | mask1: runtime::Mask256::new(masks[0]), |
194 | mask2: runtime::Mask256::new(masks[1]), |
195 | }, |
196 | ), |
197 | }) |
198 | } |
199 | (2, true, true) => { |
200 | debug!("Teddy choice: 256-bit fat, 2 bytes" ); |
201 | Some(Teddy { |
202 | buckets, |
203 | max_pattern_id: patterns.max_pattern_id(), |
204 | exec: runtime::Exec::TeddyFat2Mask256( |
205 | runtime::TeddyFat2Mask256 { |
206 | mask1: runtime::Mask256::new(masks[0]), |
207 | mask2: runtime::Mask256::new(masks[1]), |
208 | }, |
209 | ), |
210 | }) |
211 | } |
212 | (3, false, _) => { |
213 | debug!("Teddy choice: 128-bit slim, 3 bytes" ); |
214 | Some(Teddy { |
215 | buckets, |
216 | max_pattern_id: patterns.max_pattern_id(), |
217 | exec: runtime::Exec::TeddySlim3Mask128( |
218 | runtime::TeddySlim3Mask128 { |
219 | mask1: runtime::Mask128::new(masks[0]), |
220 | mask2: runtime::Mask128::new(masks[1]), |
221 | mask3: runtime::Mask128::new(masks[2]), |
222 | }, |
223 | ), |
224 | }) |
225 | } |
226 | (3, true, false) => { |
227 | debug!("Teddy choice: 256-bit slim, 3 bytes" ); |
228 | Some(Teddy { |
229 | buckets, |
230 | max_pattern_id: patterns.max_pattern_id(), |
231 | exec: runtime::Exec::TeddySlim3Mask256( |
232 | runtime::TeddySlim3Mask256 { |
233 | mask1: runtime::Mask256::new(masks[0]), |
234 | mask2: runtime::Mask256::new(masks[1]), |
235 | mask3: runtime::Mask256::new(masks[2]), |
236 | }, |
237 | ), |
238 | }) |
239 | } |
240 | (3, true, true) => { |
241 | debug!("Teddy choice: 256-bit fat, 3 bytes" ); |
242 | Some(Teddy { |
243 | buckets, |
244 | max_pattern_id: patterns.max_pattern_id(), |
245 | exec: runtime::Exec::TeddyFat3Mask256( |
246 | runtime::TeddyFat3Mask256 { |
247 | mask1: runtime::Mask256::new(masks[0]), |
248 | mask2: runtime::Mask256::new(masks[1]), |
249 | mask3: runtime::Mask256::new(masks[2]), |
250 | }, |
251 | ), |
252 | }) |
253 | } |
254 | (4, false, _) => { |
255 | debug!("Teddy choice: 128-bit slim, 4 bytes" ); |
256 | Some(Teddy { |
257 | buckets, |
258 | max_pattern_id: patterns.max_pattern_id(), |
259 | exec: runtime::Exec::TeddySlim4Mask128( |
260 | runtime::TeddySlim4Mask128 { |
261 | mask1: runtime::Mask128::new(masks[0]), |
262 | mask2: runtime::Mask128::new(masks[1]), |
263 | mask3: runtime::Mask128::new(masks[2]), |
264 | mask4: runtime::Mask128::new(masks[3]), |
265 | }, |
266 | ), |
267 | }) |
268 | } |
269 | (4, true, false) => { |
270 | debug!("Teddy choice: 256-bit slim, 4 bytes" ); |
271 | Some(Teddy { |
272 | buckets, |
273 | max_pattern_id: patterns.max_pattern_id(), |
274 | exec: runtime::Exec::TeddySlim4Mask256( |
275 | runtime::TeddySlim4Mask256 { |
276 | mask1: runtime::Mask256::new(masks[0]), |
277 | mask2: runtime::Mask256::new(masks[1]), |
278 | mask3: runtime::Mask256::new(masks[2]), |
279 | mask4: runtime::Mask256::new(masks[3]), |
280 | }, |
281 | ), |
282 | }) |
283 | } |
284 | (4, true, true) => { |
285 | debug!("Teddy choice: 256-bit fat, 4 bytes" ); |
286 | Some(Teddy { |
287 | buckets, |
288 | max_pattern_id: patterns.max_pattern_id(), |
289 | exec: runtime::Exec::TeddyFat4Mask256( |
290 | runtime::TeddyFat4Mask256 { |
291 | mask1: runtime::Mask256::new(masks[0]), |
292 | mask2: runtime::Mask256::new(masks[1]), |
293 | mask3: runtime::Mask256::new(masks[2]), |
294 | mask4: runtime::Mask256::new(masks[3]), |
295 | }, |
296 | ), |
297 | }) |
298 | } |
299 | _ => unreachable!(), |
300 | } |
301 | } |
302 | } |
303 | |
304 | /// A compiler is in charge of allocating patterns into buckets and generating |
305 | /// the masks necessary for searching. |
306 | #[derive (Clone)] |
307 | struct Compiler<'p> { |
308 | patterns: &'p Patterns, |
309 | buckets: Vec<Vec<PatternID>>, |
310 | masks: Vec<Mask>, |
311 | } |
312 | |
313 | impl<'p> Compiler<'p> { |
314 | /// Create a new Teddy compiler for the given patterns. If `fat` is true, |
315 | /// then 16 buckets will be used instead of 8. |
316 | /// |
317 | /// This panics if any of the patterns given are empty. |
318 | fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> { |
319 | let mask_len = cmp::min(4, patterns.minimum_len()); |
320 | assert!(1 <= mask_len && mask_len <= 4); |
321 | |
322 | Compiler { |
323 | patterns, |
324 | buckets: vec![vec![]; if fat { 16 } else { 8 }], |
325 | masks: vec![Mask::default(); mask_len], |
326 | } |
327 | } |
328 | |
329 | /// Compile the patterns in this compiler into buckets and masks. |
330 | fn compile(&mut self) { |
331 | let mut lonibble_to_bucket: BTreeMap<Vec<u8>, usize> = BTreeMap::new(); |
332 | for (id, pattern) in self.patterns.iter() { |
333 | // We try to be slightly clever in how we assign patterns into |
334 | // buckets. Generally speaking, we want patterns with the same |
335 | // prefix to be in the same bucket, since it minimizes the amount |
336 | // of time we spend churning through buckets in the verification |
337 | // step. |
338 | // |
339 | // So we could assign patterns with the same N-prefix (where N |
340 | // is the size of the mask, which is one of {1, 2, 3}) to the |
341 | // same bucket. However, case insensitive searches are fairly |
342 | // common, so we'd for example, ideally want to treat `abc` and |
343 | // `ABC` as if they shared the same prefix. ASCII has the nice |
344 | // property that the lower 4 bits of A and a are the same, so we |
345 | // therefore group patterns with the same low-nybbe-N-prefix into |
346 | // the same bucket. |
347 | // |
348 | // MOREOVER, this is actually necessary for correctness! In |
349 | // particular, by grouping patterns with the same prefix into the |
350 | // same bucket, we ensure that we preserve correct leftmost-first |
351 | // and leftmost-longest match semantics. In addition to the fact |
352 | // that `patterns.iter()` iterates in the correct order, this |
353 | // guarantees that all possible ambiguous matches will occur in |
354 | // the same bucket. The verification routine could be adjusted to |
355 | // support correct leftmost match semantics regardless of bucket |
356 | // allocation, but that results in a performance hit. It's much |
357 | // nicer to be able to just stop as soon as a match is found. |
358 | let lonybs = pattern.low_nybbles(self.masks.len()); |
359 | if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) { |
360 | self.buckets[bucket].push(id); |
361 | } else { |
362 | // N.B. We assign buckets in reverse because it shouldn't have |
363 | // any influence on performance, but it does make it harder to |
364 | // get leftmost match semantics accidentally correct. |
365 | let bucket = (self.buckets.len() - 1) |
366 | - (id as usize % self.buckets.len()); |
367 | self.buckets[bucket].push(id); |
368 | lonibble_to_bucket.insert(lonybs, bucket); |
369 | } |
370 | } |
371 | for (bucket_index, bucket) in self.buckets.iter().enumerate() { |
372 | for &pat_id in bucket { |
373 | let pat = self.patterns.get(pat_id); |
374 | for (i, mask) in self.masks.iter_mut().enumerate() { |
375 | if self.buckets.len() == 8 { |
376 | mask.add_slim(bucket_index as u8, pat.bytes()[i]); |
377 | } else { |
378 | mask.add_fat(bucket_index as u8, pat.bytes()[i]); |
379 | } |
380 | } |
381 | } |
382 | } |
383 | } |
384 | } |
385 | |
386 | impl<'p> fmt::Debug for Compiler<'p> { |
387 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
388 | let mut buckets: Vec>> = vec![vec![]; self.buckets.len()]; |
389 | for (i: usize, bucket: &Vec) in self.buckets.iter().enumerate() { |
390 | for &patid: u16 in bucket { |
391 | buckets[i].push(self.patterns.get(id:patid)); |
392 | } |
393 | } |
394 | f&mut DebugStruct<'_, '_>.debug_struct("Compiler" ) |
395 | .field("buckets" , &buckets) |
396 | .field(name:"masks" , &self.masks) |
397 | .finish() |
398 | } |
399 | } |
400 | |
401 | /// Mask represents the low and high nybble masks that will be used during |
402 | /// search. Each mask is 32 bytes wide, although only the first 16 bytes are |
403 | /// used for the SSSE3 runtime. |
404 | /// |
405 | /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set |
406 | /// if and only if the corresponding nybble is in the ith bucket. The index of |
407 | /// the byte (0-15, inclusive) corresponds to the nybble. |
408 | /// |
409 | /// Each mask is used as the target of a shuffle, where the indices for the |
410 | /// shuffle are taken from the haystack. AND'ing the shuffles for both the |
411 | /// low and high masks together also results in 8-bit bitsets, but where bit |
412 | /// `i` is set if and only if the correspond *byte* is in the ith bucket. |
413 | /// |
414 | /// During compilation, masks are just arrays. But during search, these masks |
415 | /// are represented as 128-bit or 256-bit vectors. |
416 | /// |
417 | /// (See the README is this directory for more details.) |
418 | #[derive (Clone, Copy, Default)] |
419 | pub struct Mask { |
420 | lo: [u8; 32], |
421 | hi: [u8; 32], |
422 | } |
423 | |
424 | impl Mask { |
425 | /// Update this mask by adding the given byte to the given bucket. The |
426 | /// given bucket must be in the range 0-7. |
427 | /// |
428 | /// This is for "slim" Teddy, where there are only 8 buckets. |
429 | fn add_slim(&mut self, bucket: u8, byte: u8) { |
430 | assert!(bucket < 8); |
431 | |
432 | let byte_lo = (byte & 0xF) as usize; |
433 | let byte_hi = ((byte >> 4) & 0xF) as usize; |
434 | // When using 256-bit vectors, we need to set this bucket assignment in |
435 | // the low and high 128-bit portions of the mask. This allows us to |
436 | // process 32 bytes at a time. Namely, AVX2 shuffles operate on each |
437 | // of the 128-bit lanes, rather than the full 256-bit vector at once. |
438 | self.lo[byte_lo] |= 1 << bucket; |
439 | self.lo[byte_lo + 16] |= 1 << bucket; |
440 | self.hi[byte_hi] |= 1 << bucket; |
441 | self.hi[byte_hi + 16] |= 1 << bucket; |
442 | } |
443 | |
444 | /// Update this mask by adding the given byte to the given bucket. The |
445 | /// given bucket must be in the range 0-15. |
446 | /// |
447 | /// This is for "fat" Teddy, where there are 16 buckets. |
448 | fn add_fat(&mut self, bucket: u8, byte: u8) { |
449 | assert!(bucket < 16); |
450 | |
451 | let byte_lo = (byte & 0xF) as usize; |
452 | let byte_hi = ((byte >> 4) & 0xF) as usize; |
453 | // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, |
454 | // the high 128 bits of our mask correspond to buckets 8-15, while the |
455 | // low 128 bits correspond to buckets 0-7. |
456 | if bucket < 8 { |
457 | self.lo[byte_lo] |= 1 << bucket; |
458 | self.hi[byte_hi] |= 1 << bucket; |
459 | } else { |
460 | self.lo[byte_lo + 16] |= 1 << (bucket % 8); |
461 | self.hi[byte_hi + 16] |= 1 << (bucket % 8); |
462 | } |
463 | } |
464 | |
465 | /// Return the low 128 bits of the low-nybble mask. |
466 | pub fn lo128(&self) -> [u8; 16] { |
467 | let mut tmp = [0; 16]; |
468 | tmp.copy_from_slice(&self.lo[..16]); |
469 | tmp |
470 | } |
471 | |
472 | /// Return the full low-nybble mask. |
473 | pub fn lo256(&self) -> [u8; 32] { |
474 | self.lo |
475 | } |
476 | |
477 | /// Return the low 128 bits of the high-nybble mask. |
478 | pub fn hi128(&self) -> [u8; 16] { |
479 | let mut tmp = [0; 16]; |
480 | tmp.copy_from_slice(&self.hi[..16]); |
481 | tmp |
482 | } |
483 | |
484 | /// Return the full high-nybble mask. |
485 | pub fn hi256(&self) -> [u8; 32] { |
486 | self.hi |
487 | } |
488 | } |
489 | |
490 | impl fmt::Debug for Mask { |
491 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
492 | let (mut parts_lo: Vec, mut parts_hi: Vec) = (vec![], vec![]); |
493 | for i: usize in 0..32 { |
494 | parts_lo.push(format!(" {:02}: {:08b}" , i, self.lo[i])); |
495 | parts_hi.push(format!(" {:02}: {:08b}" , i, self.hi[i])); |
496 | } |
497 | f&mut DebugStruct<'_, '_>.debug_struct("Mask" ) |
498 | .field("lo" , &parts_lo) |
499 | .field(name:"hi" , &parts_hi) |
500 | .finish() |
501 | } |
502 | } |
503 | |