1 | // See the README in this directory for an explanation of the Teddy algorithm. |
2 | |
3 | use std::cmp; |
4 | use std::collections::BTreeMap; |
5 | use std::fmt; |
6 | |
7 | use crate::packed::pattern::{PatternID, Patterns}; |
8 | use crate::packed::teddy::Teddy; |
9 | |
10 | /// A builder for constructing a Teddy matcher. |
11 | /// |
12 | /// The builder primarily permits fine grained configuration of the Teddy |
13 | /// matcher. Most options are made only available for testing/benchmarking |
14 | /// purposes. In reality, options are automatically determined by the nature |
15 | /// and number of patterns given to the builder. |
16 | #[derive (Clone, Debug)] |
17 | pub struct Builder { |
18 | /// When none, this is automatically determined. Otherwise, `false` means |
19 | /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used |
20 | /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't |
21 | /// available and Fat Teddy was requested, no matcher will be built. |
22 | fat: Option<bool>, |
23 | /// When none, this is automatically determined. Otherwise, `false` means |
24 | /// that 128-bit vectors will be used (up to SSSE3 instructions) where as |
25 | /// `true` means that 256-bit vectors will be used. As with `fat`, if |
26 | /// 256-bit vectors are requested and they aren't available, then a |
27 | /// searcher will not be built. |
28 | avx: Option<bool>, |
29 | } |
30 | |
31 | impl Default for Builder { |
32 | fn default() -> Builder { |
33 | Builder::new() |
34 | } |
35 | } |
36 | |
37 | impl Builder { |
38 | /// Create a new builder for configuring a Teddy matcher. |
39 | pub fn new() -> Builder { |
40 | Builder { fat: None, avx: None } |
41 | } |
42 | |
43 | /// Build a matcher for the set of patterns given. If a matcher could not |
44 | /// be built, then `None` is returned. |
45 | /// |
46 | /// Generally, a matcher isn't built if the necessary CPU features aren't |
47 | /// available, an unsupported target or if the searcher is believed to be |
48 | /// slower than standard techniques (i.e., if there are too many literals). |
49 | pub fn build(&self, patterns: &Patterns) -> Option<Teddy> { |
50 | self.build_imp(patterns) |
51 | } |
52 | |
53 | /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses |
54 | /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful |
55 | /// for a larger set of literals. |
56 | /// |
57 | /// `None` is the default, which results in an automatic selection based |
58 | /// on the number of literals and available CPU features. |
59 | pub fn fat(&mut self, yes: Option<bool>) -> &mut Builder { |
60 | self.fat = yes; |
61 | self |
62 | } |
63 | |
64 | /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). |
65 | /// Generally, a larger vector size is better since it either permits |
66 | /// matching more patterns or matching more bytes in the haystack at once. |
67 | /// |
68 | /// `None` is the default, which results in an automatic selection based on |
69 | /// the number of literals and available CPU features. |
70 | pub fn avx(&mut self, yes: Option<bool>) -> &mut Builder { |
71 | self.avx = yes; |
72 | self |
73 | } |
74 | |
75 | fn build_imp(&self, patterns: &Patterns) -> Option<Teddy> { |
76 | use crate::packed::teddy::runtime; |
77 | |
78 | // Most of the logic here is just about selecting the optimal settings, |
79 | // or perhaps even rejecting construction altogether. The choices |
80 | // we have are: fat (avx only) or not, ssse3 or avx2, and how many |
81 | // patterns we allow ourselves to search. Additionally, for testing |
82 | // and benchmarking, we permit callers to try to "force" a setting, |
83 | // and if the setting isn't allowed (e.g., forcing AVX when AVX isn't |
84 | // available), then we bail and return nothing. |
85 | |
86 | if patterns.len() > 64 { |
87 | return None; |
88 | } |
89 | let has_ssse3 = is_x86_feature_detected!("ssse3" ); |
90 | let has_avx = is_x86_feature_detected!("avx2" ); |
91 | let avx = if self.avx == Some(true) { |
92 | if !has_avx { |
93 | return None; |
94 | } |
95 | true |
96 | } else if self.avx == Some(false) { |
97 | if !has_ssse3 { |
98 | return None; |
99 | } |
100 | false |
101 | } else if !has_ssse3 && !has_avx { |
102 | return None; |
103 | } else { |
104 | has_avx |
105 | }; |
106 | let fat = match self.fat { |
107 | None => avx && patterns.len() > 32, |
108 | Some(false) => false, |
109 | Some(true) if !avx => return None, |
110 | Some(true) => true, |
111 | }; |
112 | |
113 | let mut compiler = Compiler::new(patterns, fat); |
114 | compiler.compile(); |
115 | let Compiler { buckets, masks, .. } = compiler; |
116 | // SAFETY: It is required that the builder only produce Teddy matchers |
117 | // that are allowed to run on the current CPU, since we later assume |
118 | // that the presence of (for example) TeddySlim1Mask256 means it is |
119 | // safe to call functions marked with the `avx2` target feature. |
120 | match (masks.len(), avx, fat) { |
121 | (1, false, _) => Some(Teddy { |
122 | buckets, |
123 | max_pattern_id: patterns.max_pattern_id(), |
124 | exec: runtime::Exec::TeddySlim1Mask128( |
125 | runtime::TeddySlim1Mask128 { |
126 | mask1: runtime::Mask128::new(masks[0]), |
127 | }, |
128 | ), |
129 | }), |
130 | (1, true, false) => Some(Teddy { |
131 | buckets, |
132 | max_pattern_id: patterns.max_pattern_id(), |
133 | exec: runtime::Exec::TeddySlim1Mask256( |
134 | runtime::TeddySlim1Mask256 { |
135 | mask1: runtime::Mask256::new(masks[0]), |
136 | }, |
137 | ), |
138 | }), |
139 | (1, true, true) => Some(Teddy { |
140 | buckets, |
141 | max_pattern_id: patterns.max_pattern_id(), |
142 | exec: runtime::Exec::TeddyFat1Mask256( |
143 | runtime::TeddyFat1Mask256 { |
144 | mask1: runtime::Mask256::new(masks[0]), |
145 | }, |
146 | ), |
147 | }), |
148 | (2, false, _) => Some(Teddy { |
149 | buckets, |
150 | max_pattern_id: patterns.max_pattern_id(), |
151 | exec: runtime::Exec::TeddySlim2Mask128( |
152 | runtime::TeddySlim2Mask128 { |
153 | mask1: runtime::Mask128::new(masks[0]), |
154 | mask2: runtime::Mask128::new(masks[1]), |
155 | }, |
156 | ), |
157 | }), |
158 | (2, true, false) => Some(Teddy { |
159 | buckets, |
160 | max_pattern_id: patterns.max_pattern_id(), |
161 | exec: runtime::Exec::TeddySlim2Mask256( |
162 | runtime::TeddySlim2Mask256 { |
163 | mask1: runtime::Mask256::new(masks[0]), |
164 | mask2: runtime::Mask256::new(masks[1]), |
165 | }, |
166 | ), |
167 | }), |
168 | (2, true, true) => Some(Teddy { |
169 | buckets, |
170 | max_pattern_id: patterns.max_pattern_id(), |
171 | exec: runtime::Exec::TeddyFat2Mask256( |
172 | runtime::TeddyFat2Mask256 { |
173 | mask1: runtime::Mask256::new(masks[0]), |
174 | mask2: runtime::Mask256::new(masks[1]), |
175 | }, |
176 | ), |
177 | }), |
178 | (3, false, _) => Some(Teddy { |
179 | buckets, |
180 | max_pattern_id: patterns.max_pattern_id(), |
181 | exec: runtime::Exec::TeddySlim3Mask128( |
182 | runtime::TeddySlim3Mask128 { |
183 | mask1: runtime::Mask128::new(masks[0]), |
184 | mask2: runtime::Mask128::new(masks[1]), |
185 | mask3: runtime::Mask128::new(masks[2]), |
186 | }, |
187 | ), |
188 | }), |
189 | (3, true, false) => Some(Teddy { |
190 | buckets, |
191 | max_pattern_id: patterns.max_pattern_id(), |
192 | exec: runtime::Exec::TeddySlim3Mask256( |
193 | runtime::TeddySlim3Mask256 { |
194 | mask1: runtime::Mask256::new(masks[0]), |
195 | mask2: runtime::Mask256::new(masks[1]), |
196 | mask3: runtime::Mask256::new(masks[2]), |
197 | }, |
198 | ), |
199 | }), |
200 | (3, true, true) => Some(Teddy { |
201 | buckets, |
202 | max_pattern_id: patterns.max_pattern_id(), |
203 | exec: runtime::Exec::TeddyFat3Mask256( |
204 | runtime::TeddyFat3Mask256 { |
205 | mask1: runtime::Mask256::new(masks[0]), |
206 | mask2: runtime::Mask256::new(masks[1]), |
207 | mask3: runtime::Mask256::new(masks[2]), |
208 | }, |
209 | ), |
210 | }), |
211 | _ => unreachable!(), |
212 | } |
213 | } |
214 | } |
215 | |
216 | /// A compiler is in charge of allocating patterns into buckets and generating |
217 | /// the masks necessary for searching. |
218 | #[derive (Clone)] |
219 | struct Compiler<'p> { |
220 | patterns: &'p Patterns, |
221 | buckets: Vec<Vec<PatternID>>, |
222 | masks: Vec<Mask>, |
223 | } |
224 | |
225 | impl<'p> Compiler<'p> { |
226 | /// Create a new Teddy compiler for the given patterns. If `fat` is true, |
227 | /// then 16 buckets will be used instead of 8. |
228 | /// |
229 | /// This panics if any of the patterns given are empty. |
230 | fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> { |
231 | let mask_len = cmp::min(3, patterns.minimum_len()); |
232 | assert!(1 <= mask_len && mask_len <= 3); |
233 | |
234 | Compiler { |
235 | patterns, |
236 | buckets: vec![vec![]; if fat { 16 } else { 8 }], |
237 | masks: vec![Mask::default(); mask_len], |
238 | } |
239 | } |
240 | |
241 | /// Compile the patterns in this compiler into buckets and masks. |
242 | fn compile(&mut self) { |
243 | let mut lonibble_to_bucket: BTreeMap<Vec<u8>, usize> = BTreeMap::new(); |
244 | for (id, pattern) in self.patterns.iter() { |
245 | // We try to be slightly clever in how we assign patterns into |
246 | // buckets. Generally speaking, we want patterns with the same |
247 | // prefix to be in the same bucket, since it minimizes the amount |
248 | // of time we spend churning through buckets in the verification |
249 | // step. |
250 | // |
251 | // So we could assign patterns with the same N-prefix (where N |
252 | // is the size of the mask, which is one of {1, 2, 3}) to the |
253 | // same bucket. However, case insensitive searches are fairly |
254 | // common, so we'd for example, ideally want to treat `abc` and |
255 | // `ABC` as if they shared the same prefix. ASCII has the nice |
256 | // property that the lower 4 bits of A and a are the same, so we |
257 | // therefore group patterns with the same low-nybbe-N-prefix into |
258 | // the same bucket. |
259 | // |
260 | // MOREOVER, this is actually necessary for correctness! In |
261 | // particular, by grouping patterns with the same prefix into the |
262 | // same bucket, we ensure that we preserve correct leftmost-first |
263 | // and leftmost-longest match semantics. In addition to the fact |
264 | // that `patterns.iter()` iterates in the correct order, this |
265 | // guarantees that all possible ambiguous matches will occur in |
266 | // the same bucket. The verification routine could be adjusted to |
267 | // support correct leftmost match semantics regardless of bucket |
268 | // allocation, but that results in a performance hit. It's much |
269 | // nicer to be able to just stop as soon as a match is found. |
270 | let lonybs = pattern.low_nybbles(self.masks.len()); |
271 | if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) { |
272 | self.buckets[bucket].push(id); |
273 | } else { |
274 | // N.B. We assign buckets in reverse because it shouldn't have |
275 | // any influence on performance, but it does make it harder to |
276 | // get leftmost match semantics accidentally correct. |
277 | let bucket = (self.buckets.len() - 1) |
278 | - (id as usize % self.buckets.len()); |
279 | self.buckets[bucket].push(id); |
280 | lonibble_to_bucket.insert(lonybs, bucket); |
281 | } |
282 | } |
283 | for (bucket_index, bucket) in self.buckets.iter().enumerate() { |
284 | for &pat_id in bucket { |
285 | let pat = self.patterns.get(pat_id); |
286 | for (i, mask) in self.masks.iter_mut().enumerate() { |
287 | if self.buckets.len() == 8 { |
288 | mask.add_slim(bucket_index as u8, pat.bytes()[i]); |
289 | } else { |
290 | mask.add_fat(bucket_index as u8, pat.bytes()[i]); |
291 | } |
292 | } |
293 | } |
294 | } |
295 | } |
296 | } |
297 | |
298 | impl<'p> fmt::Debug for Compiler<'p> { |
299 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
300 | let mut buckets: Vec>> = vec![vec![]; self.buckets.len()]; |
301 | for (i: usize, bucket: &Vec) in self.buckets.iter().enumerate() { |
302 | for &patid: u16 in bucket { |
303 | buckets[i].push(self.patterns.get(id:patid)); |
304 | } |
305 | } |
306 | f&mut DebugStruct<'_, '_>.debug_struct("Compiler" ) |
307 | .field("buckets" , &buckets) |
308 | .field(name:"masks" , &self.masks) |
309 | .finish() |
310 | } |
311 | } |
312 | |
313 | /// Mask represents the low and high nybble masks that will be used during |
314 | /// search. Each mask is 32 bytes wide, although only the first 16 bytes are |
315 | /// used for the SSSE3 runtime. |
316 | /// |
317 | /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set |
318 | /// if and only if the corresponding nybble is in the ith bucket. The index of |
319 | /// the byte (0-15, inclusive) corresponds to the nybble. |
320 | /// |
321 | /// Each mask is used as the target of a shuffle, where the indices for the |
322 | /// shuffle are taken from the haystack. AND'ing the shuffles for both the |
323 | /// low and high masks together also results in 8-bit bitsets, but where bit |
324 | /// `i` is set if and only if the correspond *byte* is in the ith bucket. |
325 | /// |
326 | /// During compilation, masks are just arrays. But during search, these masks |
327 | /// are represented as 128-bit or 256-bit vectors. |
328 | /// |
329 | /// (See the README is this directory for more details.) |
330 | #[derive (Clone, Copy, Default)] |
331 | pub struct Mask { |
332 | lo: [u8; 32], |
333 | hi: [u8; 32], |
334 | } |
335 | |
336 | impl Mask { |
337 | /// Update this mask by adding the given byte to the given bucket. The |
338 | /// given bucket must be in the range 0-7. |
339 | /// |
340 | /// This is for "slim" Teddy, where there are only 8 buckets. |
341 | fn add_slim(&mut self, bucket: u8, byte: u8) { |
342 | assert!(bucket < 8); |
343 | |
344 | let byte_lo = (byte & 0xF) as usize; |
345 | let byte_hi = ((byte >> 4) & 0xF) as usize; |
346 | // When using 256-bit vectors, we need to set this bucket assignment in |
347 | // the low and high 128-bit portions of the mask. This allows us to |
348 | // process 32 bytes at a time. Namely, AVX2 shuffles operate on each |
349 | // of the 128-bit lanes, rather than the full 256-bit vector at once. |
350 | self.lo[byte_lo] |= 1 << bucket; |
351 | self.lo[byte_lo + 16] |= 1 << bucket; |
352 | self.hi[byte_hi] |= 1 << bucket; |
353 | self.hi[byte_hi + 16] |= 1 << bucket; |
354 | } |
355 | |
356 | /// Update this mask by adding the given byte to the given bucket. The |
357 | /// given bucket must be in the range 0-15. |
358 | /// |
359 | /// This is for "fat" Teddy, where there are 16 buckets. |
360 | fn add_fat(&mut self, bucket: u8, byte: u8) { |
361 | assert!(bucket < 16); |
362 | |
363 | let byte_lo = (byte & 0xF) as usize; |
364 | let byte_hi = ((byte >> 4) & 0xF) as usize; |
365 | // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, |
366 | // the high 128 bits of our mask correspond to buckets 8-15, while the |
367 | // low 128 bits correspond to buckets 0-7. |
368 | if bucket < 8 { |
369 | self.lo[byte_lo] |= 1 << bucket; |
370 | self.hi[byte_hi] |= 1 << bucket; |
371 | } else { |
372 | self.lo[byte_lo + 16] |= 1 << (bucket % 8); |
373 | self.hi[byte_hi + 16] |= 1 << (bucket % 8); |
374 | } |
375 | } |
376 | |
377 | /// Return the low 128 bits of the low-nybble mask. |
378 | pub fn lo128(&self) -> [u8; 16] { |
379 | let mut tmp = [0; 16]; |
380 | tmp.copy_from_slice(&self.lo[..16]); |
381 | tmp |
382 | } |
383 | |
384 | /// Return the full low-nybble mask. |
385 | pub fn lo256(&self) -> [u8; 32] { |
386 | self.lo |
387 | } |
388 | |
389 | /// Return the low 128 bits of the high-nybble mask. |
390 | pub fn hi128(&self) -> [u8; 16] { |
391 | let mut tmp = [0; 16]; |
392 | tmp.copy_from_slice(&self.hi[..16]); |
393 | tmp |
394 | } |
395 | |
396 | /// Return the full high-nybble mask. |
397 | pub fn hi256(&self) -> [u8; 32] { |
398 | self.hi |
399 | } |
400 | } |
401 | |
402 | impl fmt::Debug for Mask { |
403 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
404 | let (mut parts_lo: Vec, mut parts_hi: Vec) = (vec![], vec![]); |
405 | for i: usize in 0..32 { |
406 | parts_lo.push(format!(" {:02}: {:08b}" , i, self.lo[i])); |
407 | parts_hi.push(format!(" {:02}: {:08b}" , i, self.hi[i])); |
408 | } |
409 | f&mut DebugStruct<'_, '_>.debug_struct("Mask" ) |
410 | .field("lo" , &parts_lo) |
411 | .field(name:"hi" , &parts_hi) |
412 | .finish() |
413 | } |
414 | } |
415 | |