compile.rs source code [crates/aho-corasick-1.0.2/src/packed/teddy/compile.rs]

1	// See the README in this directory for an explanation of the Teddy algorithm.
2
3	use core::{cmp, fmt};
4
5	use alloc::{collections::BTreeMap, format, vec, vec::Vec};
6
7	use crate::packed::{
8	pattern::{PatternID, Patterns},
9	teddy::Teddy,
10	};
11
12	/// A builder for constructing a Teddy matcher.
13	///
14	/// The builder primarily permits fine grained configuration of the Teddy
15	/// matcher. Most options are made only available for testing/benchmarking
16	/// purposes. In reality, options are automatically determined by the nature
17	/// and number of patterns given to the builder.
18	#[derive(Clone, Debug)]
19	pub struct Builder {
20	/// When none, this is automatically determined. Otherwise, `false` means
21	/// slim Teddy is used (8 buckets) and `true` means fat Teddy is used
22	/// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't
23	/// available and Fat Teddy was requested, no matcher will be built.
24	fat: Option<bool>,
25	/// When none, this is automatically determined. Otherwise, `false` means
26	/// that 128-bit vectors will be used (up to SSSE3 instructions) where as
27	/// `true` means that 256-bit vectors will be used. As with `fat`, if
28	/// 256-bit vectors are requested and they aren't available, then a
29	/// searcher will not be built.
30	avx: Option<bool>,
31	}
32
33	impl Default for Builder {
34	fn default() -> Builder {
35	Builder::new()
36	}
37	}
38
39	impl Builder {
40	/// Create a new builder for configuring a Teddy matcher.
41	pub fn new() -> Builder {
42	Builder { fat: None, avx: None }
43	}
44
45	/// Build a matcher for the set of patterns given. If a matcher could not
46	/// be built, then `None` is returned.
47	///
48	/// Generally, a matcher isn't built if the necessary CPU features aren't
49	/// available, an unsupported target or if the searcher is believed to be
50	/// slower than standard techniques (i.e., if there are too many literals).
51	pub fn build(&self, patterns: &Patterns) -> Option<Teddy> {
52	self.build_imp(patterns)
53	}
54
55	/// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses
56	/// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful
57	/// for a larger set of literals.
58	///
59	/// `None` is the default, which results in an automatic selection based
60	/// on the number of literals and available CPU features.
61	pub fn fat(&mut self, yes: Option<bool>) -> &mut Builder {
62	self.fat = yes;
63	self
64	}
65
66	/// Request the use of 256-bit vectors (true) or 128-bit vectors (false).
67	/// Generally, a larger vector size is better since it either permits
68	/// matching more patterns or matching more bytes in the haystack at once.
69	///
70	/// `None` is the default, which results in an automatic selection based on
71	/// the number of literals and available CPU features.
72	pub fn avx(&mut self, yes: Option<bool>) -> &mut Builder {
73	self.avx = yes;
74	self
75	}
76
77	fn build_imp(&self, patterns: &Patterns) -> Option<Teddy> {
78	use crate::packed::teddy::runtime;
79
80	// Most of the logic here is just about selecting the optimal settings,
81	// or perhaps even rejecting construction altogether. The choices
82	// we have are: fat (avx only) or not, ssse3 or avx2, and how many
83	// patterns we allow ourselves to search. Additionally, for testing
84	// and benchmarking, we permit callers to try to "force" a setting,
85	// and if the setting isn't allowed (e.g., forcing AVX when AVX isn't
86	// available), then we bail and return nothing.
87
88	if patterns.len() > `64` {
89	debug!("skipping Teddy because of too many patterns");
90	return None;
91	}
92	let has_ssse3 = std::is_x86_feature_detected!("ssse3");
93	let has_avx = std::is_x86_feature_detected!("avx2");
94	let avx = if self.avx == Some(`true`) {
95	if !has_avx {
96	debug!(
97	"skipping Teddy because avx was demanded but unavailable"
98	);
99	return None;
100	}
101	`true`
102	} else if self.avx == Some(`false`) {
103	if !has_ssse3 {
104	debug!(
105	"skipping Teddy because ssse3 was demanded but unavailable"
106	);
107	return None;
108	}
109	`false`
110	} else if !has_ssse3 && !has_avx {
111	debug!("skipping Teddy because ssse3 and avx are unavailable");
112	return None;
113	} else {
114	has_avx
115	};
116	let fat = match self.fat {
117	None => avx && patterns.len() > `32`,
118	Some(`false`) => `false`,
119	Some(`true`) if !avx => {
120	debug!(
121	"skipping Teddy because it needs to be fat, but fat \
122	Teddy requires avx which is unavailable"
123	);
124	return None;
125	}
126	Some(`true`) => `true`,
127	};
128
129	let mut compiler = Compiler::new(patterns, fat);
130	compiler.compile();
131	let Compiler { buckets, masks, .. } = compiler;
132	// SAFETY: It is required that the builder only produce Teddy matchers
133	// that are allowed to run on the current CPU, since we later assume
134	// that the presence of (for example) TeddySlim1Mask256 means it is
135	// safe to call functions marked with the `avx2` target feature.
136	match (masks.len(), avx, fat) {
137	(`1`, `false`, _) => {
138	debug!("Teddy choice: 128-bit slim, 1 byte");
139	Some(Teddy {
140	buckets,
141	max_pattern_id: patterns.max_pattern_id(),
142	exec: runtime::Exec::TeddySlim1Mask128(
143	runtime::TeddySlim1Mask128 {
144	mask1: runtime::Mask128::new(masks[`0`]),
145	},
146	),
147	})
148	}
149	(`1`, `true`, `false`) => {
150	debug!("Teddy choice: 256-bit slim, 1 byte");
151	Some(Teddy {
152	buckets,
153	max_pattern_id: patterns.max_pattern_id(),
154	exec: runtime::Exec::TeddySlim1Mask256(
155	runtime::TeddySlim1Mask256 {
156	mask1: runtime::Mask256::new(masks[`0`]),
157	},
158	),
159	})
160	}
161	(`1`, `true`, `true`) => {
162	debug!("Teddy choice: 256-bit fat, 1 byte");
163	Some(Teddy {
164	buckets,
165	max_pattern_id: patterns.max_pattern_id(),
166	exec: runtime::Exec::TeddyFat1Mask256(
167	runtime::TeddyFat1Mask256 {
168	mask1: runtime::Mask256::new(masks[`0`]),
169	},
170	),
171	})
172	}
173	(`2`, `false`, _) => {
174	debug!("Teddy choice: 128-bit slim, 2 bytes");
175	Some(Teddy {
176	buckets,
177	max_pattern_id: patterns.max_pattern_id(),
178	exec: runtime::Exec::TeddySlim2Mask128(
179	runtime::TeddySlim2Mask128 {
180	mask1: runtime::Mask128::new(masks[`0`]),
181	mask2: runtime::Mask128::new(masks[`1`]),
182	},
183	),
184	})
185	}
186	(`2`, `true`, `false`) => {
187	debug!("Teddy choice: 256-bit slim, 2 bytes");
188	Some(Teddy {
189	buckets,
190	max_pattern_id: patterns.max_pattern_id(),
191	exec: runtime::Exec::TeddySlim2Mask256(
192	runtime::TeddySlim2Mask256 {
193	mask1: runtime::Mask256::new(masks[`0`]),
194	mask2: runtime::Mask256::new(masks[`1`]),
195	},
196	),
197	})
198	}
199	(`2`, `true`, `true`) => {
200	debug!("Teddy choice: 256-bit fat, 2 bytes");
201	Some(Teddy {
202	buckets,
203	max_pattern_id: patterns.max_pattern_id(),
204	exec: runtime::Exec::TeddyFat2Mask256(
205	runtime::TeddyFat2Mask256 {
206	mask1: runtime::Mask256::new(masks[`0`]),
207	mask2: runtime::Mask256::new(masks[`1`]),
208	},
209	),
210	})
211	}
212	(`3`, `false`, _) => {
213	debug!("Teddy choice: 128-bit slim, 3 bytes");
214	Some(Teddy {
215	buckets,
216	max_pattern_id: patterns.max_pattern_id(),
217	exec: runtime::Exec::TeddySlim3Mask128(
218	runtime::TeddySlim3Mask128 {
219	mask1: runtime::Mask128::new(masks[`0`]),
220	mask2: runtime::Mask128::new(masks[`1`]),
221	mask3: runtime::Mask128::new(masks[`2`]),
222	},
223	),
224	})
225	}
226	(`3`, `true`, `false`) => {
227	debug!("Teddy choice: 256-bit slim, 3 bytes");
228	Some(Teddy {
229	buckets,
230	max_pattern_id: patterns.max_pattern_id(),
231	exec: runtime::Exec::TeddySlim3Mask256(
232	runtime::TeddySlim3Mask256 {
233	mask1: runtime::Mask256::new(masks[`0`]),
234	mask2: runtime::Mask256::new(masks[`1`]),
235	mask3: runtime::Mask256::new(masks[`2`]),
236	},
237	),
238	})
239	}
240	(`3`, `true`, `true`) => {
241	debug!("Teddy choice: 256-bit fat, 3 bytes");
242	Some(Teddy {
243	buckets,
244	max_pattern_id: patterns.max_pattern_id(),
245	exec: runtime::Exec::TeddyFat3Mask256(
246	runtime::TeddyFat3Mask256 {
247	mask1: runtime::Mask256::new(masks[`0`]),
248	mask2: runtime::Mask256::new(masks[`1`]),
249	mask3: runtime::Mask256::new(masks[`2`]),
250	},
251	),
252	})
253	}
254	(`4`, `false`, _) => {
255	debug!("Teddy choice: 128-bit slim, 4 bytes");
256	Some(Teddy {
257	buckets,
258	max_pattern_id: patterns.max_pattern_id(),
259	exec: runtime::Exec::TeddySlim4Mask128(
260	runtime::TeddySlim4Mask128 {
261	mask1: runtime::Mask128::new(masks[`0`]),
262	mask2: runtime::Mask128::new(masks[`1`]),
263	mask3: runtime::Mask128::new(masks[`2`]),
264	mask4: runtime::Mask128::new(masks[`3`]),
265	},
266	),
267	})
268	}
269	(`4`, `true`, `false`) => {
270	debug!("Teddy choice: 256-bit slim, 4 bytes");
271	Some(Teddy {
272	buckets,
273	max_pattern_id: patterns.max_pattern_id(),
274	exec: runtime::Exec::TeddySlim4Mask256(
275	runtime::TeddySlim4Mask256 {
276	mask1: runtime::Mask256::new(masks[`0`]),
277	mask2: runtime::Mask256::new(masks[`1`]),
278	mask3: runtime::Mask256::new(masks[`2`]),
279	mask4: runtime::Mask256::new(masks[`3`]),
280	},
281	),
282	})
283	}
284	(`4`, `true`, `true`) => {
285	debug!("Teddy choice: 256-bit fat, 4 bytes");
286	Some(Teddy {
287	buckets,
288	max_pattern_id: patterns.max_pattern_id(),
289	exec: runtime::Exec::TeddyFat4Mask256(
290	runtime::TeddyFat4Mask256 {
291	mask1: runtime::Mask256::new(masks[`0`]),
292	mask2: runtime::Mask256::new(masks[`1`]),
293	mask3: runtime::Mask256::new(masks[`2`]),
294	mask4: runtime::Mask256::new(masks[`3`]),
295	},
296	),
297	})
298	}
299	_ => unreachable!(),
300	}
301	}
302	}
303
304	/// A compiler is in charge of allocating patterns into buckets and generating
305	/// the masks necessary for searching.
306	#[derive(Clone)]
307	struct Compiler<'p> {
308	patterns: &'p Patterns,
309	buckets: Vec<Vec<PatternID>>,
310	masks: Vec<Mask>,
311	}
312
313	impl<'p> Compiler<'p> {
314	/// Create a new Teddy compiler for the given patterns. If `fat` is true,
315	/// then 16 buckets will be used instead of 8.
316	///
317	/// This panics if any of the patterns given are empty.
318	fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> {
319	let mask_len = cmp::min(`4`, patterns.minimum_len());
320	assert!(`1` <= mask_len && mask_len <= `4`);
321
322	Compiler {
323	patterns,
324	buckets: vec![vec![]; if fat { `16` } else { `8` }],
325	masks: vec![Mask::default(); mask_len],
326	}
327	}
328
329	/// Compile the patterns in this compiler into buckets and masks.
330	fn compile(&mut self) {
331	let mut lonibble_to_bucket: BTreeMap<Vec<u8>, usize> = BTreeMap::new();
332	for (id, pattern) in self.patterns.iter() {
333	// We try to be slightly clever in how we assign patterns into
334	// buckets. Generally speaking, we want patterns with the same
335	// prefix to be in the same bucket, since it minimizes the amount
336	// of time we spend churning through buckets in the verification
337	// step.
338	//
339	// So we could assign patterns with the same N-prefix (where N
340	// is the size of the mask, which is one of {1, 2, 3}) to the
341	// same bucket. However, case insensitive searches are fairly
342	// common, so we'd for example, ideally want to treat `abc` and
343	// `ABC` as if they shared the same prefix. ASCII has the nice
344	// property that the lower 4 bits of A and a are the same, so we
345	// therefore group patterns with the same low-nybbe-N-prefix into
346	// the same bucket.
347	//
348	// MOREOVER, this is actually necessary for correctness! In
349	// particular, by grouping patterns with the same prefix into the
350	// same bucket, we ensure that we preserve correct leftmost-first
351	// and leftmost-longest match semantics. In addition to the fact
352	// that `patterns.iter()` iterates in the correct order, this
353	// guarantees that all possible ambiguous matches will occur in
354	// the same bucket. The verification routine could be adjusted to
355	// support correct leftmost match semantics regardless of bucket
356	// allocation, but that results in a performance hit. It's much
357	// nicer to be able to just stop as soon as a match is found.
358	let lonybs = pattern.low_nybbles(self.masks.len());
359	if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) {
360	self.buckets[bucket].push(id);
361	} else {
362	// N.B. We assign buckets in reverse because it shouldn't have
363	// any influence on performance, but it does make it harder to
364	// get leftmost match semantics accidentally correct.
365	let bucket = (self.buckets.len() - `1`)
366	- (id as usize % self.buckets.len());
367	self.buckets[bucket].push(id);
368	lonibble_to_bucket.insert(lonybs, bucket);
369	}
370	}
371	for (bucket_index, bucket) in self.buckets.iter().enumerate() {
372	for &pat_id in bucket {
373	let pat = self.patterns.get(pat_id);
374	for (i, mask) in self.masks.iter_mut().enumerate() {
375	if self.buckets.len() == `8` {
376	mask.add_slim(bucket_index as u8, pat.bytes()[i]);
377	} else {
378	mask.add_fat(bucket_index as u8, pat.bytes()[i]);
379	}
380	}
381	}
382	}
383	}
384	}
385
386	impl<'p> fmt::Debug for Compiler<'p> {
387	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
388	let mut buckets: Vec>> = vec![vec![]; self.buckets.len()];
389	for (i: usize, bucket: &Vec) in self.buckets.iter().enumerate() {
390	for &patid: u16 in bucket {
391	buckets[i].push(self.patterns.get(id:patid));
392	}
393	}
394	f&mut DebugStruct<'_, '_>.debug_struct("Compiler")
395	.field("buckets", &buckets)
396	.field(name:"masks", &self.masks)
397	.finish()
398	}
399	}
400
401	/// Mask represents the low and high nybble masks that will be used during
402	/// search. Each mask is 32 bytes wide, although only the first 16 bytes are
403	/// used for the SSSE3 runtime.
404	///
405	/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set
406	/// if and only if the corresponding nybble is in the ith bucket. The index of
407	/// the byte (0-15, inclusive) corresponds to the nybble.
408	///
409	/// Each mask is used as the target of a shuffle, where the indices for the
410	/// shuffle are taken from the haystack. AND'ing the shuffles for both the
411	/// low and high masks together also results in 8-bit bitsets, but where bit
412	/// `i` is set if and only if the correspond byte* is in the ith bucket.*
413	///
414	/// During compilation, masks are just arrays. But during search, these masks
415	/// are represented as 128-bit or 256-bit vectors.
416	///
417	/// (See the README is this directory for more details.)
418	#[derive(Clone, Copy, Default)]
419	pub struct Mask {
420	lo: [u8; `32`],
421	hi: [u8; `32`],
422	}
423
424	impl Mask {
425	/// Update this mask by adding the given byte to the given bucket. The
426	/// given bucket must be in the range 0-7.
427	///
428	/// This is for "slim" Teddy, where there are only 8 buckets.
429	fn add_slim(&mut self, bucket: u8, byte: u8) {
430	assert!(bucket < `8`);
431
432	let byte_lo = (byte & `0xF`) as usize;
433	let byte_hi = ((byte >> `4`) & `0xF`) as usize;
434	// When using 256-bit vectors, we need to set this bucket assignment in
435	// the low and high 128-bit portions of the mask. This allows us to
436	// process 32 bytes at a time. Namely, AVX2 shuffles operate on each
437	// of the 128-bit lanes, rather than the full 256-bit vector at once.
438	self.lo[byte_lo] \|= `1` << bucket;
439	self.lo[byte_lo + `16`] \|= `1` << bucket;
440	self.hi[byte_hi] \|= `1` << bucket;
441	self.hi[byte_hi + `16`] \|= `1` << bucket;
442	}
443
444	/// Update this mask by adding the given byte to the given bucket. The
445	/// given bucket must be in the range 0-15.
446	///
447	/// This is for "fat" Teddy, where there are 16 buckets.
448	fn add_fat(&mut self, bucket: u8, byte: u8) {
449	assert!(bucket < `16`);
450
451	let byte_lo = (byte & `0xF`) as usize;
452	let byte_hi = ((byte >> `4`) & `0xF`) as usize;
453	// Unlike slim teddy, fat teddy only works with AVX2. For fat teddy,
454	// the high 128 bits of our mask correspond to buckets 8-15, while the
455	// low 128 bits correspond to buckets 0-7.
456	if bucket < `8` {
457	self.lo[byte_lo] \|= `1` << bucket;
458	self.hi[byte_hi] \|= `1` << bucket;
459	} else {
460	self.lo[byte_lo + `16`] \|= `1` << (bucket % `8`);
461	self.hi[byte_hi + `16`] \|= `1` << (bucket % `8`);
462	}
463	}
464
465	/// Return the low 128 bits of the low-nybble mask.
466	pub fn lo128(&self) -> [u8; `16`] {
467	let mut tmp = [`0`; `16`];
468	tmp.copy_from_slice(&self.lo[..`16`]);
469	tmp
470	}
471
472	/// Return the full low-nybble mask.
473	pub fn lo256(&self) -> [u8; `32`] {
474	self.lo
475	}
476
477	/// Return the low 128 bits of the high-nybble mask.
478	pub fn hi128(&self) -> [u8; `16`] {
479	let mut tmp = [`0`; `16`];
480	tmp.copy_from_slice(&self.hi[..`16`]);
481	tmp
482	}
483
484	/// Return the full high-nybble mask.
485	pub fn hi256(&self) -> [u8; `32`] {
486	self.hi
487	}
488	}
489
490	impl fmt::Debug for Mask {
491	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
492	let (mut parts_lo: Vec, mut parts_hi: Vec) = (vec![], vec![]);
493	for i: usize in `0`..`32` {
494	parts_lo.push(format!("{:`02`}: {:`08`b}", i, self.lo[i]));
495	parts_hi.push(format!("{:`02`}: {:`08`b}", i, self.hi[i]));
496	}
497	f&mut DebugStruct<'_, '_>.debug_struct("Mask")
498	.field("lo", &parts_lo)
499	.field(name:"hi", &parts_hi)
500	.finish()
501	}
502	}
503