runtime.rs source code [crates/aho-corasick-0.7.20/src/packed/teddy/runtime.rs]

1	// See the README in this directory for an explanation of the Teddy algorithm.
2	// It is strongly recommended to peruse the README before trying to grok this
3	// code, as its use of SIMD is pretty opaque, although I tried to add comments
4	// where appropriate.
5	//
6	// Moreover, while there is a lot of code in this file, most of it is
7	// repeated variants of the same thing. Specifically, there are three Teddy
8	// variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets)
9	// and Fat 256-bit Teddy (16 buckets). For each variant, there are three
10	// implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to
11	// a total of nine variants. Each one is structured roughly the same:
12	//
13	// while at <= len(haystack) - CHUNK_SIZE:
14	// let candidate = find_candidate_in_chunk(haystack, at)
15	// if not all zeroes(candidate):
16	// if match = verify(haystack, at, candidate):
17	// return match
18	//
19	// For the most part, this remains unchanged. The parts that vary are the
20	// verification routine (for slim vs fat Teddy) and the candidate extraction
21	// (based on the number of masks).
22	//
23	// In the code below, a "candidate" corresponds to a single vector with 8-bit
24	// lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the
25	// jth lane if and only if the byte occurring at position `j` is in the
26	// bucket `i` (where the `j`th position is the position in the current window
27	// of the haystack, which is always 16 or 32 bytes). Note to be careful here:
28	// the ith bit and the jth lane correspond to the least significant bits of the
29	// vector. So when visualizing how the current window of bytes is stored in a
30	// vector, you often need to flip it around. For example, the text `abcd` in a
31	// 4-byte vector would look like this:
32	//
33	// 01100100 01100011 01100010 01100001
34	// d c b a
35	//
36	// When the mask length is 1, then finding the candidate is pretty straight
37	// forward: you just apply the shuffle indices (from the haystack window) to
38	// the masks, and then AND them together, as described in the README. But for
39	// masks of length 2 and 3, you need to keep a little state. Specifically,
40	// you need to store the final 1 (for mask length 2) or 2 (for mask length 3)
41	// bytes of the candidate for use when searching the next window. This is for
42	// handling matches that span two windows.
43	//
44	// With respect to the repeated code, it would likely be possible to reduce
45	// the number of copies of code below using polymorphism, but I find this
46	// formulation clearer instead of needing to reason through generics. However,
47	// I admit, there may be a simpler generic construction that I'm missing.
48	//
49	// All variants are fairly heavily tested in src/packed/tests.rs.
50
51	use std::arch::x86_64::*;
52	use std::mem;
53
54	use crate::packed::pattern::{PatternID, Patterns};
55	use crate::packed::teddy::compile;
56	use crate::packed::vector::*;
57	use crate::Match;
58
59	/// The Teddy runtime.
60	///
61	/// A Teddy runtime can be used to quickly search for occurrences of one or
62	/// more patterns. While it does not scale to an arbitrary number of patterns
63	/// like Aho-Corasick, it does find occurrences for a small set of patterns
64	/// much more quickly than Aho-Corasick.
65	///
66	/// Teddy cannot run on small haystacks below a certain size, which is
67	/// dependent on the type of matcher used. This size can be queried via the
68	/// `minimum_len` method. Violating this will result in a panic.
69	///
70	/// Finally, when callers use a Teddy runtime, they must provide precisely the
71	/// patterns used to construct the Teddy matcher. Violating this will result
72	/// in either a panic or incorrect results, but will never sacrifice memory
73	/// safety.
74	#[derive(Clone, Debug)]
75	pub struct Teddy {
76	/// The allocation of patterns in buckets. This only contains the IDs of
77	/// patterns. In order to do full verification, callers must provide the
78	/// actual patterns when using Teddy.
79	pub buckets: Vec<Vec<PatternID>>,
80	/// The maximum identifier of a pattern. This is used as a sanity check to
81	/// ensure that the patterns provided by the caller are the same as the
82	/// patterns that were used to compile the matcher. This sanity check
83	/// permits safely eliminating bounds checks regardless of what patterns
84	/// are provided by the caller.
85	///
86	/// Note that users of the aho-corasick crate cannot get this wrong. Only
87	/// code internal to this crate can get it wrong, since neither `Patterns`
88	/// type nor the Teddy runtime are public API items.
89	pub max_pattern_id: PatternID,
90	/// The actual runtime to use.
91	pub exec: Exec,
92	}
93
94	impl Teddy {
95	/// Return the first occurrence of a match in the given haystack after or
96	/// starting at `at`.
97	///
98	/// The patterns provided must be precisely the same patterns given to the
99	/// Teddy builder, otherwise this may panic or produce incorrect results.
100	///
101	/// All matches are consistent with the match semantics (leftmost-first or
102	/// leftmost-longest) set on `pats`.
103	pub fn find_at(
104	&self,
105	pats: &Patterns,
106	haystack: &[u8],
107	at: usize,
108	) -> Option<Match> {
109	// This assert is a bit subtle, but it's an important guarantee.
110	// Namely, if the maximum pattern ID seen by Teddy is the same as the
111	// one in the patterns given, then we are guaranteed that every pattern
112	// ID in all Teddy buckets are valid indices into `pats`. While this
113	// is nominally true, there is no guarantee that callers provide the
114	// same `pats` to both the Teddy builder and the searcher, which would
115	// otherwise make `find_at` unsafe to call. But this assert lets us
116	// keep this routine safe and eliminate an important bounds check in
117	// verification.
118	assert_eq!(
119	self.max_pattern_id,
120	pats.max_pattern_id(),
121	"teddy must be called with same patterns it was built with",
122	);
123	// SAFETY: The haystack must have at least a minimum number of bytes
124	// for Teddy to be able to work. The minimum number varies depending on
125	// which matcher is used below. If this is violated, then it's possible
126	// for searching to do out-of-bounds writes.
127	assert!(haystack[at..].len() >= self.minimum_len());
128	// SAFETY: The various Teddy matchers are always safe to call because
129	// the Teddy builder guarantees that a particular Exec variant is
130	// built only when it can be run the current CPU. That is, the Teddy
131	// builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is
132	// enabled. That is, our dynamic CPU feature detection is performed
133	// once in the builder, and we rely on the type system to avoid needing
134	// to do it again.
135	unsafe {
136	match self.exec {
137	Exec::TeddySlim1Mask128(ref e) => {
138	e.find_at(pats, self, haystack, at)
139	}
140	Exec::TeddySlim1Mask256(ref e) => {
141	e.find_at(pats, self, haystack, at)
142	}
143	Exec::TeddyFat1Mask256(ref e) => {
144	e.find_at(pats, self, haystack, at)
145	}
146	Exec::TeddySlim2Mask128(ref e) => {
147	e.find_at(pats, self, haystack, at)
148	}
149	Exec::TeddySlim2Mask256(ref e) => {
150	e.find_at(pats, self, haystack, at)
151	}
152	Exec::TeddyFat2Mask256(ref e) => {
153	e.find_at(pats, self, haystack, at)
154	}
155	Exec::TeddySlim3Mask128(ref e) => {
156	e.find_at(pats, self, haystack, at)
157	}
158	Exec::TeddySlim3Mask256(ref e) => {
159	e.find_at(pats, self, haystack, at)
160	}
161	Exec::TeddyFat3Mask256(ref e) => {
162	e.find_at(pats, self, haystack, at)
163	}
164	}
165	}
166	}
167
168	/// Returns the minimum length of a haystack that must be provided by
169	/// callers to this Teddy searcher. Providing a haystack shorter than this
170	/// will result in a panic, but will never violate memory safety.
171	pub fn minimum_len(&self) -> usize {
172	// SAFETY: These values must be correct in order to ensure safety.
173	// The Teddy runtime assumes their haystacks have at least these
174	// lengths. Violating this will sacrifice memory safety.
175	match self.exec {
176	Exec::TeddySlim1Mask128(_) => `16`,
177	Exec::TeddySlim1Mask256(_) => `32`,
178	Exec::TeddyFat1Mask256(_) => `16`,
179	Exec::TeddySlim2Mask128(_) => `17`,
180	Exec::TeddySlim2Mask256(_) => `33`,
181	Exec::TeddyFat2Mask256(_) => `17`,
182	Exec::TeddySlim3Mask128(_) => `18`,
183	Exec::TeddySlim3Mask256(_) => `34`,
184	Exec::TeddyFat3Mask256(_) => `34`,
185	}
186	}
187
188	/// Returns the approximate total amount of heap used by this searcher, in
189	/// units of bytes.
190	pub fn heap_bytes(&self) -> usize {
191	let num_patterns = self.max_pattern_id as usize + `1`;
192	self.buckets.len() * mem::size_of::<Vec<PatternID>>()
193	+ num_patterns * mem::size_of::<PatternID>()
194	}
195
196	/// Runs the verification routine for Slim 128-bit Teddy.
197	///
198	/// The candidate given should be a collection of 8-bit bitsets (one bitset
199	/// per lane), where the ith bit is set in the jth lane if and only if the
200	/// byte occurring at `at + j` in `haystack` is in the bucket `i`.
201	///
202	/// This is not safe to call unless the SSSE3 target feature is enabled.
203	/// The `target_feature` attribute is not applied since this function is
204	/// always forcefully inlined.
205	#[inline(always)]
206	unsafe fn verify128(
207	&self,
208	pats: &Patterns,
209	haystack: &[u8],
210	at: usize,
211	cand: __m128i,
212	) -> Option<Match> {
213	debug_assert!(!is_all_zeroes128(cand));
214	debug_assert_eq!(`8`, self.buckets.len());
215
216	// Convert the candidate into 64-bit chunks, and then verify each of
217	// those chunks.
218	let parts = unpack64x128(cand);
219	for (i, &part) in parts.iter().enumerate() {
220	let pos = at + i * `8`;
221	if let Some(m) = self.verify64(pats, `8`, haystack, pos, part) {
222	return Some(m);
223	}
224	}
225	None
226	}
227
228	/// Runs the verification routine for Slim 256-bit Teddy.
229	///
230	/// The candidate given should be a collection of 8-bit bitsets (one bitset
231	/// per lane), where the ith bit is set in the jth lane if and only if the
232	/// byte occurring at `at + j` in `haystack` is in the bucket `i`.
233	///
234	/// This is not safe to call unless the AVX2 target feature is enabled.
235	/// The `target_feature` attribute is not applied since this function is
236	/// always forcefully inlined.
237	#[inline(always)]
238	unsafe fn verify256(
239	&self,
240	pats: &Patterns,
241	haystack: &[u8],
242	at: usize,
243	cand: __m256i,
244	) -> Option<Match> {
245	debug_assert!(!is_all_zeroes256(cand));
246	debug_assert_eq!(`8`, self.buckets.len());
247
248	// Convert the candidate into 64-bit chunks, and then verify each of
249	// those chunks.
250	let parts = unpack64x256(cand);
251	for (i, &part) in parts.iter().enumerate() {
252	let pos = at + i * `8`;
253	if let Some(m) = self.verify64(pats, `8`, haystack, pos, part) {
254	return Some(m);
255	}
256	}
257	None
258	}
259
260	/// Runs the verification routine for Fat 256-bit Teddy.
261	///
262	/// The candidate given should be a collection of 8-bit bitsets (one bitset
263	/// per lane), where the ith bit is set in the jth lane if and only if the
264	/// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the
265	/// bucket `j < 16 ? i : i + 8`.
266	///
267	/// This is not safe to call unless the AVX2 target feature is enabled.
268	/// The `target_feature` attribute is not applied since this function is
269	/// always forcefully inlined.
270	#[inline(always)]
271	unsafe fn verify_fat256(
272	&self,
273	pats: &Patterns,
274	haystack: &[u8],
275	at: usize,
276	cand: __m256i,
277	) -> Option<Match> {
278	debug_assert!(!is_all_zeroes256(cand));
279	debug_assert_eq!(`16`, self.buckets.len());
280
281	// This is a bit tricky, but we basically want to convert our
282	// candidate, which looks like this
283	//
284	// a31 a30 ... a17 a16 a15 a14 ... a01 a00
285	//
286	// where each a(i) is an 8-bit bitset corresponding to the activated
287	// buckets, to this
288	//
289	// a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
290	//
291	// Namely, for Fat Teddy, the high 128-bits of the candidate correspond
292	// to the same bytes in the haystack in the low 128-bits (so we only
293	// scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
294	//
295	// The verification routine wants to look at all potentially matching
296	// buckets before moving on to the next lane. So for example, both
297	// a16 and a00 both correspond to the first byte in our window; a00
298	// contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
299	// a16 should be checked before a01. So the transformation shown above
300	// allows us to use our normal verification procedure with one small
301	// change: we treat each bitset as 16 bits instead of 8 bits.
302
303	// Swap the 128-bit lanes in the candidate vector.
304	let swap = _mm256_permute4x64_epi64(cand, `0x4E`);
305	// Interleave the bytes from the low 128-bit lanes, starting with
306	// cand first.
307	let r1 = _mm256_unpacklo_epi8(cand, swap);
308	// Interleave the bytes from the high 128-bit lanes, starting with
309	// cand first.
310	let r2 = _mm256_unpackhi_epi8(cand, swap);
311	// Now just take the 2 low 64-bit integers from both r1 and r2. We
312	// can drop the high 64-bit integers because they are a mirror image
313	// of the low 64-bit integers. All we care about are the low 128-bit
314	// lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
315	// laid out in the desired order, as described above.
316	let parts = unpacklo64x256(r1, r2);
317	for (i, &part) in parts.iter().enumerate() {
318	let pos = at + i * `4`;
319	if let Some(m) = self.verify64(pats, `16`, haystack, pos, part) {
320	return Some(m);
321	}
322	}
323	None
324	}
325
326	/// Verify whether there are any matches starting at or after `at` in the
327	/// given `haystack`. The candidate given should correspond to either 8-bit
328	/// (for 8 buckets) or 16-bit (16 buckets) bitsets.
329	#[inline(always)]
330	fn verify64(
331	&self,
332	pats: &Patterns,
333	bucket_count: usize,
334	haystack: &[u8],
335	at: usize,
336	mut cand: u64,
337	) -> Option<Match> {
338	// N.B. While the bucket count is known from self.buckets.len(),
339	// requiring it as a parameter makes it easier for the optimizer to
340	// know its value, and thus produce more efficient codegen.
341	debug_assert!(bucket_count == `8` \|\| bucket_count == `16`);
342	while cand != `0` {
343	let bit = cand.trailing_zeros() as usize;
344	cand &= !(`1` << bit);
345
346	let at = at + (bit / bucket_count);
347	let bucket = bit % bucket_count;
348	if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) {
349	return Some(m);
350	}
351	}
352	None
353	}
354
355	/// Verify whether there are any matches starting at `at` in the given
356	/// `haystack` corresponding only to patterns in the given bucket.
357	#[inline(always)]
358	fn verify_bucket(
359	&self,
360	pats: &Patterns,
361	haystack: &[u8],
362	bucket: usize,
363	at: usize,
364	) -> Option<Match> {
365	// Forcing this function to not inline and be "cold" seems to help
366	// the codegen for Teddy overall. Interestingly, this is good for a
367	// 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among
368	// others). Overall, this seems like a problem with codegen, since
369	// creating the Match itself is a very small amount of code.
370	#[cold]
371	#[inline(never)]
372	fn match_from_span(
373	pati: PatternID,
374	start: usize,
375	end: usize,
376	) -> Match {
377	Match::from_span(pati as usize, start, end)
378	}
379
380	// N.B. The bounds check for this bucket lookup should* be elided*
381	// since we assert the number of buckets in each `find_at` routine,
382	// and the compiler can prove that the `% 8` (or `% 16`) in callers
383	// of this routine will always be in bounds.
384	for &pati in &self.buckets[bucket] {
385	// SAFETY: This is safe because we are guaranteed that every
386	// index in a Teddy bucket is a valid index into `pats`. This
387	// guarantee is upheld by the assert checking `max_pattern_id` in
388	// the beginning of `find_at` above.
389	//
390	// This explicit bounds check elision is (amazingly) good for a
391	// 25-50% boost in some benchmarks, particularly ones with a lot
392	// of short literals.
393	let pat = unsafe { pats.get_unchecked(pati) };
394	if pat.is_prefix(&haystack[at..]) {
395	return Some(match_from_span(pati, at, at + pat.len()));
396	}
397	}
398	None
399	}
400	}
401
402	/// Exec represents the different search strategies supported by the Teddy
403	/// runtime.
404	///
405	/// This enum is an important safety abstraction. Namely, callers should only
406	/// construct a variant in this enum if it is safe to execute its corresponding
407	/// target features on the current CPU. The 128-bit searchers require SSSE3,
408	/// while the 256-bit searchers require AVX2.
409	#[derive(Clone, Debug)]
410	pub enum Exec {
411	TeddySlim1Mask128(TeddySlim1Mask128),
412	TeddySlim1Mask256(TeddySlim1Mask256),
413	TeddyFat1Mask256(TeddyFat1Mask256),
414	TeddySlim2Mask128(TeddySlim2Mask128),
415	TeddySlim2Mask256(TeddySlim2Mask256),
416	TeddyFat2Mask256(TeddyFat2Mask256),
417	TeddySlim3Mask128(TeddySlim3Mask128),
418	TeddySlim3Mask256(TeddySlim3Mask256),
419	TeddyFat3Mask256(TeddyFat3Mask256),
420	}
421
422	// Most of the code below remains undocumented because they are effectively
423	// repeated versions of themselves. The general structure is described in the
424	// README and in the comments above.
425
426	#[derive(Clone, Debug)]
427	pub struct TeddySlim1Mask128 {
428	pub mask1: Mask128,
429	}
430
431	impl TeddySlim1Mask128 {
432	#[target_feature(enable = "ssse3")]
433	unsafe fn find_at(
434	&self,
435	pats: &Patterns,
436	teddy: &Teddy,
437	haystack: &[u8],
438	mut at: usize,
439	) -> Option<Match> {
440	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
441	// This assert helps eliminate bounds checks for bucket lookups in
442	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
443	assert_eq!(`8`, teddy.buckets.len());
444
445	let len = haystack.len();
446	while at <= len - `16` {
447	let c = self.candidate(haystack, at);
448	if !is_all_zeroes128(c) {
449	if let Some(m) = teddy.verify128(pats, haystack, at, c) {
450	return Some(m);
451	}
452	}
453	at += `16`;
454	}
455	if at < len {
456	at = len - `16`;
457	let c = self.candidate(haystack, at);
458	if !is_all_zeroes128(c) {
459	if let Some(m) = teddy.verify128(pats, haystack, at, c) {
460	return Some(m);
461	}
462	}
463	}
464	None
465	}
466
467	#[inline(always)]
468	unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i {
469	debug_assert!(haystack[at..].len() >= `16`);
470
471	let chunk = loadu128(haystack, at);
472	members1m128(chunk, self.mask1)
473	}
474	}
475
476	#[derive(Clone, Debug)]
477	pub struct TeddySlim1Mask256 {
478	pub mask1: Mask256,
479	}
480
481	impl TeddySlim1Mask256 {
482	#[target_feature(enable = "avx2")]
483	unsafe fn find_at(
484	&self,
485	pats: &Patterns,
486	teddy: &Teddy,
487	haystack: &[u8],
488	mut at: usize,
489	) -> Option<Match> {
490	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
491	// This assert helps eliminate bounds checks for bucket lookups in
492	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
493	assert_eq!(`8`, teddy.buckets.len());
494
495	let len = haystack.len();
496	while at <= len - `32` {
497	let c = self.candidate(haystack, at);
498	if !is_all_zeroes256(c) {
499	if let Some(m) = teddy.verify256(pats, haystack, at, c) {
500	return Some(m);
501	}
502	}
503	at += `32`;
504	}
505	if at < len {
506	at = len - `32`;
507	let c = self.candidate(haystack, at);
508	if !is_all_zeroes256(c) {
509	if let Some(m) = teddy.verify256(pats, haystack, at, c) {
510	return Some(m);
511	}
512	}
513	}
514	None
515	}
516
517	#[inline(always)]
518	unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
519	debug_assert!(haystack[at..].len() >= `32`);
520
521	let chunk = loadu256(haystack, at);
522	members1m256(chunk, self.mask1)
523	}
524	}
525
526	#[derive(Clone, Debug)]
527	pub struct TeddyFat1Mask256 {
528	pub mask1: Mask256,
529	}
530
531	impl TeddyFat1Mask256 {
532	#[target_feature(enable = "avx2")]
533	unsafe fn find_at(
534	&self,
535	pats: &Patterns,
536	teddy: &Teddy,
537	haystack: &[u8],
538	mut at: usize,
539	) -> Option<Match> {
540	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
541	// This assert helps eliminate bounds checks for bucket lookups in
542	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
543	assert_eq!(`16`, teddy.buckets.len());
544
545	let len = haystack.len();
546	while at <= len - `16` {
547	let c = self.candidate(haystack, at);
548	if !is_all_zeroes256(c) {
549	if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
550	return Some(m);
551	}
552	}
553	at += `16`;
554	}
555	if at < len {
556	at = len - `16`;
557	let c = self.candidate(haystack, at);
558	if !is_all_zeroes256(c) {
559	if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
560	return Some(m);
561	}
562	}
563	}
564	None
565	}
566
567	#[inline(always)]
568	unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
569	debug_assert!(haystack[at..].len() >= `16`);
570
571	let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
572	members1m256(chunk, self.mask1)
573	}
574	}
575
576	#[derive(Clone, Debug)]
577	pub struct TeddySlim2Mask128 {
578	pub mask1: Mask128,
579	pub mask2: Mask128,
580	}
581
582	impl TeddySlim2Mask128 {
583	#[target_feature(enable = "ssse3")]
584	unsafe fn find_at(
585	&self,
586	pats: &Patterns,
587	teddy: &Teddy,
588	haystack: &[u8],
589	mut at: usize,
590	) -> Option<Match> {
591	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
592	// This assert helps eliminate bounds checks for bucket lookups in
593	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
594	assert_eq!(`8`, teddy.buckets.len());
595
596	at += `1`;
597	let len = haystack.len();
598	let mut prev0 = ones128();
599	while at <= len - `16` {
600	let c = self.candidate(haystack, at, &mut prev0);
601	if !is_all_zeroes128(c) {
602	if let Some(m) = teddy.verify128(pats, haystack, at - `1`, c) {
603	return Some(m);
604	}
605	}
606	at += `16`;
607	}
608	if at < len {
609	at = len - `16`;
610	prev0 = ones128();
611
612	let c = self.candidate(haystack, at, &mut prev0);
613	if !is_all_zeroes128(c) {
614	if let Some(m) = teddy.verify128(pats, haystack, at - `1`, c) {
615	return Some(m);
616	}
617	}
618	}
619	None
620	}
621
622	#[inline(always)]
623	unsafe fn candidate(
624	&self,
625	haystack: &[u8],
626	at: usize,
627	prev0: &mut __m128i,
628	) -> __m128i {
629	debug_assert!(haystack[at..].len() >= `16`);
630
631	let chunk = loadu128(haystack, at);
632	let (res0, res1) = members2m128(chunk, self.mask1, self.mask2);
633	let res0prev0 = _mm_alignr_epi8(res0, *prev0, `15`);
634	_mm_and_si128(res0prev0, res1)
635	}
636	}
637
638	#[derive(Clone, Debug)]
639	pub struct TeddySlim2Mask256 {
640	pub mask1: Mask256,
641	pub mask2: Mask256,
642	}
643
644	impl TeddySlim2Mask256 {
645	#[target_feature(enable = "avx2")]
646	unsafe fn find_at(
647	&self,
648	pats: &Patterns,
649	teddy: &Teddy,
650	haystack: &[u8],
651	mut at: usize,
652	) -> Option<Match> {
653	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
654	// This assert helps eliminate bounds checks for bucket lookups in
655	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
656	assert_eq!(`8`, teddy.buckets.len());
657
658	at += `1`;
659	let len = haystack.len();
660	let mut prev0 = ones256();
661	while at <= len - `32` {
662	let c = self.candidate(haystack, at, &mut prev0);
663	if !is_all_zeroes256(c) {
664	if let Some(m) = teddy.verify256(pats, haystack, at - `1`, c) {
665	return Some(m);
666	}
667	}
668	at += `32`;
669	}
670	if at < len {
671	at = len - `32`;
672	prev0 = ones256();
673
674	let c = self.candidate(haystack, at, &mut prev0);
675	if !is_all_zeroes256(c) {
676	if let Some(m) = teddy.verify256(pats, haystack, at - `1`, c) {
677	return Some(m);
678	}
679	}
680	}
681	None
682	}
683
684	#[inline(always)]
685	unsafe fn candidate(
686	&self,
687	haystack: &[u8],
688	at: usize,
689	prev0: &mut __m256i,
690	) -> __m256i {
691	debug_assert!(haystack[at..].len() >= `32`);
692
693	let chunk = loadu256(haystack, at);
694	let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
695	let res0prev0 = alignr256_15(res0, *prev0);
696	let res = _mm256_and_si256(res0prev0, res1);
697	*prev0 = res0;
698	res
699	}
700	}
701
702	#[derive(Clone, Debug)]
703	pub struct TeddyFat2Mask256 {
704	pub mask1: Mask256,
705	pub mask2: Mask256,
706	}
707
708	impl TeddyFat2Mask256 {
709	#[target_feature(enable = "avx2")]
710	unsafe fn find_at(
711	&self,
712	pats: &Patterns,
713	teddy: &Teddy,
714	haystack: &[u8],
715	mut at: usize,
716	) -> Option<Match> {
717	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
718	// This assert helps eliminate bounds checks for bucket lookups in
719	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
720	assert_eq!(`16`, teddy.buckets.len());
721
722	at += `1`;
723	let len = haystack.len();
724	let mut prev0 = ones256();
725	while at <= len - `16` {
726	let c = self.candidate(haystack, at, &mut prev0);
727	if !is_all_zeroes256(c) {
728	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `1`, c)
729	{
730	return Some(m);
731	}
732	}
733	at += `16`;
734	}
735	if at < len {
736	at = len - `16`;
737	prev0 = ones256();
738
739	let c = self.candidate(haystack, at, &mut prev0);
740	if !is_all_zeroes256(c) {
741	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `1`, c)
742	{
743	return Some(m);
744	}
745	}
746	}
747	None
748	}
749
750	#[inline(always)]
751	unsafe fn candidate(
752	&self,
753	haystack: &[u8],
754	at: usize,
755	prev0: &mut __m256i,
756	) -> __m256i {
757	debug_assert!(haystack[at..].len() >= `16`);
758
759	let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
760	let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
761	let res0prev0 = _mm256_alignr_epi8(res0, *prev0, `15`);
762	let res = _mm256_and_si256(res0prev0, res1);
763	*prev0 = res0;
764	res
765	}
766	}
767
768	#[derive(Clone, Debug)]
769	pub struct TeddySlim3Mask128 {
770	pub mask1: Mask128,
771	pub mask2: Mask128,
772	pub mask3: Mask128,
773	}
774
775	impl TeddySlim3Mask128 {
776	#[target_feature(enable = "ssse3")]
777	unsafe fn find_at(
778	&self,
779	pats: &Patterns,
780	teddy: &Teddy,
781	haystack: &[u8],
782	mut at: usize,
783	) -> Option<Match> {
784	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
785	// This assert helps eliminate bounds checks for bucket lookups in
786	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
787	assert_eq!(`8`, teddy.buckets.len());
788
789	at += `2`;
790	let len = haystack.len();
791	let (mut prev0, mut prev1) = (ones128(), ones128());
792	while at <= len - `16` {
793	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
794	if !is_all_zeroes128(c) {
795	if let Some(m) = teddy.verify128(pats, haystack, at - `2`, c) {
796	return Some(m);
797	}
798	}
799	at += `16`;
800	}
801	if at < len {
802	at = len - `16`;
803	prev0 = ones128();
804	prev1 = ones128();
805
806	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
807	if !is_all_zeroes128(c) {
808	if let Some(m) = teddy.verify128(pats, haystack, at - `2`, c) {
809	return Some(m);
810	}
811	}
812	}
813	None
814	}
815
816	#[inline(always)]
817	unsafe fn candidate(
818	&self,
819	haystack: &[u8],
820	at: usize,
821	prev0: &mut __m128i,
822	prev1: &mut __m128i,
823	) -> __m128i {
824	debug_assert!(haystack[at..].len() >= `16`);
825
826	let chunk = loadu128(haystack, at);
827	let (res0, res1, res2) =
828	members3m128(chunk, self.mask1, self.mask2, self.mask3);
829	let res0prev0 = _mm_alignr_epi8(res0, *prev0, `14`);
830	let res1prev1 = _mm_alignr_epi8(res1, *prev1, `15`);
831	let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2);
832	*prev0 = res0;
833	*prev1 = res1;
834	res
835	}
836	}
837
838	#[derive(Clone, Debug)]
839	pub struct TeddySlim3Mask256 {
840	pub mask1: Mask256,
841	pub mask2: Mask256,
842	pub mask3: Mask256,
843	}
844
845	impl TeddySlim3Mask256 {
846	#[target_feature(enable = "avx2")]
847	unsafe fn find_at(
848	&self,
849	pats: &Patterns,
850	teddy: &Teddy,
851	haystack: &[u8],
852	mut at: usize,
853	) -> Option<Match> {
854	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
855	// This assert helps eliminate bounds checks for bucket lookups in
856	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
857	assert_eq!(`8`, teddy.buckets.len());
858
859	at += `2`;
860	let len = haystack.len();
861	let (mut prev0, mut prev1) = (ones256(), ones256());
862	while at <= len - `32` {
863	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
864	if !is_all_zeroes256(c) {
865	if let Some(m) = teddy.verify256(pats, haystack, at - `2`, c) {
866	return Some(m);
867	}
868	}
869	at += `32`;
870	}
871	if at < len {
872	at = len - `32`;
873	prev0 = ones256();
874	prev1 = ones256();
875
876	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
877	if !is_all_zeroes256(c) {
878	if let Some(m) = teddy.verify256(pats, haystack, at - `2`, c) {
879	return Some(m);
880	}
881	}
882	}
883	None
884	}
885
886	#[inline(always)]
887	unsafe fn candidate(
888	&self,
889	haystack: &[u8],
890	at: usize,
891	prev0: &mut __m256i,
892	prev1: &mut __m256i,
893	) -> __m256i {
894	debug_assert!(haystack[at..].len() >= `32`);
895
896	let chunk = loadu256(haystack, at);
897	let (res0, res1, res2) =
898	members3m256(chunk, self.mask1, self.mask2, self.mask3);
899	let res0prev0 = alignr256_14(res0, *prev0);
900	let res1prev1 = alignr256_15(res1, *prev1);
901	let res =
902	_mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
903	*prev0 = res0;
904	*prev1 = res1;
905	res
906	}
907	}
908
909	#[derive(Clone, Debug)]
910	pub struct TeddyFat3Mask256 {
911	pub mask1: Mask256,
912	pub mask2: Mask256,
913	pub mask3: Mask256,
914	}
915
916	impl TeddyFat3Mask256 {
917	#[target_feature(enable = "avx2")]
918	unsafe fn find_at(
919	&self,
920	pats: &Patterns,
921	teddy: &Teddy,
922	haystack: &[u8],
923	mut at: usize,
924	) -> Option<Match> {
925	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
926	// This assert helps eliminate bounds checks for bucket lookups in
927	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
928	assert_eq!(`16`, teddy.buckets.len());
929
930	at += `2`;
931	let len = haystack.len();
932	let (mut prev0, mut prev1) = (ones256(), ones256());
933	while at <= len - `16` {
934	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
935	if !is_all_zeroes256(c) {
936	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `2`, c)
937	{
938	return Some(m);
939	}
940	}
941	at += `16`;
942	}
943	if at < len {
944	at = len - `16`;
945	prev0 = ones256();
946	prev1 = ones256();
947
948	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
949	if !is_all_zeroes256(c) {
950	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `2`, c)
951	{
952	return Some(m);
953	}
954	}
955	}
956	None
957	}
958
959	#[inline(always)]
960	unsafe fn candidate(
961	&self,
962	haystack: &[u8],
963	at: usize,
964	prev0: &mut __m256i,
965	prev1: &mut __m256i,
966	) -> __m256i {
967	debug_assert!(haystack[at..].len() >= `16`);
968
969	let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
970	let (res0, res1, res2) =
971	members3m256(chunk, self.mask1, self.mask2, self.mask3);
972	let res0prev0 = _mm256_alignr_epi8(res0, *prev0, `14`);
973	let res1prev1 = _mm256_alignr_epi8(res1, *prev1, `15`);
974	let res =
975	_mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
976	*prev0 = res0;
977	*prev1 = res1;
978	res
979	}
980	}
981
982	/// A 128-bit mask for the low and high nybbles in a set of patterns. Each
983	/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
984	/// the nybble `j` is in the bucket `i` at a particular position.
985	#[derive(Clone, Copy, Debug)]
986	pub struct Mask128 {
987	lo: __m128i,
988	hi: __m128i,
989	}
990
991	impl Mask128 {
992	/// Create a new SIMD mask from the mask produced by the Teddy builder.
993	pub fn new(mask: compile::Mask) -> Mask128 {
994	// SAFETY: This is safe since [u8; 16] has the same representation
995	// as __m128i.
996	unsafe {
997	Mask128 {
998	lo: mem::transmute(src:mask.lo128()),
999	hi: mem::transmute(src:mask.hi128()),
1000	}
1001	}
1002	}
1003	}
1004
1005	/// A 256-bit mask for the low and high nybbles in a set of patterns. Each
1006	/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
1007	/// the nybble `j` is in the bucket `i` at a particular position.
1008	///
1009	/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
1010	/// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as
1011	/// the bitsets in the higher 128-bits, so that we can search 32 bytes at a
1012	/// time. (Remember, the nybbles in the haystack are used as indices into these
1013	/// masks, and 256-bit shuffles only operate on 128-bit lanes.)
1014	///
1015	/// For Fat Teddy, the bitsets are not repeated, but instead, the high 128
1016	/// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets
1017	/// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set
1018	/// if it's in the higher 128 bits.
1019	#[derive(Clone, Copy, Debug)]
1020	pub struct Mask256 {
1021	lo: __m256i,
1022	hi: __m256i,
1023	}
1024
1025	impl Mask256 {
1026	/// Create a new SIMD mask from the mask produced by the Teddy builder.
1027	pub fn new(mask: compile::Mask) -> Mask256 {
1028	// SAFETY: This is safe since [u8; 32] has the same representation
1029	// as __m256i.
1030	unsafe {
1031	Mask256 {
1032	lo: mem::transmute(src:mask.lo256()),
1033	hi: mem::transmute(src:mask.hi256()),
1034	}
1035	}
1036	}
1037	}
1038
1039	// The "members" routines below are responsible for taking a chunk of bytes,
1040	// a number of nybble masks and returning the result of using the masks to
1041	// lookup bytes in the chunk. The results of the high and low nybble masks are
1042	// AND'ed together, such that each candidate returned is a vector, with byte
1043	// sized lanes, and where each lane is an 8-bit bitset corresponding to the
1044	// buckets that contain the corresponding byte.
1045	//
1046	// In the case of masks of length greater than 1, callers will need to keep
1047	// the results from the previous haystack's window, and then shift the vectors
1048	// so that they all line up. Then they can be AND'ed together.
1049
1050	/// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a
1051	/// 16-byte window of the haystack (where the least significant byte
1052	/// corresponds to the start of the window), and `mask1` corresponds to a
1053	/// low/high mask for the first byte of all patterns that are being searched.
1054	#[target_feature(enable = "ssse3")]
1055	unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i {
1056	let lomask: __m128i = _mm_set1_epi8(`0xF`);
1057	let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1058	let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, `4`), b:lomask);
1059	_mm_and_si128(
1060	a:_mm_shuffle_epi8(mask1.lo, hlo),
1061	b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1062	)
1063	}
1064
1065	/// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a
1066	/// 32-byte window of the haystack (where the least significant byte
1067	/// corresponds to the start of the window), and `mask1` corresponds to a
1068	/// low/high mask for the first byte of all patterns that are being searched.
1069	///
1070	/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1071	/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1072	/// window in the haystack.
1073	#[target_feature(enable = "avx2")]
1074	unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i {
1075	let lomask: __m256i = _mm256_set1_epi8(`0xF`);
1076	let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1077	let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, `4`), b:lomask);
1078	_mm256_and_si256(
1079	a:_mm256_shuffle_epi8(mask1.lo, hlo),
1080	b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1081	)
1082	}
1083
1084	/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1085	/// to a 16-byte window of the haystack (where the least significant byte
1086	/// corresponds to the start of the window), and the masks correspond to a
1087	/// low/high mask for the first and second bytes of all patterns that are being
1088	/// searched. The vectors returned correspond to candidates for the first and
1089	/// second bytes in the patterns represented by the masks.
1090	#[target_feature(enable = "ssse3")]
1091	unsafe fn members2m128(
1092	chunk: __m128i,
1093	mask1: Mask128,
1094	mask2: Mask128,
1095	) -> (__m128i, __m128i) {
1096	let lomask: __m128i = _mm_set1_epi8(`0xF`);
1097	let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1098	let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, `4`), b:lomask);
1099	let res0: __m128i = _mm_and_si128(
1100	a:_mm_shuffle_epi8(mask1.lo, hlo),
1101	b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1102	);
1103	let res1: __m128i = _mm_and_si128(
1104	a:_mm_shuffle_epi8(mask2.lo, hlo),
1105	b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1106	);
1107	(res0, res1)
1108	}
1109
1110	/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1111	/// to a 32-byte window of the haystack (where the least significant byte
1112	/// corresponds to the start of the window), and the masks correspond to a
1113	/// low/high mask for the first and second bytes of all patterns that are being
1114	/// searched. The vectors returned correspond to candidates for the first and
1115	/// second bytes in the patterns represented by the masks.
1116	///
1117	/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1118	/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1119	/// window in the haystack.
1120	#[target_feature(enable = "avx2")]
1121	unsafe fn members2m256(
1122	chunk: __m256i,
1123	mask1: Mask256,
1124	mask2: Mask256,
1125	) -> (__m256i, __m256i) {
1126	let lomask: __m256i = _mm256_set1_epi8(`0xF`);
1127	let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1128	let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, `4`), b:lomask);
1129	let res0: __m256i = _mm256_and_si256(
1130	a:_mm256_shuffle_epi8(mask1.lo, hlo),
1131	b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1132	);
1133	let res1: __m256i = _mm256_and_si256(
1134	a:_mm256_shuffle_epi8(mask2.lo, hlo),
1135	b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1136	);
1137	(res0, res1)
1138	}
1139
1140	/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1141	/// to a 16-byte window of the haystack (where the least significant byte
1142	/// corresponds to the start of the window), and the masks correspond to a
1143	/// low/high mask for the first, second and third bytes of all patterns that
1144	/// are being searched. The vectors returned correspond to candidates for the
1145	/// first, second and third bytes in the patterns represented by the masks.
1146	#[target_feature(enable = "ssse3")]
1147	unsafe fn members3m128(
1148	chunk: __m128i,
1149	mask1: Mask128,
1150	mask2: Mask128,
1151	mask3: Mask128,
1152	) -> (__m128i, __m128i, __m128i) {
1153	let lomask: __m128i = _mm_set1_epi8(`0xF`);
1154	let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1155	let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, `4`), b:lomask);
1156	let res0: __m128i = _mm_and_si128(
1157	a:_mm_shuffle_epi8(mask1.lo, hlo),
1158	b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1159	);
1160	let res1: __m128i = _mm_and_si128(
1161	a:_mm_shuffle_epi8(mask2.lo, hlo),
1162	b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1163	);
1164	let res2: __m128i = _mm_and_si128(
1165	a:_mm_shuffle_epi8(mask3.lo, hlo),
1166	b:_mm_shuffle_epi8(a:mask3.hi, b:hhi),
1167	);
1168	(res0, res1, res2)
1169	}
1170
1171	/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1172	/// to a 32-byte window of the haystack (where the least significant byte
1173	/// corresponds to the start of the window), and the masks correspond to a
1174	/// low/high mask for the first, second and third bytes of all patterns that
1175	/// are being searched. The vectors returned correspond to candidates for the
1176	/// first, second and third bytes in the patterns represented by the masks.
1177	///
1178	/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1179	/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1180	/// window in the haystack.
1181	#[target_feature(enable = "avx2")]
1182	unsafe fn members3m256(
1183	chunk: __m256i,
1184	mask1: Mask256,
1185	mask2: Mask256,
1186	mask3: Mask256,
1187	) -> (__m256i, __m256i, __m256i) {
1188	let lomask: __m256i = _mm256_set1_epi8(`0xF`);
1189	let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1190	let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, `4`), b:lomask);
1191	let res0: __m256i = _mm256_and_si256(
1192	a:_mm256_shuffle_epi8(mask1.lo, hlo),
1193	b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1194	);
1195	let res1: __m256i = _mm256_and_si256(
1196	a:_mm256_shuffle_epi8(mask2.lo, hlo),
1197	b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1198	);
1199	let res2: __m256i = _mm256_and_si256(
1200	a:_mm256_shuffle_epi8(mask3.lo, hlo),
1201	b:_mm256_shuffle_epi8(a:mask3.hi, b:hhi),
1202	);
1203	(res0, res1, res2)
1204	}
1205