runtime.rs source code [crates/aho-corasick-1.0.2/src/packed/teddy/runtime.rs]

1	// See the README in this directory for an explanation of the Teddy algorithm.
2	// It is strongly recommended to peruse the README before trying to grok this
3	// code, as its use of SIMD is pretty opaque, although I tried to add comments
4	// where appropriate.
5	//
6	// Moreover, while there is a lot of code in this file, most of it is
7	// repeated variants of the same thing. Specifically, there are three Teddy
8	// variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets)
9	// and Fat 256-bit Teddy (16 buckets). For each variant, there are three
10	// implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to
11	// a total of nine variants. Each one is structured roughly the same:
12	//
13	// while at <= len(haystack) - CHUNK_SIZE:
14	// let candidate = find_candidate_in_chunk(haystack, at)
15	// if not all zeroes(candidate):
16	// if match = verify(haystack, at, candidate):
17	// return match
18	//
19	// For the most part, this remains unchanged. The parts that vary are the
20	// verification routine (for slim vs fat Teddy) and the candidate extraction
21	// (based on the number of masks).
22	//
23	// In the code below, a "candidate" corresponds to a single vector with 8-bit
24	// lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the
25	// jth lane if and only if the byte occurring at position `j` is in the
26	// bucket `i` (where the `j`th position is the position in the current window
27	// of the haystack, which is always 16 or 32 bytes). Note to be careful here:
28	// the ith bit and the jth lane correspond to the least significant bits of the
29	// vector. So when visualizing how the current window of bytes is stored in a
30	// vector, you often need to flip it around. For example, the text `abcd` in a
31	// 4-byte vector would look like this:
32	//
33	// 01100100 01100011 01100010 01100001
34	// d c b a
35	//
36	// When the mask length is 1, then finding the candidate is pretty straight
37	// forward: you just apply the shuffle indices (from the haystack window) to
38	// the masks, and then AND them together, as described in the README. But for
39	// masks of length 2 and 3, you need to keep a little state. Specifically,
40	// you need to store the final 1 (for mask length 2) or 2 (for mask length 3)
41	// bytes of the candidate for use when searching the next window. This is for
42	// handling matches that span two windows.
43	//
44	// With respect to the repeated code, it would likely be possible to reduce
45	// the number of copies of code below using polymorphism, but I find this
46	// formulation clearer instead of needing to reason through generics. However,
47	// I admit, there may be a simpler generic construction that I'm missing.
48	//
49	// All variants are fairly heavily tested in src/packed/tests.rs.
50
51	use core::{arch::x86_64::*, mem};
52
53	use alloc::vec::Vec;
54
55	use crate::{
56	packed::{
57	pattern::{PatternID, Patterns},
58	teddy::compile,
59	vector,
60	},
61	util::search::Match,
62	};
63
64	/// The Teddy runtime.
65	///
66	/// A Teddy runtime can be used to quickly search for occurrences of one or
67	/// more patterns. While it does not scale to an arbitrary number of patterns
68	/// like Aho-Corasick, it does find occurrences for a small set of patterns
69	/// much more quickly than Aho-Corasick.
70	///
71	/// Teddy cannot run on small haystacks below a certain size, which is
72	/// dependent on the type of matcher used. This size can be queried via the
73	/// `minimum_len` method. Violating this will result in a panic.
74	///
75	/// Finally, when callers use a Teddy runtime, they must provide precisely the
76	/// patterns used to construct the Teddy matcher. Violating this will result
77	/// in either a panic or incorrect results, but will never sacrifice memory
78	/// safety.
79	#[derive(Clone, Debug)]
80	pub struct Teddy {
81	/// The allocation of patterns in buckets. This only contains the IDs of
82	/// patterns. In order to do full verification, callers must provide the
83	/// actual patterns when using Teddy.
84	pub buckets: Vec<Vec<PatternID>>,
85	/// The maximum identifier of a pattern. This is used as a sanity check to
86	/// ensure that the patterns provided by the caller are the same as the
87	/// patterns that were used to compile the matcher. This sanity check
88	/// permits safely eliminating bounds checks regardless of what patterns
89	/// are provided by the caller.
90	///
91	/// Note that users of the aho-corasick crate cannot get this wrong. Only
92	/// code internal to this crate can get it wrong, since neither `Patterns`
93	/// type nor the Teddy runtime are public API items.
94	pub max_pattern_id: PatternID,
95	/// The actual runtime to use.
96	pub exec: Exec,
97	}
98
99	impl Teddy {
100	/// Return the first occurrence of a match in the given haystack after or
101	/// starting at `at`.
102	///
103	/// The patterns provided must be precisely the same patterns given to the
104	/// Teddy builder, otherwise this may panic or produce incorrect results.
105	///
106	/// All matches are consistent with the match semantics (leftmost-first or
107	/// leftmost-longest) set on `pats`.
108	pub fn find_at(
109	&self,
110	pats: &Patterns,
111	haystack: &[u8],
112	at: usize,
113	) -> Option<Match> {
114	// This assert is a bit subtle, but it's an important guarantee.
115	// Namely, if the maximum pattern ID seen by Teddy is the same as the
116	// one in the patterns given, then we are guaranteed that every pattern
117	// ID in all Teddy buckets are valid indices into `pats`. While this
118	// is nominally true, there is no guarantee that callers provide the
119	// same `pats` to both the Teddy builder and the searcher, which would
120	// otherwise make `find_at` unsafe to call. But this assert lets us
121	// keep this routine safe and eliminate an important bounds check in
122	// verification.
123	assert_eq!(
124	self.max_pattern_id,
125	pats.max_pattern_id(),
126	"teddy must be called with same patterns it was built with",
127	);
128	// SAFETY: The haystack must have at least a minimum number of bytes
129	// for Teddy to be able to work. The minimum number varies depending on
130	// which matcher is used below. If this is violated, then it's possible
131	// for searching to do out-of-bounds writes.
132	assert!(haystack[at..].len() >= self.minimum_len());
133	// SAFETY: The various Teddy matchers are always safe to call because
134	// the Teddy builder guarantees that a particular Exec variant is
135	// built only when it can be run the current CPU. That is, the Teddy
136	// builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is
137	// enabled. That is, our dynamic CPU feature detection is performed
138	// once in the builder, and we rely on the type system to avoid needing
139	// to do it again.
140	unsafe {
141	match self.exec {
142	Exec::TeddySlim1Mask128(ref e) => {
143	e.find_at(pats, self, haystack, at)
144	}
145	Exec::TeddySlim1Mask256(ref e) => {
146	e.find_at(pats, self, haystack, at)
147	}
148	Exec::TeddyFat1Mask256(ref e) => {
149	e.find_at(pats, self, haystack, at)
150	}
151	Exec::TeddySlim2Mask128(ref e) => {
152	e.find_at(pats, self, haystack, at)
153	}
154	Exec::TeddySlim2Mask256(ref e) => {
155	e.find_at(pats, self, haystack, at)
156	}
157	Exec::TeddyFat2Mask256(ref e) => {
158	e.find_at(pats, self, haystack, at)
159	}
160	Exec::TeddySlim3Mask128(ref e) => {
161	e.find_at(pats, self, haystack, at)
162	}
163	Exec::TeddySlim3Mask256(ref e) => {
164	e.find_at(pats, self, haystack, at)
165	}
166	Exec::TeddyFat3Mask256(ref e) => {
167	e.find_at(pats, self, haystack, at)
168	}
169	Exec::TeddySlim4Mask128(ref e) => {
170	e.find_at(pats, self, haystack, at)
171	}
172	Exec::TeddySlim4Mask256(ref e) => {
173	e.find_at(pats, self, haystack, at)
174	}
175	Exec::TeddyFat4Mask256(ref e) => {
176	e.find_at(pats, self, haystack, at)
177	}
178	}
179	}
180	}
181
182	/// Returns the minimum length of a haystack that must be provided by
183	/// callers to this Teddy searcher. Providing a haystack shorter than this
184	/// will result in a panic, but will never violate memory safety.
185	pub fn minimum_len(&self) -> usize {
186	// SAFETY: These values must be correct in order to ensure safety.
187	// The Teddy runtime assumes their haystacks have at least these
188	// lengths. Violating this will sacrifice memory safety.
189	match self.exec {
190	Exec::TeddySlim1Mask128(_) => `16`,
191	Exec::TeddySlim1Mask256(_) => `32`,
192	Exec::TeddyFat1Mask256(_) => `16`,
193	Exec::TeddySlim2Mask128(_) => `17`,
194	Exec::TeddySlim2Mask256(_) => `33`,
195	Exec::TeddyFat2Mask256(_) => `17`,
196	Exec::TeddySlim3Mask128(_) => `18`,
197	Exec::TeddySlim3Mask256(_) => `34`,
198	Exec::TeddyFat3Mask256(_) => `18`,
199	Exec::TeddySlim4Mask128(_) => `19`,
200	Exec::TeddySlim4Mask256(_) => `35`,
201	Exec::TeddyFat4Mask256(_) => `19`,
202	}
203	}
204
205	/// Returns the approximate total amount of heap used by this searcher, in
206	/// units of bytes.
207	pub fn memory_usage(&self) -> usize {
208	let num_patterns = self.max_pattern_id as usize + `1`;
209	self.buckets.len() * mem::size_of::<Vec<PatternID>>()
210	+ num_patterns * mem::size_of::<PatternID>()
211	}
212
213	/// Runs the verification routine for Slim 128-bit Teddy.
214	///
215	/// The candidate given should be a collection of 8-bit bitsets (one bitset
216	/// per lane), where the ith bit is set in the jth lane if and only if the
217	/// byte occurring at `at + j` in `haystack` is in the bucket `i`.
218	///
219	/// This is not safe to call unless the SSSE3 target feature is enabled.
220	/// The `target_feature` attribute is not applied since this function is
221	/// always forcefully inlined.
222	#[inline(always)]
223	unsafe fn verify128(
224	&self,
225	pats: &Patterns,
226	haystack: &[u8],
227	at: usize,
228	cand: __m128i,
229	) -> Option<Match> {
230	debug_assert!(!vector::is_all_zeroes128(cand));
231	debug_assert_eq!(`8`, self.buckets.len());
232
233	// Convert the candidate into 64-bit chunks, and then verify each of
234	// those chunks.
235	let parts = vector::unpack64x128(cand);
236	for (i, &part) in parts.iter().enumerate() {
237	let pos = at + i * `8`;
238	if let Some(m) = self.verify64(pats, `8`, haystack, pos, part) {
239	return Some(m);
240	}
241	}
242	None
243	}
244
245	/// Runs the verification routine for Slim 256-bit Teddy.
246	///
247	/// The candidate given should be a collection of 8-bit bitsets (one bitset
248	/// per lane), where the ith bit is set in the jth lane if and only if the
249	/// byte occurring at `at + j` in `haystack` is in the bucket `i`.
250	///
251	/// This is not safe to call unless the AVX2 target feature is enabled.
252	/// The `target_feature` attribute is not applied since this function is
253	/// always forcefully inlined.
254	#[inline(always)]
255	unsafe fn verify256(
256	&self,
257	pats: &Patterns,
258	haystack: &[u8],
259	at: usize,
260	cand: __m256i,
261	) -> Option<Match> {
262	debug_assert!(!vector::is_all_zeroes256(cand));
263	debug_assert_eq!(`8`, self.buckets.len());
264
265	// Convert the candidate into 64-bit chunks, and then verify each of
266	// those chunks.
267	let parts = vector::unpack64x256(cand);
268	let mut pos = at;
269	if let Some(m) = self.verify64(pats, `8`, haystack, pos, parts[`0`]) {
270	return Some(m);
271	}
272	pos += `8`;
273	if let Some(m) = self.verify64(pats, `8`, haystack, pos, parts[`1`]) {
274	return Some(m);
275	}
276	pos += `8`;
277	if let Some(m) = self.verify64(pats, `8`, haystack, pos, parts[`2`]) {
278	return Some(m);
279	}
280	pos += `8`;
281	if let Some(m) = self.verify64(pats, `8`, haystack, pos, parts[`3`]) {
282	return Some(m);
283	}
284	None
285	}
286
287	/// Runs the verification routine for Fat 256-bit Teddy.
288	///
289	/// The candidate given should be a collection of 8-bit bitsets (one bitset
290	/// per lane), where the ith bit is set in the jth lane if and only if the
291	/// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the
292	/// bucket `j < 16 ? i : i + 8`.
293	///
294	/// This is not safe to call unless the AVX2 target feature is enabled.
295	/// The `target_feature` attribute is not applied since this function is
296	/// always forcefully inlined.
297	#[inline(always)]
298	unsafe fn verify_fat256(
299	&self,
300	pats: &Patterns,
301	haystack: &[u8],
302	at: usize,
303	cand: __m256i,
304	) -> Option<Match> {
305	debug_assert!(!vector::is_all_zeroes256(cand));
306	debug_assert_eq!(`16`, self.buckets.len());
307
308	// This is a bit tricky, but we basically want to convert our
309	// candidate, which looks like this
310	//
311	// a31 a30 ... a17 a16 a15 a14 ... a01 a00
312	//
313	// where each a(i) is an 8-bit bitset corresponding to the activated
314	// buckets, to this
315	//
316	// a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
317	//
318	// Namely, for Fat Teddy, the high 128-bits of the candidate correspond
319	// to the same bytes in the haystack in the low 128-bits (so we only
320	// scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
321	//
322	// The verification routine wants to look at all potentially matching
323	// buckets before moving on to the next lane. So for example, both
324	// a16 and a00 both correspond to the first byte in our window; a00
325	// contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
326	// a16 should be checked before a01. So the transformation shown above
327	// allows us to use our normal verification procedure with one small
328	// change: we treat each bitset as 16 bits instead of 8 bits.
329
330	// Swap the 128-bit lanes in the candidate vector.
331	let swap = _mm256_permute4x64_epi64(cand, `0x4E`);
332	// Interleave the bytes from the low 128-bit lanes, starting with
333	// cand first.
334	let r1 = _mm256_unpacklo_epi8(cand, swap);
335	// Interleave the bytes from the high 128-bit lanes, starting with
336	// cand first.
337	let r2 = _mm256_unpackhi_epi8(cand, swap);
338	// Now just take the 2 low 64-bit integers from both r1 and r2. We
339	// can drop the high 64-bit integers because they are a mirror image
340	// of the low 64-bit integers. All we care about are the low 128-bit
341	// lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
342	// laid out in the desired order, as described above.
343	let parts = vector::unpacklo64x256(r1, r2);
344	for (i, &part) in parts.iter().enumerate() {
345	let pos = at + i * `4`;
346	if let Some(m) = self.verify64(pats, `16`, haystack, pos, part) {
347	return Some(m);
348	}
349	}
350	None
351	}
352
353	/// Verify whether there are any matches starting at or after `at` in the
354	/// given `haystack`. The candidate given should correspond to either 8-bit
355	/// (for 8 buckets) or 16-bit (16 buckets) bitsets.
356	#[inline(always)]
357	fn verify64(
358	&self,
359	pats: &Patterns,
360	bucket_count: usize,
361	haystack: &[u8],
362	at: usize,
363	mut cand: u64,
364	) -> Option<Match> {
365	// N.B. While the bucket count is known from self.buckets.len(),
366	// requiring it as a parameter makes it easier for the optimizer to
367	// know its value, and thus produce more efficient codegen.
368	debug_assert!(bucket_count == `8` \|\| bucket_count == `16`);
369	while cand != `0` {
370	let bit = cand.trailing_zeros() as usize;
371	cand &= !(`1` << bit);
372
373	let at = at + (bit / bucket_count);
374	let bucket = bit % bucket_count;
375	if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) {
376	return Some(m);
377	}
378	}
379	None
380	}
381
382	/// Verify whether there are any matches starting at `at` in the given
383	/// `haystack` corresponding only to patterns in the given bucket.
384	#[inline(always)]
385	fn verify_bucket(
386	&self,
387	pats: &Patterns,
388	haystack: &[u8],
389	bucket: usize,
390	at: usize,
391	) -> Option<Match> {
392	// Forcing this function to not inline and be "cold" seems to help
393	// the codegen for Teddy overall. Interestingly, this is good for a
394	// 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among
395	// others). Overall, this seems like a problem with codegen, since
396	// creating the Match itself is a very small amount of code.
397	#[cold]
398	#[inline(never)]
399	fn match_from_span(
400	pati: PatternID,
401	start: usize,
402	end: usize,
403	) -> Match {
404	Match::must(pati as usize, start..end)
405	}
406
407	// N.B. The bounds check for this bucket lookup should* be elided*
408	// since we assert the number of buckets in each `find_at` routine,
409	// and the compiler can prove that the `% 8` (or `% 16`) in callers
410	// of this routine will always be in bounds.
411	for &pati in &self.buckets[bucket] {
412	// SAFETY: This is safe because we are guaranteed that every
413	// index in a Teddy bucket is a valid index into `pats`. This
414	// guarantee is upheld by the assert checking `max_pattern_id` in
415	// the beginning of `find_at` above.
416	//
417	// This explicit bounds check elision is (amazingly) good for a
418	// 25-50% boost in some benchmarks, particularly ones with a lot
419	// of short literals.
420	let pat = unsafe { pats.get_unchecked(pati) };
421	if pat.is_prefix(&haystack[at..]) {
422	return Some(match_from_span(pati, at, at + pat.len()));
423	}
424	}
425	None
426	}
427	}
428
429	/// Exec represents the different search strategies supported by the Teddy
430	/// runtime.
431	///
432	/// This enum is an important safety abstraction. Namely, callers should only
433	/// construct a variant in this enum if it is safe to execute its corresponding
434	/// target features on the current CPU. The 128-bit searchers require SSSE3,
435	/// while the 256-bit searchers require AVX2.
436	#[derive(Clone, Debug)]
437	pub enum Exec {
438	TeddySlim1Mask128(TeddySlim1Mask128),
439	TeddySlim1Mask256(TeddySlim1Mask256),
440	TeddyFat1Mask256(TeddyFat1Mask256),
441	TeddySlim2Mask128(TeddySlim2Mask128),
442	TeddySlim2Mask256(TeddySlim2Mask256),
443	TeddyFat2Mask256(TeddyFat2Mask256),
444	TeddySlim3Mask128(TeddySlim3Mask128),
445	TeddySlim3Mask256(TeddySlim3Mask256),
446	TeddyFat3Mask256(TeddyFat3Mask256),
447	TeddySlim4Mask128(TeddySlim4Mask128),
448	TeddySlim4Mask256(TeddySlim4Mask256),
449	TeddyFat4Mask256(TeddyFat4Mask256),
450	}
451
452	// Most of the code below remains undocumented because they are effectively
453	// repeated versions of themselves. The general structure is described in the
454	// README and in the comments above.
455
456	#[derive(Clone, Debug)]
457	pub struct TeddySlim1Mask128 {
458	pub mask1: Mask128,
459	}
460
461	impl TeddySlim1Mask128 {
462	#[target_feature(enable = "ssse3")]
463	unsafe fn find_at(
464	&self,
465	pats: &Patterns,
466	teddy: &Teddy,
467	haystack: &[u8],
468	mut at: usize,
469	) -> Option<Match> {
470	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
471	// This assert helps eliminate bounds checks for bucket lookups in
472	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
473	assert_eq!(`8`, teddy.buckets.len());
474
475	let len = haystack.len();
476	while at <= len - `16` {
477	let c = self.candidate(haystack, at);
478	if !vector::is_all_zeroes128(c) {
479	if let Some(m) = teddy.verify128(pats, haystack, at, c) {
480	return Some(m);
481	}
482	}
483	at += `16`;
484	}
485	if at < len {
486	at = len - `16`;
487	let c = self.candidate(haystack, at);
488	if !vector::is_all_zeroes128(c) {
489	if let Some(m) = teddy.verify128(pats, haystack, at, c) {
490	return Some(m);
491	}
492	}
493	}
494	None
495	}
496
497	#[inline(always)]
498	unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i {
499	debug_assert!(haystack[at..].len() >= `16`);
500
501	let chunk = vector::loadu128(haystack, at);
502	members1m128(chunk, self.mask1)
503	}
504	}
505
506	#[derive(Clone, Debug)]
507	pub struct TeddySlim1Mask256 {
508	pub mask1: Mask256,
509	}
510
511	impl TeddySlim1Mask256 {
512	#[target_feature(enable = "avx2")]
513	unsafe fn find_at(
514	&self,
515	pats: &Patterns,
516	teddy: &Teddy,
517	haystack: &[u8],
518	mut at: usize,
519	) -> Option<Match> {
520	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
521	// This assert helps eliminate bounds checks for bucket lookups in
522	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
523	assert_eq!(`8`, teddy.buckets.len());
524
525	let len = haystack.len();
526	while at <= len - `32` {
527	let c = self.candidate(haystack, at);
528	if !vector::is_all_zeroes256(c) {
529	if let Some(m) = teddy.verify256(pats, haystack, at, c) {
530	return Some(m);
531	}
532	}
533	at += `32`;
534	}
535	if at < len {
536	at = len - `32`;
537	let c = self.candidate(haystack, at);
538	if !vector::is_all_zeroes256(c) {
539	if let Some(m) = teddy.verify256(pats, haystack, at, c) {
540	return Some(m);
541	}
542	}
543	}
544	None
545	}
546
547	#[inline(always)]
548	unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
549	debug_assert!(haystack[at..].len() >= `32`);
550
551	let chunk = vector::loadu256(haystack, at);
552	members1m256(chunk, self.mask1)
553	}
554	}
555
556	#[derive(Clone, Debug)]
557	pub struct TeddyFat1Mask256 {
558	pub mask1: Mask256,
559	}
560
561	impl TeddyFat1Mask256 {
562	#[target_feature(enable = "avx2")]
563	unsafe fn find_at(
564	&self,
565	pats: &Patterns,
566	teddy: &Teddy,
567	haystack: &[u8],
568	mut at: usize,
569	) -> Option<Match> {
570	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
571	// This assert helps eliminate bounds checks for bucket lookups in
572	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
573	assert_eq!(`16`, teddy.buckets.len());
574
575	let len = haystack.len();
576	while at <= len - `16` {
577	let c = self.candidate(haystack, at);
578	if !vector::is_all_zeroes256(c) {
579	if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
580	return Some(m);
581	}
582	}
583	at += `16`;
584	}
585	if at < len {
586	at = len - `16`;
587	let c = self.candidate(haystack, at);
588	if !vector::is_all_zeroes256(c) {
589	if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
590	return Some(m);
591	}
592	}
593	}
594	None
595	}
596
597	#[inline(always)]
598	unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
599	debug_assert!(haystack[at..].len() >= `16`);
600
601	let chunk =
602	_mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
603	members1m256(chunk, self.mask1)
604	}
605	}
606
607	#[derive(Clone, Debug)]
608	pub struct TeddySlim2Mask128 {
609	pub mask1: Mask128,
610	pub mask2: Mask128,
611	}
612
613	impl TeddySlim2Mask128 {
614	#[target_feature(enable = "ssse3")]
615	unsafe fn find_at(
616	&self,
617	pats: &Patterns,
618	teddy: &Teddy,
619	haystack: &[u8],
620	mut at: usize,
621	) -> Option<Match> {
622	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
623	// This assert helps eliminate bounds checks for bucket lookups in
624	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
625	assert_eq!(`8`, teddy.buckets.len());
626
627	at += `1`;
628	let len = haystack.len();
629	let mut prev0 = vector::ones128();
630	while at <= len - `16` {
631	let c = self.candidate(haystack, at, &mut prev0);
632	if !vector::is_all_zeroes128(c) {
633	if let Some(m) = teddy.verify128(pats, haystack, at - `1`, c) {
634	return Some(m);
635	}
636	}
637	at += `16`;
638	}
639	if at < len {
640	at = len - `16`;
641	prev0 = vector::ones128();
642
643	let c = self.candidate(haystack, at, &mut prev0);
644	if !vector::is_all_zeroes128(c) {
645	if let Some(m) = teddy.verify128(pats, haystack, at - `1`, c) {
646	return Some(m);
647	}
648	}
649	}
650	None
651	}
652
653	#[inline(always)]
654	unsafe fn candidate(
655	&self,
656	haystack: &[u8],
657	at: usize,
658	prev0: &mut __m128i,
659	) -> __m128i {
660	debug_assert!(haystack[at..].len() >= `16`);
661
662	let chunk = vector::loadu128(haystack, at);
663	let (res0, res1) = members2m128(chunk, self.mask1, self.mask2);
664	let res0prev0 = _mm_alignr_epi8(res0, *prev0, `15`);
665	_mm_and_si128(res0prev0, res1)
666	}
667	}
668
669	#[derive(Clone, Debug)]
670	pub struct TeddySlim2Mask256 {
671	pub mask1: Mask256,
672	pub mask2: Mask256,
673	}
674
675	impl TeddySlim2Mask256 {
676	#[target_feature(enable = "avx2")]
677	unsafe fn find_at(
678	&self,
679	pats: &Patterns,
680	teddy: &Teddy,
681	haystack: &[u8],
682	mut at: usize,
683	) -> Option<Match> {
684	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
685	// This assert helps eliminate bounds checks for bucket lookups in
686	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
687	assert_eq!(`8`, teddy.buckets.len());
688
689	at += `1`;
690	let len = haystack.len();
691	let mut prev0 = vector::ones256();
692	while at <= len - `32` {
693	let c = self.candidate(haystack, at, &mut prev0);
694	if !vector::is_all_zeroes256(c) {
695	if let Some(m) = teddy.verify256(pats, haystack, at - `1`, c) {
696	return Some(m);
697	}
698	}
699	at += `32`;
700	}
701	if at < len {
702	at = len - `32`;
703	prev0 = vector::ones256();
704
705	let c = self.candidate(haystack, at, &mut prev0);
706	if !vector::is_all_zeroes256(c) {
707	if let Some(m) = teddy.verify256(pats, haystack, at - `1`, c) {
708	return Some(m);
709	}
710	}
711	}
712	None
713	}
714
715	#[inline(always)]
716	unsafe fn candidate(
717	&self,
718	haystack: &[u8],
719	at: usize,
720	prev0: &mut __m256i,
721	) -> __m256i {
722	debug_assert!(haystack[at..].len() >= `32`);
723
724	let chunk = vector::loadu256(haystack, at);
725	let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
726	let res0prev0 = vector::alignr256_15(res0, *prev0);
727	let res = _mm256_and_si256(res0prev0, res1);
728	*prev0 = res0;
729	res
730	}
731	}
732
733	#[derive(Clone, Debug)]
734	pub struct TeddyFat2Mask256 {
735	pub mask1: Mask256,
736	pub mask2: Mask256,
737	}
738
739	impl TeddyFat2Mask256 {
740	#[target_feature(enable = "avx2")]
741	unsafe fn find_at(
742	&self,
743	pats: &Patterns,
744	teddy: &Teddy,
745	haystack: &[u8],
746	mut at: usize,
747	) -> Option<Match> {
748	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
749	// This assert helps eliminate bounds checks for bucket lookups in
750	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
751	assert_eq!(`16`, teddy.buckets.len());
752
753	at += `1`;
754	let len = haystack.len();
755	let mut prev0 = vector::ones256();
756	while at <= len - `16` {
757	let c = self.candidate(haystack, at, &mut prev0);
758	if !vector::is_all_zeroes256(c) {
759	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `1`, c)
760	{
761	return Some(m);
762	}
763	}
764	at += `16`;
765	}
766	if at < len {
767	at = len - `16`;
768	prev0 = vector::ones256();
769
770	let c = self.candidate(haystack, at, &mut prev0);
771	if !vector::is_all_zeroes256(c) {
772	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `1`, c)
773	{
774	return Some(m);
775	}
776	}
777	}
778	None
779	}
780
781	#[inline(always)]
782	unsafe fn candidate(
783	&self,
784	haystack: &[u8],
785	at: usize,
786	prev0: &mut __m256i,
787	) -> __m256i {
788	debug_assert!(haystack[at..].len() >= `16`);
789
790	let chunk =
791	_mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
792	let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
793	let res0prev0 = _mm256_alignr_epi8(res0, *prev0, `15`);
794	let res = _mm256_and_si256(res0prev0, res1);
795	*prev0 = res0;
796	res
797	}
798	}
799
800	#[derive(Clone, Debug)]
801	pub struct TeddySlim3Mask128 {
802	pub mask1: Mask128,
803	pub mask2: Mask128,
804	pub mask3: Mask128,
805	}
806
807	impl TeddySlim3Mask128 {
808	#[target_feature(enable = "ssse3")]
809	unsafe fn find_at(
810	&self,
811	pats: &Patterns,
812	teddy: &Teddy,
813	haystack: &[u8],
814	mut at: usize,
815	) -> Option<Match> {
816	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
817	// This assert helps eliminate bounds checks for bucket lookups in
818	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
819	assert_eq!(`8`, teddy.buckets.len());
820
821	at += `2`;
822	let len = haystack.len();
823	let (mut prev0, mut prev1) = (vector::ones128(), vector::ones128());
824	while at <= len - `16` {
825	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
826	if !vector::is_all_zeroes128(c) {
827	if let Some(m) = teddy.verify128(pats, haystack, at - `2`, c) {
828	return Some(m);
829	}
830	}
831	at += `16`;
832	}
833	if at < len {
834	at = len - `16`;
835	prev0 = vector::ones128();
836	prev1 = vector::ones128();
837
838	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
839	if !vector::is_all_zeroes128(c) {
840	if let Some(m) = teddy.verify128(pats, haystack, at - `2`, c) {
841	return Some(m);
842	}
843	}
844	}
845	None
846	}
847
848	#[inline(always)]
849	unsafe fn candidate(
850	&self,
851	haystack: &[u8],
852	at: usize,
853	prev0: &mut __m128i,
854	prev1: &mut __m128i,
855	) -> __m128i {
856	debug_assert!(haystack[at..].len() >= `16`);
857
858	let chunk = vector::loadu128(haystack, at);
859	let (res0, res1, res2) =
860	members3m128(chunk, self.mask1, self.mask2, self.mask3);
861	let res0prev0 = _mm_alignr_epi8(res0, *prev0, `14`);
862	let res1prev1 = _mm_alignr_epi8(res1, *prev1, `15`);
863	let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2);
864	*prev0 = res0;
865	*prev1 = res1;
866	res
867	}
868	}
869
870	#[derive(Clone, Debug)]
871	pub struct TeddySlim3Mask256 {
872	pub mask1: Mask256,
873	pub mask2: Mask256,
874	pub mask3: Mask256,
875	}
876
877	impl TeddySlim3Mask256 {
878	#[target_feature(enable = "avx2")]
879	unsafe fn find_at(
880	&self,
881	pats: &Patterns,
882	teddy: &Teddy,
883	haystack: &[u8],
884	mut at: usize,
885	) -> Option<Match> {
886	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
887	// This assert helps eliminate bounds checks for bucket lookups in
888	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
889	assert_eq!(`8`, teddy.buckets.len());
890
891	at += `2`;
892	let len = haystack.len();
893	let (mut prev0, mut prev1) = (vector::ones256(), vector::ones256());
894	while at <= len - `32` {
895	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
896	if !vector::is_all_zeroes256(c) {
897	if let Some(m) = teddy.verify256(pats, haystack, at - `2`, c) {
898	return Some(m);
899	}
900	}
901	at += `32`;
902	}
903	if at < len {
904	at = len - `32`;
905	prev0 = vector::ones256();
906	prev1 = vector::ones256();
907
908	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
909	if !vector::is_all_zeroes256(c) {
910	if let Some(m) = teddy.verify256(pats, haystack, at - `2`, c) {
911	return Some(m);
912	}
913	}
914	}
915	None
916	}
917
918	#[inline(always)]
919	unsafe fn candidate(
920	&self,
921	haystack: &[u8],
922	at: usize,
923	prev0: &mut __m256i,
924	prev1: &mut __m256i,
925	) -> __m256i {
926	debug_assert!(haystack[at..].len() >= `32`);
927
928	let chunk = vector::loadu256(haystack, at);
929	let (res0, res1, res2) =
930	members3m256(chunk, self.mask1, self.mask2, self.mask3);
931	let res0prev0 = vector::alignr256_14(res0, *prev0);
932	let res1prev1 = vector::alignr256_15(res1, *prev1);
933	let res =
934	_mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
935	*prev0 = res0;
936	*prev1 = res1;
937	res
938	}
939	}
940
941	#[derive(Clone, Debug)]
942	pub struct TeddyFat3Mask256 {
943	pub mask1: Mask256,
944	pub mask2: Mask256,
945	pub mask3: Mask256,
946	}
947
948	impl TeddyFat3Mask256 {
949	#[target_feature(enable = "avx2")]
950	unsafe fn find_at(
951	&self,
952	pats: &Patterns,
953	teddy: &Teddy,
954	haystack: &[u8],
955	mut at: usize,
956	) -> Option<Match> {
957	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
958	// This assert helps eliminate bounds checks for bucket lookups in
959	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
960	assert_eq!(`16`, teddy.buckets.len());
961
962	at += `2`;
963	let len = haystack.len();
964	let (mut prev0, mut prev1) = (vector::ones256(), vector::ones256());
965	while at <= len - `16` {
966	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
967	if !vector::is_all_zeroes256(c) {
968	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `2`, c)
969	{
970	return Some(m);
971	}
972	}
973	at += `16`;
974	}
975	if at < len {
976	at = len - `16`;
977	prev0 = vector::ones256();
978	prev1 = vector::ones256();
979
980	let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
981	if !vector::is_all_zeroes256(c) {
982	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `2`, c)
983	{
984	return Some(m);
985	}
986	}
987	}
988	None
989	}
990
991	#[inline(always)]
992	unsafe fn candidate(
993	&self,
994	haystack: &[u8],
995	at: usize,
996	prev0: &mut __m256i,
997	prev1: &mut __m256i,
998	) -> __m256i {
999	debug_assert!(haystack[at..].len() >= `16`);
1000
1001	let chunk =
1002	_mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
1003	let (res0, res1, res2) =
1004	members3m256(chunk, self.mask1, self.mask2, self.mask3);
1005	let res0prev0 = _mm256_alignr_epi8(res0, *prev0, `14`);
1006	let res1prev1 = _mm256_alignr_epi8(res1, *prev1, `15`);
1007	let res =
1008	_mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
1009	*prev0 = res0;
1010	*prev1 = res1;
1011	res
1012	}
1013	}
1014
1015	#[derive(Clone, Debug)]
1016	pub struct TeddySlim4Mask128 {
1017	pub mask1: Mask128,
1018	pub mask2: Mask128,
1019	pub mask3: Mask128,
1020	pub mask4: Mask128,
1021	}
1022
1023	impl TeddySlim4Mask128 {
1024	#[target_feature(enable = "ssse3")]
1025	unsafe fn find_at(
1026	&self,
1027	pats: &Patterns,
1028	teddy: &Teddy,
1029	haystack: &[u8],
1030	mut at: usize,
1031	) -> Option<Match> {
1032	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
1033	// This assert helps eliminate bounds checks for bucket lookups in
1034	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
1035	assert_eq!(`8`, teddy.buckets.len());
1036
1037	at += `3`;
1038	let len = haystack.len();
1039	let mut prev0 = vector::ones128();
1040	let mut prev1 = vector::ones128();
1041	let mut prev2 = vector::ones128();
1042	while at <= len - `16` {
1043	let c = self
1044	.candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1045	if !vector::is_all_zeroes128(c) {
1046	if let Some(m) = teddy.verify128(pats, haystack, at - `3`, c) {
1047	return Some(m);
1048	}
1049	}
1050	at += `16`;
1051	}
1052	if at < len {
1053	at = len - `16`;
1054	prev0 = vector::ones128();
1055	prev1 = vector::ones128();
1056	prev2 = vector::ones128();
1057
1058	let c = self
1059	.candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1060	if !vector::is_all_zeroes128(c) {
1061	if let Some(m) = teddy.verify128(pats, haystack, at - `3`, c) {
1062	return Some(m);
1063	}
1064	}
1065	}
1066	None
1067	}
1068
1069	#[inline(always)]
1070	unsafe fn candidate(
1071	&self,
1072	haystack: &[u8],
1073	at: usize,
1074	prev0: &mut __m128i,
1075	prev1: &mut __m128i,
1076	prev2: &mut __m128i,
1077	) -> __m128i {
1078	debug_assert!(haystack[at..].len() >= `16`);
1079
1080	let chunk = vector::loadu128(haystack, at);
1081	let (res0, res1, res2, res3) = members4m128(
1082	chunk, self.mask1, self.mask2, self.mask3, self.mask4,
1083	);
1084	let res0prev0 = _mm_alignr_epi8(res0, *prev0, `13`);
1085	let res1prev1 = _mm_alignr_epi8(res1, *prev1, `14`);
1086	let res2prev2 = _mm_alignr_epi8(res2, *prev2, `15`);
1087	let res = _mm_and_si128(
1088	_mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2prev2),
1089	res3,
1090	);
1091	*prev0 = res0;
1092	*prev1 = res1;
1093	*prev2 = res2;
1094	res
1095	}
1096	}
1097
1098	#[derive(Clone, Debug)]
1099	pub struct TeddySlim4Mask256 {
1100	pub mask1: Mask256,
1101	pub mask2: Mask256,
1102	pub mask3: Mask256,
1103	pub mask4: Mask256,
1104	}
1105
1106	impl TeddySlim4Mask256 {
1107	#[target_feature(enable = "avx2")]
1108	unsafe fn find_at(
1109	&self,
1110	pats: &Patterns,
1111	teddy: &Teddy,
1112	haystack: &[u8],
1113	mut at: usize,
1114	) -> Option<Match> {
1115	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
1116	// This assert helps eliminate bounds checks for bucket lookups in
1117	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
1118	assert_eq!(`8`, teddy.buckets.len());
1119
1120	at += `3`;
1121	let len = haystack.len();
1122	let mut prev0 = vector::ones256();
1123	let mut prev1 = vector::ones256();
1124	let mut prev2 = vector::ones256();
1125	while at <= len - `32` {
1126	let c = self
1127	.candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1128	if !vector::is_all_zeroes256(c) {
1129	if let Some(m) = teddy.verify256(pats, haystack, at - `3`, c) {
1130	return Some(m);
1131	}
1132	}
1133	at += `32`;
1134	}
1135	if at < len {
1136	at = len - `32`;
1137	prev0 = vector::ones256();
1138	prev1 = vector::ones256();
1139	prev2 = vector::ones256();
1140
1141	let c = self
1142	.candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1143	if !vector::is_all_zeroes256(c) {
1144	if let Some(m) = teddy.verify256(pats, haystack, at - `3`, c) {
1145	return Some(m);
1146	}
1147	}
1148	}
1149	None
1150	}
1151
1152	#[inline(always)]
1153	unsafe fn candidate(
1154	&self,
1155	haystack: &[u8],
1156	at: usize,
1157	prev0: &mut __m256i,
1158	prev1: &mut __m256i,
1159	prev2: &mut __m256i,
1160	) -> __m256i {
1161	debug_assert!(haystack[at..].len() >= `32`);
1162
1163	let chunk = vector::loadu256(haystack, at);
1164	let (res0, res1, res2, res3) = members4m256(
1165	chunk, self.mask1, self.mask2, self.mask3, self.mask4,
1166	);
1167	let res0prev0 = vector::alignr256_13(res0, *prev0);
1168	let res1prev1 = vector::alignr256_14(res1, *prev1);
1169	let res2prev2 = vector::alignr256_15(res2, *prev2);
1170	let res = _mm256_and_si256(
1171	_mm256_and_si256(
1172	_mm256_and_si256(res0prev0, res1prev1),
1173	res2prev2,
1174	),
1175	res3,
1176	);
1177	*prev0 = res0;
1178	*prev1 = res1;
1179	*prev2 = res2;
1180	res
1181	}
1182	}
1183
1184	#[derive(Clone, Debug)]
1185	pub struct TeddyFat4Mask256 {
1186	pub mask1: Mask256,
1187	pub mask2: Mask256,
1188	pub mask3: Mask256,
1189	pub mask4: Mask256,
1190	}
1191
1192	impl TeddyFat4Mask256 {
1193	#[target_feature(enable = "avx2")]
1194	unsafe fn find_at(
1195	&self,
1196	pats: &Patterns,
1197	teddy: &Teddy,
1198	haystack: &[u8],
1199	mut at: usize,
1200	) -> Option<Match> {
1201	debug_assert!(haystack[at..].len() >= teddy.minimum_len());
1202	// This assert helps eliminate bounds checks for bucket lookups in
1203	// Teddy::verify_bucket, which has a small (3-4%) performance boost.
1204	assert_eq!(`16`, teddy.buckets.len());
1205
1206	at += `3`;
1207	let len = haystack.len();
1208	let mut prev0 = vector::ones256();
1209	let mut prev1 = vector::ones256();
1210	let mut prev2 = vector::ones256();
1211	while at <= len - `16` {
1212	let c = self
1213	.candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1214	if !vector::is_all_zeroes256(c) {
1215	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `3`, c)
1216	{
1217	return Some(m);
1218	}
1219	}
1220	at += `16`;
1221	}
1222	if at < len {
1223	at = len - `16`;
1224	prev0 = vector::ones256();
1225	prev1 = vector::ones256();
1226	prev2 = vector::ones256();
1227
1228	let c = self
1229	.candidate(haystack, at, &mut prev0, &mut prev1, &mut prev2);
1230	if !vector::is_all_zeroes256(c) {
1231	if let Some(m) = teddy.verify_fat256(pats, haystack, at - `3`, c)
1232	{
1233	return Some(m);
1234	}
1235	}
1236	}
1237	None
1238	}
1239
1240	#[inline(always)]
1241	unsafe fn candidate(
1242	&self,
1243	haystack: &[u8],
1244	at: usize,
1245	prev0: &mut __m256i,
1246	prev1: &mut __m256i,
1247	prev2: &mut __m256i,
1248	) -> __m256i {
1249	debug_assert!(haystack[at..].len() >= `16`);
1250
1251	let chunk =
1252	_mm256_broadcastsi128_si256(vector::loadu128(haystack, at));
1253	let (res0, res1, res2, res3) = members4m256(
1254	chunk, self.mask1, self.mask2, self.mask3, self.mask4,
1255	);
1256	let res0prev0 = _mm256_alignr_epi8(res0, *prev0, `13`);
1257	let res1prev1 = _mm256_alignr_epi8(res1, *prev1, `14`);
1258	let res2prev2 = _mm256_alignr_epi8(res2, *prev2, `15`);
1259	let res = _mm256_and_si256(
1260	_mm256_and_si256(
1261	_mm256_and_si256(res0prev0, res1prev1),
1262	res2prev2,
1263	),
1264	res3,
1265	);
1266	*prev0 = res0;
1267	*prev1 = res1;
1268	*prev2 = res2;
1269	res
1270	}
1271	}
1272
1273	/// A 128-bit mask for the low and high nybbles in a set of patterns. Each
1274	/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
1275	/// the nybble `j` is in the bucket `i` at a particular position.
1276	#[derive(Clone, Copy, Debug)]
1277	pub struct Mask128 {
1278	lo: __m128i,
1279	hi: __m128i,
1280	}
1281
1282	impl Mask128 {
1283	/// Create a new SIMD mask from the mask produced by the Teddy builder.
1284	pub fn new(mask: compile::Mask) -> Mask128 {
1285	// SAFETY: This is safe since [u8; 16] has the same representation
1286	// as __m128i.
1287	unsafe {
1288	Mask128 {
1289	lo: mem::transmute(src:mask.lo128()),
1290	hi: mem::transmute(src:mask.hi128()),
1291	}
1292	}
1293	}
1294	}
1295
1296	/// A 256-bit mask for the low and high nybbles in a set of patterns. Each
1297	/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
1298	/// the nybble `j` is in the bucket `i` at a particular position.
1299	///
1300	/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
1301	/// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as
1302	/// the bitsets in the higher 128-bits, so that we can search 32 bytes at a
1303	/// time. (Remember, the nybbles in the haystack are used as indices into these
1304	/// masks, and 256-bit shuffles only operate on 128-bit lanes.)
1305	///
1306	/// For Fat Teddy, the bitsets are not repeated, but instead, the high 128
1307	/// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets
1308	/// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set
1309	/// if it's in the higher 128 bits.
1310	#[derive(Clone, Copy, Debug)]
1311	pub struct Mask256 {
1312	lo: __m256i,
1313	hi: __m256i,
1314	}
1315
1316	impl Mask256 {
1317	/// Create a new SIMD mask from the mask produced by the Teddy builder.
1318	pub fn new(mask: compile::Mask) -> Mask256 {
1319	// SAFETY: This is safe since [u8; 32] has the same representation
1320	// as __m256i.
1321	unsafe {
1322	Mask256 {
1323	lo: mem::transmute(src:mask.lo256()),
1324	hi: mem::transmute(src:mask.hi256()),
1325	}
1326	}
1327	}
1328	}
1329
1330	// The "members" routines below are responsible for taking a chunk of bytes,
1331	// a number of nybble masks and returning the result of using the masks to
1332	// lookup bytes in the chunk. The results of the high and low nybble masks are
1333	// AND'ed together, such that each candidate returned is a vector, with byte
1334	// sized lanes, and where each lane is an 8-bit bitset corresponding to the
1335	// buckets that contain the corresponding byte.
1336	//
1337	// In the case of masks of length greater than 1, callers will need to keep
1338	// the results from the previous haystack's window, and then shift the vectors
1339	// so that they all line up. Then they can be AND'ed together.
1340
1341	/// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a
1342	/// 16-byte window of the haystack (where the least significant byte
1343	/// corresponds to the start of the window), and `mask1` corresponds to a
1344	/// low/high mask for the first byte of all patterns that are being searched.
1345	#[target_feature(enable = "ssse3")]
1346	unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i {
1347	let lomask: __m128i = _mm_set1_epi8(`0xF`);
1348	let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1349	let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, `4`), b:lomask);
1350	_mm_and_si128(
1351	a:_mm_shuffle_epi8(mask1.lo, hlo),
1352	b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1353	)
1354	}
1355
1356	/// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a
1357	/// 32-byte window of the haystack (where the least significant byte
1358	/// corresponds to the start of the window), and `mask1` corresponds to a
1359	/// low/high mask for the first byte of all patterns that are being searched.
1360	///
1361	/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1362	/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1363	/// window in the haystack.
1364	#[target_feature(enable = "avx2")]
1365	unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i {
1366	let lomask: __m256i = _mm256_set1_epi8(`0xF`);
1367	let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1368	let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, `4`), b:lomask);
1369	_mm256_and_si256(
1370	a:_mm256_shuffle_epi8(mask1.lo, hlo),
1371	b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1372	)
1373	}
1374
1375	/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1376	/// to a 16-byte window of the haystack (where the least significant byte
1377	/// corresponds to the start of the window), and the masks correspond to a
1378	/// low/high mask for the first and second bytes of all patterns that are being
1379	/// searched. The vectors returned correspond to candidates for the first and
1380	/// second bytes in the patterns represented by the masks.
1381	#[target_feature(enable = "ssse3")]
1382	unsafe fn members2m128(
1383	chunk: __m128i,
1384	mask1: Mask128,
1385	mask2: Mask128,
1386	) -> (__m128i, __m128i) {
1387	let lomask: __m128i = _mm_set1_epi8(`0xF`);
1388	let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1389	let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, `4`), b:lomask);
1390	let res0: __m128i = _mm_and_si128(
1391	a:_mm_shuffle_epi8(mask1.lo, hlo),
1392	b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1393	);
1394	let res1: __m128i = _mm_and_si128(
1395	a:_mm_shuffle_epi8(mask2.lo, hlo),
1396	b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1397	);
1398	(res0, res1)
1399	}
1400
1401	/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1402	/// to a 32-byte window of the haystack (where the least significant byte
1403	/// corresponds to the start of the window), and the masks correspond to a
1404	/// low/high mask for the first and second bytes of all patterns that are being
1405	/// searched. The vectors returned correspond to candidates for the first and
1406	/// second bytes in the patterns represented by the masks.
1407	///
1408	/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1409	/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1410	/// window in the haystack.
1411	#[target_feature(enable = "avx2")]
1412	unsafe fn members2m256(
1413	chunk: __m256i,
1414	mask1: Mask256,
1415	mask2: Mask256,
1416	) -> (__m256i, __m256i) {
1417	let lomask: __m256i = _mm256_set1_epi8(`0xF`);
1418	let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1419	let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, `4`), b:lomask);
1420	let res0: __m256i = _mm256_and_si256(
1421	a:_mm256_shuffle_epi8(mask1.lo, hlo),
1422	b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1423	);
1424	let res1: __m256i = _mm256_and_si256(
1425	a:_mm256_shuffle_epi8(mask2.lo, hlo),
1426	b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1427	);
1428	(res0, res1)
1429	}
1430
1431	/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1432	/// to a 16-byte window of the haystack (where the least significant byte
1433	/// corresponds to the start of the window), and the masks correspond to a
1434	/// low/high mask for the first, second and third bytes of all patterns that
1435	/// are being searched. The vectors returned correspond to candidates for the
1436	/// first, second and third bytes in the patterns represented by the masks.
1437	#[target_feature(enable = "ssse3")]
1438	unsafe fn members3m128(
1439	chunk: __m128i,
1440	mask1: Mask128,
1441	mask2: Mask128,
1442	mask3: Mask128,
1443	) -> (__m128i, __m128i, __m128i) {
1444	let lomask: __m128i = _mm_set1_epi8(`0xF`);
1445	let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1446	let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, `4`), b:lomask);
1447	let res0: __m128i = _mm_and_si128(
1448	a:_mm_shuffle_epi8(mask1.lo, hlo),
1449	b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1450	);
1451	let res1: __m128i = _mm_and_si128(
1452	a:_mm_shuffle_epi8(mask2.lo, hlo),
1453	b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1454	);
1455	let res2: __m128i = _mm_and_si128(
1456	a:_mm_shuffle_epi8(mask3.lo, hlo),
1457	b:_mm_shuffle_epi8(a:mask3.hi, b:hhi),
1458	);
1459	(res0, res1, res2)
1460	}
1461
1462	/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1463	/// to a 32-byte window of the haystack (where the least significant byte
1464	/// corresponds to the start of the window), and the masks correspond to a
1465	/// low/high mask for the first, second and third bytes of all patterns that
1466	/// are being searched. The vectors returned correspond to candidates for the
1467	/// first, second and third bytes in the patterns represented by the masks.
1468	///
1469	/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1470	/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1471	/// window in the haystack.
1472	#[target_feature(enable = "avx2")]
1473	unsafe fn members3m256(
1474	chunk: __m256i,
1475	mask1: Mask256,
1476	mask2: Mask256,
1477	mask3: Mask256,
1478	) -> (__m256i, __m256i, __m256i) {
1479	let lomask: __m256i = _mm256_set1_epi8(`0xF`);
1480	let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1481	let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, `4`), b:lomask);
1482	let res0: __m256i = _mm256_and_si256(
1483	a:_mm256_shuffle_epi8(mask1.lo, hlo),
1484	b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1485	);
1486	let res1: __m256i = _mm256_and_si256(
1487	a:_mm256_shuffle_epi8(mask2.lo, hlo),
1488	b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1489	);
1490	let res2: __m256i = _mm256_and_si256(
1491	a:_mm256_shuffle_epi8(mask3.lo, hlo),
1492	b:_mm256_shuffle_epi8(a:mask3.hi, b:hhi),
1493	);
1494	(res0, res1, res2)
1495	}
1496
1497	/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1498	/// to a 16-byte window of the haystack (where the least significant byte
1499	/// corresponds to the start of the window), and the masks correspond to a
1500	/// low/high mask for the first, second, third and fourth bytes of all patterns
1501	/// that are being searched. The vectors returned correspond to candidates for
1502	/// the first, second, third and fourth bytes in the patterns represented by
1503	/// the masks.
1504	#[target_feature(enable = "ssse3")]
1505	unsafe fn members4m128(
1506	chunk: __m128i,
1507	mask1: Mask128,
1508	mask2: Mask128,
1509	mask3: Mask128,
1510	mask4: Mask128,
1511	) -> (__m128i, __m128i, __m128i, __m128i) {
1512	let lomask: __m128i = _mm_set1_epi8(`0xF`);
1513	let hlo: __m128i = _mm_and_si128(a:chunk, b:lomask);
1514	let hhi: __m128i = _mm_and_si128(a:_mm_srli_epi16(chunk, `4`), b:lomask);
1515	let res0: __m128i = _mm_and_si128(
1516	a:_mm_shuffle_epi8(mask1.lo, hlo),
1517	b:_mm_shuffle_epi8(a:mask1.hi, b:hhi),
1518	);
1519	let res1: __m128i = _mm_and_si128(
1520	a:_mm_shuffle_epi8(mask2.lo, hlo),
1521	b:_mm_shuffle_epi8(a:mask2.hi, b:hhi),
1522	);
1523	let res2: __m128i = _mm_and_si128(
1524	a:_mm_shuffle_epi8(mask3.lo, hlo),
1525	b:_mm_shuffle_epi8(a:mask3.hi, b:hhi),
1526	);
1527	let res3: __m128i = _mm_and_si128(
1528	a:_mm_shuffle_epi8(mask4.lo, hlo),
1529	b:_mm_shuffle_epi8(a:mask4.hi, b:hhi),
1530	);
1531	(res0, res1, res2, res3)
1532	}
1533
1534	/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1535	/// to a 32-byte window of the haystack (where the least significant byte
1536	/// corresponds to the start of the window), and the masks correspond to a
1537	/// low/high mask for the first, second, third and fourth bytes of all patterns
1538	/// that are being searched. The vectors returned correspond to candidates for
1539	/// the first, second, third and fourth bytes in the patterns represented by
1540	/// the masks.
1541	///
1542	/// Note that this can also be used for Fat Teddy, where the high 128 bits in
1543	/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1544	/// window in the haystack.
1545	#[target_feature(enable = "avx2")]
1546	unsafe fn members4m256(
1547	chunk: __m256i,
1548	mask1: Mask256,
1549	mask2: Mask256,
1550	mask3: Mask256,
1551	mask4: Mask256,
1552	) -> (__m256i, __m256i, __m256i, __m256i) {
1553	let lomask: __m256i = _mm256_set1_epi8(`0xF`);
1554	let hlo: __m256i = _mm256_and_si256(a:chunk, b:lomask);
1555	let hhi: __m256i = _mm256_and_si256(a:_mm256_srli_epi16(chunk, `4`), b:lomask);
1556	let res0: __m256i = _mm256_and_si256(
1557	a:_mm256_shuffle_epi8(mask1.lo, hlo),
1558	b:_mm256_shuffle_epi8(a:mask1.hi, b:hhi),
1559	);
1560	let res1: __m256i = _mm256_and_si256(
1561	a:_mm256_shuffle_epi8(mask2.lo, hlo),
1562	b:_mm256_shuffle_epi8(a:mask2.hi, b:hhi),
1563	);
1564	let res2: __m256i = _mm256_and_si256(
1565	a:_mm256_shuffle_epi8(mask3.lo, hlo),
1566	b:_mm256_shuffle_epi8(a:mask3.hi, b:hhi),
1567	);
1568	let res3: __m256i = _mm256_and_si256(
1569	a:_mm256_shuffle_epi8(mask4.lo, hlo),
1570	b:_mm256_shuffle_epi8(a:mask4.hi, b:hhi),
1571	);
1572	(res0, res1, res2, res3)
1573	}
1574