generic.rs source code [crates/aho_corasick/src/packed/teddy/generic.rs]

1	use core::fmt::Debug;
2
3	use alloc::{
4	boxed::Box, collections::BTreeMap, format, sync::Arc, vec, vec::Vec,
5	};
6
7	use crate::{
8	packed::{
9	ext::Pointer,
10	pattern::Patterns,
11	vector::{FatVector, Vector},
12	},
13	util::int::U32,
14	PatternID,
15	};
16
17	/// A match type specialized to the Teddy implementations below.
18	///
19	/// Essentially, instead of representing a match at byte offsets, we use
20	/// raw pointers. This is because the implementations below operate on raw
21	/// pointers, and so this is a more natural return type based on how the
22	/// implementation works.
23	///
24	/// Also, the `PatternID` used here is a `u16`.
25	#[derive(Clone, Copy, Debug)]
26	pub(crate) struct Match {
27	pid: PatternID,
28	start: *const u8,
29	end: *const u8,
30	}
31
32	impl Match {
33	/// Returns the ID of the pattern that matched.
34	pub(crate) fn pattern(&self) -> PatternID {
35	self.pid
36	}
37
38	/// Returns a pointer into the haystack at which the match starts.
39	pub(crate) fn start(&self) -> *const u8 {
40	self.start
41	}
42
43	/// Returns a pointer into the haystack at which the match ends.
44	pub(crate) fn end(&self) -> *const u8 {
45	self.end
46	}
47	}
48
49	/// A "slim" Teddy implementation that is generic over both the vector type
50	/// and the minimum length of the patterns being searched for.
51	///
52	/// Only 1, 2, 3 and 4 bytes are supported as minimum lengths.
53	#[derive(Clone, Debug)]
54	pub(crate) struct Slim<V, const BYTES: usize> {
55	/// A generic data structure for doing "slim" Teddy verification.
56	teddy: Teddy<`8`>,
57	/// The masks used as inputs to the shuffle operation to generate
58	/// candidates (which are fed into the verification routines).
59	masks: [Mask<V>; BYTES],
60	}
61
62	impl<V: Vector, const BYTES: usize> Slim<V, BYTES> {
63	/// Create a new "slim" Teddy searcher for the given patterns.
64	///
65	/// # Panics
66	///
67	/// This panics when `BYTES` is any value other than 1, 2, 3 or 4.
68	///
69	/// # Safety
70	///
71	/// Callers must ensure that this is okay to call in the current target for
72	/// the current CPU.
73	#[inline(always)]
74	pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Slim<V, BYTES> {
75	assert!(
76	`1` <= BYTES && BYTES <= `4`,
77	"only 1, 2, 3 or 4 bytes are supported"
78	);
79	let teddy = Teddy::new(patterns);
80	let masks = SlimMaskBuilder::from_teddy(&teddy);
81	Slim { teddy, masks }
82	}
83
84	/// Returns the approximate total amount of heap used by this type, in
85	/// units of bytes.
86	#[inline(always)]
87	pub(crate) fn memory_usage(&self) -> usize {
88	self.teddy.memory_usage()
89	}
90
91	/// Returns the minimum length, in bytes, that a haystack must be in order
92	/// to use it with this searcher.
93	#[inline(always)]
94	pub(crate) fn minimum_len(&self) -> usize {
95	V::BYTES + (BYTES - `1`)
96	}
97	}
98
99	impl<V: Vector> Slim<V, `1`> {
100	/// Look for an occurrences of the patterns in this finder in the haystack
101	/// given by the `start` and `end` pointers.
102	///
103	/// If no match could be found, then `None` is returned.
104	///
105	/// # Safety
106	///
107	/// The given pointers representing the haystack must be valid to read
108	/// from. They must also point to a region of memory that is at least the
109	/// minimum length required by this searcher.
110	///
111	/// Callers must ensure that this is okay to call in the current target for
112	/// the current CPU.
113	#[inline(always)]
114	pub(crate) unsafe fn find(
115	&self,
116	start: *const u8,
117	end: *const u8,
118	) -> Option<Match> {
119	let len = end.distance(start);
120	debug_assert!(len >= self.minimum_len());
121	let mut cur = start;
122	while cur <= end.sub(V::BYTES) {
123	if let Some(m) = self.find_one(cur, end) {
124	return Some(m);
125	}
126	cur = cur.add(V::BYTES);
127	}
128	if cur < end {
129	cur = end.sub(V::BYTES);
130	if let Some(m) = self.find_one(cur, end) {
131	return Some(m);
132	}
133	}
134	None
135	}
136
137	/// Look for a match starting at the `V::BYTES` at and after `cur`. If
138	/// there isn't one, then `None` is returned.
139	///
140	/// # Safety
141	///
142	/// The given pointers representing the haystack must be valid to read
143	/// from. They must also point to a region of memory that is at least the
144	/// minimum length required by this searcher.
145	///
146	/// Callers must ensure that this is okay to call in the current target for
147	/// the current CPU.
148	#[inline(always)]
149	unsafe fn find_one(
150	&self,
151	cur: *const u8,
152	end: *const u8,
153	) -> Option<Match> {
154	let c = self.candidate(cur);
155	if !c.is_zero() {
156	if let Some(m) = self.teddy.verify(cur, end, c) {
157	return Some(m);
158	}
159	}
160	None
161	}
162
163	/// Look for a candidate match (represented as a vector) starting at the
164	/// `V::BYTES` at and after `cur`. If there isn't one, then a vector with
165	/// all bits set to zero is returned.
166	///
167	/// # Safety
168	///
169	/// The given pointer representing the haystack must be valid to read
170	/// from.
171	///
172	/// Callers must ensure that this is okay to call in the current target for
173	/// the current CPU.
174	#[inline(always)]
175	unsafe fn candidate(&self, cur: *const u8) -> V {
176	let chunk = V::load_unaligned(cur);
177	Mask::members1(chunk, self.masks)
178	}
179	}
180
181	impl<V: Vector> Slim<V, `2`> {
182	/// See Slim<V, 1>::find.
183	#[inline(always)]
184	pub(crate) unsafe fn find(
185	&self,
186	start: *const u8,
187	end: *const u8,
188	) -> Option<Match> {
189	let len = end.distance(start);
190	debug_assert!(len >= self.minimum_len());
191	let mut cur = start.add(`1`);
192	let mut prev0 = V::splat(`0xFF`);
193	while cur <= end.sub(V::BYTES) {
194	if let Some(m) = self.find_one(cur, end, &mut prev0) {
195	return Some(m);
196	}
197	cur = cur.add(V::BYTES);
198	}
199	if cur < end {
200	cur = end.sub(V::BYTES);
201	prev0 = V::splat(`0xFF`);
202	if let Some(m) = self.find_one(cur, end, &mut prev0) {
203	return Some(m);
204	}
205	}
206	None
207	}
208
209	/// See Slim<V, 1>::find_one.
210	#[inline(always)]
211	unsafe fn find_one(
212	&self,
213	cur: *const u8,
214	end: *const u8,
215	prev0: &mut V,
216	) -> Option<Match> {
217	let c = self.candidate(cur, prev0);
218	if !c.is_zero() {
219	if let Some(m) = self.teddy.verify(cur.sub(`1`), end, c) {
220	return Some(m);
221	}
222	}
223	None
224	}
225
226	/// See Slim<V, 1>::candidate.
227	#[inline(always)]
228	unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V {
229	let chunk = V::load_unaligned(cur);
230	let (res0, res1) = Mask::members2(chunk, self.masks);
231	let res0prev0 = res0.shift_in_one_byte(*prev0);
232	let res = res0prev0.and(res1);
233	*prev0 = res0;
234	res
235	}
236	}
237
238	impl<V: Vector> Slim<V, `3`> {
239	/// See Slim<V, 1>::find.
240	#[inline(always)]
241	pub(crate) unsafe fn find(
242	&self,
243	start: *const u8,
244	end: *const u8,
245	) -> Option<Match> {
246	let len = end.distance(start);
247	debug_assert!(len >= self.minimum_len());
248	let mut cur = start.add(`2`);
249	let mut prev0 = V::splat(`0xFF`);
250	let mut prev1 = V::splat(`0xFF`);
251	while cur <= end.sub(V::BYTES) {
252	if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) {
253	return Some(m);
254	}
255	cur = cur.add(V::BYTES);
256	}
257	if cur < end {
258	cur = end.sub(V::BYTES);
259	prev0 = V::splat(`0xFF`);
260	prev1 = V::splat(`0xFF`);
261	if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) {
262	return Some(m);
263	}
264	}
265	None
266	}
267
268	/// See Slim<V, 1>::find_one.
269	#[inline(always)]
270	unsafe fn find_one(
271	&self,
272	cur: *const u8,
273	end: *const u8,
274	prev0: &mut V,
275	prev1: &mut V,
276	) -> Option<Match> {
277	let c = self.candidate(cur, prev0, prev1);
278	if !c.is_zero() {
279	if let Some(m) = self.teddy.verify(cur.sub(`2`), end, c) {
280	return Some(m);
281	}
282	}
283	None
284	}
285
286	/// See Slim<V, 1>::candidate.
287	#[inline(always)]
288	unsafe fn candidate(
289	&self,
290	cur: *const u8,
291	prev0: &mut V,
292	prev1: &mut V,
293	) -> V {
294	let chunk = V::load_unaligned(cur);
295	let (res0, res1, res2) = Mask::members3(chunk, self.masks);
296	let res0prev0 = res0.shift_in_two_bytes(*prev0);
297	let res1prev1 = res1.shift_in_one_byte(*prev1);
298	let res = res0prev0.and(res1prev1).and(res2);
299	*prev0 = res0;
300	*prev1 = res1;
301	res
302	}
303	}
304
305	impl<V: Vector> Slim<V, `4`> {
306	/// See Slim<V, 1>::find.
307	#[inline(always)]
308	pub(crate) unsafe fn find(
309	&self,
310	start: *const u8,
311	end: *const u8,
312	) -> Option<Match> {
313	let len = end.distance(start);
314	debug_assert!(len >= self.minimum_len());
315	let mut cur = start.add(`3`);
316	let mut prev0 = V::splat(`0xFF`);
317	let mut prev1 = V::splat(`0xFF`);
318	let mut prev2 = V::splat(`0xFF`);
319	while cur <= end.sub(V::BYTES) {
320	if let Some(m) =
321	self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2)
322	{
323	return Some(m);
324	}
325	cur = cur.add(V::BYTES);
326	}
327	if cur < end {
328	cur = end.sub(V::BYTES);
329	prev0 = V::splat(`0xFF`);
330	prev1 = V::splat(`0xFF`);
331	prev2 = V::splat(`0xFF`);
332	if let Some(m) =
333	self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2)
334	{
335	return Some(m);
336	}
337	}
338	None
339	}
340
341	/// See Slim<V, 1>::find_one.
342	#[inline(always)]
343	unsafe fn find_one(
344	&self,
345	cur: *const u8,
346	end: *const u8,
347	prev0: &mut V,
348	prev1: &mut V,
349	prev2: &mut V,
350	) -> Option<Match> {
351	let c = self.candidate(cur, prev0, prev1, prev2);
352	if !c.is_zero() {
353	if let Some(m) = self.teddy.verify(cur.sub(`3`), end, c) {
354	return Some(m);
355	}
356	}
357	None
358	}
359
360	/// See Slim<V, 1>::candidate.
361	#[inline(always)]
362	unsafe fn candidate(
363	&self,
364	cur: *const u8,
365	prev0: &mut V,
366	prev1: &mut V,
367	prev2: &mut V,
368	) -> V {
369	let chunk = V::load_unaligned(cur);
370	let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks);
371	let res0prev0 = res0.shift_in_three_bytes(*prev0);
372	let res1prev1 = res1.shift_in_two_bytes(*prev1);
373	let res2prev2 = res2.shift_in_one_byte(*prev2);
374	let res = res0prev0.and(res1prev1).and(res2prev2).and(res3);
375	*prev0 = res0;
376	*prev1 = res1;
377	*prev2 = res2;
378	res
379	}
380	}
381
382	/// A "fat" Teddy implementation that is generic over both the vector type
383	/// and the minimum length of the patterns being searched for.
384	///
385	/// Only 1, 2, 3 and 4 bytes are supported as minimum lengths.
386	#[derive(Clone, Debug)]
387	pub(crate) struct Fat<V, const BYTES: usize> {
388	/// A generic data structure for doing "fat" Teddy verification.
389	teddy: Teddy<`16`>,
390	/// The masks used as inputs to the shuffle operation to generate
391	/// candidates (which are fed into the verification routines).
392	masks: [Mask<V>; BYTES],
393	}
394
395	impl<V: FatVector, const BYTES: usize> Fat<V, BYTES> {
396	/// Create a new "fat" Teddy searcher for the given patterns.
397	///
398	/// # Panics
399	///
400	/// This panics when `BYTES` is any value other than 1, 2, 3 or 4.
401	///
402	/// # Safety
403	///
404	/// Callers must ensure that this is okay to call in the current target for
405	/// the current CPU.
406	#[inline(always)]
407	pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Fat<V, BYTES> {
408	assert!(
409	`1` <= BYTES && BYTES <= `4`,
410	"only 1, 2, 3 or 4 bytes are supported"
411	);
412	let teddy = Teddy::new(patterns);
413	let masks = FatMaskBuilder::from_teddy(&teddy);
414	Fat { teddy, masks }
415	}
416
417	/// Returns the approximate total amount of heap used by this type, in
418	/// units of bytes.
419	#[inline(always)]
420	pub(crate) fn memory_usage(&self) -> usize {
421	self.teddy.memory_usage()
422	}
423
424	/// Returns the minimum length, in bytes, that a haystack must be in order
425	/// to use it with this searcher.
426	#[inline(always)]
427	pub(crate) fn minimum_len(&self) -> usize {
428	V::Half::BYTES + (BYTES - `1`)
429	}
430	}
431
432	impl<V: FatVector> Fat<V, `1`> {
433	/// Look for an occurrences of the patterns in this finder in the haystack
434	/// given by the `start` and `end` pointers.
435	///
436	/// If no match could be found, then `None` is returned.
437	///
438	/// # Safety
439	///
440	/// The given pointers representing the haystack must be valid to read
441	/// from. They must also point to a region of memory that is at least the
442	/// minimum length required by this searcher.
443	///
444	/// Callers must ensure that this is okay to call in the current target for
445	/// the current CPU.
446	#[inline(always)]
447	pub(crate) unsafe fn find(
448	&self,
449	start: *const u8,
450	end: *const u8,
451	) -> Option<Match> {
452	let len = end.distance(start);
453	debug_assert!(len >= self.minimum_len());
454	let mut cur = start;
455	while cur <= end.sub(V::Half::BYTES) {
456	if let Some(m) = self.find_one(cur, end) {
457	return Some(m);
458	}
459	cur = cur.add(V::Half::BYTES);
460	}
461	if cur < end {
462	cur = end.sub(V::Half::BYTES);
463	if let Some(m) = self.find_one(cur, end) {
464	return Some(m);
465	}
466	}
467	None
468	}
469
470	/// Look for a match starting at the `V::BYTES` at and after `cur`. If
471	/// there isn't one, then `None` is returned.
472	///
473	/// # Safety
474	///
475	/// The given pointers representing the haystack must be valid to read
476	/// from. They must also point to a region of memory that is at least the
477	/// minimum length required by this searcher.
478	///
479	/// Callers must ensure that this is okay to call in the current target for
480	/// the current CPU.
481	#[inline(always)]
482	unsafe fn find_one(
483	&self,
484	cur: *const u8,
485	end: *const u8,
486	) -> Option<Match> {
487	let c = self.candidate(cur);
488	if !c.is_zero() {
489	if let Some(m) = self.teddy.verify(cur, end, c) {
490	return Some(m);
491	}
492	}
493	None
494	}
495
496	/// Look for a candidate match (represented as a vector) starting at the
497	/// `V::BYTES` at and after `cur`. If there isn't one, then a vector with
498	/// all bits set to zero is returned.
499	///
500	/// # Safety
501	///
502	/// The given pointer representing the haystack must be valid to read
503	/// from.
504	///
505	/// Callers must ensure that this is okay to call in the current target for
506	/// the current CPU.
507	#[inline(always)]
508	unsafe fn candidate(&self, cur: *const u8) -> V {
509	let chunk = V::load_half_unaligned(cur);
510	Mask::members1(chunk, self.masks)
511	}
512	}
513
514	impl<V: FatVector> Fat<V, `2`> {
515	/// See `Fat<V, 1>::find`.
516	#[inline(always)]
517	pub(crate) unsafe fn find(
518	&self,
519	start: *const u8,
520	end: *const u8,
521	) -> Option<Match> {
522	let len = end.distance(start);
523	debug_assert!(len >= self.minimum_len());
524	let mut cur = start.add(`1`);
525	let mut prev0 = V::splat(`0xFF`);
526	while cur <= end.sub(V::Half::BYTES) {
527	if let Some(m) = self.find_one(cur, end, &mut prev0) {
528	return Some(m);
529	}
530	cur = cur.add(V::Half::BYTES);
531	}
532	if cur < end {
533	cur = end.sub(V::Half::BYTES);
534	prev0 = V::splat(`0xFF`);
535	if let Some(m) = self.find_one(cur, end, &mut prev0) {
536	return Some(m);
537	}
538	}
539	None
540	}
541
542	/// See `Fat<V, 1>::find_one`.
543	#[inline(always)]
544	unsafe fn find_one(
545	&self,
546	cur: *const u8,
547	end: *const u8,
548	prev0: &mut V,
549	) -> Option<Match> {
550	let c = self.candidate(cur, prev0);
551	if !c.is_zero() {
552	if let Some(m) = self.teddy.verify(cur.sub(`1`), end, c) {
553	return Some(m);
554	}
555	}
556	None
557	}
558
559	/// See `Fat<V, 1>::candidate`.
560	#[inline(always)]
561	unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V {
562	let chunk = V::load_half_unaligned(cur);
563	let (res0, res1) = Mask::members2(chunk, self.masks);
564	let res0prev0 = res0.half_shift_in_one_byte(*prev0);
565	let res = res0prev0.and(res1);
566	*prev0 = res0;
567	res
568	}
569	}
570
571	impl<V: FatVector> Fat<V, `3`> {
572	/// See `Fat<V, 1>::find`.
573	#[inline(always)]
574	pub(crate) unsafe fn find(
575	&self,
576	start: *const u8,
577	end: *const u8,
578	) -> Option<Match> {
579	let len = end.distance(start);
580	debug_assert!(len >= self.minimum_len());
581	let mut cur = start.add(`2`);
582	let mut prev0 = V::splat(`0xFF`);
583	let mut prev1 = V::splat(`0xFF`);
584	while cur <= end.sub(V::Half::BYTES) {
585	if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) {
586	return Some(m);
587	}
588	cur = cur.add(V::Half::BYTES);
589	}
590	if cur < end {
591	cur = end.sub(V::Half::BYTES);
592	prev0 = V::splat(`0xFF`);
593	prev1 = V::splat(`0xFF`);
594	if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) {
595	return Some(m);
596	}
597	}
598	None
599	}
600
601	/// See `Fat<V, 1>::find_one`.
602	#[inline(always)]
603	unsafe fn find_one(
604	&self,
605	cur: *const u8,
606	end: *const u8,
607	prev0: &mut V,
608	prev1: &mut V,
609	) -> Option<Match> {
610	let c = self.candidate(cur, prev0, prev1);
611	if !c.is_zero() {
612	if let Some(m) = self.teddy.verify(cur.sub(`2`), end, c) {
613	return Some(m);
614	}
615	}
616	None
617	}
618
619	/// See `Fat<V, 1>::candidate`.
620	#[inline(always)]
621	unsafe fn candidate(
622	&self,
623	cur: *const u8,
624	prev0: &mut V,
625	prev1: &mut V,
626	) -> V {
627	let chunk = V::load_half_unaligned(cur);
628	let (res0, res1, res2) = Mask::members3(chunk, self.masks);
629	let res0prev0 = res0.half_shift_in_two_bytes(*prev0);
630	let res1prev1 = res1.half_shift_in_one_byte(*prev1);
631	let res = res0prev0.and(res1prev1).and(res2);
632	*prev0 = res0;
633	*prev1 = res1;
634	res
635	}
636	}
637
638	impl<V: FatVector> Fat<V, `4`> {
639	/// See `Fat<V, 1>::find`.
640	#[inline(always)]
641	pub(crate) unsafe fn find(
642	&self,
643	start: *const u8,
644	end: *const u8,
645	) -> Option<Match> {
646	let len = end.distance(start);
647	debug_assert!(len >= self.minimum_len());
648	let mut cur = start.add(`3`);
649	let mut prev0 = V::splat(`0xFF`);
650	let mut prev1 = V::splat(`0xFF`);
651	let mut prev2 = V::splat(`0xFF`);
652	while cur <= end.sub(V::Half::BYTES) {
653	if let Some(m) =
654	self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2)
655	{
656	return Some(m);
657	}
658	cur = cur.add(V::Half::BYTES);
659	}
660	if cur < end {
661	cur = end.sub(V::Half::BYTES);
662	prev0 = V::splat(`0xFF`);
663	prev1 = V::splat(`0xFF`);
664	prev2 = V::splat(`0xFF`);
665	if let Some(m) =
666	self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2)
667	{
668	return Some(m);
669	}
670	}
671	None
672	}
673
674	/// See `Fat<V, 1>::find_one`.
675	#[inline(always)]
676	unsafe fn find_one(
677	&self,
678	cur: *const u8,
679	end: *const u8,
680	prev0: &mut V,
681	prev1: &mut V,
682	prev2: &mut V,
683	) -> Option<Match> {
684	let c = self.candidate(cur, prev0, prev1, prev2);
685	if !c.is_zero() {
686	if let Some(m) = self.teddy.verify(cur.sub(`3`), end, c) {
687	return Some(m);
688	}
689	}
690	None
691	}
692
693	/// See `Fat<V, 1>::candidate`.
694	#[inline(always)]
695	unsafe fn candidate(
696	&self,
697	cur: *const u8,
698	prev0: &mut V,
699	prev1: &mut V,
700	prev2: &mut V,
701	) -> V {
702	let chunk = V::load_half_unaligned(cur);
703	let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks);
704	let res0prev0 = res0.half_shift_in_three_bytes(*prev0);
705	let res1prev1 = res1.half_shift_in_two_bytes(*prev1);
706	let res2prev2 = res2.half_shift_in_one_byte(*prev2);
707	let res = res0prev0.and(res1prev1).and(res2prev2).and(res3);
708	*prev0 = res0;
709	*prev1 = res1;
710	*prev2 = res2;
711	res
712	}
713	}
714
715	/// The common elements of all "slim" and "fat" Teddy search implementations.
716	///
717	/// Essentially, this contains the patterns and the buckets. Namely, it
718	/// contains enough to implement the verification step after candidates are
719	/// identified via the shuffle masks.
720	///
721	/// It is generic over the number of buckets used. In general, the number of
722	/// buckets is either 8 (for "slim" Teddy) or 16 (for "fat" Teddy). The generic
723	/// parameter isn't really meant to be instantiated for any value other than
724	/// 8 or 16, although it is technically possible. The main hiccup is that there
725	/// is some bit-shifting done in the critical part of verification that could
726	/// be quite expensive if `N` is not a multiple of 2.
727	#[derive(Clone, Debug)]
728	struct Teddy<const BUCKETS: usize> {
729	/// The patterns we are searching for.
730	///
731	/// A pattern string can be found by its `PatternID`.
732	patterns: Arc<Patterns>,
733	/// The allocation of patterns in buckets. This only contains the IDs of
734	/// patterns. In order to do full verification, callers must provide the
735	/// actual patterns when using Teddy.
736	buckets: [Vec<PatternID>; BUCKETS],
737	// N.B. The above representation is very simple, but it definitely results
738	// in ping-ponging between different allocations during verification. I've
739	// tried experimenting with other representations that flatten the pattern
740	// strings into a single allocation, but it doesn't seem to help much.
741	// Probably everything is small enough to fit into cache anyway, and so the
742	// pointer chasing isn't a big deal?
743	//
744	// One other avenue I haven't explored is some kind of hashing trick
745	// that let's us do another high-confidence check before launching into
746	// `memcmp`.
747	}
748
749	impl<const BUCKETS: usize> Teddy<BUCKETS> {
750	/// Create a new generic data structure for Teddy verification.
751	fn new(patterns: Arc<Patterns>) -> Teddy<BUCKETS> {
752	assert_ne!(`0`, patterns.len(), "Teddy requires at least one pattern");
753	assert_ne!(
754	`0`,
755	patterns.minimum_len(),
756	"Teddy does not support zero-length patterns"
757	);
758	assert!(
759	BUCKETS == `8` \|\| BUCKETS == `16`,
760	"Teddy only supports 8 or 16 buckets"
761	);
762	// MSRV(1.63): Use core::array::from_fn below instead of allocating a
763	// superfluous outer Vec. Not a big deal (especially given the BTreeMap
764	// allocation below), but nice to not do it.
765	let buckets =
766	<[Vec<PatternID>; BUCKETS]>::try_from(vec![vec![]; BUCKETS])
767	.unwrap();
768	let mut t = Teddy { patterns, buckets };
769
770	let mut map: BTreeMap<Box<[u8]>, usize> = BTreeMap::new();
771	for (id, pattern) in t.patterns.iter() {
772	// We try to be slightly clever in how we assign patterns into
773	// buckets. Generally speaking, we want patterns with the same
774	// prefix to be in the same bucket, since it minimizes the amount
775	// of time we spend churning through buckets in the verification
776	// step.
777	//
778	// So we could assign patterns with the same N-prefix (where N is
779	// the size of the mask, which is one of {1, 2, 3}) to the same
780	// bucket. However, case insensitive searches are fairly common, so
781	// we'd for example, ideally want to treat `abc` and `ABC` as if
782	// they shared the same prefix. ASCII has the nice property that
783	// the lower 4 bits of A and a are the same, so we therefore group
784	// patterns with the same low-nybble-N-prefix into the same bucket.
785	//
786	// MOREOVER, this is actually necessary for correctness! In
787	// particular, by grouping patterns with the same prefix into the
788	// same bucket, we ensure that we preserve correct leftmost-first
789	// and leftmost-longest match semantics. In addition to the fact
790	// that `patterns.iter()` iterates in the correct order, this
791	// guarantees that all possible ambiguous matches will occur in
792	// the same bucket. The verification routine could be adjusted to
793	// support correct leftmost match semantics regardless of bucket
794	// allocation, but that results in a performance hit. It's much
795	// nicer to be able to just stop as soon as a match is found.
796	let lonybs = pattern.low_nybbles(t.mask_len());
797	if let Some(&bucket) = map.get(&lonybs) {
798	t.buckets[bucket].push(id);
799	} else {
800	// N.B. We assign buckets in reverse because it shouldn't have
801	// any influence on performance, but it does make it harder to
802	// get leftmost match semantics accidentally correct.
803	let bucket = (BUCKETS - `1`) - (id.as_usize() % BUCKETS);
804	t.buckets[bucket].push(id);
805	map.insert(lonybs, bucket);
806	}
807	}
808	t
809	}
810
811	/// Verify whether there are any matches starting at or after `cur` in the
812	/// haystack. The candidate chunk given should correspond to 8-bit bitsets
813	/// for N buckets.
814	///
815	/// # Safety
816	///
817	/// The given pointers representing the haystack must be valid to read
818	/// from.
819	#[inline(always)]
820	unsafe fn verify64(
821	&self,
822	cur: *const u8,
823	end: *const u8,
824	mut candidate_chunk: u64,
825	) -> Option<Match> {
826	while candidate_chunk != `0` {
827	let bit = candidate_chunk.trailing_zeros().as_usize();
828	candidate_chunk &= !(`1` << bit);
829
830	let cur = cur.add(bit / BUCKETS);
831	let bucket = bit % BUCKETS;
832	if let Some(m) = self.verify_bucket(cur, end, bucket) {
833	return Some(m);
834	}
835	}
836	None
837	}
838
839	/// Verify whether there are any matches starting at `at` in the given
840	/// `haystack` corresponding only to patterns in the given bucket.
841	///
842	/// # Safety
843	///
844	/// The given pointers representing the haystack must be valid to read
845	/// from.
846	///
847	/// The bucket index must be less than or equal to `self.buckets.len()`.
848	#[inline(always)]
849	unsafe fn verify_bucket(
850	&self,
851	cur: *const u8,
852	end: *const u8,
853	bucket: usize,
854	) -> Option<Match> {
855	debug_assert!(bucket < self.buckets.len());
856	// SAFETY: The caller must ensure that the bucket index is correct.
857	for pid in self.buckets.get_unchecked(bucket).iter().copied() {
858	// SAFETY: This is safe because we are guaranteed that every
859	// index in a Teddy bucket is a valid index into `pats`, by
860	// construction.
861	debug_assert!(pid.as_usize() < self.patterns.len());
862	let pat = self.patterns.get_unchecked(pid);
863	if pat.is_prefix_raw(cur, end) {
864	let start = cur;
865	let end = start.add(pat.len());
866	return Some(Match { pid, start, end });
867	}
868	}
869	None
870	}
871
872	/// Returns the total number of masks required by the patterns in this
873	/// Teddy searcher.
874	///
875	/// Basically, the mask length corresponds to the type of Teddy searcher
876	/// to use: a 1-byte, 2-byte, 3-byte or 4-byte searcher. The bigger the
877	/// better, typically, since searching for longer substrings usually
878	/// decreases the rate of false positives. Therefore, the number of masks
879	/// needed is the length of the shortest pattern in this searcher. If the
880	/// length of the shortest pattern (in bytes) is bigger than 4, then the
881	/// mask length is 4 since there are no Teddy searchers for more than 4
882	/// bytes.
883	fn mask_len(&self) -> usize {
884	core::cmp::min(`4`, self.patterns.minimum_len())
885	}
886
887	/// Returns the approximate total amount of heap used by this type, in
888	/// units of bytes.
889	fn memory_usage(&self) -> usize {
890	// This is an upper bound rather than a precise accounting. No
891	// particular reason, other than it's probably very close to actual
892	// memory usage in practice.
893	self.patterns.len() * core::mem::size_of::<PatternID>()
894	}
895	}
896
897	impl Teddy<`8`> {
898	/// Runs the verification routine for "slim" Teddy.
899	///
900	/// The candidate given should be a collection of 8-bit bitsets (one bitset
901	/// per lane), where the ith bit is set in the jth lane if and only if the
902	/// byte occurring at `at + j` in `cur` is in the bucket `i`.
903	///
904	/// # Safety
905	///
906	/// Callers must ensure that this is okay to call in the current target for
907	/// the current CPU.
908	///
909	/// The given pointers must be valid to read from.
910	#[inline(always)]
911	unsafe fn verify<V: Vector>(
912	&self,
913	mut cur: *const u8,
914	end: *const u8,
915	candidate: V,
916	) -> Option<Match> {
917	debug_assert!(!candidate.is_zero());
918	// Convert the candidate into 64-bit chunks, and then verify each of
919	// those chunks.
920	candidate.for_each_64bit_lane(
921	#[inline(always)]
922	\|_, chunk\| {
923	let result = self.verify64(cur, end, chunk);
924	cur = cur.add(`8`);
925	result
926	},
927	)
928	}
929	}
930
931	impl Teddy<`16`> {
932	/// Runs the verification routine for "fat" Teddy.
933	///
934	/// The candidate given should be a collection of 8-bit bitsets (one bitset
935	/// per lane), where the ith bit is set in the jth lane if and only if the
936	/// byte occurring at `at + (j < 16 ? j : j - 16)` in `cur` is in the
937	/// bucket `j < 16 ? i : i + 8`.
938	///
939	/// # Safety
940	///
941	/// Callers must ensure that this is okay to call in the current target for
942	/// the current CPU.
943	///
944	/// The given pointers must be valid to read from.
945	#[inline(always)]
946	unsafe fn verify<V: FatVector>(
947	&self,
948	mut cur: *const u8,
949	end: *const u8,
950	candidate: V,
951	) -> Option<Match> {
952	// This is a bit tricky, but we basically want to convert our
953	// candidate, which looks like this (assuming a 256-bit vector):
954	//
955	// a31 a30 ... a17 a16 a15 a14 ... a01 a00
956	//
957	// where each a(i) is an 8-bit bitset corresponding to the activated
958	// buckets, to this
959	//
960	// a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
961	//
962	// Namely, for Fat Teddy, the high 128-bits of the candidate correspond
963	// to the same bytes in the haystack in the low 128-bits (so we only
964	// scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
965	//
966	// The verification routine wants to look at all potentially matching
967	// buckets before moving on to the next lane. So for example, both
968	// a16 and a00 both correspond to the first byte in our window; a00
969	// contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
970	// a16 should be checked before a01. So the transformation shown above
971	// allows us to use our normal verification procedure with one small
972	// change: we treat each bitset as 16 bits instead of 8 bits.
973	debug_assert!(!candidate.is_zero());
974
975	// Swap the 128-bit lanes in the candidate vector.
976	let swapped = candidate.swap_halves();
977	// Interleave the bytes from the low 128-bit lanes, starting with
978	// cand first.
979	let r1 = candidate.interleave_low_8bit_lanes(swapped);
980	// Interleave the bytes from the high 128-bit lanes, starting with
981	// cand first.
982	let r2 = candidate.interleave_high_8bit_lanes(swapped);
983	// Now just take the 2 low 64-bit integers from both r1 and r2. We
984	// can drop the high 64-bit integers because they are a mirror image
985	// of the low 64-bit integers. All we care about are the low 128-bit
986	// lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
987	// laid out in the desired order, as described above.
988	r1.for_each_low_64bit_lane(
989	r2,
990	#[inline(always)]
991	\|_, chunk\| {
992	let result = self.verify64(cur, end, chunk);
993	cur = cur.add(`4`);
994	result
995	},
996	)
997	}
998	}
999
1000	/// A vector generic mask for the low and high nybbles in a set of patterns.
1001	/// Each 8-bit lane `j` in a vector corresponds to a bitset where the `i`th bit
1002	/// is set if and only if the nybble `j` is in the bucket `i` at a particular
1003	/// position.
1004	///
1005	/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
1006	/// used. For Slim Teddy, the bitsets in the lower half are the same as the
1007	/// bitsets in the higher half, so that we can search `V::BYTES` bytes at a
1008	/// time. (Remember, the nybbles in the haystack are used as indices into these
1009	/// masks, and 256-bit shuffles only operate on 128-bit lanes.)
1010	///
1011	/// For Fat Teddy, the bitsets are not repeated, but instead, the high half
1012	/// bits correspond to an addition 8 buckets. So that a bitset `00100010` has
1013	/// buckets 1 and 5 set if it's in the lower half, but has buckets 9 and 13 set
1014	/// if it's in the higher half.
1015	#[derive(Clone, Copy, Debug)]
1016	struct Mask<V> {
1017	lo: V,
1018	hi: V,
1019	}
1020
1021	impl<V: Vector> Mask<V> {
1022	/// Return a candidate for Teddy (fat or slim) that is searching for 1-byte
1023	/// candidates.
1024	///
1025	/// If a candidate is returned, it will be a collection of 8-bit bitsets
1026	/// (one bitset per lane), where the ith bit is set in the jth lane if and
1027	/// only if the byte occurring at the jth lane in `chunk` is in the bucket
1028	/// `i`. If no candidate is found, then the vector returned will have all
1029	/// lanes set to zero.
1030	///
1031	/// `chunk` should correspond to a `V::BYTES` window of the haystack (where
1032	/// the least significant byte corresponds to the start of the window). For
1033	/// fat Teddy, the haystack window length should be `V::BYTES / 2`, with
1034	/// the window repeated in each half of the vector.
1035	///
1036	/// `mask1` should correspond to a low/high mask for the first byte of all
1037	/// patterns that are being searched.
1038	#[inline(always)]
1039	unsafe fn members1(chunk: V, masks: [Mask<V>; `1`]) -> V {
1040	let lomask = V::splat(`0xF`);
1041	let hlo = chunk.and(lomask);
1042	let hhi = chunk.shift_8bit_lane_right::<`4`>().and(lomask);
1043	let locand = masks[`0`].lo.shuffle_bytes(hlo);
1044	let hicand = masks[`0`].hi.shuffle_bytes(hhi);
1045	locand.and(hicand)
1046	}
1047
1048	/// Return a candidate for Teddy (fat or slim) that is searching for 2-byte
1049	/// candidates.
1050	///
1051	/// If candidates are returned, each will be a collection of 8-bit bitsets
1052	/// (one bitset per lane), where the ith bit is set in the jth lane if and
1053	/// only if the byte occurring at the jth lane in `chunk` is in the bucket
1054	/// `i`. Each candidate returned corresponds to the first and second bytes
1055	/// of the patterns being searched. If no candidate is found, then all of
1056	/// the lanes will be set to zero in at least one of the vectors returned.
1057	///
1058	/// `chunk` should correspond to a `V::BYTES` window of the haystack (where
1059	/// the least significant byte corresponds to the start of the window). For
1060	/// fat Teddy, the haystack window length should be `V::BYTES / 2`, with
1061	/// the window repeated in each half of the vector.
1062	///
1063	/// The masks should correspond to the masks computed for the first and
1064	/// second bytes of all patterns that are being searched.
1065	#[inline(always)]
1066	unsafe fn members2(chunk: V, masks: [Mask<V>; `2`]) -> (V, V) {
1067	let lomask = V::splat(`0xF`);
1068	let hlo = chunk.and(lomask);
1069	let hhi = chunk.shift_8bit_lane_right::<`4`>().and(lomask);
1070
1071	let locand1 = masks[`0`].lo.shuffle_bytes(hlo);
1072	let hicand1 = masks[`0`].hi.shuffle_bytes(hhi);
1073	let cand1 = locand1.and(hicand1);
1074
1075	let locand2 = masks[`1`].lo.shuffle_bytes(hlo);
1076	let hicand2 = masks[`1`].hi.shuffle_bytes(hhi);
1077	let cand2 = locand2.and(hicand2);
1078
1079	(cand1, cand2)
1080	}
1081
1082	/// Return a candidate for Teddy (fat or slim) that is searching for 3-byte
1083	/// candidates.
1084	///
1085	/// If candidates are returned, each will be a collection of 8-bit bitsets
1086	/// (one bitset per lane), where the ith bit is set in the jth lane if and
1087	/// only if the byte occurring at the jth lane in `chunk` is in the bucket
1088	/// `i`. Each candidate returned corresponds to the first, second and third
1089	/// bytes of the patterns being searched. If no candidate is found, then
1090	/// all of the lanes will be set to zero in at least one of the vectors
1091	/// returned.
1092	///
1093	/// `chunk` should correspond to a `V::BYTES` window of the haystack (where
1094	/// the least significant byte corresponds to the start of the window). For
1095	/// fat Teddy, the haystack window length should be `V::BYTES / 2`, with
1096	/// the window repeated in each half of the vector.
1097	///
1098	/// The masks should correspond to the masks computed for the first, second
1099	/// and third bytes of all patterns that are being searched.
1100	#[inline(always)]
1101	unsafe fn members3(chunk: V, masks: [Mask<V>; `3`]) -> (V, V, V) {
1102	let lomask = V::splat(`0xF`);
1103	let hlo = chunk.and(lomask);
1104	let hhi = chunk.shift_8bit_lane_right::<`4`>().and(lomask);
1105
1106	let locand1 = masks[`0`].lo.shuffle_bytes(hlo);
1107	let hicand1 = masks[`0`].hi.shuffle_bytes(hhi);
1108	let cand1 = locand1.and(hicand1);
1109
1110	let locand2 = masks[`1`].lo.shuffle_bytes(hlo);
1111	let hicand2 = masks[`1`].hi.shuffle_bytes(hhi);
1112	let cand2 = locand2.and(hicand2);
1113
1114	let locand3 = masks[`2`].lo.shuffle_bytes(hlo);
1115	let hicand3 = masks[`2`].hi.shuffle_bytes(hhi);
1116	let cand3 = locand3.and(hicand3);
1117
1118	(cand1, cand2, cand3)
1119	}
1120
1121	/// Return a candidate for Teddy (fat or slim) that is searching for 4-byte
1122	/// candidates.
1123	///
1124	/// If candidates are returned, each will be a collection of 8-bit bitsets
1125	/// (one bitset per lane), where the ith bit is set in the jth lane if and
1126	/// only if the byte occurring at the jth lane in `chunk` is in the bucket
1127	/// `i`. Each candidate returned corresponds to the first, second, third
1128	/// and fourth bytes of the patterns being searched. If no candidate is
1129	/// found, then all of the lanes will be set to zero in at least one of the
1130	/// vectors returned.
1131	///
1132	/// `chunk` should correspond to a `V::BYTES` window of the haystack (where
1133	/// the least significant byte corresponds to the start of the window). For
1134	/// fat Teddy, the haystack window length should be `V::BYTES / 2`, with
1135	/// the window repeated in each half of the vector.
1136	///
1137	/// The masks should correspond to the masks computed for the first,
1138	/// second, third and fourth bytes of all patterns that are being searched.
1139	#[inline(always)]
1140	unsafe fn members4(chunk: V, masks: [Mask<V>; `4`]) -> (V, V, V, V) {
1141	let lomask = V::splat(`0xF`);
1142	let hlo = chunk.and(lomask);
1143	let hhi = chunk.shift_8bit_lane_right::<`4`>().and(lomask);
1144
1145	let locand1 = masks[`0`].lo.shuffle_bytes(hlo);
1146	let hicand1 = masks[`0`].hi.shuffle_bytes(hhi);
1147	let cand1 = locand1.and(hicand1);
1148
1149	let locand2 = masks[`1`].lo.shuffle_bytes(hlo);
1150	let hicand2 = masks[`1`].hi.shuffle_bytes(hhi);
1151	let cand2 = locand2.and(hicand2);
1152
1153	let locand3 = masks[`2`].lo.shuffle_bytes(hlo);
1154	let hicand3 = masks[`2`].hi.shuffle_bytes(hhi);
1155	let cand3 = locand3.and(hicand3);
1156
1157	let locand4 = masks[`3`].lo.shuffle_bytes(hlo);
1158	let hicand4 = masks[`3`].hi.shuffle_bytes(hhi);
1159	let cand4 = locand4.and(hicand4);
1160
1161	(cand1, cand2, cand3, cand4)
1162	}
1163	}
1164
1165	/// Represents the low and high nybble masks that will be used during
1166	/// search. Each mask is 32 bytes wide, although only the first 16 bytes are
1167	/// used for 128-bit vectors.
1168	///
1169	/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set
1170	/// if and only if the corresponding nybble is in the ith bucket. The index of
1171	/// the byte (0-15, inclusive) corresponds to the nybble.
1172	///
1173	/// Each mask is used as the target of a shuffle, where the indices for the
1174	/// shuffle are taken from the haystack. AND'ing the shuffles for both the
1175	/// low and high masks together also results in 8-bit bitsets, but where bit
1176	/// `i` is set if and only if the correspond byte* is in the ith bucket.*
1177	#[derive(Clone, Default)]
1178	struct SlimMaskBuilder {
1179	lo: [u8; `32`],
1180	hi: [u8; `32`],
1181	}
1182
1183	impl SlimMaskBuilder {
1184	/// Update this mask by adding the given byte to the given bucket. The
1185	/// given bucket must be in the range 0-7.
1186	///
1187	/// # Panics
1188	///
1189	/// When `bucket >= 8`.
1190	fn add(&mut self, bucket: usize, byte: u8) {
1191	assert!(bucket < `8`);
1192
1193	let bucket = u8::try_from(bucket).unwrap();
1194	let byte_lo = usize::from(byte & `0xF`);
1195	let byte_hi = usize::from((byte >> `4`) & `0xF`);
1196	// When using 256-bit vectors, we need to set this bucket assignment in
1197	// the low and high 128-bit portions of the mask. This allows us to
1198	// process 32 bytes at a time. Namely, AVX2 shuffles operate on each
1199	// of the 128-bit lanes, rather than the full 256-bit vector at once.
1200	self.lo[byte_lo] \|= `1` << bucket;
1201	self.lo[byte_lo + `16`] \|= `1` << bucket;
1202	self.hi[byte_hi] \|= `1` << bucket;
1203	self.hi[byte_hi + `16`] \|= `1` << bucket;
1204	}
1205
1206	/// Turn this builder into a vector mask.
1207	///
1208	/// # Panics
1209	///
1210	/// When `V` represents a vector bigger than what `MaskBytes` can contain.
1211	///
1212	/// # Safety
1213	///
1214	/// Callers must ensure that this is okay to call in the current target for
1215	/// the current CPU.
1216	#[inline(always)]
1217	unsafe fn build<V: Vector>(&self) -> Mask<V> {
1218	assert!(V::BYTES <= self.lo.len());
1219	assert!(V::BYTES <= self.hi.len());
1220	Mask {
1221	lo: V::load_unaligned(self.lo[..].as_ptr()),
1222	hi: V::load_unaligned(self.hi[..].as_ptr()),
1223	}
1224	}
1225
1226	/// A convenience function for building `N` vector masks from a slim
1227	/// `Teddy` value.
1228	///
1229	/// # Panics
1230	///
1231	/// When `V` represents a vector bigger than what `MaskBytes` can contain.
1232	///
1233	/// # Safety
1234	///
1235	/// Callers must ensure that this is okay to call in the current target for
1236	/// the current CPU.
1237	#[inline(always)]
1238	unsafe fn from_teddy<const BYTES: usize, V: Vector>(
1239	teddy: &Teddy<`8`>,
1240	) -> [Mask<V>; BYTES] {
1241	// MSRV(1.63): Use core::array::from_fn to just build the array here
1242	// instead of creating a vector and turning it into an array.
1243	let mut mask_builders = vec![SlimMaskBuilder::default(); BYTES];
1244	for (bucket_index, bucket) in teddy.buckets.iter().enumerate() {
1245	for pid in bucket.iter().copied() {
1246	let pat = teddy.patterns.get(pid);
1247	for (i, builder) in mask_builders.iter_mut().enumerate() {
1248	builder.add(bucket_index, pat.bytes()[i]);
1249	}
1250	}
1251	}
1252	let array =
1253	<[SlimMaskBuilder; BYTES]>::try_from(mask_builders).unwrap();
1254	array.map(\|builder\| builder.build())
1255	}
1256	}
1257
1258	impl Debug for SlimMaskBuilder {
1259	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1260	let (mut parts_lo: Vec, mut parts_hi: Vec) = (vec![], vec![]);
1261	for i: usize in `0`..`32` {
1262	parts_lo.push(format!("{:`02`}: {:`08`b}", i, self.lo[i]));
1263	parts_hi.push(format!("{:`02`}: {:`08`b}", i, self.hi[i]));
1264	}
1265	f&mut DebugStruct<'_, '_>.debug_struct("SlimMaskBuilder")
1266	.field("lo", &parts_lo)
1267	.field(name:"hi", &parts_hi)
1268	.finish()
1269	}
1270	}
1271
1272	/// Represents the low and high nybble masks that will be used during "fat"
1273	/// Teddy search.
1274	///
1275	/// Each mask is 32 bytes wide, and at the time of writing, only 256-bit vectors
1276	/// support fat Teddy.
1277	///
1278	/// A fat Teddy mask is like a slim Teddy mask, except that instead of
1279	/// repeating the bitsets in the high and low 128-bits in 256-bit vectors, the
1280	/// high and low 128-bit halves each represent distinct buckets. (Bringing the
1281	/// total to 16 instead of 8.) This permits spreading the patterns out a bit
1282	/// more and thus putting less pressure on verification to be fast.
1283	///
1284	/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set
1285	/// if and only if the corresponding nybble is in the ith bucket. The index of
1286	/// the byte (0-15, inclusive) corresponds to the nybble.
1287	#[derive(Clone, Copy, Default)]
1288	struct FatMaskBuilder {
1289	lo: [u8; `32`],
1290	hi: [u8; `32`],
1291	}
1292
1293	impl FatMaskBuilder {
1294	/// Update this mask by adding the given byte to the given bucket. The
1295	/// given bucket must be in the range 0-15.
1296	///
1297	/// # Panics
1298	///
1299	/// When `bucket >= 16`.
1300	fn add(&mut self, bucket: usize, byte: u8) {
1301	assert!(bucket < `16`);
1302
1303	let bucket = u8::try_from(bucket).unwrap();
1304	let byte_lo = usize::from(byte & `0xF`);
1305	let byte_hi = usize::from((byte >> `4`) & `0xF`);
1306	// Unlike slim teddy, fat teddy only works with AVX2. For fat teddy,
1307	// the high 128 bits of our mask correspond to buckets 8-15, while the
1308	// low 128 bits correspond to buckets 0-7.
1309	if bucket < `8` {
1310	self.lo[byte_lo] \|= `1` << bucket;
1311	self.hi[byte_hi] \|= `1` << bucket;
1312	} else {
1313	self.lo[byte_lo + `16`] \|= `1` << (bucket % `8`);
1314	self.hi[byte_hi + `16`] \|= `1` << (bucket % `8`);
1315	}
1316	}
1317
1318	/// Turn this builder into a vector mask.
1319	///
1320	/// # Panics
1321	///
1322	/// When `V` represents a vector bigger than what `MaskBytes` can contain.
1323	///
1324	/// # Safety
1325	///
1326	/// Callers must ensure that this is okay to call in the current target for
1327	/// the current CPU.
1328	#[inline(always)]
1329	unsafe fn build<V: Vector>(&self) -> Mask<V> {
1330	assert!(V::BYTES <= self.lo.len());
1331	assert!(V::BYTES <= self.hi.len());
1332	Mask {
1333	lo: V::load_unaligned(self.lo[..].as_ptr()),
1334	hi: V::load_unaligned(self.hi[..].as_ptr()),
1335	}
1336	}
1337
1338	/// A convenience function for building `N` vector masks from a fat
1339	/// `Teddy` value.
1340	///
1341	/// # Panics
1342	///
1343	/// When `V` represents a vector bigger than what `MaskBytes` can contain.
1344	///
1345	/// # Safety
1346	///
1347	/// Callers must ensure that this is okay to call in the current target for
1348	/// the current CPU.
1349	#[inline(always)]
1350	unsafe fn from_teddy<const BYTES: usize, V: Vector>(
1351	teddy: &Teddy<`16`>,
1352	) -> [Mask<V>; BYTES] {
1353	// MSRV(1.63): Use core::array::from_fn to just build the array here
1354	// instead of creating a vector and turning it into an array.
1355	let mut mask_builders = vec![FatMaskBuilder::default(); BYTES];
1356	for (bucket_index, bucket) in teddy.buckets.iter().enumerate() {
1357	for pid in bucket.iter().copied() {
1358	let pat = teddy.patterns.get(pid);
1359	for (i, builder) in mask_builders.iter_mut().enumerate() {
1360	builder.add(bucket_index, pat.bytes()[i]);
1361	}
1362	}
1363	}
1364	let array =
1365	<[FatMaskBuilder; BYTES]>::try_from(mask_builders).unwrap();
1366	array.map(\|builder\| builder.build())
1367	}
1368	}
1369
1370	impl Debug for FatMaskBuilder {
1371	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1372	let (mut parts_lo: Vec, mut parts_hi: Vec) = (vec![], vec![]);
1373	for i: usize in `0`..`32` {
1374	parts_lo.push(format!("{:`02`}: {:`08`b}", i, self.lo[i]));
1375	parts_hi.push(format!("{:`02`}: {:`08`b}", i, self.hi[i]));
1376	}
1377	f&mut DebugStruct<'_, '_>.debug_struct("FatMaskBuilder")
1378	.field("lo", &parts_lo)
1379	.field(name:"hi", &parts_hi)
1380	.finish()
1381	}
1382	}
1383