prefilter.rs source code [crates/aho-corasick-1.0.2/src/util/prefilter.rs]

1	use core::{
2	cmp,
3	fmt::Debug,
4	panic::{RefUnwindSafe, UnwindSafe},
5	u8,
6	};
7
8	use alloc::{sync::Arc, vec, vec::Vec};
9
10	use crate::{
11	packed,
12	util::{
13	alphabet::ByteSet,
14	search::{Match, MatchKind, Span},
15	},
16	};
17
18	/// A prefilter for accelerating a search.
19	///
20	/// This crate uses prefilters in the core search implementations to accelerate
21	/// common cases. They typically only apply to cases where there are a small
22	/// number of patterns (less than 100 or so), but when they do, thoughput can
23	/// be boosted considerably, perhaps by an order of magnitude. When a prefilter
24	/// is active, it is used whenever a search enters an automaton's start state.
25	///
26	/// Currently, prefilters cannot be constructed by
27	/// callers. A `Prefilter` can only be accessed via the
28	/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter)
29	/// method and used to execute a search. In other words, a prefilter can be
30	/// used to optimize your own search implementation if necessary, but cannot do
31	/// much else. If you have a use case for more APIs, please submit an issue.
32	#[derive(Clone, Debug)]
33	pub struct Prefilter {
34	finder: Arc<dyn PrefilterI>,
35	memory_usage: usize,
36	}
37
38	impl Prefilter {
39	/// Execute a search in the haystack within the span given. If a match or
40	/// a possible match is returned, then it is guaranteed to occur within
41	/// the bounds of the span.
42	///
43	/// If the span provided is invalid for the given haystack, then behavior
44	/// is unspecified.
45	#[inline]
46	pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
47	self.finder.find_in(haystack, span)
48	}
49
50	#[inline]
51	pub(crate) fn memory_usage(&self) -> usize {
52	self.memory_usage
53	}
54	}
55
56	/// A candidate is the result of running a prefilter on a haystack at a
57	/// particular position.
58	///
59	/// The result is either no match, a confirmed match or a possible match.
60	///
61	/// When no match is returned, the prefilter is guaranteeing that no possible
62	/// match can be found in the haystack, and the caller may trust this. That is,
63	/// all correct prefilters must never report false negatives.
64	///
65	/// In some cases, a prefilter can confirm a match very quickly, in which case,
66	/// the caller may use this to stop what it's doing and report the match. In
67	/// this case, prefilter implementations must never report a false positive.
68	/// In other cases, the prefilter can only report a potential match, in which
69	/// case the callers must attempt to confirm the match. In this case, prefilter
70	/// implementations are permitted to return false positives.
71	#[derive(Clone, Debug)]
72	pub enum Candidate {
73	/// No match was found. Since false negatives are not possible, this means
74	/// the search can quit as it is guaranteed not to find another match.
75	None,
76	/// A confirmed match was found. Callers do not need to confirm it.
77	Match(Match),
78	/// The start of a possible match was found. Callers must confirm it before
79	/// reporting it as a match.
80	PossibleStartOfMatch(usize),
81	}
82
83	impl Candidate {
84	/// Convert this candidate into an option. This is useful when callers
85	/// do not distinguish between true positives and false positives (i.e.,
86	/// the caller must always confirm the match).
87	pub fn into_option(self) -> Option<usize> {
88	match self {
89	Candidate::None => None,
90	Candidate::Match(ref m: &Match) => Some(m.start()),
91	Candidate::PossibleStartOfMatch(start: usize) => Some(start),
92	}
93	}
94	}
95
96	/// A prefilter describes the behavior of fast literal scanners for quickly
97	/// skipping past bytes in the haystack that we know cannot possibly
98	/// participate in a match.
99	trait PrefilterI:
100	Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static
101	{
102	/// Returns the next possible match candidate. This may yield false
103	/// positives, so callers must confirm a match starting at the position
104	/// returned. This, however, must never produce false negatives. That is,
105	/// this must, at minimum, return the starting position of the next match
106	/// in the given haystack after or at the given position.
107	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate;
108	}
109
110	impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
111	#[inline(always)]
112	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
113	(**self).find_in(haystack, span)
114	}
115	}
116
117	/// A builder for constructing the best possible prefilter. When constructed,
118	/// this builder will heuristically select the best prefilter it can build,
119	/// if any, and discard the rest.
120	#[derive(Debug)]
121	pub(crate) struct Builder {
122	count: usize,
123	ascii_case_insensitive: bool,
124	start_bytes: StartBytesBuilder,
125	rare_bytes: RareBytesBuilder,
126	memmem: MemmemBuilder,
127	packed: Option<packed::Builder>,
128	// If we run across a condition that suggests we shouldn't use a prefilter
129	// at all (like an empty pattern), then disable prefilters entirely.
130	enabled: bool,
131	}
132
133	impl Builder {
134	/// Create a new builder for constructing the best possible prefilter.
135	pub(crate) fn new(kind: MatchKind) -> Builder {
136	let pbuilder = kind
137	.as_packed()
138	.map(\|kind\| packed::Config::new().match_kind(kind).builder());
139	Builder {
140	count: `0`,
141	ascii_case_insensitive: `false`,
142	start_bytes: StartBytesBuilder::new(),
143	rare_bytes: RareBytesBuilder::new(),
144	memmem: MemmemBuilder::default(),
145	packed: pbuilder,
146	enabled: `true`,
147	}
148	}
149
150	/// Enable ASCII case insensitivity. When set, byte strings added to this
151	/// builder will be interpreted without respect to ASCII case.
152	pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
153	self.ascii_case_insensitive = yes;
154	self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
155	self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
156	self
157	}
158
159	/// Return a prefilter suitable for quickly finding potential matches.
160	///
161	/// All patterns added to an Aho-Corasick automaton should be added to this
162	/// builder before attempting to construct the prefilter.
163	pub(crate) fn build(&self) -> Option<Prefilter> {
164	if !self.enabled {
165	return None;
166	}
167	// If we only have one pattern, then deferring to memmem is always
168	// the best choice. This is kind of a weird case, because, well, why
169	// use Aho-Corasick if you only have one pattern? But maybe you don't
170	// know exactly how many patterns you'll get up front, and you need to
171	// support the option of multiple patterns. So instead of relying on
172	// the caller to branch and use memmem explicitly, we just do it for
173	// them.
174	if !self.ascii_case_insensitive {
175	if let Some(pre) = self.memmem.build() {
176	return Some(pre);
177	}
178	}
179	match (self.start_bytes.build(), self.rare_bytes.build()) {
180	// If we could build both start and rare prefilters, then there are
181	// a few cases in which we'd want to use the start-byte prefilter
182	// over the rare-byte prefilter, since the former has lower
183	// overhead.
184	(prestart @ Some(_), prerare @ Some(_)) => {
185	// If the start-byte prefilter can scan for a smaller number
186	// of bytes than the rare-byte prefilter, then it's probably
187	// faster.
188	let has_fewer_bytes =
189	self.start_bytes.count < self.rare_bytes.count;
190	// Otherwise, if the combined frequency rank of the detected
191	// bytes in the start-byte prefilter is "close" to the combined
192	// frequency rank of the rare-byte prefilter, then we pick
193	// the start-byte prefilter even if the rare-byte prefilter
194	// heuristically searches for rare bytes. This is because the
195	// rare-byte prefilter has higher constant costs, so we tend to
196	// prefer the start-byte prefilter when we can.
197	let has_rarer_bytes =
198	self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + `50`;
199	if has_fewer_bytes \|\| has_rarer_bytes {
200	prestart
201	} else {
202	prerare
203	}
204	}
205	(prestart @ Some(_), None) => prestart,
206	(None, prerare @ Some(_)) => prerare,
207	(None, None) if self.ascii_case_insensitive => None,
208	(None, None) => {
209	self.packed.as_ref().and_then(\|b\| b.build()).map(\|s\| {
210	let memory_usage = s.memory_usage();
211	Prefilter { finder: Arc::new(Packed(s)), memory_usage }
212	})
213	}
214	}
215	}
216
217	/// Add a literal string to this prefilter builder.
218	pub(crate) fn add(&mut self, bytes: &[u8]) {
219	if bytes.is_empty() {
220	self.enabled = `false`;
221	}
222	if !self.enabled {
223	return;
224	}
225	self.count += `1`;
226	self.start_bytes.add(bytes);
227	self.rare_bytes.add(bytes);
228	self.memmem.add(bytes);
229	if let Some(ref mut pbuilder) = self.packed {
230	pbuilder.add(bytes);
231	}
232	}
233	}
234
235	/// A type that wraps a packed searcher and implements the `Prefilter`
236	/// interface.
237	#[derive(Clone, Debug)]
238	struct Packed(packed::Searcher);
239
240	impl PrefilterI for Packed {
241	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
242	self.0
243	.find_in(&haystack, span)
244	.map_or(default:Candidate::None, f:Candidate::Match)
245	}
246	}
247
248	/// A builder for constructing a prefilter that uses memmem.
249	#[derive(Debug, Default)]
250	struct MemmemBuilder {
251	/// The number of patterns that have been added.
252	count: usize,
253	/// The singular pattern to search for. This is only set when count==1.
254	one: Option<Vec<u8>>,
255	}
256
257	impl MemmemBuilder {
258	fn build(&self) -> Option<Prefilter> {
259	#[cfg(all(feature = "std", feature = "perf-literal"))]
260	fn imp(builder: &MemmemBuilder) -> Option<Prefilter> {
261	let pattern = builder.one.as_ref()?;
262	assert_eq!(`1`, builder.count);
263	let finder = Arc::new(Memmem(
264	memchr::memmem::Finder::new(pattern).into_owned(),
265	));
266	let memory_usage = pattern.len();
267	Some(Prefilter { finder, memory_usage })
268	}
269
270	#[cfg(not(all(feature = "std", feature = "perf-literal")))]
271	fn imp(_: &MemmemBuilder) -> Option<Prefilter> {
272	None
273	}
274
275	imp(self)
276	}
277
278	fn add(&mut self, bytes: &[u8]) {
279	self.count += `1`;
280	if self.count == `1` {
281	self.one = Some(bytes.to_vec());
282	} else {
283	self.one = None;
284	}
285	}
286	}
287
288	/// A type that wraps a SIMD accelerated single substring search from the
289	/// `memchr` crate for use as a prefilter.
290	///
291	/// Currently, this prefilter is only active for Aho-Corasick searchers with
292	/// a single pattern. In theory, this could be extended to support searchers
293	/// that have a common prefix of more than one byte (for one byte, we would use
294	/// memchr), but it's not clear if it's worth it or not.
295	///
296	/// Also, unfortunately, this currently also requires the 'std' feature to
297	/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc
298	/// mode, and so APIs like Finder::into_owned aren't available when 'std' is
299	/// disabled. But there should be an 'alloc' feature that brings in APIs like
300	/// Finder::into_owned but doesn't use std-only features like runtime CPU
301	/// feature detection.
302	#[cfg(all(feature = "std", feature = "perf-literal"))]
303	#[derive(Clone, Debug)]
304	struct Memmem(memchr::memmem::Finder<'static>);
305
306	#[cfg(all(feature = "std", feature = "perf-literal"))]
307	impl PrefilterI for Memmem {
308	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
309	use crate::util::primitives::PatternID;
310
311	self.0.find(&haystack[span]).map_or(default:Candidate::None, \|i: usize\| {
312	let start: usize = span.start + i;
313	let end: usize = start + self.0.needle().len();
314	// N.B. We can declare a match and use a fixed pattern ID here
315	// because a Memmem prefilter is only ever created for searchers
316	// with exactly one pattern. Thus, every match is always a match
317	// and it is always for the first and only pattern.
318	Candidate::Match(Match::new(pattern:PatternID::ZERO, span:start..end))
319	})
320	}
321	}
322
323	/// A builder for constructing a rare byte prefilter.
324	///
325	/// A rare byte prefilter attempts to pick out a small set of rare bytes that
326	/// occurr in the patterns, and then quickly scan to matches of those rare
327	/// bytes.
328	#[derive(Clone, Debug)]
329	struct RareBytesBuilder {
330	/// Whether this prefilter should account for ASCII case insensitivity or
331	/// not.
332	ascii_case_insensitive: bool,
333	/// A set of rare bytes, indexed by byte value.
334	rare_set: ByteSet,
335	/// A set of byte offsets associated with bytes in a pattern. An entry
336	/// corresponds to a particular bytes (its index) and is only non-zero if
337	/// the byte occurred at an offset greater than 0 in at least one pattern.
338	///
339	/// If a byte's offset is not representable in 8 bits, then the rare bytes
340	/// prefilter becomes inert.
341	byte_offsets: RareByteOffsets,
342	/// Whether this is available as a prefilter or not. This can be set to
343	/// false during construction if a condition is seen that invalidates the
344	/// use of the rare-byte prefilter.
345	available: bool,
346	/// The number of bytes set to an active value in `byte_offsets`.
347	count: usize,
348	/// The sum of frequency ranks for the rare bytes detected. This is
349	/// intended to give a heuristic notion of how rare the bytes are.
350	rank_sum: u16,
351	}
352
353	/// A set of byte offsets, keyed by byte.
354	#[derive(Clone, Copy)]
355	struct RareByteOffsets {
356	/// Each entry corresponds to the maximum offset of the corresponding
357	/// byte across all patterns seen.
358	set: [RareByteOffset; `256`],
359	}
360
361	impl RareByteOffsets {
362	/// Create a new empty set of rare byte offsets.
363	pub(crate) fn empty() -> RareByteOffsets {
364	RareByteOffsets { set: [RareByteOffset::default(); `256`] }
365	}
366
367	/// Add the given offset for the given byte to this set. If the offset is
368	/// greater than the existing offset, then it overwrites the previous
369	/// value and returns false. If there is no previous value set, then this
370	/// sets it and returns true.
371	pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) {
372	self.set[byte as usize].max =
373	cmp::max(self.set[byte as usize].max, v2:off.max);
374	}
375	}
376
377	impl core::fmt::Debug for RareByteOffsets {
378	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
379	let mut offsets: Vec<&RareByteOffset> = vec![];
380	for off: &RareByteOffset in self.set.iter() {
381	if off.max > `0` {
382	offsets.push(off);
383	}
384	}
385	f.debug_struct("RareByteOffsets").field(name:"set", &offsets).finish()
386	}
387	}
388
389	/// Offsets associated with an occurrence of a "rare" byte in any of the
390	/// patterns used to construct a single Aho-Corasick automaton.
391	#[derive(Clone, Copy, Debug)]
392	struct RareByteOffset {
393	/// The maximum offset at which a particular byte occurs from the start
394	/// of any pattern. This is used as a shift amount. That is, when an
395	/// occurrence of this byte is found, the candidate position reported by
396	/// the prefilter is `position_of_byte - max`, such that the automaton
397	/// will begin its search at a position that is guaranteed to observe a
398	/// match.
399	///
400	/// To avoid accidentally quadratic behavior, a prefilter is considered
401	/// ineffective when it is asked to start scanning from a position that it
402	/// has already scanned past.
403	///
404	/// Using a `u8` here means that if we ever see a pattern that's longer
405	/// than 255 bytes, then the entire rare byte prefilter is disabled.
406	max: u8,
407	}
408
409	impl Default for RareByteOffset {
410	fn default() -> RareByteOffset {
411	RareByteOffset { max: `0` }
412	}
413	}
414
415	impl RareByteOffset {
416	/// Create a new rare byte offset. If the given offset is too big, then
417	/// None is returned. In that case, callers should render the rare bytes
418	/// prefilter inert.
419	fn new(max: usize) -> Option<RareByteOffset> {
420	if max > u8::MAX as usize {
421	None
422	} else {
423	Some(RareByteOffset { max: max as u8 })
424	}
425	}
426	}
427
428	impl RareBytesBuilder {
429	/// Create a new builder for constructing a rare byte prefilter.
430	fn new() -> RareBytesBuilder {
431	RareBytesBuilder {
432	ascii_case_insensitive: `false`,
433	rare_set: ByteSet::empty(),
434	byte_offsets: RareByteOffsets::empty(),
435	available: `true`,
436	count: `0`,
437	rank_sum: `0`,
438	}
439	}
440
441	/// Enable ASCII case insensitivity. When set, byte strings added to this
442	/// builder will be interpreted without respect to ASCII case.
443	fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
444	self.ascii_case_insensitive = yes;
445	self
446	}
447
448	/// Build the rare bytes prefilter.
449	///
450	/// If there are more than 3 distinct rare bytes found, or if heuristics
451	/// otherwise determine that this prefilter should not be used, then `None`
452	/// is returned.
453	fn build(&self) -> Option<Prefilter> {
454	#[cfg(feature = "perf-literal")]
455	fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> {
456	if !builder.available \|\| builder.count > `3` {
457	return None;
458	}
459	let (mut bytes, mut len) = ([`0`; `3`], `0`);
460	for b in `0`..=`255` {
461	if builder.rare_set.contains(b) {
462	bytes[len] = b as u8;
463	len += `1`;
464	}
465	}
466	let finder: Arc<dyn PrefilterI> = match len {
467	`0` => return None,
468	`1` => Arc::new(RareBytesOne {
469	byte1: bytes[`0`],
470	offset: builder.byte_offsets.set[bytes[`0`] as usize],
471	}),
472	`2` => Arc::new(RareBytesTwo {
473	offsets: builder.byte_offsets,
474	byte1: bytes[`0`],
475	byte2: bytes[`1`],
476	}),
477	`3` => Arc::new(RareBytesThree {
478	offsets: builder.byte_offsets,
479	byte1: bytes[`0`],
480	byte2: bytes[`1`],
481	byte3: bytes[`2`],
482	}),
483	_ => unreachable!(),
484	};
485	Some(Prefilter { finder, memory_usage: `0` })
486	}
487
488	#[cfg(not(feature = "perf-literal"))]
489	fn imp(_: &RareBytesBuilder) -> Option<Prefilter> {
490	None
491	}
492
493	imp(self)
494	}
495
496	/// Add a byte string to this builder.
497	///
498	/// All patterns added to an Aho-Corasick automaton should be added to this
499	/// builder before attempting to construct the prefilter.
500	fn add(&mut self, bytes: &[u8]) {
501	// If we've already given up, then do nothing.
502	if !self.available {
503	return;
504	}
505	// If we've already blown our budget, then don't waste time looking
506	// for more rare bytes.
507	if self.count > `3` {
508	self.available = `false`;
509	return;
510	}
511	// If the pattern is too long, then our offset table is bunk, so
512	// give up.
513	if bytes.len() >= `256` {
514	self.available = `false`;
515	return;
516	}
517	let mut rarest = match bytes.get(`0`) {
518	None => return,
519	Some(&b) => (b, freq_rank(b)),
520	};
521	// The idea here is to look for the rarest byte in each pattern, and
522	// add that to our set. As a special exception, if we see a byte that
523	// we've already added, then we immediately stop and choose that byte,
524	// even if there's another rare byte in the pattern. This helps us
525	// apply the rare byte optimization in more cases by attempting to pick
526	// bytes that are in common between patterns. So for example, if we
527	// were searching for `Sherlock` and `lockjaw`, then this would pick
528	// `k` for both patterns, resulting in the use of `memchr` instead of
529	// `memchr2` for `k` and `j`.
530	let mut found = `false`;
531	for (pos, &b) in bytes.iter().enumerate() {
532	self.set_offset(pos, b);
533	if found {
534	continue;
535	}
536	if self.rare_set.contains(b) {
537	found = `true`;
538	continue;
539	}
540	let rank = freq_rank(b);
541	if rank < rarest.1 {
542	rarest = (b, rank);
543	}
544	}
545	if !found {
546	self.add_rare_byte(rarest.0);
547	}
548	}
549
550	fn set_offset(&mut self, pos: usize, byte: u8) {
551	// This unwrap is OK because pos is never bigger than our max.
552	let offset = RareByteOffset::new(pos).unwrap();
553	self.byte_offsets.set(byte, offset);
554	if self.ascii_case_insensitive {
555	self.byte_offsets.set(opposite_ascii_case(byte), offset);
556	}
557	}
558
559	fn add_rare_byte(&mut self, byte: u8) {
560	self.add_one_rare_byte(byte);
561	if self.ascii_case_insensitive {
562	self.add_one_rare_byte(opposite_ascii_case(byte));
563	}
564	}
565
566	fn add_one_rare_byte(&mut self, byte: u8) {
567	if !self.rare_set.contains(byte) {
568	self.rare_set.add(byte);
569	self.count += `1`;
570	self.rank_sum += freq_rank(byte) as u16;
571	}
572	}
573	}
574
575	/// A prefilter for scanning for a single "rare" byte.
576	#[cfg(feature = "perf-literal")]
577	#[derive(Clone, Debug)]
578	struct RareBytesOne {
579	byte1: u8,
580	offset: RareByteOffset,
581	}
582
583	#[cfg(feature = "perf-literal")]
584	impl PrefilterI for RareBytesOne {
585	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
586	memchr::memchr(self.byte1, &haystack[span])
587	.map(\|i\| {
588	let pos = span.start + i;
589	cmp::max(
590	span.start,
591	pos.saturating_sub(usize::from(self.offset.max)),
592	)
593	})
594	.map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
595	}
596	}
597
598	/// A prefilter for scanning for two "rare" bytes.
599	#[cfg(feature = "perf-literal")]
600	#[derive(Clone, Debug)]
601	struct RareBytesTwo {
602	offsets: RareByteOffsets,
603	byte1: u8,
604	byte2: u8,
605	}
606
607	#[cfg(feature = "perf-literal")]
608	impl PrefilterI for RareBytesTwo {
609	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
610	memchr::memchr2(self.byte1, self.byte2, &haystack[span])
611	.map(\|i\| {
612	let pos = span.start + i;
613	let offset = self.offsets.set[usize::from(haystack[pos])].max;
614	cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
615	})
616	.map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
617	}
618	}
619
620	/// A prefilter for scanning for three "rare" bytes.
621	#[cfg(feature = "perf-literal")]
622	#[derive(Clone, Debug)]
623	struct RareBytesThree {
624	offsets: RareByteOffsets,
625	byte1: u8,
626	byte2: u8,
627	byte3: u8,
628	}
629
630	#[cfg(feature = "perf-literal")]
631	impl PrefilterI for RareBytesThree {
632	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
633	memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
634	.map(\|i\| {
635	let pos = span.start + i;
636	let offset = self.offsets.set[usize::from(haystack[pos])].max;
637	cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
638	})
639	.map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
640	}
641	}
642
643	/// A builder for constructing a starting byte prefilter.
644	///
645	/// A starting byte prefilter is a simplistic prefilter that looks for possible
646	/// matches by reporting all positions corresponding to a particular byte. This
647	/// generally only takes affect when there are at most 3 distinct possible
648	/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
649	/// distinct starting bytes (`f` and `b`), and this prefilter returns all
650	/// occurrences of either `f` or `b`.
651	///
652	/// In some cases, a heuristic frequency analysis may determine that it would
653	/// be better not to use this prefilter even when there are 3 or fewer distinct
654	/// starting bytes.
655	#[derive(Clone, Debug)]
656	struct StartBytesBuilder {
657	/// Whether this prefilter should account for ASCII case insensitivity or
658	/// not.
659	ascii_case_insensitive: bool,
660	/// The set of starting bytes observed.
661	byteset: Vec<bool>,
662	/// The number of bytes set to true in `byteset`.
663	count: usize,
664	/// The sum of frequency ranks for the rare bytes detected. This is
665	/// intended to give a heuristic notion of how rare the bytes are.
666	rank_sum: u16,
667	}
668
669	impl StartBytesBuilder {
670	/// Create a new builder for constructing a start byte prefilter.
671	fn new() -> StartBytesBuilder {
672	StartBytesBuilder {
673	ascii_case_insensitive: `false`,
674	byteset: vec![`false`; `256`],
675	count: `0`,
676	rank_sum: `0`,
677	}
678	}
679
680	/// Enable ASCII case insensitivity. When set, byte strings added to this
681	/// builder will be interpreted without respect to ASCII case.
682	fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
683	self.ascii_case_insensitive = yes;
684	self
685	}
686
687	/// Build the starting bytes prefilter.
688	///
689	/// If there are more than 3 distinct starting bytes, or if heuristics
690	/// otherwise determine that this prefilter should not be used, then `None`
691	/// is returned.
692	fn build(&self) -> Option<Prefilter> {
693	#[cfg(feature = "perf-literal")]
694	fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> {
695	if builder.count > `3` {
696	return None;
697	}
698	let (mut bytes, mut len) = ([`0`; `3`], `0`);
699	for b in `0`..`256` {
700	if !builder.byteset[b] {
701	continue;
702	}
703	// We don't handle non-ASCII bytes for now. Getting non-ASCII
704	// bytes right is trickier, since we generally don't want to put
705	// a leading UTF-8 code unit into a prefilter that isn't ASCII,
706	// since they can frequently. Instead, it would be better to use a
707	// continuation byte, but this requires more sophisticated analysis
708	// of the automaton and a richer prefilter API.
709	if b > `0x7F` {
710	return None;
711	}
712	bytes[len] = b as u8;
713	len += `1`;
714	}
715	let finder: Arc<dyn PrefilterI> = match len {
716	`0` => return None,
717	`1` => Arc::new(StartBytesOne { byte1: bytes[`0`] }),
718	`2` => Arc::new(StartBytesTwo {
719	byte1: bytes[`0`],
720	byte2: bytes[`1`],
721	}),
722	`3` => Arc::new(StartBytesThree {
723	byte1: bytes[`0`],
724	byte2: bytes[`1`],
725	byte3: bytes[`2`],
726	}),
727	_ => unreachable!(),
728	};
729	Some(Prefilter { finder, memory_usage: `0` })
730	}
731
732	#[cfg(not(feature = "perf-literal"))]
733	fn imp(_: &StartBytesBuilder) -> Option<Prefilter> {
734	None
735	}
736
737	imp(self)
738	}
739
740	/// Add a byte string to this builder.
741	///
742	/// All patterns added to an Aho-Corasick automaton should be added to this
743	/// builder before attempting to construct the prefilter.
744	fn add(&mut self, bytes: &[u8]) {
745	if self.count > `3` {
746	return;
747	}
748	if let Some(&byte) = bytes.get(`0`) {
749	self.add_one_byte(byte);
750	if self.ascii_case_insensitive {
751	self.add_one_byte(opposite_ascii_case(byte));
752	}
753	}
754	}
755
756	fn add_one_byte(&mut self, byte: u8) {
757	if !self.byteset[byte as usize] {
758	self.byteset[byte as usize] = `true`;
759	self.count += `1`;
760	self.rank_sum += freq_rank(byte) as u16;
761	}
762	}
763	}
764
765	/// A prefilter for scanning for a single starting byte.
766	#[cfg(feature = "perf-literal")]
767	#[derive(Clone, Debug)]
768	struct StartBytesOne {
769	byte1: u8,
770	}
771
772	#[cfg(feature = "perf-literal")]
773	impl PrefilterI for StartBytesOne {
774	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
775	memchr::memchr(self.byte1, &haystack[span])
776	.map(\|i\| span.start + i)
777	.map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
778	}
779	}
780
781	/// A prefilter for scanning for two starting bytes.
782	#[cfg(feature = "perf-literal")]
783	#[derive(Clone, Debug)]
784	struct StartBytesTwo {
785	byte1: u8,
786	byte2: u8,
787	}
788
789	#[cfg(feature = "perf-literal")]
790	impl PrefilterI for StartBytesTwo {
791	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
792	memchr::memchr2(self.byte1, self.byte2, &haystack[span])
793	.map(\|i\| span.start + i)
794	.map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
795	}
796	}
797
798	/// A prefilter for scanning for three starting bytes.
799	#[cfg(feature = "perf-literal")]
800	#[derive(Clone, Debug)]
801	struct StartBytesThree {
802	byte1: u8,
803	byte2: u8,
804	byte3: u8,
805	}
806
807	#[cfg(feature = "perf-literal")]
808	impl PrefilterI for StartBytesThree {
809	fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
810	memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
811	.map(\|i\| span.start + i)
812	.map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
813	}
814	}
815
816	/// If the given byte is an ASCII letter, then return it in the opposite case.
817	/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
818	/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
819	pub(crate) fn opposite_ascii_case(b: u8) -> u8 {
820	if b'A' <= b && b <= b'Z' {
821	b.to_ascii_lowercase()
822	} else if b'a' <= b && b <= b'z' {
823	b.to_ascii_uppercase()
824	} else {
825	b
826	}
827	}
828
829	/// Return the frequency rank of the given byte. The higher the rank, the more
830	/// common the byte (heuristically speaking).
831	fn freq_rank(b: u8) -> u8 {
832	use crate::util::byte_frequencies::BYTE_FREQUENCIES;
833	BYTE_FREQUENCIES[b as usize]
834	}
835