prefilter.rs source code [crates/aho-corasick-0.7.20/src/prefilter.rs]

1	use std::cmp;
2	use std::fmt;
3	use std::panic::{RefUnwindSafe, UnwindSafe};
4	use std::u8;
5
6	use memchr::{memchr, memchr2, memchr3};
7
8	use crate::ahocorasick::MatchKind;
9	use crate::packed;
10	use crate::Match;
11
12	/// A candidate is the result of running a prefilter on a haystack at a
13	/// particular position. The result is either no match, a confirmed match or
14	/// a possible match.
15	///
16	/// When no match is returned, the prefilter is guaranteeing that no possible
17	/// match can be found in the haystack, and the caller may trust this. That is,
18	/// all correct prefilters must never report false negatives.
19	///
20	/// In some cases, a prefilter can confirm a match very quickly, in which case,
21	/// the caller may use this to stop what it's doing and report the match. In
22	/// this case, prefilter implementations must never report a false positive.
23	/// In other cases, the prefilter can only report a potential match, in which
24	/// case the callers must attempt to confirm the match. In this case, prefilter
25	/// implementations are permitted to return false positives.
26	#[derive(Clone, Debug)]
27	pub enum Candidate {
28	None,
29	Match(Match),
30	PossibleStartOfMatch(usize),
31	}
32
33	impl Candidate {
34	/// Convert this candidate into an option. This is useful when callers
35	/// do not distinguish between true positives and false positives (i.e.,
36	/// the caller must always confirm the match in order to update some other
37	/// state).
38	pub fn into_option(self) -> Option<usize> {
39	match self {
40	Candidate::None => None,
41	Candidate::Match(ref m: &Match) => Some(m.start()),
42	Candidate::PossibleStartOfMatch(start: usize) => Some(start),
43	}
44	}
45	}
46
47	/// A prefilter describes the behavior of fast literal scanners for quickly
48	/// skipping past bytes in the haystack that we know cannot possibly
49	/// participate in a match.
50	pub trait Prefilter:
51	Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug
52	{
53	/// Returns the next possible match candidate. This may yield false
54	/// positives, so callers must confirm a match starting at the position
55	/// returned. This, however, must never produce false negatives. That is,
56	/// this must, at minimum, return the starting position of the next match
57	/// in the given haystack after or at the given position.
58	fn next_candidate(
59	&self,
60	state: &mut PrefilterState,
61	haystack: &[u8],
62	at: usize,
63	) -> Candidate;
64
65	/// A method for cloning a prefilter, to work-around the fact that Clone
66	/// is not object-safe.
67	fn clone_prefilter(&self) -> Box<dyn Prefilter>;
68
69	/// Returns the approximate total amount of heap used by this prefilter, in
70	/// units of bytes.
71	fn heap_bytes(&self) -> usize;
72
73	/// Returns true if and only if this prefilter never returns false
74	/// positives. This is useful for completely avoiding the automaton
75	/// when the prefilter can quickly confirm its own matches.
76	///
77	/// By default, this returns true, which is conservative; it is always
78	/// correct to return `true`. Returning `false` here and reporting a false
79	/// positive will result in incorrect searches.
80	fn reports_false_positives(&self) -> bool {
81	`true`
82	}
83
84	/// Returns true if and only if this prefilter may look for a non-starting
85	/// position of a match.
86	///
87	/// This is useful in a streaming context where prefilters that don't look
88	/// for a starting position of a match can be quite difficult to deal with.
89	///
90	/// This returns false by default.
91	fn looks_for_non_start_of_match(&self) -> bool {
92	`false`
93	}
94	}
95
96	impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
97	#[inline]
98	fn next_candidate(
99	&self,
100	state: &mut PrefilterState,
101	haystack: &[u8],
102	at: usize,
103	) -> Candidate {
104	(**self).next_candidate(state, haystack, at)
105	}
106
107	fn clone_prefilter(&self) -> Box<dyn Prefilter> {
108	(**self).clone_prefilter()
109	}
110
111	fn heap_bytes(&self) -> usize {
112	(**self).heap_bytes()
113	}
114
115	fn reports_false_positives(&self) -> bool {
116	(**self).reports_false_positives()
117	}
118	}
119
120	/// A convenience object for representing any type that implements Prefilter
121	/// and is cloneable.
122	#[derive(Debug)]
123	pub struct PrefilterObj(Box<dyn Prefilter>);
124
125	impl Clone for PrefilterObj {
126	fn clone(&self) -> Self {
127	PrefilterObj(self.0.clone_prefilter())
128	}
129	}
130
131	impl PrefilterObj {
132	/// Create a new prefilter object.
133	pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
134	PrefilterObj(Box::new(t))
135	}
136
137	/// Return the underlying prefilter trait object.
138	pub fn as_ref(&self) -> &dyn Prefilter {
139	&*self.0
140	}
141	}
142
143	/// PrefilterState tracks state associated with the effectiveness of a
144	/// prefilter. It is used to track how many bytes, on average, are skipped by
145	/// the prefilter. If this average dips below a certain threshold over time,
146	/// then the state renders the prefilter inert and stops using it.
147	///
148	/// A prefilter state should be created for each search. (Where creating an
149	/// iterator via, e.g., `find_iter`, is treated as a single search.)
150	#[derive(Clone, Debug)]
151	pub struct PrefilterState {
152	/// The number of skips that has been executed.
153	skips: usize,
154	/// The total number of bytes that have been skipped.
155	skipped: usize,
156	/// The maximum length of a match. This is used to help determine how many
157	/// bytes on average should be skipped in order for a prefilter to be
158	/// effective.
159	max_match_len: usize,
160	/// Once this heuristic has been deemed permanently ineffective, it will be
161	/// inert throughout the rest of its lifetime. This serves as a cheap way
162	/// to check inertness.
163	inert: bool,
164	/// The last (absolute) position at which a prefilter scanned to.
165	/// Prefilters can use this position to determine whether to re-scan or
166	/// not.
167	///
168	/// Unlike other things that impact effectiveness, this is a fleeting
169	/// condition. That is, a prefilter can be considered ineffective if it is
170	/// at a position before `last_scan_at`, but can become effective again
171	/// once the search moves past `last_scan_at`.
172	///
173	/// The utility of this is to both avoid additional overhead from calling
174	/// the prefilter and to avoid quadratic behavior. This ensures that a
175	/// prefilter will scan any particular byte at most once. (Note that some
176	/// prefilters, like the start-byte prefilter, do not need to use this
177	/// field at all, since it only looks for starting bytes.)
178	last_scan_at: usize,
179	}
180
181	impl PrefilterState {
182	/// The minimum number of skip attempts to try before considering whether
183	/// a prefilter is effective or not.
184	const MIN_SKIPS: usize = `40`;
185
186	/// The minimum amount of bytes that skipping must average, expressed as a
187	/// factor of the multiple of the length of a possible match.
188	///
189	/// That is, after MIN_SKIPS have occurred, if the average number of bytes
190	/// skipped ever falls below MIN_AVG_FACTOR max-match-length, then the*
191	/// prefilter outed to be rendered inert.
192	const MIN_AVG_FACTOR: usize = `2`;
193
194	/// Create a fresh prefilter state.
195	pub fn new(max_match_len: usize) -> PrefilterState {
196	PrefilterState {
197	skips: `0`,
198	skipped: `0`,
199	max_match_len,
200	inert: `false`,
201	last_scan_at: `0`,
202	}
203	}
204
205	/// Create a prefilter state that always disables the prefilter.
206	pub fn disabled() -> PrefilterState {
207	PrefilterState {
208	skips: `0`,
209	skipped: `0`,
210	max_match_len: `0`,
211	inert: `true`,
212	last_scan_at: `0`,
213	}
214	}
215
216	/// Update this state with the number of bytes skipped on the last
217	/// invocation of the prefilter.
218	#[inline]
219	fn update_skipped_bytes(&mut self, skipped: usize) {
220	self.skips += `1`;
221	self.skipped += skipped;
222	}
223
224	/// Updates the position at which the last scan stopped. This may be
225	/// greater than the position of the last candidate reported. For example,
226	/// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz`
227	/// will report a candidate at position `0`, but the end of its last scan
228	/// will be at position `3`.
229	///
230	/// This position factors into the effectiveness of this prefilter. If the
231	/// current position is less than the last position at which a scan ended,
232	/// then the prefilter should not be re-run until the search moves past
233	/// that position.
234	#[inline]
235	fn update_at(&mut self, at: usize) {
236	if at > self.last_scan_at {
237	self.last_scan_at = at;
238	}
239	}
240
241	/// Return true if and only if this state indicates that a prefilter is
242	/// still effective.
243	///
244	/// The given pos should correspond to the current starting position of the
245	/// search.
246	#[inline]
247	pub fn is_effective(&mut self, at: usize) -> bool {
248	if self.inert {
249	return `false`;
250	}
251	if at < self.last_scan_at {
252	return `false`;
253	}
254	if self.skips < PrefilterState::MIN_SKIPS {
255	return `true`;
256	}
257
258	let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
259	if self.skipped >= min_avg * self.skips {
260	return `true`;
261	}
262
263	// We're inert.
264	self.inert = `true`;
265	`false`
266	}
267	}
268
269	/// A builder for constructing the best possible prefilter. When constructed,
270	/// this builder will heuristically select the best prefilter it can build,
271	/// if any, and discard the rest.
272	#[derive(Debug)]
273	pub struct Builder {
274	count: usize,
275	ascii_case_insensitive: bool,
276	start_bytes: StartBytesBuilder,
277	rare_bytes: RareBytesBuilder,
278	packed: Option<packed::Builder>,
279	}
280
281	impl Builder {
282	/// Create a new builder for constructing the best possible prefilter.
283	pub fn new(kind: MatchKind) -> Builder {
284	let pbuilder = kind
285	.as_packed()
286	.map(\|kind\| packed::Config::new().match_kind(kind).builder());
287	Builder {
288	count: `0`,
289	ascii_case_insensitive: `false`,
290	start_bytes: StartBytesBuilder::new(),
291	rare_bytes: RareBytesBuilder::new(),
292	packed: pbuilder,
293	}
294	}
295
296	/// Enable ASCII case insensitivity. When set, byte strings added to this
297	/// builder will be interpreted without respect to ASCII case.
298	pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
299	self.ascii_case_insensitive = yes;
300	self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
301	self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
302	self
303	}
304
305	/// Return a prefilter suitable for quickly finding potential matches.
306	///
307	/// All patterns added to an Aho-Corasick automaton should be added to this
308	/// builder before attempting to construct the prefilter.
309	pub fn build(&self) -> Option<PrefilterObj> {
310	// match (self.start_bytes.build(), self.rare_bytes.build()) {
311	match (self.start_bytes.build(), self.rare_bytes.build()) {
312	// If we could build both start and rare prefilters, then there are
313	// a few cases in which we'd want to use the start-byte prefilter
314	// over the rare-byte prefilter, since the former has lower
315	// overhead.
316	(prestart @ Some(_), prerare @ Some(_)) => {
317	// If the start-byte prefilter can scan for a smaller number
318	// of bytes than the rare-byte prefilter, then it's probably
319	// faster.
320	let has_fewer_bytes =
321	self.start_bytes.count < self.rare_bytes.count;
322	// Otherwise, if the combined frequency rank of the detected
323	// bytes in the start-byte prefilter is "close" to the combined
324	// frequency rank of the rare-byte prefilter, then we pick
325	// the start-byte prefilter even if the rare-byte prefilter
326	// heuristically searches for rare bytes. This is because the
327	// rare-byte prefilter has higher constant costs, so we tend to
328	// prefer the start-byte prefilter when we can.
329	let has_rarer_bytes =
330	self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + `50`;
331	if has_fewer_bytes \|\| has_rarer_bytes {
332	prestart
333	} else {
334	prerare
335	}
336	}
337	(prestart @ Some(_), None) => prestart,
338	(None, prerare @ Some(_)) => prerare,
339	(None, None) if self.ascii_case_insensitive => None,
340	(None, None) => self
341	.packed
342	.as_ref()
343	.and_then(\|b\| b.build())
344	.map(\|s\| PrefilterObj::new(Packed(s))),
345	}
346	}
347
348	/// Add a literal string to this prefilter builder.
349	pub fn add(&mut self, bytes: &[u8]) {
350	self.count += `1`;
351	self.start_bytes.add(bytes);
352	self.rare_bytes.add(bytes);
353	if let Some(ref mut pbuilder) = self.packed {
354	pbuilder.add(bytes);
355	}
356	}
357	}
358
359	/// A type that wraps a packed searcher and implements the `Prefilter`
360	/// interface.
361	#[derive(Clone, Debug)]
362	struct Packed(packed::Searcher);
363
364	impl Prefilter for Packed {
365	fn next_candidate(
366	&self,
367	_state: &mut PrefilterState,
368	haystack: &[u8],
369	at: usize,
370	) -> Candidate {
371	self.0.find_at(haystack, at).map_or(default:Candidate::None, f:Candidate::Match)
372	}
373
374	fn clone_prefilter(&self) -> Box<dyn Prefilter> {
375	Box::new(self.clone())
376	}
377
378	fn heap_bytes(&self) -> usize {
379	self.0.heap_bytes()
380	}
381
382	fn reports_false_positives(&self) -> bool {
383	`false`
384	}
385	}
386
387	/// A builder for constructing a rare byte prefilter.
388	///
389	/// A rare byte prefilter attempts to pick out a small set of rare bytes that
390	/// occurr in the patterns, and then quickly scan to matches of those rare
391	/// bytes.
392	#[derive(Clone, Debug)]
393	struct RareBytesBuilder {
394	/// Whether this prefilter should account for ASCII case insensitivity or
395	/// not.
396	ascii_case_insensitive: bool,
397	/// A set of rare bytes, indexed by byte value.
398	rare_set: ByteSet,
399	/// A set of byte offsets associated with bytes in a pattern. An entry
400	/// corresponds to a particular bytes (its index) and is only non-zero if
401	/// the byte occurred at an offset greater than 0 in at least one pattern.
402	///
403	/// If a byte's offset is not representable in 8 bits, then the rare bytes
404	/// prefilter becomes inert.
405	byte_offsets: RareByteOffsets,
406	/// Whether this is available as a prefilter or not. This can be set to
407	/// false during construction if a condition is seen that invalidates the
408	/// use of the rare-byte prefilter.
409	available: bool,
410	/// The number of bytes set to an active value in `byte_offsets`.
411	count: usize,
412	/// The sum of frequency ranks for the rare bytes detected. This is
413	/// intended to give a heuristic notion of how rare the bytes are.
414	rank_sum: u16,
415	}
416
417	/// A set of bytes.
418	#[derive(Clone, Copy)]
419	struct ByteSet([bool; `256`]);
420
421	impl ByteSet {
422	fn empty() -> ByteSet {
423	ByteSet([`false`; `256`])
424	}
425
426	fn insert(&mut self, b: u8) -> bool {
427	let new: bool = !self.contains(b);
428	self.0[b as usize] = `true`;
429	new
430	}
431
432	fn contains(&self, b: u8) -> bool {
433	self.0[b as usize]
434	}
435	}
436
437	impl fmt::Debug for ByteSet {
438	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
439	let mut bytes: Vec = vec![];
440	for b: u8 in `0`..=`255` {
441	if self.contains(b) {
442	bytes.push(b);
443	}
444	}
445	f.debug_struct("ByteSet").field(name:"set", &bytes).finish()
446	}
447	}
448
449	/// A set of byte offsets, keyed by byte.
450	#[derive(Clone, Copy)]
451	struct RareByteOffsets {
452	/// Each entry corresponds to the maximum offset of the corresponding
453	/// byte across all patterns seen.
454	set: [RareByteOffset; `256`],
455	}
456
457	impl RareByteOffsets {
458	/// Create a new empty set of rare byte offsets.
459	pub fn empty() -> RareByteOffsets {
460	RareByteOffsets { set: [RareByteOffset::default(); `256`] }
461	}
462
463	/// Add the given offset for the given byte to this set. If the offset is
464	/// greater than the existing offset, then it overwrites the previous
465	/// value and returns false. If there is no previous value set, then this
466	/// sets it and returns true.
467	pub fn set(&mut self, byte: u8, off: RareByteOffset) {
468	self.set[byte as usize].max =
469	cmp::max(self.set[byte as usize].max, v2:off.max);
470	}
471	}
472
473	impl fmt::Debug for RareByteOffsets {
474	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
475	let mut offsets: Vec<&RareByteOffset> = vec![];
476	for off: &RareByteOffset in self.set.iter() {
477	if off.max > `0` {
478	offsets.push(off);
479	}
480	}
481	f.debug_struct("RareByteOffsets").field(name:"set", &offsets).finish()
482	}
483	}
484
485	/// Offsets associated with an occurrence of a "rare" byte in any of the
486	/// patterns used to construct a single Aho-Corasick automaton.
487	#[derive(Clone, Copy, Debug)]
488	struct RareByteOffset {
489	/// The maximum offset at which a particular byte occurs from the start
490	/// of any pattern. This is used as a shift amount. That is, when an
491	/// occurrence of this byte is found, the candidate position reported by
492	/// the prefilter is `position_of_byte - max`, such that the automaton
493	/// will begin its search at a position that is guaranteed to observe a
494	/// match.
495	///
496	/// To avoid accidentally quadratic behavior, a prefilter is considered
497	/// ineffective when it is asked to start scanning from a position that it
498	/// has already scanned past.
499	///
500	/// Using a `u8` here means that if we ever see a pattern that's longer
501	/// than 255 bytes, then the entire rare byte prefilter is disabled.
502	max: u8,
503	}
504
505	impl Default for RareByteOffset {
506	fn default() -> RareByteOffset {
507	RareByteOffset { max: `0` }
508	}
509	}
510
511	impl RareByteOffset {
512	/// Create a new rare byte offset. If the given offset is too big, then
513	/// None is returned. In that case, callers should render the rare bytes
514	/// prefilter inert.
515	fn new(max: usize) -> Option<RareByteOffset> {
516	if max > u8::MAX as usize {
517	None
518	} else {
519	Some(RareByteOffset { max: max as u8 })
520	}
521	}
522	}
523
524	impl RareBytesBuilder {
525	/// Create a new builder for constructing a rare byte prefilter.
526	fn new() -> RareBytesBuilder {
527	RareBytesBuilder {
528	ascii_case_insensitive: `false`,
529	rare_set: ByteSet::empty(),
530	byte_offsets: RareByteOffsets::empty(),
531	available: `true`,
532	count: `0`,
533	rank_sum: `0`,
534	}
535	}
536
537	/// Enable ASCII case insensitivity. When set, byte strings added to this
538	/// builder will be interpreted without respect to ASCII case.
539	fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
540	self.ascii_case_insensitive = yes;
541	self
542	}
543
544	/// Build the rare bytes prefilter.
545	///
546	/// If there are more than 3 distinct starting bytes, or if heuristics
547	/// otherwise determine that this prefilter should not be used, then `None`
548	/// is returned.
549	fn build(&self) -> Option<PrefilterObj> {
550	if !self.available \|\| self.count > `3` {
551	return None;
552	}
553	let (mut bytes, mut len) = ([`0`; `3`], `0`);
554	for b in `0`..=`255` {
555	if self.rare_set.contains(b) {
556	bytes[len] = b as u8;
557	len += `1`;
558	}
559	}
560	match len {
561	`0` => None,
562	`1` => Some(PrefilterObj::new(RareBytesOne {
563	byte1: bytes[`0`],
564	offset: self.byte_offsets.set[bytes[`0`] as usize],
565	})),
566	`2` => Some(PrefilterObj::new(RareBytesTwo {
567	offsets: self.byte_offsets,
568	byte1: bytes[`0`],
569	byte2: bytes[`1`],
570	})),
571	`3` => Some(PrefilterObj::new(RareBytesThree {
572	offsets: self.byte_offsets,
573	byte1: bytes[`0`],
574	byte2: bytes[`1`],
575	byte3: bytes[`2`],
576	})),
577	_ => unreachable!(),
578	}
579	}
580
581	/// Add a byte string to this builder.
582	///
583	/// All patterns added to an Aho-Corasick automaton should be added to this
584	/// builder before attempting to construct the prefilter.
585	fn add(&mut self, bytes: &[u8]) {
586	// If we've already given up, then do nothing.
587	if !self.available {
588	return;
589	}
590	// If we've already blown our budget, then don't waste time looking
591	// for more rare bytes.
592	if self.count > `3` {
593	self.available = `false`;
594	return;
595	}
596	// If the pattern is too long, then our offset table is bunk, so
597	// give up.
598	if bytes.len() >= `256` {
599	self.available = `false`;
600	return;
601	}
602	let mut rarest = match bytes.get(`0`) {
603	None => return,
604	Some(&b) => (b, freq_rank(b)),
605	};
606	// The idea here is to look for the rarest byte in each pattern, and
607	// add that to our set. As a special exception, if we see a byte that
608	// we've already added, then we immediately stop and choose that byte,
609	// even if there's another rare byte in the pattern. This helps us
610	// apply the rare byte optimization in more cases by attempting to pick
611	// bytes that are in common between patterns. So for example, if we
612	// were searching for `Sherlock` and `lockjaw`, then this would pick
613	// `k` for both patterns, resulting in the use of `memchr` instead of
614	// `memchr2` for `k` and `j`.
615	let mut found = `false`;
616	for (pos, &b) in bytes.iter().enumerate() {
617	self.set_offset(pos, b);
618	if found {
619	continue;
620	}
621	if self.rare_set.contains(b) {
622	found = `true`;
623	continue;
624	}
625	let rank = freq_rank(b);
626	if rank < rarest.1 {
627	rarest = (b, rank);
628	}
629	}
630	if !found {
631	self.add_rare_byte(rarest.0);
632	}
633	}
634
635	fn set_offset(&mut self, pos: usize, byte: u8) {
636	// This unwrap is OK because pos is never bigger than our max.
637	let offset = RareByteOffset::new(pos).unwrap();
638	self.byte_offsets.set(byte, offset);
639	if self.ascii_case_insensitive {
640	self.byte_offsets.set(opposite_ascii_case(byte), offset);
641	}
642	}
643
644	fn add_rare_byte(&mut self, byte: u8) {
645	self.add_one_rare_byte(byte);
646	if self.ascii_case_insensitive {
647	self.add_one_rare_byte(opposite_ascii_case(byte));
648	}
649	}
650
651	fn add_one_rare_byte(&mut self, byte: u8) {
652	if self.rare_set.insert(byte) {
653	self.count += `1`;
654	self.rank_sum += freq_rank(byte) as u16;
655	}
656	}
657	}
658
659	/// A prefilter for scanning for a single "rare" byte.
660	#[derive(Clone, Debug)]
661	struct RareBytesOne {
662	byte1: u8,
663	offset: RareByteOffset,
664	}
665
666	impl Prefilter for RareBytesOne {
667	fn next_candidate(
668	&self,
669	state: &mut PrefilterState,
670	haystack: &[u8],
671	at: usize,
672	) -> Candidate {
673	memchr(self.byte1, &haystack[at..])
674	.map(\|i\| {
675	let pos = at + i;
676	state.last_scan_at = pos;
677	cmp::max(at, pos.saturating_sub(self.offset.max as usize))
678	})
679	.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
680	}
681
682	fn clone_prefilter(&self) -> Box<dyn Prefilter> {
683	Box::new(self.clone())
684	}
685
686	fn heap_bytes(&self) -> usize {
687	`0`
688	}
689
690	fn looks_for_non_start_of_match(&self) -> bool {
691	// TODO: It should be possible to use a rare byte prefilter in a
692	// streaming context. The main problem is that we usually assume that
693	// if a prefilter has scanned some text and not found anything, then no
694	// match starts* in that text. This doesn't matter in non-streaming*
695	// contexts, but in a streaming context, if we're looking for a byte
696	// that doesn't start at the beginning of a match and don't find it,
697	// then it's still possible for a match to start at the end of the
698	// current buffer content. In order to fix this, the streaming searcher
699	// would need to become aware of prefilters that do this and use the
700	// appropriate offset in various places. It is quite a delicate change
701	// and probably shouldn't be attempted until streaming search has a
702	// better testing strategy. In particular, we'd really like to be able
703	// to vary the buffer size to force strange cases that occur at the
704	// edge of the buffer. If we make the buffer size minimal, then these
705	// cases occur more frequently and easier.
706	//
707	// This is also a bummer because this means that if the prefilter
708	// builder chose a rare byte prefilter, then a streaming search won't
709	// use any prefilter at all because the builder doesn't know how it's
710	// going to be used. Assuming we don't make streaming search aware of
711	// these special types of prefilters as described above, we could fix
712	// this by building a "backup" prefilter that could be used when the
713	// rare byte prefilter could not. But that's a bandaide. Sigh.
714	`true`
715	}
716	}
717
718	/// A prefilter for scanning for two "rare" bytes.
719	#[derive(Clone, Debug)]
720	struct RareBytesTwo {
721	offsets: RareByteOffsets,
722	byte1: u8,
723	byte2: u8,
724	}
725
726	impl Prefilter for RareBytesTwo {
727	fn next_candidate(
728	&self,
729	state: &mut PrefilterState,
730	haystack: &[u8],
731	at: usize,
732	) -> Candidate {
733	memchr2(self.byte1, self.byte2, &haystack[at..])
734	.map(\|i\| {
735	let pos = at + i;
736	state.update_at(pos);
737	let offset = self.offsets.set[haystack[pos] as usize].max;
738	cmp::max(at, pos.saturating_sub(offset as usize))
739	})
740	.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
741	}
742
743	fn clone_prefilter(&self) -> Box<dyn Prefilter> {
744	Box::new(self.clone())
745	}
746
747	fn heap_bytes(&self) -> usize {
748	`0`
749	}
750
751	fn looks_for_non_start_of_match(&self) -> bool {
752	// TODO: See Prefilter impl for RareBytesOne.
753	`true`
754	}
755	}
756
757	/// A prefilter for scanning for three "rare" bytes.
758	#[derive(Clone, Debug)]
759	struct RareBytesThree {
760	offsets: RareByteOffsets,
761	byte1: u8,
762	byte2: u8,
763	byte3: u8,
764	}
765
766	impl Prefilter for RareBytesThree {
767	fn next_candidate(
768	&self,
769	state: &mut PrefilterState,
770	haystack: &[u8],
771	at: usize,
772	) -> Candidate {
773	memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
774	.map(\|i\| {
775	let pos = at + i;
776	state.update_at(pos);
777	let offset = self.offsets.set[haystack[pos] as usize].max;
778	cmp::max(at, pos.saturating_sub(offset as usize))
779	})
780	.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
781	}
782
783	fn clone_prefilter(&self) -> Box<dyn Prefilter> {
784	Box::new(self.clone())
785	}
786
787	fn heap_bytes(&self) -> usize {
788	`0`
789	}
790
791	fn looks_for_non_start_of_match(&self) -> bool {
792	// TODO: See Prefilter impl for RareBytesOne.
793	`true`
794	}
795	}
796
797	/// A builder for constructing a starting byte prefilter.
798	///
799	/// A starting byte prefilter is a simplistic prefilter that looks for possible
800	/// matches by reporting all positions corresponding to a particular byte. This
801	/// generally only takes affect when there are at most 3 distinct possible
802	/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
803	/// distinct starting bytes (`f` and `b`), and this prefilter returns all
804	/// occurrences of either `f` or `b`.
805	///
806	/// In some cases, a heuristic frequency analysis may determine that it would
807	/// be better not to use this prefilter even when there are 3 or fewer distinct
808	/// starting bytes.
809	#[derive(Clone, Debug)]
810	struct StartBytesBuilder {
811	/// Whether this prefilter should account for ASCII case insensitivity or
812	/// not.
813	ascii_case_insensitive: bool,
814	/// The set of starting bytes observed.
815	byteset: Vec<bool>,
816	/// The number of bytes set to true in `byteset`.
817	count: usize,
818	/// The sum of frequency ranks for the rare bytes detected. This is
819	/// intended to give a heuristic notion of how rare the bytes are.
820	rank_sum: u16,
821	}
822
823	impl StartBytesBuilder {
824	/// Create a new builder for constructing a start byte prefilter.
825	fn new() -> StartBytesBuilder {
826	StartBytesBuilder {
827	ascii_case_insensitive: `false`,
828	byteset: vec![`false`; `256`],
829	count: `0`,
830	rank_sum: `0`,
831	}
832	}
833
834	/// Enable ASCII case insensitivity. When set, byte strings added to this
835	/// builder will be interpreted without respect to ASCII case.
836	fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
837	self.ascii_case_insensitive = yes;
838	self
839	}
840
841	/// Build the starting bytes prefilter.
842	///
843	/// If there are more than 3 distinct starting bytes, or if heuristics
844	/// otherwise determine that this prefilter should not be used, then `None`
845	/// is returned.
846	fn build(&self) -> Option<PrefilterObj> {
847	if self.count > `3` {
848	return None;
849	}
850	let (mut bytes, mut len) = ([`0`; `3`], `0`);
851	for b in `0`..`256` {
852	if !self.byteset[b] {
853	continue;
854	}
855	// We don't handle non-ASCII bytes for now. Getting non-ASCII
856	// bytes right is trickier, since we generally don't want to put
857	// a leading UTF-8 code unit into a prefilter that isn't ASCII,
858	// since they can frequently. Instead, it would be better to use a
859	// continuation byte, but this requires more sophisticated analysis
860	// of the automaton and a richer prefilter API.
861	if b > `0x7F` {
862	return None;
863	}
864	bytes[len] = b as u8;
865	len += `1`;
866	}
867	match len {
868	`0` => None,
869	`1` => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[`0`] })),
870	`2` => Some(PrefilterObj::new(StartBytesTwo {
871	byte1: bytes[`0`],
872	byte2: bytes[`1`],
873	})),
874	`3` => Some(PrefilterObj::new(StartBytesThree {
875	byte1: bytes[`0`],
876	byte2: bytes[`1`],
877	byte3: bytes[`2`],
878	})),
879	_ => unreachable!(),
880	}
881	}
882
883	/// Add a byte string to this builder.
884	///
885	/// All patterns added to an Aho-Corasick automaton should be added to this
886	/// builder before attempting to construct the prefilter.
887	fn add(&mut self, bytes: &[u8]) {
888	if self.count > `3` {
889	return;
890	}
891	if let Some(&byte) = bytes.get(`0`) {
892	self.add_one_byte(byte);
893	if self.ascii_case_insensitive {
894	self.add_one_byte(opposite_ascii_case(byte));
895	}
896	}
897	}
898
899	fn add_one_byte(&mut self, byte: u8) {
900	if !self.byteset[byte as usize] {
901	self.byteset[byte as usize] = `true`;
902	self.count += `1`;
903	self.rank_sum += freq_rank(byte) as u16;
904	}
905	}
906	}
907
908	/// A prefilter for scanning for a single starting byte.
909	#[derive(Clone, Debug)]
910	struct StartBytesOne {
911	byte1: u8,
912	}
913
914	impl Prefilter for StartBytesOne {
915	fn next_candidate(
916	&self,
917	_state: &mut PrefilterState,
918	haystack: &[u8],
919	at: usize,
920	) -> Candidate {
921	memchr(self.byte1, &haystack[at..])
922	.map(\|i\| at + i)
923	.map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
924	}
925
926	fn clone_prefilter(&self) -> Box<dyn Prefilter> {
927	Box::new(self.clone())
928	}
929
930	fn heap_bytes(&self) -> usize {
931	`0`
932	}
933	}
934
935	/// A prefilter for scanning for two starting bytes.
936	#[derive(Clone, Debug)]
937	struct StartBytesTwo {
938	byte1: u8,
939	byte2: u8,
940	}
941
942	impl Prefilter for StartBytesTwo {
943	fn next_candidate(
944	&self,
945	_state: &mut PrefilterState,
946	haystack: &[u8],
947	at: usize,
948	) -> Candidate {
949	memchr2(self.byte1, self.byte2, &haystack[at..])
950	.map(\|i\| at + i)
951	.map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
952	}
953
954	fn clone_prefilter(&self) -> Box<dyn Prefilter> {
955	Box::new(self.clone())
956	}
957
958	fn heap_bytes(&self) -> usize {
959	`0`
960	}
961	}
962
963	/// A prefilter for scanning for three starting bytes.
964	#[derive(Clone, Debug)]
965	struct StartBytesThree {
966	byte1: u8,
967	byte2: u8,
968	byte3: u8,
969	}
970
971	impl Prefilter for StartBytesThree {
972	fn next_candidate(
973	&self,
974	_state: &mut PrefilterState,
975	haystack: &[u8],
976	at: usize,
977	) -> Candidate {
978	memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
979	.map(\|i\| at + i)
980	.map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
981	}
982
983	fn clone_prefilter(&self) -> Box<dyn Prefilter> {
984	Box::new(self.clone())
985	}
986
987	fn heap_bytes(&self) -> usize {
988	`0`
989	}
990	}
991
992	/// Return the next candidate reported by the given prefilter while
993	/// simultaneously updating the given prestate.
994	///
995	/// The caller is responsible for checking the prestate before deciding whether
996	/// to initiate a search.
997	#[inline]
998	pub fn next<P: Prefilter>(
999	prestate: &mut PrefilterState,
1000	prefilter: P,
1001	haystack: &[u8],
1002	at: usize,
1003	) -> Candidate {
1004	let cand: Candidate = prefilter.next_candidate(state:prestate, haystack, at);
1005	match cand {
1006	Candidate::None => {
1007	prestate.update_skipped_bytes(skipped:haystack.len() - at);
1008	}
1009	Candidate::Match(ref m: &Match) => {
1010	prestate.update_skipped_bytes(skipped:m.start() - at);
1011	}
1012	Candidate::PossibleStartOfMatch(i: usize) => {
1013	prestate.update_skipped_bytes(skipped:i - at);
1014	}
1015	}
1016	cand
1017	}
1018
1019	/// If the given byte is an ASCII letter, then return it in the opposite case.
1020	/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
1021	/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
1022	pub fn opposite_ascii_case(b: u8) -> u8 {
1023	if b'A' <= b && b <= b'Z' {
1024	b.to_ascii_lowercase()
1025	} else if b'a' <= b && b <= b'z' {
1026	b.to_ascii_uppercase()
1027	} else {
1028	b
1029	}
1030	}
1031
1032	/// Return the frequency rank of the given byte. The higher the rank, the more
1033	/// common the byte (heuristically speaking).
1034	fn freq_rank(b: u8) -> u8 {
1035	use crate::byte_frequencies::BYTE_FREQUENCIES;
1036	BYTE_FREQUENCIES[b as usize]
1037	}
1038
1039	#[cfg(test)]
1040	mod tests {
1041	use super::*;
1042
1043	#[test]
1044	fn scratch() {
1045	let mut b = Builder::new(MatchKind::LeftmostFirst);
1046	b.add(b"Sherlock");
1047	b.add(b"locjaw");
1048	// b.add(b"Sherlock");
1049	// b.add(b"Holmes");
1050	// b.add(b"Watson");
1051	// b.add("Шерлок Холмс".as_bytes());
1052	// b.add("Джон Уотсон".as_bytes());
1053
1054	let s = b.build().unwrap();
1055	println!("{:?}", s);
1056	}
1057	}
1058