prefilter.rs source code [crates/regex_automata/src/util/prefilter.rs]

1	use crate::Match;
2
3	/// A candidate is the result of running a prefilter on a haystack at a
4	/// particular position. The result is one of no match, a confirmed match or
5	/// a possible match.
6	///
7	/// When no match is returned, the prefilter is guaranteeing that no possible
8	/// match can be found in the haystack, and the caller may trust this. That is,
9	/// all correct prefilters must never report false negatives.
10	///
11	/// In some cases, a prefilter can confirm a match very quickly, in which case,
12	/// the caller may use this to stop what it's doing and report the match. In
13	/// this case, prefilter implementations must never report a false positive.
14	/// In other cases, the prefilter can only report a potential match, in which
15	/// case the callers must attempt to confirm the match. In this case, prefilter
16	/// implementations are permitted to return false positives.
17	#[derive(Clone, Debug)]
18	pub enum Candidate {
19	/// The prefilter reports that no match is possible. Prefilter
20	/// implementations will never report false negatives.
21	None,
22	/// The prefilter reports that a match has been confirmed at the provided
23	/// byte offsets. When this variant is reported, the prefilter is
24	/// guaranteeing a match. No false positives are permitted.
25	Match(Match),
26	/// The prefilter reports that a match may* start at the given position.*
27	/// When this variant is reported, it may correspond to a false positive.
28	PossibleStartOfMatch(usize),
29	}
30
31	impl Candidate {
32	/// Convert this candidate into an option. This is useful when callers do
33	/// not distinguish between true positives and false positives (i.e., the
34	/// caller must always confirm the match in order to update some other
35	/// state).
36	///
37	/// The byte offset in the option returned corresponds to the starting
38	/// position of the possible match.
39	pub fn into_option(self) -> Option<usize> {
40	match self {
41	Candidate::None => None,
42	Candidate::Match(ref m: &Match) => Some(m.start()),
43	Candidate::PossibleStartOfMatch(start: usize) => Some(start),
44	}
45	}
46	}
47
48	/// A prefilter describes the behavior of fast literal scanners for quickly
49	/// skipping past bytes in the haystack that we know cannot possibly
50	/// participate in a match.
51	pub trait Prefilter: core::fmt::Debug {
52	/// Returns the next possible match candidate. This may yield false
53	/// positives, so callers must confirm a match starting at the position
54	/// returned. This, however, must never produce false negatives. That is,
55	/// this must, at minimum, return the starting position of the next match
56	/// in the given haystack after or at the given position.
57	fn next_candidate(
58	&self,
59	state: &mut State,
60	haystack: &[u8],
61	at: usize,
62	) -> Candidate;
63
64	/// Returns the approximate total amount of heap used by this prefilter, in
65	/// units of bytes.
66	fn heap_bytes(&self) -> usize;
67
68	/// Returns true if and only if this prefilter may return false positives
69	/// via the `Candidate::PossibleStartOfMatch` variant. This is most useful
70	/// when false positives are not posssible (in which case, implementations
71	/// should return false), which may allow completely avoiding heavier regex
72	/// machinery when the prefilter can quickly confirm its own matches.
73	///
74	/// By default, this returns true, which is conservative; it is always
75	/// correct to return `true`. Returning `false` here and reporting a false
76	/// positive will result in incorrect searches.
77	fn reports_false_positives(&self) -> bool {
78	`true`
79	}
80	}
81
82	impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
83	#[inline]
84	fn next_candidate(
85	&self,
86	state: &mut State,
87	haystack: &[u8],
88	at: usize,
89	) -> Candidate {
90	(**self).next_candidate(state, haystack, at)
91	}
92
93	fn heap_bytes(&self) -> usize {
94	(**self).heap_bytes()
95	}
96
97	fn reports_false_positives(&self) -> bool {
98	(**self).reports_false_positives()
99	}
100	}
101
102	#[derive(Clone)]
103	pub struct Scanner<'p> {
104	prefilter: &'p dyn Prefilter,
105	state: State,
106	}
107
108	impl<'p> Scanner<'p> {
109	pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> {
110	Scanner { prefilter, state: State::new() }
111	}
112
113	pub(crate) fn is_effective(&mut self, at: usize) -> bool {
114	self.state.is_effective(at)
115	}
116
117	pub(crate) fn reports_false_positives(&self) -> bool {
118	self.prefilter.reports_false_positives()
119	}
120
121	pub(crate) fn next_candidate(
122	&mut self,
123	bytes: &[u8],
124	at: usize,
125	) -> Candidate {
126	let cand = self.prefilter.next_candidate(&mut self.state, bytes, at);
127	match cand {
128	Candidate::None => {
129	self.state.update_skipped_bytes(bytes.len() - at);
130	}
131	Candidate::Match(ref m) => {
132	self.state.update_skipped_bytes(m.start() - at);
133	}
134	Candidate::PossibleStartOfMatch(i) => {
135	self.state.update_skipped_bytes(i - at);
136	}
137	}
138	cand
139	}
140	}
141
142	impl<'p> core::fmt::Debug for Scanner<'p> {
143	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
144	f.debug_struct("Scanner").field(name:"state", &self.state).finish()
145	}
146	}
147
148	/// State tracks state associated with the effectiveness of a
149	/// prefilter. It is used to track how many bytes, on average, are skipped by
150	/// the prefilter. If this average dips below a certain threshold over time,
151	/// then the state renders the prefilter inert and stops using it.
152	///
153	/// A prefilter state should be created for each search. (Where creating an
154	/// iterator via, e.g., `find_iter`, is treated as a single search.)
155	#[derive(Clone, Debug)]
156	pub struct State {
157	/// The number of skips that has been executed.
158	skips: usize,
159	/// The total number of bytes that have been skipped.
160	skipped: usize,
161	/// Once this heuristic has been deemed permanently ineffective, it will be
162	/// inert throughout the rest of its lifetime. This serves as a cheap way
163	/// to check inertness.
164	inert: bool,
165	/// The last (absolute) position at which a prefilter scanned to.
166	/// Prefilters can use this position to determine whether to re-scan or
167	/// not.
168	///
169	/// Unlike other things that impact effectiveness, this is a fleeting
170	/// condition. That is, a prefilter can be considered ineffective if it is
171	/// at a position before `last_scan_at`, but can become effective again
172	/// once the search moves past `last_scan_at`.
173	///
174	/// The utility of this is to both avoid additional overhead from calling
175	/// the prefilter and to avoid quadratic behavior. This ensures that a
176	/// prefilter will scan any particular byte at most once. (Note that some
177	/// prefilters, like the start-byte prefilter, do not need to use this
178	/// field at all, since it only looks for starting bytes.)
179	last_scan_at: usize,
180	}
181
182	impl State {
183	/// The minimum number of skip attempts to try before considering whether
184	/// a prefilter is effective or not.
185	const MIN_SKIPS: usize = `40`;
186
187	/// The minimum amount of bytes that skipping must average.
188	///
189	/// That is, after MIN_SKIPS have occurred, if the average number of bytes
190	/// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be
191	/// rendered inert.
192	const MIN_AVG_SKIP: usize = `16`;
193
194	/// Create a fresh prefilter state.
195	pub fn new() -> State {
196	State { skips: `0`, skipped: `0`, inert: `false`, last_scan_at: `0` }
197	}
198
199	/// Updates the position at which the last scan stopped. This may be
200	/// greater than the position of the last candidate reported. For example,
201	/// searching for the byte `z` in `abczdef` for the pattern `abcz` will
202	/// report a candidate at position `0`, but the end of its last scan will
203	/// be at position `3`.
204	///
205	/// This position factors into the effectiveness of this prefilter. If the
206	/// current position is less than the last position at which a scan ended,
207	/// then the prefilter should not be re-run until the search moves past
208	/// that position.
209	///
210	/// It is always correct to never update the last scan position. In fact,
211	/// it is also always correct to set the last scan position to an arbitrary
212	/// value. The key is setting it to a position in the future at which it
213	/// makes sense to restart the prefilter.
214	pub fn update_last_scan(&mut self, at: usize) {
215	if at > self.last_scan_at {
216	self.last_scan_at = at;
217	}
218	}
219
220	/// Return true if and only if this state indicates that a prefilter is
221	/// still effective. If the prefilter is not effective, then this state
222	/// is rendered "inert." At which point, all subsequent calls to
223	/// `is_effective` on this state will return `false`.
224	///
225	/// `at` should correspond to the current starting position of the search.
226	///
227	/// Callers typically do not need to use this, as it represents the
228	/// default implementation of
229	/// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective).
230	fn is_effective(&mut self, at: usize) -> bool {
231	if self.inert {
232	return `false`;
233	}
234	if at < self.last_scan_at {
235	return `false`;
236	}
237	if self.skips < State::MIN_SKIPS {
238	return `true`;
239	}
240
241	if self.skipped >= State::MIN_AVG_SKIP * self.skips {
242	return `true`;
243	}
244
245	// We're inert.
246	self.inert = `true`;
247	`false`
248	}
249
250	/// Update this state with the number of bytes skipped on the last
251	/// invocation of the prefilter.
252	fn update_skipped_bytes(&mut self, skipped: usize) {
253	self.skips += `1`;
254	self.skipped += skipped;
255	}
256	}
257
258	/// A `Prefilter` implementation that reports a possible match at every
259	/// position.
260	///
261	/// This should generally not be used as an actual prefilter. It is only
262	/// useful when one needs to represent the absence of a prefilter in a generic
263	/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex)
264	/// uses this prefilter by default to indicate that no prefilter should be
265	/// used.
266	///
267	/// A `None` prefilter value cannot be constructed.
268	#[derive(Clone, Debug)]
269	pub struct None {
270	_priv: (),
271	}
272
273	impl Prefilter for None {
274	fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate {
275	Candidate::PossibleStartOfMatch(at)
276	}
277
278	fn heap_bytes(&self) -> usize {
279	`0`
280	}
281	}
282