1use crate::Match;
2
3/// A candidate is the result of running a prefilter on a haystack at a
4/// particular position. The result is one of no match, a confirmed match or
5/// a possible match.
6///
7/// When no match is returned, the prefilter is guaranteeing that no possible
8/// match can be found in the haystack, and the caller may trust this. That is,
9/// all correct prefilters must never report false negatives.
10///
11/// In some cases, a prefilter can confirm a match very quickly, in which case,
12/// the caller may use this to stop what it's doing and report the match. In
13/// this case, prefilter implementations must never report a false positive.
14/// In other cases, the prefilter can only report a potential match, in which
15/// case the callers must attempt to confirm the match. In this case, prefilter
16/// implementations are permitted to return false positives.
17#[derive(Clone, Debug)]
18pub enum Candidate {
19 /// The prefilter reports that no match is possible. Prefilter
20 /// implementations will never report false negatives.
21 None,
22 /// The prefilter reports that a match has been confirmed at the provided
23 /// byte offsets. When this variant is reported, the prefilter is
24 /// guaranteeing a match. No false positives are permitted.
25 Match(Match),
26 /// The prefilter reports that a match *may* start at the given position.
27 /// When this variant is reported, it may correspond to a false positive.
28 PossibleStartOfMatch(usize),
29}
30
31impl Candidate {
32 /// Convert this candidate into an option. This is useful when callers do
33 /// not distinguish between true positives and false positives (i.e., the
34 /// caller must always confirm the match in order to update some other
35 /// state).
36 ///
37 /// The byte offset in the option returned corresponds to the starting
38 /// position of the possible match.
39 pub fn into_option(self) -> Option<usize> {
40 match self {
41 Candidate::None => None,
42 Candidate::Match(ref m: &Match) => Some(m.start()),
43 Candidate::PossibleStartOfMatch(start: usize) => Some(start),
44 }
45 }
46}
47
48/// A prefilter describes the behavior of fast literal scanners for quickly
49/// skipping past bytes in the haystack that we know cannot possibly
50/// participate in a match.
51pub trait Prefilter: core::fmt::Debug {
52 /// Returns the next possible match candidate. This may yield false
53 /// positives, so callers must confirm a match starting at the position
54 /// returned. This, however, must never produce false negatives. That is,
55 /// this must, at minimum, return the starting position of the next match
56 /// in the given haystack after or at the given position.
57 fn next_candidate(
58 &self,
59 state: &mut State,
60 haystack: &[u8],
61 at: usize,
62 ) -> Candidate;
63
64 /// Returns the approximate total amount of heap used by this prefilter, in
65 /// units of bytes.
66 fn heap_bytes(&self) -> usize;
67
68 /// Returns true if and only if this prefilter may return false positives
69 /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful
70 /// when false positives are not posssible (in which case, implementations
71 /// should return false), which may allow completely avoiding heavier regex
72 /// machinery when the prefilter can quickly confirm its own matches.
73 ///
74 /// By default, this returns true, which is conservative; it is always
75 /// correct to return `true`. Returning `false` here and reporting a false
76 /// positive will result in incorrect searches.
77 fn reports_false_positives(&self) -> bool {
78 true
79 }
80}
81
82impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
83 #[inline]
84 fn next_candidate(
85 &self,
86 state: &mut State,
87 haystack: &[u8],
88 at: usize,
89 ) -> Candidate {
90 (**self).next_candidate(state, haystack, at)
91 }
92
93 fn heap_bytes(&self) -> usize {
94 (**self).heap_bytes()
95 }
96
97 fn reports_false_positives(&self) -> bool {
98 (**self).reports_false_positives()
99 }
100}
101
102#[derive(Clone)]
103pub struct Scanner<'p> {
104 prefilter: &'p dyn Prefilter,
105 state: State,
106}
107
108impl<'p> Scanner<'p> {
109 pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> {
110 Scanner { prefilter, state: State::new() }
111 }
112
113 pub(crate) fn is_effective(&mut self, at: usize) -> bool {
114 self.state.is_effective(at)
115 }
116
117 pub(crate) fn reports_false_positives(&self) -> bool {
118 self.prefilter.reports_false_positives()
119 }
120
121 pub(crate) fn next_candidate(
122 &mut self,
123 bytes: &[u8],
124 at: usize,
125 ) -> Candidate {
126 let cand = self.prefilter.next_candidate(&mut self.state, bytes, at);
127 match cand {
128 Candidate::None => {
129 self.state.update_skipped_bytes(bytes.len() - at);
130 }
131 Candidate::Match(ref m) => {
132 self.state.update_skipped_bytes(m.start() - at);
133 }
134 Candidate::PossibleStartOfMatch(i) => {
135 self.state.update_skipped_bytes(i - at);
136 }
137 }
138 cand
139 }
140}
141
142impl<'p> core::fmt::Debug for Scanner<'p> {
143 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
144 f.debug_struct("Scanner").field(name:"state", &self.state).finish()
145 }
146}
147
148/// State tracks state associated with the effectiveness of a
149/// prefilter. It is used to track how many bytes, on average, are skipped by
150/// the prefilter. If this average dips below a certain threshold over time,
151/// then the state renders the prefilter inert and stops using it.
152///
153/// A prefilter state should be created for each search. (Where creating an
154/// iterator via, e.g., `find_iter`, is treated as a single search.)
155#[derive(Clone, Debug)]
156pub struct State {
157 /// The number of skips that has been executed.
158 skips: usize,
159 /// The total number of bytes that have been skipped.
160 skipped: usize,
161 /// Once this heuristic has been deemed permanently ineffective, it will be
162 /// inert throughout the rest of its lifetime. This serves as a cheap way
163 /// to check inertness.
164 inert: bool,
165 /// The last (absolute) position at which a prefilter scanned to.
166 /// Prefilters can use this position to determine whether to re-scan or
167 /// not.
168 ///
169 /// Unlike other things that impact effectiveness, this is a fleeting
170 /// condition. That is, a prefilter can be considered ineffective if it is
171 /// at a position before `last_scan_at`, but can become effective again
172 /// once the search moves past `last_scan_at`.
173 ///
174 /// The utility of this is to both avoid additional overhead from calling
175 /// the prefilter and to avoid quadratic behavior. This ensures that a
176 /// prefilter will scan any particular byte at most once. (Note that some
177 /// prefilters, like the start-byte prefilter, do not need to use this
178 /// field at all, since it only looks for starting bytes.)
179 last_scan_at: usize,
180}
181
182impl State {
183 /// The minimum number of skip attempts to try before considering whether
184 /// a prefilter is effective or not.
185 const MIN_SKIPS: usize = 40;
186
187 /// The minimum amount of bytes that skipping must average.
188 ///
189 /// That is, after MIN_SKIPS have occurred, if the average number of bytes
190 /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be
191 /// rendered inert.
192 const MIN_AVG_SKIP: usize = 16;
193
194 /// Create a fresh prefilter state.
195 pub fn new() -> State {
196 State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 }
197 }
198
199 /// Updates the position at which the last scan stopped. This may be
200 /// greater than the position of the last candidate reported. For example,
201 /// searching for the byte `z` in `abczdef` for the pattern `abcz` will
202 /// report a candidate at position `0`, but the end of its last scan will
203 /// be at position `3`.
204 ///
205 /// This position factors into the effectiveness of this prefilter. If the
206 /// current position is less than the last position at which a scan ended,
207 /// then the prefilter should not be re-run until the search moves past
208 /// that position.
209 ///
210 /// It is always correct to never update the last scan position. In fact,
211 /// it is also always correct to set the last scan position to an arbitrary
212 /// value. The key is setting it to a position in the future at which it
213 /// makes sense to restart the prefilter.
214 pub fn update_last_scan(&mut self, at: usize) {
215 if at > self.last_scan_at {
216 self.last_scan_at = at;
217 }
218 }
219
220 /// Return true if and only if this state indicates that a prefilter is
221 /// still effective. If the prefilter is not effective, then this state
222 /// is rendered "inert." At which point, all subsequent calls to
223 /// `is_effective` on this state will return `false`.
224 ///
225 /// `at` should correspond to the current starting position of the search.
226 ///
227 /// Callers typically do not need to use this, as it represents the
228 /// default implementation of
229 /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective).
230 fn is_effective(&mut self, at: usize) -> bool {
231 if self.inert {
232 return false;
233 }
234 if at < self.last_scan_at {
235 return false;
236 }
237 if self.skips < State::MIN_SKIPS {
238 return true;
239 }
240
241 if self.skipped >= State::MIN_AVG_SKIP * self.skips {
242 return true;
243 }
244
245 // We're inert.
246 self.inert = true;
247 false
248 }
249
250 /// Update this state with the number of bytes skipped on the last
251 /// invocation of the prefilter.
252 fn update_skipped_bytes(&mut self, skipped: usize) {
253 self.skips += 1;
254 self.skipped += skipped;
255 }
256}
257
258/// A `Prefilter` implementation that reports a possible match at every
259/// position.
260///
261/// This should generally not be used as an actual prefilter. It is only
262/// useful when one needs to represent the absence of a prefilter in a generic
263/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex)
264/// uses this prefilter by default to indicate that no prefilter should be
265/// used.
266///
267/// A `None` prefilter value cannot be constructed.
268#[derive(Clone, Debug)]
269pub struct None {
270 _priv: (),
271}
272
273impl Prefilter for None {
274 fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate {
275 Candidate::PossibleStartOfMatch(at)
276 }
277
278 fn heap_bytes(&self) -> usize {
279 0
280 }
281}
282