1 | use crate::Match; |
2 | |
3 | /// A candidate is the result of running a prefilter on a haystack at a |
4 | /// particular position. The result is one of no match, a confirmed match or |
5 | /// a possible match. |
6 | /// |
7 | /// When no match is returned, the prefilter is guaranteeing that no possible |
8 | /// match can be found in the haystack, and the caller may trust this. That is, |
9 | /// all correct prefilters must never report false negatives. |
10 | /// |
11 | /// In some cases, a prefilter can confirm a match very quickly, in which case, |
12 | /// the caller may use this to stop what it's doing and report the match. In |
13 | /// this case, prefilter implementations must never report a false positive. |
14 | /// In other cases, the prefilter can only report a potential match, in which |
15 | /// case the callers must attempt to confirm the match. In this case, prefilter |
16 | /// implementations are permitted to return false positives. |
17 | #[derive (Clone, Debug)] |
18 | pub enum Candidate { |
19 | /// The prefilter reports that no match is possible. Prefilter |
20 | /// implementations will never report false negatives. |
21 | None, |
22 | /// The prefilter reports that a match has been confirmed at the provided |
23 | /// byte offsets. When this variant is reported, the prefilter is |
24 | /// guaranteeing a match. No false positives are permitted. |
25 | Match(Match), |
26 | /// The prefilter reports that a match *may* start at the given position. |
27 | /// When this variant is reported, it may correspond to a false positive. |
28 | PossibleStartOfMatch(usize), |
29 | } |
30 | |
31 | impl Candidate { |
32 | /// Convert this candidate into an option. This is useful when callers do |
33 | /// not distinguish between true positives and false positives (i.e., the |
34 | /// caller must always confirm the match in order to update some other |
35 | /// state). |
36 | /// |
37 | /// The byte offset in the option returned corresponds to the starting |
38 | /// position of the possible match. |
39 | pub fn into_option(self) -> Option<usize> { |
40 | match self { |
41 | Candidate::None => None, |
42 | Candidate::Match(ref m: &Match) => Some(m.start()), |
43 | Candidate::PossibleStartOfMatch(start: usize) => Some(start), |
44 | } |
45 | } |
46 | } |
47 | |
48 | /// A prefilter describes the behavior of fast literal scanners for quickly |
49 | /// skipping past bytes in the haystack that we know cannot possibly |
50 | /// participate in a match. |
51 | pub trait Prefilter: core::fmt::Debug { |
52 | /// Returns the next possible match candidate. This may yield false |
53 | /// positives, so callers must confirm a match starting at the position |
54 | /// returned. This, however, must never produce false negatives. That is, |
55 | /// this must, at minimum, return the starting position of the next match |
56 | /// in the given haystack after or at the given position. |
57 | fn next_candidate( |
58 | &self, |
59 | state: &mut State, |
60 | haystack: &[u8], |
61 | at: usize, |
62 | ) -> Candidate; |
63 | |
64 | /// Returns the approximate total amount of heap used by this prefilter, in |
65 | /// units of bytes. |
66 | fn heap_bytes(&self) -> usize; |
67 | |
68 | /// Returns true if and only if this prefilter may return false positives |
69 | /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful |
70 | /// when false positives are not posssible (in which case, implementations |
71 | /// should return false), which may allow completely avoiding heavier regex |
72 | /// machinery when the prefilter can quickly confirm its own matches. |
73 | /// |
74 | /// By default, this returns true, which is conservative; it is always |
75 | /// correct to return `true`. Returning `false` here and reporting a false |
76 | /// positive will result in incorrect searches. |
77 | fn reports_false_positives(&self) -> bool { |
78 | true |
79 | } |
80 | } |
81 | |
82 | impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { |
83 | #[inline ] |
84 | fn next_candidate( |
85 | &self, |
86 | state: &mut State, |
87 | haystack: &[u8], |
88 | at: usize, |
89 | ) -> Candidate { |
90 | (**self).next_candidate(state, haystack, at) |
91 | } |
92 | |
93 | fn heap_bytes(&self) -> usize { |
94 | (**self).heap_bytes() |
95 | } |
96 | |
97 | fn reports_false_positives(&self) -> bool { |
98 | (**self).reports_false_positives() |
99 | } |
100 | } |
101 | |
102 | #[derive (Clone)] |
103 | pub struct Scanner<'p> { |
104 | prefilter: &'p dyn Prefilter, |
105 | state: State, |
106 | } |
107 | |
108 | impl<'p> Scanner<'p> { |
109 | pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> { |
110 | Scanner { prefilter, state: State::new() } |
111 | } |
112 | |
113 | pub(crate) fn is_effective(&mut self, at: usize) -> bool { |
114 | self.state.is_effective(at) |
115 | } |
116 | |
117 | pub(crate) fn reports_false_positives(&self) -> bool { |
118 | self.prefilter.reports_false_positives() |
119 | } |
120 | |
121 | pub(crate) fn next_candidate( |
122 | &mut self, |
123 | bytes: &[u8], |
124 | at: usize, |
125 | ) -> Candidate { |
126 | let cand = self.prefilter.next_candidate(&mut self.state, bytes, at); |
127 | match cand { |
128 | Candidate::None => { |
129 | self.state.update_skipped_bytes(bytes.len() - at); |
130 | } |
131 | Candidate::Match(ref m) => { |
132 | self.state.update_skipped_bytes(m.start() - at); |
133 | } |
134 | Candidate::PossibleStartOfMatch(i) => { |
135 | self.state.update_skipped_bytes(i - at); |
136 | } |
137 | } |
138 | cand |
139 | } |
140 | } |
141 | |
142 | impl<'p> core::fmt::Debug for Scanner<'p> { |
143 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
144 | f.debug_struct("Scanner" ).field(name:"state" , &self.state).finish() |
145 | } |
146 | } |
147 | |
148 | /// State tracks state associated with the effectiveness of a |
149 | /// prefilter. It is used to track how many bytes, on average, are skipped by |
150 | /// the prefilter. If this average dips below a certain threshold over time, |
151 | /// then the state renders the prefilter inert and stops using it. |
152 | /// |
153 | /// A prefilter state should be created for each search. (Where creating an |
154 | /// iterator via, e.g., `find_iter`, is treated as a single search.) |
155 | #[derive (Clone, Debug)] |
156 | pub struct State { |
157 | /// The number of skips that has been executed. |
158 | skips: usize, |
159 | /// The total number of bytes that have been skipped. |
160 | skipped: usize, |
161 | /// Once this heuristic has been deemed permanently ineffective, it will be |
162 | /// inert throughout the rest of its lifetime. This serves as a cheap way |
163 | /// to check inertness. |
164 | inert: bool, |
165 | /// The last (absolute) position at which a prefilter scanned to. |
166 | /// Prefilters can use this position to determine whether to re-scan or |
167 | /// not. |
168 | /// |
169 | /// Unlike other things that impact effectiveness, this is a fleeting |
170 | /// condition. That is, a prefilter can be considered ineffective if it is |
171 | /// at a position before `last_scan_at`, but can become effective again |
172 | /// once the search moves past `last_scan_at`. |
173 | /// |
174 | /// The utility of this is to both avoid additional overhead from calling |
175 | /// the prefilter and to avoid quadratic behavior. This ensures that a |
176 | /// prefilter will scan any particular byte at most once. (Note that some |
177 | /// prefilters, like the start-byte prefilter, do not need to use this |
178 | /// field at all, since it only looks for starting bytes.) |
179 | last_scan_at: usize, |
180 | } |
181 | |
182 | impl State { |
183 | /// The minimum number of skip attempts to try before considering whether |
184 | /// a prefilter is effective or not. |
185 | const MIN_SKIPS: usize = 40; |
186 | |
187 | /// The minimum amount of bytes that skipping must average. |
188 | /// |
189 | /// That is, after MIN_SKIPS have occurred, if the average number of bytes |
190 | /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be |
191 | /// rendered inert. |
192 | const MIN_AVG_SKIP: usize = 16; |
193 | |
194 | /// Create a fresh prefilter state. |
195 | pub fn new() -> State { |
196 | State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 } |
197 | } |
198 | |
199 | /// Updates the position at which the last scan stopped. This may be |
200 | /// greater than the position of the last candidate reported. For example, |
201 | /// searching for the byte `z` in `abczdef` for the pattern `abcz` will |
202 | /// report a candidate at position `0`, but the end of its last scan will |
203 | /// be at position `3`. |
204 | /// |
205 | /// This position factors into the effectiveness of this prefilter. If the |
206 | /// current position is less than the last position at which a scan ended, |
207 | /// then the prefilter should not be re-run until the search moves past |
208 | /// that position. |
209 | /// |
210 | /// It is always correct to never update the last scan position. In fact, |
211 | /// it is also always correct to set the last scan position to an arbitrary |
212 | /// value. The key is setting it to a position in the future at which it |
213 | /// makes sense to restart the prefilter. |
214 | pub fn update_last_scan(&mut self, at: usize) { |
215 | if at > self.last_scan_at { |
216 | self.last_scan_at = at; |
217 | } |
218 | } |
219 | |
220 | /// Return true if and only if this state indicates that a prefilter is |
221 | /// still effective. If the prefilter is not effective, then this state |
222 | /// is rendered "inert." At which point, all subsequent calls to |
223 | /// `is_effective` on this state will return `false`. |
224 | /// |
225 | /// `at` should correspond to the current starting position of the search. |
226 | /// |
227 | /// Callers typically do not need to use this, as it represents the |
228 | /// default implementation of |
229 | /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective). |
230 | fn is_effective(&mut self, at: usize) -> bool { |
231 | if self.inert { |
232 | return false; |
233 | } |
234 | if at < self.last_scan_at { |
235 | return false; |
236 | } |
237 | if self.skips < State::MIN_SKIPS { |
238 | return true; |
239 | } |
240 | |
241 | if self.skipped >= State::MIN_AVG_SKIP * self.skips { |
242 | return true; |
243 | } |
244 | |
245 | // We're inert. |
246 | self.inert = true; |
247 | false |
248 | } |
249 | |
250 | /// Update this state with the number of bytes skipped on the last |
251 | /// invocation of the prefilter. |
252 | fn update_skipped_bytes(&mut self, skipped: usize) { |
253 | self.skips += 1; |
254 | self.skipped += skipped; |
255 | } |
256 | } |
257 | |
258 | /// A `Prefilter` implementation that reports a possible match at every |
259 | /// position. |
260 | /// |
261 | /// This should generally not be used as an actual prefilter. It is only |
262 | /// useful when one needs to represent the absence of a prefilter in a generic |
263 | /// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex) |
264 | /// uses this prefilter by default to indicate that no prefilter should be |
265 | /// used. |
266 | /// |
267 | /// A `None` prefilter value cannot be constructed. |
268 | #[derive (Clone, Debug)] |
269 | pub struct None { |
270 | _priv: (), |
271 | } |
272 | |
273 | impl Prefilter for None { |
274 | fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate { |
275 | Candidate::PossibleStartOfMatch(at) |
276 | } |
277 | |
278 | fn heap_bytes(&self) -> usize { |
279 | 0 |
280 | } |
281 | } |
282 | |