| 1 | use crate::Match; |
| 2 | |
| 3 | /// A candidate is the result of running a prefilter on a haystack at a |
| 4 | /// particular position. The result is one of no match, a confirmed match or |
| 5 | /// a possible match. |
| 6 | /// |
| 7 | /// When no match is returned, the prefilter is guaranteeing that no possible |
| 8 | /// match can be found in the haystack, and the caller may trust this. That is, |
| 9 | /// all correct prefilters must never report false negatives. |
| 10 | /// |
| 11 | /// In some cases, a prefilter can confirm a match very quickly, in which case, |
| 12 | /// the caller may use this to stop what it's doing and report the match. In |
| 13 | /// this case, prefilter implementations must never report a false positive. |
| 14 | /// In other cases, the prefilter can only report a potential match, in which |
| 15 | /// case the callers must attempt to confirm the match. In this case, prefilter |
| 16 | /// implementations are permitted to return false positives. |
| 17 | #[derive (Clone, Debug)] |
| 18 | pub enum Candidate { |
| 19 | /// The prefilter reports that no match is possible. Prefilter |
| 20 | /// implementations will never report false negatives. |
| 21 | None, |
| 22 | /// The prefilter reports that a match has been confirmed at the provided |
| 23 | /// byte offsets. When this variant is reported, the prefilter is |
| 24 | /// guaranteeing a match. No false positives are permitted. |
| 25 | Match(Match), |
| 26 | /// The prefilter reports that a match *may* start at the given position. |
| 27 | /// When this variant is reported, it may correspond to a false positive. |
| 28 | PossibleStartOfMatch(usize), |
| 29 | } |
| 30 | |
| 31 | impl Candidate { |
| 32 | /// Convert this candidate into an option. This is useful when callers do |
| 33 | /// not distinguish between true positives and false positives (i.e., the |
| 34 | /// caller must always confirm the match in order to update some other |
| 35 | /// state). |
| 36 | /// |
| 37 | /// The byte offset in the option returned corresponds to the starting |
| 38 | /// position of the possible match. |
| 39 | pub fn into_option(self) -> Option<usize> { |
| 40 | match self { |
| 41 | Candidate::None => None, |
| 42 | Candidate::Match(ref m: &Match) => Some(m.start()), |
| 43 | Candidate::PossibleStartOfMatch(start: usize) => Some(start), |
| 44 | } |
| 45 | } |
| 46 | } |
| 47 | |
| 48 | /// A prefilter describes the behavior of fast literal scanners for quickly |
| 49 | /// skipping past bytes in the haystack that we know cannot possibly |
| 50 | /// participate in a match. |
| 51 | pub trait Prefilter: core::fmt::Debug { |
| 52 | /// Returns the next possible match candidate. This may yield false |
| 53 | /// positives, so callers must confirm a match starting at the position |
| 54 | /// returned. This, however, must never produce false negatives. That is, |
| 55 | /// this must, at minimum, return the starting position of the next match |
| 56 | /// in the given haystack after or at the given position. |
| 57 | fn next_candidate( |
| 58 | &self, |
| 59 | state: &mut State, |
| 60 | haystack: &[u8], |
| 61 | at: usize, |
| 62 | ) -> Candidate; |
| 63 | |
| 64 | /// Returns the approximate total amount of heap used by this prefilter, in |
| 65 | /// units of bytes. |
| 66 | fn heap_bytes(&self) -> usize; |
| 67 | |
| 68 | /// Returns true if and only if this prefilter may return false positives |
| 69 | /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful |
| 70 | /// when false positives are not posssible (in which case, implementations |
| 71 | /// should return false), which may allow completely avoiding heavier regex |
| 72 | /// machinery when the prefilter can quickly confirm its own matches. |
| 73 | /// |
| 74 | /// By default, this returns true, which is conservative; it is always |
| 75 | /// correct to return `true`. Returning `false` here and reporting a false |
| 76 | /// positive will result in incorrect searches. |
| 77 | fn reports_false_positives(&self) -> bool { |
| 78 | true |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { |
| 83 | #[inline ] |
| 84 | fn next_candidate( |
| 85 | &self, |
| 86 | state: &mut State, |
| 87 | haystack: &[u8], |
| 88 | at: usize, |
| 89 | ) -> Candidate { |
| 90 | (**self).next_candidate(state, haystack, at) |
| 91 | } |
| 92 | |
| 93 | fn heap_bytes(&self) -> usize { |
| 94 | (**self).heap_bytes() |
| 95 | } |
| 96 | |
| 97 | fn reports_false_positives(&self) -> bool { |
| 98 | (**self).reports_false_positives() |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | #[derive (Clone)] |
| 103 | pub struct Scanner<'p> { |
| 104 | prefilter: &'p dyn Prefilter, |
| 105 | state: State, |
| 106 | } |
| 107 | |
| 108 | impl<'p> Scanner<'p> { |
| 109 | pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> { |
| 110 | Scanner { prefilter, state: State::new() } |
| 111 | } |
| 112 | |
| 113 | pub(crate) fn is_effective(&mut self, at: usize) -> bool { |
| 114 | self.state.is_effective(at) |
| 115 | } |
| 116 | |
| 117 | pub(crate) fn reports_false_positives(&self) -> bool { |
| 118 | self.prefilter.reports_false_positives() |
| 119 | } |
| 120 | |
| 121 | pub(crate) fn next_candidate( |
| 122 | &mut self, |
| 123 | bytes: &[u8], |
| 124 | at: usize, |
| 125 | ) -> Candidate { |
| 126 | let cand = self.prefilter.next_candidate(&mut self.state, bytes, at); |
| 127 | match cand { |
| 128 | Candidate::None => { |
| 129 | self.state.update_skipped_bytes(bytes.len() - at); |
| 130 | } |
| 131 | Candidate::Match(ref m) => { |
| 132 | self.state.update_skipped_bytes(m.start() - at); |
| 133 | } |
| 134 | Candidate::PossibleStartOfMatch(i) => { |
| 135 | self.state.update_skipped_bytes(i - at); |
| 136 | } |
| 137 | } |
| 138 | cand |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | impl<'p> core::fmt::Debug for Scanner<'p> { |
| 143 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 144 | f.debug_struct("Scanner" ).field(name:"state" , &self.state).finish() |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | /// State tracks state associated with the effectiveness of a |
| 149 | /// prefilter. It is used to track how many bytes, on average, are skipped by |
| 150 | /// the prefilter. If this average dips below a certain threshold over time, |
| 151 | /// then the state renders the prefilter inert and stops using it. |
| 152 | /// |
| 153 | /// A prefilter state should be created for each search. (Where creating an |
| 154 | /// iterator via, e.g., `find_iter`, is treated as a single search.) |
| 155 | #[derive (Clone, Debug)] |
| 156 | pub struct State { |
| 157 | /// The number of skips that has been executed. |
| 158 | skips: usize, |
| 159 | /// The total number of bytes that have been skipped. |
| 160 | skipped: usize, |
| 161 | /// Once this heuristic has been deemed permanently ineffective, it will be |
| 162 | /// inert throughout the rest of its lifetime. This serves as a cheap way |
| 163 | /// to check inertness. |
| 164 | inert: bool, |
| 165 | /// The last (absolute) position at which a prefilter scanned to. |
| 166 | /// Prefilters can use this position to determine whether to re-scan or |
| 167 | /// not. |
| 168 | /// |
| 169 | /// Unlike other things that impact effectiveness, this is a fleeting |
| 170 | /// condition. That is, a prefilter can be considered ineffective if it is |
| 171 | /// at a position before `last_scan_at`, but can become effective again |
| 172 | /// once the search moves past `last_scan_at`. |
| 173 | /// |
| 174 | /// The utility of this is to both avoid additional overhead from calling |
| 175 | /// the prefilter and to avoid quadratic behavior. This ensures that a |
| 176 | /// prefilter will scan any particular byte at most once. (Note that some |
| 177 | /// prefilters, like the start-byte prefilter, do not need to use this |
| 178 | /// field at all, since it only looks for starting bytes.) |
| 179 | last_scan_at: usize, |
| 180 | } |
| 181 | |
| 182 | impl State { |
| 183 | /// The minimum number of skip attempts to try before considering whether |
| 184 | /// a prefilter is effective or not. |
| 185 | const MIN_SKIPS: usize = 40; |
| 186 | |
| 187 | /// The minimum amount of bytes that skipping must average. |
| 188 | /// |
| 189 | /// That is, after MIN_SKIPS have occurred, if the average number of bytes |
| 190 | /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be |
| 191 | /// rendered inert. |
| 192 | const MIN_AVG_SKIP: usize = 16; |
| 193 | |
| 194 | /// Create a fresh prefilter state. |
| 195 | pub fn new() -> State { |
| 196 | State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 } |
| 197 | } |
| 198 | |
| 199 | /// Updates the position at which the last scan stopped. This may be |
| 200 | /// greater than the position of the last candidate reported. For example, |
| 201 | /// searching for the byte `z` in `abczdef` for the pattern `abcz` will |
| 202 | /// report a candidate at position `0`, but the end of its last scan will |
| 203 | /// be at position `3`. |
| 204 | /// |
| 205 | /// This position factors into the effectiveness of this prefilter. If the |
| 206 | /// current position is less than the last position at which a scan ended, |
| 207 | /// then the prefilter should not be re-run until the search moves past |
| 208 | /// that position. |
| 209 | /// |
| 210 | /// It is always correct to never update the last scan position. In fact, |
| 211 | /// it is also always correct to set the last scan position to an arbitrary |
| 212 | /// value. The key is setting it to a position in the future at which it |
| 213 | /// makes sense to restart the prefilter. |
| 214 | pub fn update_last_scan(&mut self, at: usize) { |
| 215 | if at > self.last_scan_at { |
| 216 | self.last_scan_at = at; |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | /// Return true if and only if this state indicates that a prefilter is |
| 221 | /// still effective. If the prefilter is not effective, then this state |
| 222 | /// is rendered "inert." At which point, all subsequent calls to |
| 223 | /// `is_effective` on this state will return `false`. |
| 224 | /// |
| 225 | /// `at` should correspond to the current starting position of the search. |
| 226 | /// |
| 227 | /// Callers typically do not need to use this, as it represents the |
| 228 | /// default implementation of |
| 229 | /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective). |
| 230 | fn is_effective(&mut self, at: usize) -> bool { |
| 231 | if self.inert { |
| 232 | return false; |
| 233 | } |
| 234 | if at < self.last_scan_at { |
| 235 | return false; |
| 236 | } |
| 237 | if self.skips < State::MIN_SKIPS { |
| 238 | return true; |
| 239 | } |
| 240 | |
| 241 | if self.skipped >= State::MIN_AVG_SKIP * self.skips { |
| 242 | return true; |
| 243 | } |
| 244 | |
| 245 | // We're inert. |
| 246 | self.inert = true; |
| 247 | false |
| 248 | } |
| 249 | |
| 250 | /// Update this state with the number of bytes skipped on the last |
| 251 | /// invocation of the prefilter. |
| 252 | fn update_skipped_bytes(&mut self, skipped: usize) { |
| 253 | self.skips += 1; |
| 254 | self.skipped += skipped; |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | /// A `Prefilter` implementation that reports a possible match at every |
| 259 | /// position. |
| 260 | /// |
| 261 | /// This should generally not be used as an actual prefilter. It is only |
| 262 | /// useful when one needs to represent the absence of a prefilter in a generic |
| 263 | /// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex) |
| 264 | /// uses this prefilter by default to indicate that no prefilter should be |
| 265 | /// used. |
| 266 | /// |
| 267 | /// A `None` prefilter value cannot be constructed. |
| 268 | #[derive (Clone, Debug)] |
| 269 | pub struct None { |
| 270 | _priv: (), |
| 271 | } |
| 272 | |
| 273 | impl Prefilter for None { |
| 274 | fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate { |
| 275 | Candidate::PossibleStartOfMatch(at) |
| 276 | } |
| 277 | |
| 278 | fn heap_bytes(&self) -> usize { |
| 279 | 0 |
| 280 | } |
| 281 | } |
| 282 | |