1 | use std::cmp; |
2 | use std::fmt; |
3 | use std::panic::{RefUnwindSafe, UnwindSafe}; |
4 | use std::u8; |
5 | |
6 | use memchr::{memchr, memchr2, memchr3}; |
7 | |
8 | use crate::ahocorasick::MatchKind; |
9 | use crate::packed; |
10 | use crate::Match; |
11 | |
12 | /// A candidate is the result of running a prefilter on a haystack at a |
13 | /// particular position. The result is either no match, a confirmed match or |
14 | /// a possible match. |
15 | /// |
16 | /// When no match is returned, the prefilter is guaranteeing that no possible |
17 | /// match can be found in the haystack, and the caller may trust this. That is, |
18 | /// all correct prefilters must never report false negatives. |
19 | /// |
20 | /// In some cases, a prefilter can confirm a match very quickly, in which case, |
21 | /// the caller may use this to stop what it's doing and report the match. In |
22 | /// this case, prefilter implementations must never report a false positive. |
23 | /// In other cases, the prefilter can only report a potential match, in which |
24 | /// case the callers must attempt to confirm the match. In this case, prefilter |
25 | /// implementations are permitted to return false positives. |
26 | #[derive (Clone, Debug)] |
27 | pub enum Candidate { |
28 | None, |
29 | Match(Match), |
30 | PossibleStartOfMatch(usize), |
31 | } |
32 | |
33 | impl Candidate { |
34 | /// Convert this candidate into an option. This is useful when callers |
35 | /// do not distinguish between true positives and false positives (i.e., |
36 | /// the caller must always confirm the match in order to update some other |
37 | /// state). |
38 | pub fn into_option(self) -> Option<usize> { |
39 | match self { |
40 | Candidate::None => None, |
41 | Candidate::Match(ref m: &Match) => Some(m.start()), |
42 | Candidate::PossibleStartOfMatch(start: usize) => Some(start), |
43 | } |
44 | } |
45 | } |
46 | |
47 | /// A prefilter describes the behavior of fast literal scanners for quickly |
48 | /// skipping past bytes in the haystack that we know cannot possibly |
49 | /// participate in a match. |
50 | pub trait Prefilter: |
51 | Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug |
52 | { |
53 | /// Returns the next possible match candidate. This may yield false |
54 | /// positives, so callers must confirm a match starting at the position |
55 | /// returned. This, however, must never produce false negatives. That is, |
56 | /// this must, at minimum, return the starting position of the next match |
57 | /// in the given haystack after or at the given position. |
58 | fn next_candidate( |
59 | &self, |
60 | state: &mut PrefilterState, |
61 | haystack: &[u8], |
62 | at: usize, |
63 | ) -> Candidate; |
64 | |
65 | /// A method for cloning a prefilter, to work-around the fact that Clone |
66 | /// is not object-safe. |
67 | fn clone_prefilter(&self) -> Box<dyn Prefilter>; |
68 | |
69 | /// Returns the approximate total amount of heap used by this prefilter, in |
70 | /// units of bytes. |
71 | fn heap_bytes(&self) -> usize; |
72 | |
73 | /// Returns true if and only if this prefilter never returns false |
74 | /// positives. This is useful for completely avoiding the automaton |
75 | /// when the prefilter can quickly confirm its own matches. |
76 | /// |
77 | /// By default, this returns true, which is conservative; it is always |
78 | /// correct to return `true`. Returning `false` here and reporting a false |
79 | /// positive will result in incorrect searches. |
80 | fn reports_false_positives(&self) -> bool { |
81 | true |
82 | } |
83 | |
84 | /// Returns true if and only if this prefilter may look for a non-starting |
85 | /// position of a match. |
86 | /// |
87 | /// This is useful in a streaming context where prefilters that don't look |
88 | /// for a starting position of a match can be quite difficult to deal with. |
89 | /// |
90 | /// This returns false by default. |
91 | fn looks_for_non_start_of_match(&self) -> bool { |
92 | false |
93 | } |
94 | } |
95 | |
96 | impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { |
97 | #[inline ] |
98 | fn next_candidate( |
99 | &self, |
100 | state: &mut PrefilterState, |
101 | haystack: &[u8], |
102 | at: usize, |
103 | ) -> Candidate { |
104 | (**self).next_candidate(state, haystack, at) |
105 | } |
106 | |
107 | fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
108 | (**self).clone_prefilter() |
109 | } |
110 | |
111 | fn heap_bytes(&self) -> usize { |
112 | (**self).heap_bytes() |
113 | } |
114 | |
115 | fn reports_false_positives(&self) -> bool { |
116 | (**self).reports_false_positives() |
117 | } |
118 | } |
119 | |
120 | /// A convenience object for representing any type that implements Prefilter |
121 | /// and is cloneable. |
122 | #[derive (Debug)] |
123 | pub struct PrefilterObj(Box<dyn Prefilter>); |
124 | |
125 | impl Clone for PrefilterObj { |
126 | fn clone(&self) -> Self { |
127 | PrefilterObj(self.0.clone_prefilter()) |
128 | } |
129 | } |
130 | |
131 | impl PrefilterObj { |
132 | /// Create a new prefilter object. |
133 | pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj { |
134 | PrefilterObj(Box::new(t)) |
135 | } |
136 | |
137 | /// Return the underlying prefilter trait object. |
138 | pub fn as_ref(&self) -> &dyn Prefilter { |
139 | &*self.0 |
140 | } |
141 | } |
142 | |
143 | /// PrefilterState tracks state associated with the effectiveness of a |
144 | /// prefilter. It is used to track how many bytes, on average, are skipped by |
145 | /// the prefilter. If this average dips below a certain threshold over time, |
146 | /// then the state renders the prefilter inert and stops using it. |
147 | /// |
148 | /// A prefilter state should be created for each search. (Where creating an |
149 | /// iterator via, e.g., `find_iter`, is treated as a single search.) |
150 | #[derive (Clone, Debug)] |
151 | pub struct PrefilterState { |
152 | /// The number of skips that has been executed. |
153 | skips: usize, |
154 | /// The total number of bytes that have been skipped. |
155 | skipped: usize, |
156 | /// The maximum length of a match. This is used to help determine how many |
157 | /// bytes on average should be skipped in order for a prefilter to be |
158 | /// effective. |
159 | max_match_len: usize, |
160 | /// Once this heuristic has been deemed permanently ineffective, it will be |
161 | /// inert throughout the rest of its lifetime. This serves as a cheap way |
162 | /// to check inertness. |
163 | inert: bool, |
164 | /// The last (absolute) position at which a prefilter scanned to. |
165 | /// Prefilters can use this position to determine whether to re-scan or |
166 | /// not. |
167 | /// |
168 | /// Unlike other things that impact effectiveness, this is a fleeting |
169 | /// condition. That is, a prefilter can be considered ineffective if it is |
170 | /// at a position before `last_scan_at`, but can become effective again |
171 | /// once the search moves past `last_scan_at`. |
172 | /// |
173 | /// The utility of this is to both avoid additional overhead from calling |
174 | /// the prefilter and to avoid quadratic behavior. This ensures that a |
175 | /// prefilter will scan any particular byte at most once. (Note that some |
176 | /// prefilters, like the start-byte prefilter, do not need to use this |
177 | /// field at all, since it only looks for starting bytes.) |
178 | last_scan_at: usize, |
179 | } |
180 | |
181 | impl PrefilterState { |
182 | /// The minimum number of skip attempts to try before considering whether |
183 | /// a prefilter is effective or not. |
184 | const MIN_SKIPS: usize = 40; |
185 | |
186 | /// The minimum amount of bytes that skipping must average, expressed as a |
187 | /// factor of the multiple of the length of a possible match. |
188 | /// |
189 | /// That is, after MIN_SKIPS have occurred, if the average number of bytes |
190 | /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the |
191 | /// prefilter outed to be rendered inert. |
192 | const MIN_AVG_FACTOR: usize = 2; |
193 | |
194 | /// Create a fresh prefilter state. |
195 | pub fn new(max_match_len: usize) -> PrefilterState { |
196 | PrefilterState { |
197 | skips: 0, |
198 | skipped: 0, |
199 | max_match_len, |
200 | inert: false, |
201 | last_scan_at: 0, |
202 | } |
203 | } |
204 | |
205 | /// Create a prefilter state that always disables the prefilter. |
206 | pub fn disabled() -> PrefilterState { |
207 | PrefilterState { |
208 | skips: 0, |
209 | skipped: 0, |
210 | max_match_len: 0, |
211 | inert: true, |
212 | last_scan_at: 0, |
213 | } |
214 | } |
215 | |
216 | /// Update this state with the number of bytes skipped on the last |
217 | /// invocation of the prefilter. |
218 | #[inline ] |
219 | fn update_skipped_bytes(&mut self, skipped: usize) { |
220 | self.skips += 1; |
221 | self.skipped += skipped; |
222 | } |
223 | |
224 | /// Updates the position at which the last scan stopped. This may be |
225 | /// greater than the position of the last candidate reported. For example, |
226 | /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz` |
227 | /// will report a candidate at position `0`, but the end of its last scan |
228 | /// will be at position `3`. |
229 | /// |
230 | /// This position factors into the effectiveness of this prefilter. If the |
231 | /// current position is less than the last position at which a scan ended, |
232 | /// then the prefilter should not be re-run until the search moves past |
233 | /// that position. |
234 | #[inline ] |
235 | fn update_at(&mut self, at: usize) { |
236 | if at > self.last_scan_at { |
237 | self.last_scan_at = at; |
238 | } |
239 | } |
240 | |
241 | /// Return true if and only if this state indicates that a prefilter is |
242 | /// still effective. |
243 | /// |
244 | /// The given pos should correspond to the current starting position of the |
245 | /// search. |
246 | #[inline ] |
247 | pub fn is_effective(&mut self, at: usize) -> bool { |
248 | if self.inert { |
249 | return false; |
250 | } |
251 | if at < self.last_scan_at { |
252 | return false; |
253 | } |
254 | if self.skips < PrefilterState::MIN_SKIPS { |
255 | return true; |
256 | } |
257 | |
258 | let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len; |
259 | if self.skipped >= min_avg * self.skips { |
260 | return true; |
261 | } |
262 | |
263 | // We're inert. |
264 | self.inert = true; |
265 | false |
266 | } |
267 | } |
268 | |
269 | /// A builder for constructing the best possible prefilter. When constructed, |
270 | /// this builder will heuristically select the best prefilter it can build, |
271 | /// if any, and discard the rest. |
272 | #[derive (Debug)] |
273 | pub struct Builder { |
274 | count: usize, |
275 | ascii_case_insensitive: bool, |
276 | start_bytes: StartBytesBuilder, |
277 | rare_bytes: RareBytesBuilder, |
278 | packed: Option<packed::Builder>, |
279 | } |
280 | |
281 | impl Builder { |
282 | /// Create a new builder for constructing the best possible prefilter. |
283 | pub fn new(kind: MatchKind) -> Builder { |
284 | let pbuilder = kind |
285 | .as_packed() |
286 | .map(|kind| packed::Config::new().match_kind(kind).builder()); |
287 | Builder { |
288 | count: 0, |
289 | ascii_case_insensitive: false, |
290 | start_bytes: StartBytesBuilder::new(), |
291 | rare_bytes: RareBytesBuilder::new(), |
292 | packed: pbuilder, |
293 | } |
294 | } |
295 | |
296 | /// Enable ASCII case insensitivity. When set, byte strings added to this |
297 | /// builder will be interpreted without respect to ASCII case. |
298 | pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder { |
299 | self.ascii_case_insensitive = yes; |
300 | self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); |
301 | self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); |
302 | self |
303 | } |
304 | |
305 | /// Return a prefilter suitable for quickly finding potential matches. |
306 | /// |
307 | /// All patterns added to an Aho-Corasick automaton should be added to this |
308 | /// builder before attempting to construct the prefilter. |
309 | pub fn build(&self) -> Option<PrefilterObj> { |
310 | // match (self.start_bytes.build(), self.rare_bytes.build()) { |
311 | match (self.start_bytes.build(), self.rare_bytes.build()) { |
312 | // If we could build both start and rare prefilters, then there are |
313 | // a few cases in which we'd want to use the start-byte prefilter |
314 | // over the rare-byte prefilter, since the former has lower |
315 | // overhead. |
316 | (prestart @ Some(_), prerare @ Some(_)) => { |
317 | // If the start-byte prefilter can scan for a smaller number |
318 | // of bytes than the rare-byte prefilter, then it's probably |
319 | // faster. |
320 | let has_fewer_bytes = |
321 | self.start_bytes.count < self.rare_bytes.count; |
322 | // Otherwise, if the combined frequency rank of the detected |
323 | // bytes in the start-byte prefilter is "close" to the combined |
324 | // frequency rank of the rare-byte prefilter, then we pick |
325 | // the start-byte prefilter even if the rare-byte prefilter |
326 | // heuristically searches for rare bytes. This is because the |
327 | // rare-byte prefilter has higher constant costs, so we tend to |
328 | // prefer the start-byte prefilter when we can. |
329 | let has_rarer_bytes = |
330 | self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; |
331 | if has_fewer_bytes || has_rarer_bytes { |
332 | prestart |
333 | } else { |
334 | prerare |
335 | } |
336 | } |
337 | (prestart @ Some(_), None) => prestart, |
338 | (None, prerare @ Some(_)) => prerare, |
339 | (None, None) if self.ascii_case_insensitive => None, |
340 | (None, None) => self |
341 | .packed |
342 | .as_ref() |
343 | .and_then(|b| b.build()) |
344 | .map(|s| PrefilterObj::new(Packed(s))), |
345 | } |
346 | } |
347 | |
348 | /// Add a literal string to this prefilter builder. |
349 | pub fn add(&mut self, bytes: &[u8]) { |
350 | self.count += 1; |
351 | self.start_bytes.add(bytes); |
352 | self.rare_bytes.add(bytes); |
353 | if let Some(ref mut pbuilder) = self.packed { |
354 | pbuilder.add(bytes); |
355 | } |
356 | } |
357 | } |
358 | |
359 | /// A type that wraps a packed searcher and implements the `Prefilter` |
360 | /// interface. |
361 | #[derive (Clone, Debug)] |
362 | struct Packed(packed::Searcher); |
363 | |
364 | impl Prefilter for Packed { |
365 | fn next_candidate( |
366 | &self, |
367 | _state: &mut PrefilterState, |
368 | haystack: &[u8], |
369 | at: usize, |
370 | ) -> Candidate { |
371 | self.0.find_at(haystack, at).map_or(default:Candidate::None, f:Candidate::Match) |
372 | } |
373 | |
374 | fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
375 | Box::new(self.clone()) |
376 | } |
377 | |
378 | fn heap_bytes(&self) -> usize { |
379 | self.0.heap_bytes() |
380 | } |
381 | |
382 | fn reports_false_positives(&self) -> bool { |
383 | false |
384 | } |
385 | } |
386 | |
387 | /// A builder for constructing a rare byte prefilter. |
388 | /// |
389 | /// A rare byte prefilter attempts to pick out a small set of rare bytes that |
390 | /// occurr in the patterns, and then quickly scan to matches of those rare |
391 | /// bytes. |
392 | #[derive (Clone, Debug)] |
393 | struct RareBytesBuilder { |
394 | /// Whether this prefilter should account for ASCII case insensitivity or |
395 | /// not. |
396 | ascii_case_insensitive: bool, |
397 | /// A set of rare bytes, indexed by byte value. |
398 | rare_set: ByteSet, |
399 | /// A set of byte offsets associated with bytes in a pattern. An entry |
400 | /// corresponds to a particular bytes (its index) and is only non-zero if |
401 | /// the byte occurred at an offset greater than 0 in at least one pattern. |
402 | /// |
403 | /// If a byte's offset is not representable in 8 bits, then the rare bytes |
404 | /// prefilter becomes inert. |
405 | byte_offsets: RareByteOffsets, |
406 | /// Whether this is available as a prefilter or not. This can be set to |
407 | /// false during construction if a condition is seen that invalidates the |
408 | /// use of the rare-byte prefilter. |
409 | available: bool, |
410 | /// The number of bytes set to an active value in `byte_offsets`. |
411 | count: usize, |
412 | /// The sum of frequency ranks for the rare bytes detected. This is |
413 | /// intended to give a heuristic notion of how rare the bytes are. |
414 | rank_sum: u16, |
415 | } |
416 | |
417 | /// A set of bytes. |
418 | #[derive (Clone, Copy)] |
419 | struct ByteSet([bool; 256]); |
420 | |
421 | impl ByteSet { |
422 | fn empty() -> ByteSet { |
423 | ByteSet([false; 256]) |
424 | } |
425 | |
426 | fn insert(&mut self, b: u8) -> bool { |
427 | let new: bool = !self.contains(b); |
428 | self.0[b as usize] = true; |
429 | new |
430 | } |
431 | |
432 | fn contains(&self, b: u8) -> bool { |
433 | self.0[b as usize] |
434 | } |
435 | } |
436 | |
437 | impl fmt::Debug for ByteSet { |
438 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
439 | let mut bytes: Vec = vec![]; |
440 | for b: u8 in 0..=255 { |
441 | if self.contains(b) { |
442 | bytes.push(b); |
443 | } |
444 | } |
445 | f.debug_struct("ByteSet" ).field(name:"set" , &bytes).finish() |
446 | } |
447 | } |
448 | |
449 | /// A set of byte offsets, keyed by byte. |
450 | #[derive (Clone, Copy)] |
451 | struct RareByteOffsets { |
452 | /// Each entry corresponds to the maximum offset of the corresponding |
453 | /// byte across all patterns seen. |
454 | set: [RareByteOffset; 256], |
455 | } |
456 | |
457 | impl RareByteOffsets { |
458 | /// Create a new empty set of rare byte offsets. |
459 | pub fn empty() -> RareByteOffsets { |
460 | RareByteOffsets { set: [RareByteOffset::default(); 256] } |
461 | } |
462 | |
463 | /// Add the given offset for the given byte to this set. If the offset is |
464 | /// greater than the existing offset, then it overwrites the previous |
465 | /// value and returns false. If there is no previous value set, then this |
466 | /// sets it and returns true. |
467 | pub fn set(&mut self, byte: u8, off: RareByteOffset) { |
468 | self.set[byte as usize].max = |
469 | cmp::max(self.set[byte as usize].max, v2:off.max); |
470 | } |
471 | } |
472 | |
473 | impl fmt::Debug for RareByteOffsets { |
474 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
475 | let mut offsets: Vec<&RareByteOffset> = vec![]; |
476 | for off: &RareByteOffset in self.set.iter() { |
477 | if off.max > 0 { |
478 | offsets.push(off); |
479 | } |
480 | } |
481 | f.debug_struct("RareByteOffsets" ).field(name:"set" , &offsets).finish() |
482 | } |
483 | } |
484 | |
485 | /// Offsets associated with an occurrence of a "rare" byte in any of the |
486 | /// patterns used to construct a single Aho-Corasick automaton. |
487 | #[derive (Clone, Copy, Debug)] |
488 | struct RareByteOffset { |
489 | /// The maximum offset at which a particular byte occurs from the start |
490 | /// of any pattern. This is used as a shift amount. That is, when an |
491 | /// occurrence of this byte is found, the candidate position reported by |
492 | /// the prefilter is `position_of_byte - max`, such that the automaton |
493 | /// will begin its search at a position that is guaranteed to observe a |
494 | /// match. |
495 | /// |
496 | /// To avoid accidentally quadratic behavior, a prefilter is considered |
497 | /// ineffective when it is asked to start scanning from a position that it |
498 | /// has already scanned past. |
499 | /// |
500 | /// Using a `u8` here means that if we ever see a pattern that's longer |
501 | /// than 255 bytes, then the entire rare byte prefilter is disabled. |
502 | max: u8, |
503 | } |
504 | |
505 | impl Default for RareByteOffset { |
506 | fn default() -> RareByteOffset { |
507 | RareByteOffset { max: 0 } |
508 | } |
509 | } |
510 | |
511 | impl RareByteOffset { |
512 | /// Create a new rare byte offset. If the given offset is too big, then |
513 | /// None is returned. In that case, callers should render the rare bytes |
514 | /// prefilter inert. |
515 | fn new(max: usize) -> Option<RareByteOffset> { |
516 | if max > u8::MAX as usize { |
517 | None |
518 | } else { |
519 | Some(RareByteOffset { max: max as u8 }) |
520 | } |
521 | } |
522 | } |
523 | |
524 | impl RareBytesBuilder { |
525 | /// Create a new builder for constructing a rare byte prefilter. |
526 | fn new() -> RareBytesBuilder { |
527 | RareBytesBuilder { |
528 | ascii_case_insensitive: false, |
529 | rare_set: ByteSet::empty(), |
530 | byte_offsets: RareByteOffsets::empty(), |
531 | available: true, |
532 | count: 0, |
533 | rank_sum: 0, |
534 | } |
535 | } |
536 | |
537 | /// Enable ASCII case insensitivity. When set, byte strings added to this |
538 | /// builder will be interpreted without respect to ASCII case. |
539 | fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { |
540 | self.ascii_case_insensitive = yes; |
541 | self |
542 | } |
543 | |
544 | /// Build the rare bytes prefilter. |
545 | /// |
546 | /// If there are more than 3 distinct starting bytes, or if heuristics |
547 | /// otherwise determine that this prefilter should not be used, then `None` |
548 | /// is returned. |
549 | fn build(&self) -> Option<PrefilterObj> { |
550 | if !self.available || self.count > 3 { |
551 | return None; |
552 | } |
553 | let (mut bytes, mut len) = ([0; 3], 0); |
554 | for b in 0..=255 { |
555 | if self.rare_set.contains(b) { |
556 | bytes[len] = b as u8; |
557 | len += 1; |
558 | } |
559 | } |
560 | match len { |
561 | 0 => None, |
562 | 1 => Some(PrefilterObj::new(RareBytesOne { |
563 | byte1: bytes[0], |
564 | offset: self.byte_offsets.set[bytes[0] as usize], |
565 | })), |
566 | 2 => Some(PrefilterObj::new(RareBytesTwo { |
567 | offsets: self.byte_offsets, |
568 | byte1: bytes[0], |
569 | byte2: bytes[1], |
570 | })), |
571 | 3 => Some(PrefilterObj::new(RareBytesThree { |
572 | offsets: self.byte_offsets, |
573 | byte1: bytes[0], |
574 | byte2: bytes[1], |
575 | byte3: bytes[2], |
576 | })), |
577 | _ => unreachable!(), |
578 | } |
579 | } |
580 | |
581 | /// Add a byte string to this builder. |
582 | /// |
583 | /// All patterns added to an Aho-Corasick automaton should be added to this |
584 | /// builder before attempting to construct the prefilter. |
585 | fn add(&mut self, bytes: &[u8]) { |
586 | // If we've already given up, then do nothing. |
587 | if !self.available { |
588 | return; |
589 | } |
590 | // If we've already blown our budget, then don't waste time looking |
591 | // for more rare bytes. |
592 | if self.count > 3 { |
593 | self.available = false; |
594 | return; |
595 | } |
596 | // If the pattern is too long, then our offset table is bunk, so |
597 | // give up. |
598 | if bytes.len() >= 256 { |
599 | self.available = false; |
600 | return; |
601 | } |
602 | let mut rarest = match bytes.get(0) { |
603 | None => return, |
604 | Some(&b) => (b, freq_rank(b)), |
605 | }; |
606 | // The idea here is to look for the rarest byte in each pattern, and |
607 | // add that to our set. As a special exception, if we see a byte that |
608 | // we've already added, then we immediately stop and choose that byte, |
609 | // even if there's another rare byte in the pattern. This helps us |
610 | // apply the rare byte optimization in more cases by attempting to pick |
611 | // bytes that are in common between patterns. So for example, if we |
612 | // were searching for `Sherlock` and `lockjaw`, then this would pick |
613 | // `k` for both patterns, resulting in the use of `memchr` instead of |
614 | // `memchr2` for `k` and `j`. |
615 | let mut found = false; |
616 | for (pos, &b) in bytes.iter().enumerate() { |
617 | self.set_offset(pos, b); |
618 | if found { |
619 | continue; |
620 | } |
621 | if self.rare_set.contains(b) { |
622 | found = true; |
623 | continue; |
624 | } |
625 | let rank = freq_rank(b); |
626 | if rank < rarest.1 { |
627 | rarest = (b, rank); |
628 | } |
629 | } |
630 | if !found { |
631 | self.add_rare_byte(rarest.0); |
632 | } |
633 | } |
634 | |
635 | fn set_offset(&mut self, pos: usize, byte: u8) { |
636 | // This unwrap is OK because pos is never bigger than our max. |
637 | let offset = RareByteOffset::new(pos).unwrap(); |
638 | self.byte_offsets.set(byte, offset); |
639 | if self.ascii_case_insensitive { |
640 | self.byte_offsets.set(opposite_ascii_case(byte), offset); |
641 | } |
642 | } |
643 | |
644 | fn add_rare_byte(&mut self, byte: u8) { |
645 | self.add_one_rare_byte(byte); |
646 | if self.ascii_case_insensitive { |
647 | self.add_one_rare_byte(opposite_ascii_case(byte)); |
648 | } |
649 | } |
650 | |
651 | fn add_one_rare_byte(&mut self, byte: u8) { |
652 | if self.rare_set.insert(byte) { |
653 | self.count += 1; |
654 | self.rank_sum += freq_rank(byte) as u16; |
655 | } |
656 | } |
657 | } |
658 | |
659 | /// A prefilter for scanning for a single "rare" byte. |
660 | #[derive (Clone, Debug)] |
661 | struct RareBytesOne { |
662 | byte1: u8, |
663 | offset: RareByteOffset, |
664 | } |
665 | |
666 | impl Prefilter for RareBytesOne { |
667 | fn next_candidate( |
668 | &self, |
669 | state: &mut PrefilterState, |
670 | haystack: &[u8], |
671 | at: usize, |
672 | ) -> Candidate { |
673 | memchr(self.byte1, &haystack[at..]) |
674 | .map(|i| { |
675 | let pos = at + i; |
676 | state.last_scan_at = pos; |
677 | cmp::max(at, pos.saturating_sub(self.offset.max as usize)) |
678 | }) |
679 | .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
680 | } |
681 | |
682 | fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
683 | Box::new(self.clone()) |
684 | } |
685 | |
686 | fn heap_bytes(&self) -> usize { |
687 | 0 |
688 | } |
689 | |
690 | fn looks_for_non_start_of_match(&self) -> bool { |
691 | // TODO: It should be possible to use a rare byte prefilter in a |
692 | // streaming context. The main problem is that we usually assume that |
693 | // if a prefilter has scanned some text and not found anything, then no |
694 | // match *starts* in that text. This doesn't matter in non-streaming |
695 | // contexts, but in a streaming context, if we're looking for a byte |
696 | // that doesn't start at the beginning of a match and don't find it, |
697 | // then it's still possible for a match to start at the end of the |
698 | // current buffer content. In order to fix this, the streaming searcher |
699 | // would need to become aware of prefilters that do this and use the |
700 | // appropriate offset in various places. It is quite a delicate change |
701 | // and probably shouldn't be attempted until streaming search has a |
702 | // better testing strategy. In particular, we'd really like to be able |
703 | // to vary the buffer size to force strange cases that occur at the |
704 | // edge of the buffer. If we make the buffer size minimal, then these |
705 | // cases occur more frequently and easier. |
706 | // |
707 | // This is also a bummer because this means that if the prefilter |
708 | // builder chose a rare byte prefilter, then a streaming search won't |
709 | // use any prefilter at all because the builder doesn't know how it's |
710 | // going to be used. Assuming we don't make streaming search aware of |
711 | // these special types of prefilters as described above, we could fix |
712 | // this by building a "backup" prefilter that could be used when the |
713 | // rare byte prefilter could not. But that's a bandaide. Sigh. |
714 | true |
715 | } |
716 | } |
717 | |
718 | /// A prefilter for scanning for two "rare" bytes. |
719 | #[derive (Clone, Debug)] |
720 | struct RareBytesTwo { |
721 | offsets: RareByteOffsets, |
722 | byte1: u8, |
723 | byte2: u8, |
724 | } |
725 | |
726 | impl Prefilter for RareBytesTwo { |
727 | fn next_candidate( |
728 | &self, |
729 | state: &mut PrefilterState, |
730 | haystack: &[u8], |
731 | at: usize, |
732 | ) -> Candidate { |
733 | memchr2(self.byte1, self.byte2, &haystack[at..]) |
734 | .map(|i| { |
735 | let pos = at + i; |
736 | state.update_at(pos); |
737 | let offset = self.offsets.set[haystack[pos] as usize].max; |
738 | cmp::max(at, pos.saturating_sub(offset as usize)) |
739 | }) |
740 | .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
741 | } |
742 | |
743 | fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
744 | Box::new(self.clone()) |
745 | } |
746 | |
747 | fn heap_bytes(&self) -> usize { |
748 | 0 |
749 | } |
750 | |
751 | fn looks_for_non_start_of_match(&self) -> bool { |
752 | // TODO: See Prefilter impl for RareBytesOne. |
753 | true |
754 | } |
755 | } |
756 | |
757 | /// A prefilter for scanning for three "rare" bytes. |
758 | #[derive (Clone, Debug)] |
759 | struct RareBytesThree { |
760 | offsets: RareByteOffsets, |
761 | byte1: u8, |
762 | byte2: u8, |
763 | byte3: u8, |
764 | } |
765 | |
766 | impl Prefilter for RareBytesThree { |
767 | fn next_candidate( |
768 | &self, |
769 | state: &mut PrefilterState, |
770 | haystack: &[u8], |
771 | at: usize, |
772 | ) -> Candidate { |
773 | memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) |
774 | .map(|i| { |
775 | let pos = at + i; |
776 | state.update_at(pos); |
777 | let offset = self.offsets.set[haystack[pos] as usize].max; |
778 | cmp::max(at, pos.saturating_sub(offset as usize)) |
779 | }) |
780 | .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
781 | } |
782 | |
783 | fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
784 | Box::new(self.clone()) |
785 | } |
786 | |
787 | fn heap_bytes(&self) -> usize { |
788 | 0 |
789 | } |
790 | |
791 | fn looks_for_non_start_of_match(&self) -> bool { |
792 | // TODO: See Prefilter impl for RareBytesOne. |
793 | true |
794 | } |
795 | } |
796 | |
797 | /// A builder for constructing a starting byte prefilter. |
798 | /// |
799 | /// A starting byte prefilter is a simplistic prefilter that looks for possible |
800 | /// matches by reporting all positions corresponding to a particular byte. This |
801 | /// generally only takes affect when there are at most 3 distinct possible |
802 | /// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two |
803 | /// distinct starting bytes (`f` and `b`), and this prefilter returns all |
804 | /// occurrences of either `f` or `b`. |
805 | /// |
806 | /// In some cases, a heuristic frequency analysis may determine that it would |
807 | /// be better not to use this prefilter even when there are 3 or fewer distinct |
808 | /// starting bytes. |
809 | #[derive (Clone, Debug)] |
810 | struct StartBytesBuilder { |
811 | /// Whether this prefilter should account for ASCII case insensitivity or |
812 | /// not. |
813 | ascii_case_insensitive: bool, |
814 | /// The set of starting bytes observed. |
815 | byteset: Vec<bool>, |
816 | /// The number of bytes set to true in `byteset`. |
817 | count: usize, |
818 | /// The sum of frequency ranks for the rare bytes detected. This is |
819 | /// intended to give a heuristic notion of how rare the bytes are. |
820 | rank_sum: u16, |
821 | } |
822 | |
823 | impl StartBytesBuilder { |
824 | /// Create a new builder for constructing a start byte prefilter. |
825 | fn new() -> StartBytesBuilder { |
826 | StartBytesBuilder { |
827 | ascii_case_insensitive: false, |
828 | byteset: vec![false; 256], |
829 | count: 0, |
830 | rank_sum: 0, |
831 | } |
832 | } |
833 | |
834 | /// Enable ASCII case insensitivity. When set, byte strings added to this |
835 | /// builder will be interpreted without respect to ASCII case. |
836 | fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { |
837 | self.ascii_case_insensitive = yes; |
838 | self |
839 | } |
840 | |
841 | /// Build the starting bytes prefilter. |
842 | /// |
843 | /// If there are more than 3 distinct starting bytes, or if heuristics |
844 | /// otherwise determine that this prefilter should not be used, then `None` |
845 | /// is returned. |
846 | fn build(&self) -> Option<PrefilterObj> { |
847 | if self.count > 3 { |
848 | return None; |
849 | } |
850 | let (mut bytes, mut len) = ([0; 3], 0); |
851 | for b in 0..256 { |
852 | if !self.byteset[b] { |
853 | continue; |
854 | } |
855 | // We don't handle non-ASCII bytes for now. Getting non-ASCII |
856 | // bytes right is trickier, since we generally don't want to put |
857 | // a leading UTF-8 code unit into a prefilter that isn't ASCII, |
858 | // since they can frequently. Instead, it would be better to use a |
859 | // continuation byte, but this requires more sophisticated analysis |
860 | // of the automaton and a richer prefilter API. |
861 | if b > 0x7F { |
862 | return None; |
863 | } |
864 | bytes[len] = b as u8; |
865 | len += 1; |
866 | } |
867 | match len { |
868 | 0 => None, |
869 | 1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })), |
870 | 2 => Some(PrefilterObj::new(StartBytesTwo { |
871 | byte1: bytes[0], |
872 | byte2: bytes[1], |
873 | })), |
874 | 3 => Some(PrefilterObj::new(StartBytesThree { |
875 | byte1: bytes[0], |
876 | byte2: bytes[1], |
877 | byte3: bytes[2], |
878 | })), |
879 | _ => unreachable!(), |
880 | } |
881 | } |
882 | |
883 | /// Add a byte string to this builder. |
884 | /// |
885 | /// All patterns added to an Aho-Corasick automaton should be added to this |
886 | /// builder before attempting to construct the prefilter. |
887 | fn add(&mut self, bytes: &[u8]) { |
888 | if self.count > 3 { |
889 | return; |
890 | } |
891 | if let Some(&byte) = bytes.get(0) { |
892 | self.add_one_byte(byte); |
893 | if self.ascii_case_insensitive { |
894 | self.add_one_byte(opposite_ascii_case(byte)); |
895 | } |
896 | } |
897 | } |
898 | |
899 | fn add_one_byte(&mut self, byte: u8) { |
900 | if !self.byteset[byte as usize] { |
901 | self.byteset[byte as usize] = true; |
902 | self.count += 1; |
903 | self.rank_sum += freq_rank(byte) as u16; |
904 | } |
905 | } |
906 | } |
907 | |
908 | /// A prefilter for scanning for a single starting byte. |
909 | #[derive (Clone, Debug)] |
910 | struct StartBytesOne { |
911 | byte1: u8, |
912 | } |
913 | |
914 | impl Prefilter for StartBytesOne { |
915 | fn next_candidate( |
916 | &self, |
917 | _state: &mut PrefilterState, |
918 | haystack: &[u8], |
919 | at: usize, |
920 | ) -> Candidate { |
921 | memchr(self.byte1, &haystack[at..]) |
922 | .map(|i| at + i) |
923 | .map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch) |
924 | } |
925 | |
926 | fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
927 | Box::new(self.clone()) |
928 | } |
929 | |
930 | fn heap_bytes(&self) -> usize { |
931 | 0 |
932 | } |
933 | } |
934 | |
935 | /// A prefilter for scanning for two starting bytes. |
936 | #[derive (Clone, Debug)] |
937 | struct StartBytesTwo { |
938 | byte1: u8, |
939 | byte2: u8, |
940 | } |
941 | |
942 | impl Prefilter for StartBytesTwo { |
943 | fn next_candidate( |
944 | &self, |
945 | _state: &mut PrefilterState, |
946 | haystack: &[u8], |
947 | at: usize, |
948 | ) -> Candidate { |
949 | memchr2(self.byte1, self.byte2, &haystack[at..]) |
950 | .map(|i| at + i) |
951 | .map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch) |
952 | } |
953 | |
954 | fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
955 | Box::new(self.clone()) |
956 | } |
957 | |
958 | fn heap_bytes(&self) -> usize { |
959 | 0 |
960 | } |
961 | } |
962 | |
963 | /// A prefilter for scanning for three starting bytes. |
964 | #[derive (Clone, Debug)] |
965 | struct StartBytesThree { |
966 | byte1: u8, |
967 | byte2: u8, |
968 | byte3: u8, |
969 | } |
970 | |
971 | impl Prefilter for StartBytesThree { |
972 | fn next_candidate( |
973 | &self, |
974 | _state: &mut PrefilterState, |
975 | haystack: &[u8], |
976 | at: usize, |
977 | ) -> Candidate { |
978 | memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) |
979 | .map(|i| at + i) |
980 | .map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch) |
981 | } |
982 | |
983 | fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
984 | Box::new(self.clone()) |
985 | } |
986 | |
987 | fn heap_bytes(&self) -> usize { |
988 | 0 |
989 | } |
990 | } |
991 | |
992 | /// Return the next candidate reported by the given prefilter while |
993 | /// simultaneously updating the given prestate. |
994 | /// |
995 | /// The caller is responsible for checking the prestate before deciding whether |
996 | /// to initiate a search. |
997 | #[inline ] |
998 | pub fn next<P: Prefilter>( |
999 | prestate: &mut PrefilterState, |
1000 | prefilter: P, |
1001 | haystack: &[u8], |
1002 | at: usize, |
1003 | ) -> Candidate { |
1004 | let cand: Candidate = prefilter.next_candidate(state:prestate, haystack, at); |
1005 | match cand { |
1006 | Candidate::None => { |
1007 | prestate.update_skipped_bytes(skipped:haystack.len() - at); |
1008 | } |
1009 | Candidate::Match(ref m: &Match) => { |
1010 | prestate.update_skipped_bytes(skipped:m.start() - at); |
1011 | } |
1012 | Candidate::PossibleStartOfMatch(i: usize) => { |
1013 | prestate.update_skipped_bytes(skipped:i - at); |
1014 | } |
1015 | } |
1016 | cand |
1017 | } |
1018 | |
1019 | /// If the given byte is an ASCII letter, then return it in the opposite case. |
1020 | /// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns |
1021 | /// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. |
1022 | pub fn opposite_ascii_case(b: u8) -> u8 { |
1023 | if b'A' <= b && b <= b'Z' { |
1024 | b.to_ascii_lowercase() |
1025 | } else if b'a' <= b && b <= b'z' { |
1026 | b.to_ascii_uppercase() |
1027 | } else { |
1028 | b |
1029 | } |
1030 | } |
1031 | |
1032 | /// Return the frequency rank of the given byte. The higher the rank, the more |
1033 | /// common the byte (heuristically speaking). |
1034 | fn freq_rank(b: u8) -> u8 { |
1035 | use crate::byte_frequencies::BYTE_FREQUENCIES; |
1036 | BYTE_FREQUENCIES[b as usize] |
1037 | } |
1038 | |
1039 | #[cfg (test)] |
1040 | mod tests { |
1041 | use super::*; |
1042 | |
1043 | #[test ] |
1044 | fn scratch() { |
1045 | let mut b = Builder::new(MatchKind::LeftmostFirst); |
1046 | b.add(b"Sherlock" ); |
1047 | b.add(b"locjaw" ); |
1048 | // b.add(b"Sherlock"); |
1049 | // b.add(b"Holmes"); |
1050 | // b.add(b"Watson"); |
1051 | // b.add("Шерлок Холмс".as_bytes()); |
1052 | // b.add("Джон Уотсон".as_bytes()); |
1053 | |
1054 | let s = b.build().unwrap(); |
1055 | println!(" {:?}" , s); |
1056 | } |
1057 | } |
1058 | |