1 | use core::{ |
2 | cmp, |
3 | fmt::Debug, |
4 | panic::{RefUnwindSafe, UnwindSafe}, |
5 | u8, |
6 | }; |
7 | |
8 | use alloc::{sync::Arc, vec, vec::Vec}; |
9 | |
10 | use crate::{ |
11 | packed, |
12 | util::{ |
13 | alphabet::ByteSet, |
14 | search::{Match, MatchKind, Span}, |
15 | }, |
16 | }; |
17 | |
18 | /// A prefilter for accelerating a search. |
19 | /// |
20 | /// This crate uses prefilters in the core search implementations to accelerate |
21 | /// common cases. They typically only apply to cases where there are a small |
22 | /// number of patterns (less than 100 or so), but when they do, thoughput can |
23 | /// be boosted considerably, perhaps by an order of magnitude. When a prefilter |
24 | /// is active, it is used whenever a search enters an automaton's start state. |
25 | /// |
26 | /// Currently, prefilters cannot be constructed by |
27 | /// callers. A `Prefilter` can only be accessed via the |
28 | /// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter) |
29 | /// method and used to execute a search. In other words, a prefilter can be |
30 | /// used to optimize your own search implementation if necessary, but cannot do |
31 | /// much else. If you have a use case for more APIs, please submit an issue. |
32 | #[derive(Clone, Debug)] |
33 | pub struct Prefilter { |
34 | finder: Arc<dyn PrefilterI>, |
35 | memory_usage: usize, |
36 | } |
37 | |
38 | impl Prefilter { |
39 | /// Execute a search in the haystack within the span given. If a match or |
40 | /// a possible match is returned, then it is guaranteed to occur within |
41 | /// the bounds of the span. |
42 | /// |
43 | /// If the span provided is invalid for the given haystack, then behavior |
44 | /// is unspecified. |
45 | #[inline ] |
46 | pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
47 | self.finder.find_in(haystack, span) |
48 | } |
49 | |
50 | #[inline ] |
51 | pub(crate) fn memory_usage(&self) -> usize { |
52 | self.memory_usage |
53 | } |
54 | } |
55 | |
56 | /// A candidate is the result of running a prefilter on a haystack at a |
57 | /// particular position. |
58 | /// |
59 | /// The result is either no match, a confirmed match or a possible match. |
60 | /// |
61 | /// When no match is returned, the prefilter is guaranteeing that no possible |
62 | /// match can be found in the haystack, and the caller may trust this. That is, |
63 | /// all correct prefilters must never report false negatives. |
64 | /// |
65 | /// In some cases, a prefilter can confirm a match very quickly, in which case, |
66 | /// the caller may use this to stop what it's doing and report the match. In |
67 | /// this case, prefilter implementations must never report a false positive. |
68 | /// In other cases, the prefilter can only report a potential match, in which |
69 | /// case the callers must attempt to confirm the match. In this case, prefilter |
70 | /// implementations are permitted to return false positives. |
71 | #[derive(Clone, Debug)] |
72 | pub enum Candidate { |
73 | /// No match was found. Since false negatives are not possible, this means |
74 | /// the search can quit as it is guaranteed not to find another match. |
75 | None, |
76 | /// A confirmed match was found. Callers do not need to confirm it. |
77 | Match(Match), |
78 | /// The start of a possible match was found. Callers must confirm it before |
79 | /// reporting it as a match. |
80 | PossibleStartOfMatch(usize), |
81 | } |
82 | |
83 | impl Candidate { |
84 | /// Convert this candidate into an option. This is useful when callers |
85 | /// do not distinguish between true positives and false positives (i.e., |
86 | /// the caller must always confirm the match). |
87 | pub fn into_option(self) -> Option<usize> { |
88 | match self { |
89 | Candidate::None => None, |
90 | Candidate::Match(ref m) => Some(m.start()), |
91 | Candidate::PossibleStartOfMatch(start) => Some(start), |
92 | } |
93 | } |
94 | } |
95 | |
96 | /// A prefilter describes the behavior of fast literal scanners for quickly |
97 | /// skipping past bytes in the haystack that we know cannot possibly |
98 | /// participate in a match. |
99 | trait PrefilterI: |
100 | Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static |
101 | { |
102 | /// Returns the next possible match candidate. This may yield false |
103 | /// positives, so callers must confirm a match starting at the position |
104 | /// returned. This, however, must never produce false negatives. That is, |
105 | /// this must, at minimum, return the starting position of the next match |
106 | /// in the given haystack after or at the given position. |
107 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate; |
108 | } |
109 | |
110 | impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> { |
111 | #[inline (always)] |
112 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
113 | (**self).find_in(haystack, span) |
114 | } |
115 | } |
116 | |
117 | /// A builder for constructing the best possible prefilter. When constructed, |
118 | /// this builder will heuristically select the best prefilter it can build, |
119 | /// if any, and discard the rest. |
120 | #[derive(Debug)] |
121 | pub(crate) struct Builder { |
122 | count: usize, |
123 | ascii_case_insensitive: bool, |
124 | start_bytes: StartBytesBuilder, |
125 | rare_bytes: RareBytesBuilder, |
126 | memmem: MemmemBuilder, |
127 | packed: Option<packed::Builder>, |
128 | // If we run across a condition that suggests we shouldn't use a prefilter |
129 | // at all (like an empty pattern), then disable prefilters entirely. |
130 | enabled: bool, |
131 | } |
132 | |
133 | impl Builder { |
134 | /// Create a new builder for constructing the best possible prefilter. |
135 | pub(crate) fn new(kind: MatchKind) -> Builder { |
136 | let pbuilder = kind |
137 | .as_packed() |
138 | .map(|kind| packed::Config::new().match_kind(kind).builder()); |
139 | Builder { |
140 | count: 0, |
141 | ascii_case_insensitive: false, |
142 | start_bytes: StartBytesBuilder::new(), |
143 | rare_bytes: RareBytesBuilder::new(), |
144 | memmem: MemmemBuilder::default(), |
145 | packed: pbuilder, |
146 | enabled: true, |
147 | } |
148 | } |
149 | |
150 | /// Enable ASCII case insensitivity. When set, byte strings added to this |
151 | /// builder will be interpreted without respect to ASCII case. |
152 | pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder { |
153 | self.ascii_case_insensitive = yes; |
154 | self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); |
155 | self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); |
156 | self |
157 | } |
158 | |
159 | /// Return a prefilter suitable for quickly finding potential matches. |
160 | /// |
161 | /// All patterns added to an Aho-Corasick automaton should be added to this |
162 | /// builder before attempting to construct the prefilter. |
163 | pub(crate) fn build(&self) -> Option<Prefilter> { |
164 | if !self.enabled { |
165 | debug!("prefilter not enabled, skipping" ); |
166 | return None; |
167 | } |
168 | // If we only have one pattern, then deferring to memmem is always |
169 | // the best choice. This is kind of a weird case, because, well, why |
170 | // use Aho-Corasick if you only have one pattern? But maybe you don't |
171 | // know exactly how many patterns you'll get up front, and you need to |
172 | // support the option of multiple patterns. So instead of relying on |
173 | // the caller to branch and use memmem explicitly, we just do it for |
174 | // them. |
175 | if !self.ascii_case_insensitive { |
176 | if let Some(pre) = self.memmem.build() { |
177 | debug!("using memmem prefilter" ); |
178 | return Some(pre); |
179 | } |
180 | } |
181 | let (packed, patlen, minlen) = if self.ascii_case_insensitive { |
182 | (None, usize::MAX, 0) |
183 | } else { |
184 | let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len()); |
185 | let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len()); |
186 | let packed = |
187 | self.packed.as_ref().and_then(|b| b.build()).map(|s| { |
188 | let memory_usage = s.memory_usage(); |
189 | debug!( |
190 | "built packed prefilter (len: {}, \ |
191 | minimum pattern len: {}, memory usage: {}) \ |
192 | for consideration" , |
193 | patlen, minlen, memory_usage, |
194 | ); |
195 | Prefilter { finder: Arc::new(Packed(s)), memory_usage } |
196 | }); |
197 | (packed, patlen, minlen) |
198 | }; |
199 | match (self.start_bytes.build(), self.rare_bytes.build()) { |
200 | // If we could build both start and rare prefilters, then there are |
201 | // a few cases in which we'd want to use the start-byte prefilter |
202 | // over the rare-byte prefilter, since the former has lower |
203 | // overhead. |
204 | (prestart @ Some(_), prerare @ Some(_)) => { |
205 | debug!( |
206 | "both start (len={}, rank={}) and \ |
207 | rare (len={}, rank={}) byte prefilters \ |
208 | are available" , |
209 | self.start_bytes.count, |
210 | self.start_bytes.rank_sum, |
211 | self.rare_bytes.count, |
212 | self.rare_bytes.rank_sum, |
213 | ); |
214 | if patlen <= 16 |
215 | && minlen >= 2 |
216 | && self.start_bytes.count >= 3 |
217 | && self.rare_bytes.count >= 3 |
218 | { |
219 | debug!( |
220 | "start and rare byte prefilters available, but \ |
221 | they're probably slower than packed so using \ |
222 | packed" |
223 | ); |
224 | return packed; |
225 | } |
226 | // If the start-byte prefilter can scan for a smaller number |
227 | // of bytes than the rare-byte prefilter, then it's probably |
228 | // faster. |
229 | let has_fewer_bytes = |
230 | self.start_bytes.count < self.rare_bytes.count; |
231 | // Otherwise, if the combined frequency rank of the detected |
232 | // bytes in the start-byte prefilter is "close" to the combined |
233 | // frequency rank of the rare-byte prefilter, then we pick |
234 | // the start-byte prefilter even if the rare-byte prefilter |
235 | // heuristically searches for rare bytes. This is because the |
236 | // rare-byte prefilter has higher constant costs, so we tend to |
237 | // prefer the start-byte prefilter when we can. |
238 | let has_rarer_bytes = |
239 | self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; |
240 | if has_fewer_bytes { |
241 | debug!( |
242 | "using start byte prefilter because it has fewer |
243 | bytes to search for than the rare byte prefilter" , |
244 | ); |
245 | prestart |
246 | } else if has_rarer_bytes { |
247 | debug!( |
248 | "using start byte prefilter because its byte \ |
249 | frequency rank was determined to be \ |
250 | \"good enough \" relative to the rare byte prefilter \ |
251 | byte frequency rank" , |
252 | ); |
253 | prestart |
254 | } else { |
255 | debug!("using rare byte prefilter" ); |
256 | prerare |
257 | } |
258 | } |
259 | (prestart @ Some(_), None) => { |
260 | if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 { |
261 | debug!( |
262 | "start byte prefilter available, but \ |
263 | it's probably slower than packed so using \ |
264 | packed" |
265 | ); |
266 | return packed; |
267 | } |
268 | debug!( |
269 | "have start byte prefilter but not rare byte prefilter, \ |
270 | so using start byte prefilter" , |
271 | ); |
272 | prestart |
273 | } |
274 | (None, prerare @ Some(_)) => { |
275 | if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 { |
276 | debug!( |
277 | "rare byte prefilter available, but \ |
278 | it's probably slower than packed so using \ |
279 | packed" |
280 | ); |
281 | return packed; |
282 | } |
283 | debug!( |
284 | "have rare byte prefilter but not start byte prefilter, \ |
285 | so using rare byte prefilter" , |
286 | ); |
287 | prerare |
288 | } |
289 | (None, None) if self.ascii_case_insensitive => { |
290 | debug!( |
291 | "no start or rare byte prefilter and ASCII case \ |
292 | insensitivity was enabled, so skipping prefilter" , |
293 | ); |
294 | None |
295 | } |
296 | (None, None) => { |
297 | if packed.is_some() { |
298 | debug!("falling back to packed prefilter" ); |
299 | } else { |
300 | debug!("no prefilter available" ); |
301 | } |
302 | packed |
303 | } |
304 | } |
305 | } |
306 | |
307 | /// Add a literal string to this prefilter builder. |
308 | pub(crate) fn add(&mut self, bytes: &[u8]) { |
309 | if bytes.is_empty() { |
310 | self.enabled = false; |
311 | } |
312 | if !self.enabled { |
313 | return; |
314 | } |
315 | self.count += 1; |
316 | self.start_bytes.add(bytes); |
317 | self.rare_bytes.add(bytes); |
318 | self.memmem.add(bytes); |
319 | if let Some(ref mut pbuilder) = self.packed { |
320 | pbuilder.add(bytes); |
321 | } |
322 | } |
323 | } |
324 | |
325 | /// A type that wraps a packed searcher and implements the `Prefilter` |
326 | /// interface. |
327 | #[derive(Clone, Debug)] |
328 | struct Packed(packed::Searcher); |
329 | |
330 | impl PrefilterI for Packed { |
331 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
332 | self.0 |
333 | .find_in(&haystack, span) |
334 | .map_or(Candidate::None, Candidate::Match) |
335 | } |
336 | } |
337 | |
338 | /// A builder for constructing a prefilter that uses memmem. |
339 | #[derive(Debug, Default)] |
340 | struct MemmemBuilder { |
341 | /// The number of patterns that have been added. |
342 | count: usize, |
343 | /// The singular pattern to search for. This is only set when count==1. |
344 | one: Option<Vec<u8>>, |
345 | } |
346 | |
347 | impl MemmemBuilder { |
348 | fn build(&self) -> Option<Prefilter> { |
349 | #[cfg (all(feature = "std" , feature = "perf-literal" ))] |
350 | fn imp(builder: &MemmemBuilder) -> Option<Prefilter> { |
351 | let pattern = builder.one.as_ref()?; |
352 | assert_eq!(1, builder.count); |
353 | let finder = Arc::new(Memmem( |
354 | memchr::memmem::Finder::new(pattern).into_owned(), |
355 | )); |
356 | let memory_usage = pattern.len(); |
357 | Some(Prefilter { finder, memory_usage }) |
358 | } |
359 | |
360 | #[cfg (not(all(feature = "std" , feature = "perf-literal" )))] |
361 | fn imp(_: &MemmemBuilder) -> Option<Prefilter> { |
362 | None |
363 | } |
364 | |
365 | imp(self) |
366 | } |
367 | |
368 | fn add(&mut self, bytes: &[u8]) { |
369 | self.count += 1; |
370 | if self.count == 1 { |
371 | self.one = Some(bytes.to_vec()); |
372 | } else { |
373 | self.one = None; |
374 | } |
375 | } |
376 | } |
377 | |
378 | /// A type that wraps a SIMD accelerated single substring search from the |
379 | /// `memchr` crate for use as a prefilter. |
380 | /// |
381 | /// Currently, this prefilter is only active for Aho-Corasick searchers with |
382 | /// a single pattern. In theory, this could be extended to support searchers |
383 | /// that have a common prefix of more than one byte (for one byte, we would use |
384 | /// memchr), but it's not clear if it's worth it or not. |
385 | /// |
386 | /// Also, unfortunately, this currently also requires the 'std' feature to |
387 | /// be enabled. That's because memchr doesn't have a no-std-but-with-alloc |
388 | /// mode, and so APIs like Finder::into_owned aren't available when 'std' is |
389 | /// disabled. But there should be an 'alloc' feature that brings in APIs like |
390 | /// Finder::into_owned but doesn't use std-only features like runtime CPU |
391 | /// feature detection. |
392 | #[cfg (all(feature = "std" , feature = "perf-literal" ))] |
393 | #[derive(Clone, Debug)] |
394 | struct Memmem(memchr::memmem::Finder<'static>); |
395 | |
396 | #[cfg (all(feature = "std" , feature = "perf-literal" ))] |
397 | impl PrefilterI for Memmem { |
398 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
399 | use crate::util::primitives::PatternID; |
400 | |
401 | self.0.find(&haystack[span]).map_or(Candidate::None, |i| { |
402 | let start = span.start + i; |
403 | let end = start + self.0.needle().len(); |
404 | // N.B. We can declare a match and use a fixed pattern ID here |
405 | // because a Memmem prefilter is only ever created for searchers |
406 | // with exactly one pattern. Thus, every match is always a match |
407 | // and it is always for the first and only pattern. |
408 | Candidate::Match(Match::new(PatternID::ZERO, start..end)) |
409 | }) |
410 | } |
411 | } |
412 | |
413 | /// A builder for constructing a rare byte prefilter. |
414 | /// |
415 | /// A rare byte prefilter attempts to pick out a small set of rare bytes that |
416 | /// occurr in the patterns, and then quickly scan to matches of those rare |
417 | /// bytes. |
418 | #[derive(Clone, Debug)] |
419 | struct RareBytesBuilder { |
420 | /// Whether this prefilter should account for ASCII case insensitivity or |
421 | /// not. |
422 | ascii_case_insensitive: bool, |
423 | /// A set of rare bytes, indexed by byte value. |
424 | rare_set: ByteSet, |
425 | /// A set of byte offsets associated with bytes in a pattern. An entry |
426 | /// corresponds to a particular bytes (its index) and is only non-zero if |
427 | /// the byte occurred at an offset greater than 0 in at least one pattern. |
428 | /// |
429 | /// If a byte's offset is not representable in 8 bits, then the rare bytes |
430 | /// prefilter becomes inert. |
431 | byte_offsets: RareByteOffsets, |
432 | /// Whether this is available as a prefilter or not. This can be set to |
433 | /// false during construction if a condition is seen that invalidates the |
434 | /// use of the rare-byte prefilter. |
435 | available: bool, |
436 | /// The number of bytes set to an active value in `byte_offsets`. |
437 | count: usize, |
438 | /// The sum of frequency ranks for the rare bytes detected. This is |
439 | /// intended to give a heuristic notion of how rare the bytes are. |
440 | rank_sum: u16, |
441 | } |
442 | |
443 | /// A set of byte offsets, keyed by byte. |
444 | #[derive(Clone, Copy)] |
445 | struct RareByteOffsets { |
446 | /// Each entry corresponds to the maximum offset of the corresponding |
447 | /// byte across all patterns seen. |
448 | set: [RareByteOffset; 256], |
449 | } |
450 | |
451 | impl RareByteOffsets { |
452 | /// Create a new empty set of rare byte offsets. |
453 | pub(crate) fn empty() -> RareByteOffsets { |
454 | RareByteOffsets { set: [RareByteOffset::default(); 256] } |
455 | } |
456 | |
457 | /// Add the given offset for the given byte to this set. If the offset is |
458 | /// greater than the existing offset, then it overwrites the previous |
459 | /// value and returns false. If there is no previous value set, then this |
460 | /// sets it and returns true. |
461 | pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) { |
462 | self.set[byte as usize].max = |
463 | cmp::max(self.set[byte as usize].max, off.max); |
464 | } |
465 | } |
466 | |
467 | impl core::fmt::Debug for RareByteOffsets { |
468 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
469 | let mut offsets = vec![]; |
470 | for off in self.set.iter() { |
471 | if off.max > 0 { |
472 | offsets.push(off); |
473 | } |
474 | } |
475 | f.debug_struct("RareByteOffsets" ).field("set" , &offsets).finish() |
476 | } |
477 | } |
478 | |
479 | /// Offsets associated with an occurrence of a "rare" byte in any of the |
480 | /// patterns used to construct a single Aho-Corasick automaton. |
481 | #[derive(Clone, Copy, Debug)] |
482 | struct RareByteOffset { |
483 | /// The maximum offset at which a particular byte occurs from the start |
484 | /// of any pattern. This is used as a shift amount. That is, when an |
485 | /// occurrence of this byte is found, the candidate position reported by |
486 | /// the prefilter is `position_of_byte - max`, such that the automaton |
487 | /// will begin its search at a position that is guaranteed to observe a |
488 | /// match. |
489 | /// |
490 | /// To avoid accidentally quadratic behavior, a prefilter is considered |
491 | /// ineffective when it is asked to start scanning from a position that it |
492 | /// has already scanned past. |
493 | /// |
494 | /// Using a `u8` here means that if we ever see a pattern that's longer |
495 | /// than 255 bytes, then the entire rare byte prefilter is disabled. |
496 | max: u8, |
497 | } |
498 | |
499 | impl Default for RareByteOffset { |
500 | fn default() -> RareByteOffset { |
501 | RareByteOffset { max: 0 } |
502 | } |
503 | } |
504 | |
505 | impl RareByteOffset { |
506 | /// Create a new rare byte offset. If the given offset is too big, then |
507 | /// None is returned. In that case, callers should render the rare bytes |
508 | /// prefilter inert. |
509 | fn new(max: usize) -> Option<RareByteOffset> { |
510 | if max > u8::MAX as usize { |
511 | None |
512 | } else { |
513 | Some(RareByteOffset { max: max as u8 }) |
514 | } |
515 | } |
516 | } |
517 | |
518 | impl RareBytesBuilder { |
519 | /// Create a new builder for constructing a rare byte prefilter. |
520 | fn new() -> RareBytesBuilder { |
521 | RareBytesBuilder { |
522 | ascii_case_insensitive: false, |
523 | rare_set: ByteSet::empty(), |
524 | byte_offsets: RareByteOffsets::empty(), |
525 | available: true, |
526 | count: 0, |
527 | rank_sum: 0, |
528 | } |
529 | } |
530 | |
531 | /// Enable ASCII case insensitivity. When set, byte strings added to this |
532 | /// builder will be interpreted without respect to ASCII case. |
533 | fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { |
534 | self.ascii_case_insensitive = yes; |
535 | self |
536 | } |
537 | |
538 | /// Build the rare bytes prefilter. |
539 | /// |
540 | /// If there are more than 3 distinct rare bytes found, or if heuristics |
541 | /// otherwise determine that this prefilter should not be used, then `None` |
542 | /// is returned. |
543 | fn build(&self) -> Option<Prefilter> { |
544 | #[cfg (feature = "perf-literal" )] |
545 | fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> { |
546 | if !builder.available || builder.count > 3 { |
547 | return None; |
548 | } |
549 | let (mut bytes, mut len) = ([0; 3], 0); |
550 | for b in 0..=255 { |
551 | if builder.rare_set.contains(b) { |
552 | bytes[len] = b as u8; |
553 | len += 1; |
554 | } |
555 | } |
556 | let finder: Arc<dyn PrefilterI> = match len { |
557 | 0 => return None, |
558 | 1 => Arc::new(RareBytesOne { |
559 | byte1: bytes[0], |
560 | offset: builder.byte_offsets.set[bytes[0] as usize], |
561 | }), |
562 | 2 => Arc::new(RareBytesTwo { |
563 | offsets: builder.byte_offsets, |
564 | byte1: bytes[0], |
565 | byte2: bytes[1], |
566 | }), |
567 | 3 => Arc::new(RareBytesThree { |
568 | offsets: builder.byte_offsets, |
569 | byte1: bytes[0], |
570 | byte2: bytes[1], |
571 | byte3: bytes[2], |
572 | }), |
573 | _ => unreachable!(), |
574 | }; |
575 | Some(Prefilter { finder, memory_usage: 0 }) |
576 | } |
577 | |
578 | #[cfg (not(feature = "perf-literal" ))] |
579 | fn imp(_: &RareBytesBuilder) -> Option<Prefilter> { |
580 | None |
581 | } |
582 | |
583 | imp(self) |
584 | } |
585 | |
586 | /// Add a byte string to this builder. |
587 | /// |
588 | /// All patterns added to an Aho-Corasick automaton should be added to this |
589 | /// builder before attempting to construct the prefilter. |
590 | fn add(&mut self, bytes: &[u8]) { |
591 | // If we've already given up, then do nothing. |
592 | if !self.available { |
593 | return; |
594 | } |
595 | // If we've already blown our budget, then don't waste time looking |
596 | // for more rare bytes. |
597 | if self.count > 3 { |
598 | self.available = false; |
599 | return; |
600 | } |
601 | // If the pattern is too long, then our offset table is bunk, so |
602 | // give up. |
603 | if bytes.len() >= 256 { |
604 | self.available = false; |
605 | return; |
606 | } |
607 | let mut rarest = match bytes.get(0) { |
608 | None => return, |
609 | Some(&b) => (b, freq_rank(b)), |
610 | }; |
611 | // The idea here is to look for the rarest byte in each pattern, and |
612 | // add that to our set. As a special exception, if we see a byte that |
613 | // we've already added, then we immediately stop and choose that byte, |
614 | // even if there's another rare byte in the pattern. This helps us |
615 | // apply the rare byte optimization in more cases by attempting to pick |
616 | // bytes that are in common between patterns. So for example, if we |
617 | // were searching for `Sherlock` and `lockjaw`, then this would pick |
618 | // `k` for both patterns, resulting in the use of `memchr` instead of |
619 | // `memchr2` for `k` and `j`. |
620 | let mut found = false; |
621 | for (pos, &b) in bytes.iter().enumerate() { |
622 | self.set_offset(pos, b); |
623 | if found { |
624 | continue; |
625 | } |
626 | if self.rare_set.contains(b) { |
627 | found = true; |
628 | continue; |
629 | } |
630 | let rank = freq_rank(b); |
631 | if rank < rarest.1 { |
632 | rarest = (b, rank); |
633 | } |
634 | } |
635 | if !found { |
636 | self.add_rare_byte(rarest.0); |
637 | } |
638 | } |
639 | |
640 | fn set_offset(&mut self, pos: usize, byte: u8) { |
641 | // This unwrap is OK because pos is never bigger than our max. |
642 | let offset = RareByteOffset::new(pos).unwrap(); |
643 | self.byte_offsets.set(byte, offset); |
644 | if self.ascii_case_insensitive { |
645 | self.byte_offsets.set(opposite_ascii_case(byte), offset); |
646 | } |
647 | } |
648 | |
649 | fn add_rare_byte(&mut self, byte: u8) { |
650 | self.add_one_rare_byte(byte); |
651 | if self.ascii_case_insensitive { |
652 | self.add_one_rare_byte(opposite_ascii_case(byte)); |
653 | } |
654 | } |
655 | |
656 | fn add_one_rare_byte(&mut self, byte: u8) { |
657 | if !self.rare_set.contains(byte) { |
658 | self.rare_set.add(byte); |
659 | self.count += 1; |
660 | self.rank_sum += freq_rank(byte) as u16; |
661 | } |
662 | } |
663 | } |
664 | |
665 | /// A prefilter for scanning for a single "rare" byte. |
666 | #[cfg (feature = "perf-literal" )] |
667 | #[derive(Clone, Debug)] |
668 | struct RareBytesOne { |
669 | byte1: u8, |
670 | offset: RareByteOffset, |
671 | } |
672 | |
673 | #[cfg (feature = "perf-literal" )] |
674 | impl PrefilterI for RareBytesOne { |
675 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
676 | memchr::memchr(self.byte1, &haystack[span]) |
677 | .map(|i| { |
678 | let pos = span.start + i; |
679 | cmp::max( |
680 | span.start, |
681 | pos.saturating_sub(usize::from(self.offset.max)), |
682 | ) |
683 | }) |
684 | .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
685 | } |
686 | } |
687 | |
688 | /// A prefilter for scanning for two "rare" bytes. |
689 | #[cfg (feature = "perf-literal" )] |
690 | #[derive(Clone, Debug)] |
691 | struct RareBytesTwo { |
692 | offsets: RareByteOffsets, |
693 | byte1: u8, |
694 | byte2: u8, |
695 | } |
696 | |
697 | #[cfg (feature = "perf-literal" )] |
698 | impl PrefilterI for RareBytesTwo { |
699 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
700 | memchr::memchr2(self.byte1, self.byte2, &haystack[span]) |
701 | .map(|i| { |
702 | let pos = span.start + i; |
703 | let offset = self.offsets.set[usize::from(haystack[pos])].max; |
704 | cmp::max(span.start, pos.saturating_sub(usize::from(offset))) |
705 | }) |
706 | .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
707 | } |
708 | } |
709 | |
710 | /// A prefilter for scanning for three "rare" bytes. |
711 | #[cfg (feature = "perf-literal" )] |
712 | #[derive(Clone, Debug)] |
713 | struct RareBytesThree { |
714 | offsets: RareByteOffsets, |
715 | byte1: u8, |
716 | byte2: u8, |
717 | byte3: u8, |
718 | } |
719 | |
720 | #[cfg (feature = "perf-literal" )] |
721 | impl PrefilterI for RareBytesThree { |
722 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
723 | memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) |
724 | .map(|i| { |
725 | let pos = span.start + i; |
726 | let offset = self.offsets.set[usize::from(haystack[pos])].max; |
727 | cmp::max(span.start, pos.saturating_sub(usize::from(offset))) |
728 | }) |
729 | .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
730 | } |
731 | } |
732 | |
733 | /// A builder for constructing a starting byte prefilter. |
734 | /// |
735 | /// A starting byte prefilter is a simplistic prefilter that looks for possible |
736 | /// matches by reporting all positions corresponding to a particular byte. This |
737 | /// generally only takes affect when there are at most 3 distinct possible |
738 | /// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two |
739 | /// distinct starting bytes (`f` and `b`), and this prefilter returns all |
740 | /// occurrences of either `f` or `b`. |
741 | /// |
742 | /// In some cases, a heuristic frequency analysis may determine that it would |
743 | /// be better not to use this prefilter even when there are 3 or fewer distinct |
744 | /// starting bytes. |
745 | #[derive(Clone, Debug)] |
746 | struct StartBytesBuilder { |
747 | /// Whether this prefilter should account for ASCII case insensitivity or |
748 | /// not. |
749 | ascii_case_insensitive: bool, |
750 | /// The set of starting bytes observed. |
751 | byteset: Vec<bool>, |
752 | /// The number of bytes set to true in `byteset`. |
753 | count: usize, |
754 | /// The sum of frequency ranks for the rare bytes detected. This is |
755 | /// intended to give a heuristic notion of how rare the bytes are. |
756 | rank_sum: u16, |
757 | } |
758 | |
759 | impl StartBytesBuilder { |
760 | /// Create a new builder for constructing a start byte prefilter. |
761 | fn new() -> StartBytesBuilder { |
762 | StartBytesBuilder { |
763 | ascii_case_insensitive: false, |
764 | byteset: vec![false; 256], |
765 | count: 0, |
766 | rank_sum: 0, |
767 | } |
768 | } |
769 | |
770 | /// Enable ASCII case insensitivity. When set, byte strings added to this |
771 | /// builder will be interpreted without respect to ASCII case. |
772 | fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { |
773 | self.ascii_case_insensitive = yes; |
774 | self |
775 | } |
776 | |
777 | /// Build the starting bytes prefilter. |
778 | /// |
779 | /// If there are more than 3 distinct starting bytes, or if heuristics |
780 | /// otherwise determine that this prefilter should not be used, then `None` |
781 | /// is returned. |
782 | fn build(&self) -> Option<Prefilter> { |
783 | #[cfg (feature = "perf-literal" )] |
784 | fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> { |
785 | if builder.count > 3 { |
786 | return None; |
787 | } |
788 | let (mut bytes, mut len) = ([0; 3], 0); |
789 | for b in 0..256 { |
790 | if !builder.byteset[b] { |
791 | continue; |
792 | } |
793 | // We don't handle non-ASCII bytes for now. Getting non-ASCII |
794 | // bytes right is trickier, since we generally don't want to put |
795 | // a leading UTF-8 code unit into a prefilter that isn't ASCII, |
796 | // since they can frequently. Instead, it would be better to use a |
797 | // continuation byte, but this requires more sophisticated analysis |
798 | // of the automaton and a richer prefilter API. |
799 | if b > 0x7F { |
800 | return None; |
801 | } |
802 | bytes[len] = b as u8; |
803 | len += 1; |
804 | } |
805 | let finder: Arc<dyn PrefilterI> = match len { |
806 | 0 => return None, |
807 | 1 => Arc::new(StartBytesOne { byte1: bytes[0] }), |
808 | 2 => Arc::new(StartBytesTwo { |
809 | byte1: bytes[0], |
810 | byte2: bytes[1], |
811 | }), |
812 | 3 => Arc::new(StartBytesThree { |
813 | byte1: bytes[0], |
814 | byte2: bytes[1], |
815 | byte3: bytes[2], |
816 | }), |
817 | _ => unreachable!(), |
818 | }; |
819 | Some(Prefilter { finder, memory_usage: 0 }) |
820 | } |
821 | |
822 | #[cfg (not(feature = "perf-literal" ))] |
823 | fn imp(_: &StartBytesBuilder) -> Option<Prefilter> { |
824 | None |
825 | } |
826 | |
827 | imp(self) |
828 | } |
829 | |
830 | /// Add a byte string to this builder. |
831 | /// |
832 | /// All patterns added to an Aho-Corasick automaton should be added to this |
833 | /// builder before attempting to construct the prefilter. |
834 | fn add(&mut self, bytes: &[u8]) { |
835 | if self.count > 3 { |
836 | return; |
837 | } |
838 | if let Some(&byte) = bytes.get(0) { |
839 | self.add_one_byte(byte); |
840 | if self.ascii_case_insensitive { |
841 | self.add_one_byte(opposite_ascii_case(byte)); |
842 | } |
843 | } |
844 | } |
845 | |
846 | fn add_one_byte(&mut self, byte: u8) { |
847 | if !self.byteset[byte as usize] { |
848 | self.byteset[byte as usize] = true; |
849 | self.count += 1; |
850 | self.rank_sum += freq_rank(byte) as u16; |
851 | } |
852 | } |
853 | } |
854 | |
855 | /// A prefilter for scanning for a single starting byte. |
856 | #[cfg (feature = "perf-literal" )] |
857 | #[derive(Clone, Debug)] |
858 | struct StartBytesOne { |
859 | byte1: u8, |
860 | } |
861 | |
862 | #[cfg (feature = "perf-literal" )] |
863 | impl PrefilterI for StartBytesOne { |
864 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
865 | memchr::memchr(self.byte1, &haystack[span]) |
866 | .map(|i| span.start + i) |
867 | .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
868 | } |
869 | } |
870 | |
871 | /// A prefilter for scanning for two starting bytes. |
872 | #[cfg (feature = "perf-literal" )] |
873 | #[derive(Clone, Debug)] |
874 | struct StartBytesTwo { |
875 | byte1: u8, |
876 | byte2: u8, |
877 | } |
878 | |
879 | #[cfg (feature = "perf-literal" )] |
880 | impl PrefilterI for StartBytesTwo { |
881 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
882 | memchr::memchr2(self.byte1, self.byte2, &haystack[span]) |
883 | .map(|i| span.start + i) |
884 | .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
885 | } |
886 | } |
887 | |
888 | /// A prefilter for scanning for three starting bytes. |
889 | #[cfg (feature = "perf-literal" )] |
890 | #[derive(Clone, Debug)] |
891 | struct StartBytesThree { |
892 | byte1: u8, |
893 | byte2: u8, |
894 | byte3: u8, |
895 | } |
896 | |
897 | #[cfg (feature = "perf-literal" )] |
898 | impl PrefilterI for StartBytesThree { |
899 | fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { |
900 | memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) |
901 | .map(|i| span.start + i) |
902 | .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
903 | } |
904 | } |
905 | |
906 | /// If the given byte is an ASCII letter, then return it in the opposite case. |
907 | /// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns |
908 | /// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. |
909 | pub(crate) fn opposite_ascii_case(b: u8) -> u8 { |
910 | if b'A' <= b && b <= b'Z' { |
911 | b.to_ascii_lowercase() |
912 | } else if b'a' <= b && b <= b'z' { |
913 | b.to_ascii_uppercase() |
914 | } else { |
915 | b |
916 | } |
917 | } |
918 | |
919 | /// Return the frequency rank of the given byte. The higher the rank, the more |
920 | /// common the byte (heuristically speaking). |
921 | fn freq_rank(b: u8) -> u8 { |
922 | use crate::util::byte_frequencies::BYTE_FREQUENCIES; |
923 | BYTE_FREQUENCIES[b as usize] |
924 | } |
925 | |