1use core::{
2 cmp,
3 fmt::Debug,
4 panic::{RefUnwindSafe, UnwindSafe},
5 u8,
6};
7
8use alloc::{sync::Arc, vec, vec::Vec};
9
10use crate::{
11 packed,
12 util::{
13 alphabet::ByteSet,
14 search::{Match, MatchKind, Span},
15 },
16};
17
18/// A prefilter for accelerating a search.
19///
20/// This crate uses prefilters in the core search implementations to accelerate
21/// common cases. They typically only apply to cases where there are a small
22/// number of patterns (less than 100 or so), but when they do, thoughput can
23/// be boosted considerably, perhaps by an order of magnitude. When a prefilter
24/// is active, it is used whenever a search enters an automaton's start state.
25///
26/// Currently, prefilters cannot be constructed by
27/// callers. A `Prefilter` can only be accessed via the
28/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter)
29/// method and used to execute a search. In other words, a prefilter can be
30/// used to optimize your own search implementation if necessary, but cannot do
31/// much else. If you have a use case for more APIs, please submit an issue.
32#[derive(Clone, Debug)]
33pub struct Prefilter {
34 finder: Arc<dyn PrefilterI>,
35 memory_usage: usize,
36}
37
38impl Prefilter {
39 /// Execute a search in the haystack within the span given. If a match or
40 /// a possible match is returned, then it is guaranteed to occur within
41 /// the bounds of the span.
42 ///
43 /// If the span provided is invalid for the given haystack, then behavior
44 /// is unspecified.
45 #[inline]
46 pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
47 self.finder.find_in(haystack, span)
48 }
49
50 #[inline]
51 pub(crate) fn memory_usage(&self) -> usize {
52 self.memory_usage
53 }
54}
55
56/// A candidate is the result of running a prefilter on a haystack at a
57/// particular position.
58///
59/// The result is either no match, a confirmed match or a possible match.
60///
61/// When no match is returned, the prefilter is guaranteeing that no possible
62/// match can be found in the haystack, and the caller may trust this. That is,
63/// all correct prefilters must never report false negatives.
64///
65/// In some cases, a prefilter can confirm a match very quickly, in which case,
66/// the caller may use this to stop what it's doing and report the match. In
67/// this case, prefilter implementations must never report a false positive.
68/// In other cases, the prefilter can only report a potential match, in which
69/// case the callers must attempt to confirm the match. In this case, prefilter
70/// implementations are permitted to return false positives.
71#[derive(Clone, Debug)]
72pub enum Candidate {
73 /// No match was found. Since false negatives are not possible, this means
74 /// the search can quit as it is guaranteed not to find another match.
75 None,
76 /// A confirmed match was found. Callers do not need to confirm it.
77 Match(Match),
78 /// The start of a possible match was found. Callers must confirm it before
79 /// reporting it as a match.
80 PossibleStartOfMatch(usize),
81}
82
83impl Candidate {
84 /// Convert this candidate into an option. This is useful when callers
85 /// do not distinguish between true positives and false positives (i.e.,
86 /// the caller must always confirm the match).
87 pub fn into_option(self) -> Option<usize> {
88 match self {
89 Candidate::None => None,
90 Candidate::Match(ref m: &Match) => Some(m.start()),
91 Candidate::PossibleStartOfMatch(start: usize) => Some(start),
92 }
93 }
94}
95
96/// A prefilter describes the behavior of fast literal scanners for quickly
97/// skipping past bytes in the haystack that we know cannot possibly
98/// participate in a match.
99trait PrefilterI:
100 Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static
101{
102 /// Returns the next possible match candidate. This may yield false
103 /// positives, so callers must confirm a match starting at the position
104 /// returned. This, however, must never produce false negatives. That is,
105 /// this must, at minimum, return the starting position of the next match
106 /// in the given haystack after or at the given position.
107 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate;
108}
109
110impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
111 #[inline(always)]
112 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
113 (**self).find_in(haystack, span)
114 }
115}
116
117/// A builder for constructing the best possible prefilter. When constructed,
118/// this builder will heuristically select the best prefilter it can build,
119/// if any, and discard the rest.
120#[derive(Debug)]
121pub(crate) struct Builder {
122 count: usize,
123 ascii_case_insensitive: bool,
124 start_bytes: StartBytesBuilder,
125 rare_bytes: RareBytesBuilder,
126 memmem: MemmemBuilder,
127 packed: Option<packed::Builder>,
128 // If we run across a condition that suggests we shouldn't use a prefilter
129 // at all (like an empty pattern), then disable prefilters entirely.
130 enabled: bool,
131}
132
133impl Builder {
134 /// Create a new builder for constructing the best possible prefilter.
135 pub(crate) fn new(kind: MatchKind) -> Builder {
136 let pbuilder = kind
137 .as_packed()
138 .map(|kind| packed::Config::new().match_kind(kind).builder());
139 Builder {
140 count: 0,
141 ascii_case_insensitive: false,
142 start_bytes: StartBytesBuilder::new(),
143 rare_bytes: RareBytesBuilder::new(),
144 memmem: MemmemBuilder::default(),
145 packed: pbuilder,
146 enabled: true,
147 }
148 }
149
150 /// Enable ASCII case insensitivity. When set, byte strings added to this
151 /// builder will be interpreted without respect to ASCII case.
152 pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
153 self.ascii_case_insensitive = yes;
154 self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
155 self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
156 self
157 }
158
159 /// Return a prefilter suitable for quickly finding potential matches.
160 ///
161 /// All patterns added to an Aho-Corasick automaton should be added to this
162 /// builder before attempting to construct the prefilter.
163 pub(crate) fn build(&self) -> Option<Prefilter> {
164 if !self.enabled {
165 return None;
166 }
167 // If we only have one pattern, then deferring to memmem is always
168 // the best choice. This is kind of a weird case, because, well, why
169 // use Aho-Corasick if you only have one pattern? But maybe you don't
170 // know exactly how many patterns you'll get up front, and you need to
171 // support the option of multiple patterns. So instead of relying on
172 // the caller to branch and use memmem explicitly, we just do it for
173 // them.
174 if !self.ascii_case_insensitive {
175 if let Some(pre) = self.memmem.build() {
176 return Some(pre);
177 }
178 }
179 match (self.start_bytes.build(), self.rare_bytes.build()) {
180 // If we could build both start and rare prefilters, then there are
181 // a few cases in which we'd want to use the start-byte prefilter
182 // over the rare-byte prefilter, since the former has lower
183 // overhead.
184 (prestart @ Some(_), prerare @ Some(_)) => {
185 // If the start-byte prefilter can scan for a smaller number
186 // of bytes than the rare-byte prefilter, then it's probably
187 // faster.
188 let has_fewer_bytes =
189 self.start_bytes.count < self.rare_bytes.count;
190 // Otherwise, if the combined frequency rank of the detected
191 // bytes in the start-byte prefilter is "close" to the combined
192 // frequency rank of the rare-byte prefilter, then we pick
193 // the start-byte prefilter even if the rare-byte prefilter
194 // heuristically searches for rare bytes. This is because the
195 // rare-byte prefilter has higher constant costs, so we tend to
196 // prefer the start-byte prefilter when we can.
197 let has_rarer_bytes =
198 self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
199 if has_fewer_bytes || has_rarer_bytes {
200 prestart
201 } else {
202 prerare
203 }
204 }
205 (prestart @ Some(_), None) => prestart,
206 (None, prerare @ Some(_)) => prerare,
207 (None, None) if self.ascii_case_insensitive => None,
208 (None, None) => {
209 self.packed.as_ref().and_then(|b| b.build()).map(|s| {
210 let memory_usage = s.memory_usage();
211 Prefilter { finder: Arc::new(Packed(s)), memory_usage }
212 })
213 }
214 }
215 }
216
217 /// Add a literal string to this prefilter builder.
218 pub(crate) fn add(&mut self, bytes: &[u8]) {
219 if bytes.is_empty() {
220 self.enabled = false;
221 }
222 if !self.enabled {
223 return;
224 }
225 self.count += 1;
226 self.start_bytes.add(bytes);
227 self.rare_bytes.add(bytes);
228 self.memmem.add(bytes);
229 if let Some(ref mut pbuilder) = self.packed {
230 pbuilder.add(bytes);
231 }
232 }
233}
234
235/// A type that wraps a packed searcher and implements the `Prefilter`
236/// interface.
237#[derive(Clone, Debug)]
238struct Packed(packed::Searcher);
239
240impl PrefilterI for Packed {
241 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
242 self.0
243 .find_in(&haystack, span)
244 .map_or(default:Candidate::None, f:Candidate::Match)
245 }
246}
247
248/// A builder for constructing a prefilter that uses memmem.
249#[derive(Debug, Default)]
250struct MemmemBuilder {
251 /// The number of patterns that have been added.
252 count: usize,
253 /// The singular pattern to search for. This is only set when count==1.
254 one: Option<Vec<u8>>,
255}
256
257impl MemmemBuilder {
258 fn build(&self) -> Option<Prefilter> {
259 #[cfg(all(feature = "std", feature = "perf-literal"))]
260 fn imp(builder: &MemmemBuilder) -> Option<Prefilter> {
261 let pattern = builder.one.as_ref()?;
262 assert_eq!(1, builder.count);
263 let finder = Arc::new(Memmem(
264 memchr::memmem::Finder::new(pattern).into_owned(),
265 ));
266 let memory_usage = pattern.len();
267 Some(Prefilter { finder, memory_usage })
268 }
269
270 #[cfg(not(all(feature = "std", feature = "perf-literal")))]
271 fn imp(_: &MemmemBuilder) -> Option<Prefilter> {
272 None
273 }
274
275 imp(self)
276 }
277
278 fn add(&mut self, bytes: &[u8]) {
279 self.count += 1;
280 if self.count == 1 {
281 self.one = Some(bytes.to_vec());
282 } else {
283 self.one = None;
284 }
285 }
286}
287
288/// A type that wraps a SIMD accelerated single substring search from the
289/// `memchr` crate for use as a prefilter.
290///
291/// Currently, this prefilter is only active for Aho-Corasick searchers with
292/// a single pattern. In theory, this could be extended to support searchers
293/// that have a common prefix of more than one byte (for one byte, we would use
294/// memchr), but it's not clear if it's worth it or not.
295///
296/// Also, unfortunately, this currently also requires the 'std' feature to
297/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc
298/// mode, and so APIs like Finder::into_owned aren't available when 'std' is
299/// disabled. But there should be an 'alloc' feature that brings in APIs like
300/// Finder::into_owned but doesn't use std-only features like runtime CPU
301/// feature detection.
302#[cfg(all(feature = "std", feature = "perf-literal"))]
303#[derive(Clone, Debug)]
304struct Memmem(memchr::memmem::Finder<'static>);
305
306#[cfg(all(feature = "std", feature = "perf-literal"))]
307impl PrefilterI for Memmem {
308 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
309 use crate::util::primitives::PatternID;
310
311 self.0.find(&haystack[span]).map_or(default:Candidate::None, |i: usize| {
312 let start: usize = span.start + i;
313 let end: usize = start + self.0.needle().len();
314 // N.B. We can declare a match and use a fixed pattern ID here
315 // because a Memmem prefilter is only ever created for searchers
316 // with exactly one pattern. Thus, every match is always a match
317 // and it is always for the first and only pattern.
318 Candidate::Match(Match::new(pattern:PatternID::ZERO, span:start..end))
319 })
320 }
321}
322
323/// A builder for constructing a rare byte prefilter.
324///
325/// A rare byte prefilter attempts to pick out a small set of rare bytes that
326/// occurr in the patterns, and then quickly scan to matches of those rare
327/// bytes.
328#[derive(Clone, Debug)]
329struct RareBytesBuilder {
330 /// Whether this prefilter should account for ASCII case insensitivity or
331 /// not.
332 ascii_case_insensitive: bool,
333 /// A set of rare bytes, indexed by byte value.
334 rare_set: ByteSet,
335 /// A set of byte offsets associated with bytes in a pattern. An entry
336 /// corresponds to a particular bytes (its index) and is only non-zero if
337 /// the byte occurred at an offset greater than 0 in at least one pattern.
338 ///
339 /// If a byte's offset is not representable in 8 bits, then the rare bytes
340 /// prefilter becomes inert.
341 byte_offsets: RareByteOffsets,
342 /// Whether this is available as a prefilter or not. This can be set to
343 /// false during construction if a condition is seen that invalidates the
344 /// use of the rare-byte prefilter.
345 available: bool,
346 /// The number of bytes set to an active value in `byte_offsets`.
347 count: usize,
348 /// The sum of frequency ranks for the rare bytes detected. This is
349 /// intended to give a heuristic notion of how rare the bytes are.
350 rank_sum: u16,
351}
352
353/// A set of byte offsets, keyed by byte.
354#[derive(Clone, Copy)]
355struct RareByteOffsets {
356 /// Each entry corresponds to the maximum offset of the corresponding
357 /// byte across all patterns seen.
358 set: [RareByteOffset; 256],
359}
360
361impl RareByteOffsets {
362 /// Create a new empty set of rare byte offsets.
363 pub(crate) fn empty() -> RareByteOffsets {
364 RareByteOffsets { set: [RareByteOffset::default(); 256] }
365 }
366
367 /// Add the given offset for the given byte to this set. If the offset is
368 /// greater than the existing offset, then it overwrites the previous
369 /// value and returns false. If there is no previous value set, then this
370 /// sets it and returns true.
371 pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) {
372 self.set[byte as usize].max =
373 cmp::max(self.set[byte as usize].max, v2:off.max);
374 }
375}
376
377impl core::fmt::Debug for RareByteOffsets {
378 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
379 let mut offsets: Vec<&RareByteOffset> = vec![];
380 for off: &RareByteOffset in self.set.iter() {
381 if off.max > 0 {
382 offsets.push(off);
383 }
384 }
385 f.debug_struct("RareByteOffsets").field(name:"set", &offsets).finish()
386 }
387}
388
389/// Offsets associated with an occurrence of a "rare" byte in any of the
390/// patterns used to construct a single Aho-Corasick automaton.
391#[derive(Clone, Copy, Debug)]
392struct RareByteOffset {
393 /// The maximum offset at which a particular byte occurs from the start
394 /// of any pattern. This is used as a shift amount. That is, when an
395 /// occurrence of this byte is found, the candidate position reported by
396 /// the prefilter is `position_of_byte - max`, such that the automaton
397 /// will begin its search at a position that is guaranteed to observe a
398 /// match.
399 ///
400 /// To avoid accidentally quadratic behavior, a prefilter is considered
401 /// ineffective when it is asked to start scanning from a position that it
402 /// has already scanned past.
403 ///
404 /// Using a `u8` here means that if we ever see a pattern that's longer
405 /// than 255 bytes, then the entire rare byte prefilter is disabled.
406 max: u8,
407}
408
409impl Default for RareByteOffset {
410 fn default() -> RareByteOffset {
411 RareByteOffset { max: 0 }
412 }
413}
414
415impl RareByteOffset {
416 /// Create a new rare byte offset. If the given offset is too big, then
417 /// None is returned. In that case, callers should render the rare bytes
418 /// prefilter inert.
419 fn new(max: usize) -> Option<RareByteOffset> {
420 if max > u8::MAX as usize {
421 None
422 } else {
423 Some(RareByteOffset { max: max as u8 })
424 }
425 }
426}
427
428impl RareBytesBuilder {
429 /// Create a new builder for constructing a rare byte prefilter.
430 fn new() -> RareBytesBuilder {
431 RareBytesBuilder {
432 ascii_case_insensitive: false,
433 rare_set: ByteSet::empty(),
434 byte_offsets: RareByteOffsets::empty(),
435 available: true,
436 count: 0,
437 rank_sum: 0,
438 }
439 }
440
441 /// Enable ASCII case insensitivity. When set, byte strings added to this
442 /// builder will be interpreted without respect to ASCII case.
443 fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
444 self.ascii_case_insensitive = yes;
445 self
446 }
447
448 /// Build the rare bytes prefilter.
449 ///
450 /// If there are more than 3 distinct rare bytes found, or if heuristics
451 /// otherwise determine that this prefilter should not be used, then `None`
452 /// is returned.
453 fn build(&self) -> Option<Prefilter> {
454 #[cfg(feature = "perf-literal")]
455 fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> {
456 if !builder.available || builder.count > 3 {
457 return None;
458 }
459 let (mut bytes, mut len) = ([0; 3], 0);
460 for b in 0..=255 {
461 if builder.rare_set.contains(b) {
462 bytes[len] = b as u8;
463 len += 1;
464 }
465 }
466 let finder: Arc<dyn PrefilterI> = match len {
467 0 => return None,
468 1 => Arc::new(RareBytesOne {
469 byte1: bytes[0],
470 offset: builder.byte_offsets.set[bytes[0] as usize],
471 }),
472 2 => Arc::new(RareBytesTwo {
473 offsets: builder.byte_offsets,
474 byte1: bytes[0],
475 byte2: bytes[1],
476 }),
477 3 => Arc::new(RareBytesThree {
478 offsets: builder.byte_offsets,
479 byte1: bytes[0],
480 byte2: bytes[1],
481 byte3: bytes[2],
482 }),
483 _ => unreachable!(),
484 };
485 Some(Prefilter { finder, memory_usage: 0 })
486 }
487
488 #[cfg(not(feature = "perf-literal"))]
489 fn imp(_: &RareBytesBuilder) -> Option<Prefilter> {
490 None
491 }
492
493 imp(self)
494 }
495
496 /// Add a byte string to this builder.
497 ///
498 /// All patterns added to an Aho-Corasick automaton should be added to this
499 /// builder before attempting to construct the prefilter.
500 fn add(&mut self, bytes: &[u8]) {
501 // If we've already given up, then do nothing.
502 if !self.available {
503 return;
504 }
505 // If we've already blown our budget, then don't waste time looking
506 // for more rare bytes.
507 if self.count > 3 {
508 self.available = false;
509 return;
510 }
511 // If the pattern is too long, then our offset table is bunk, so
512 // give up.
513 if bytes.len() >= 256 {
514 self.available = false;
515 return;
516 }
517 let mut rarest = match bytes.get(0) {
518 None => return,
519 Some(&b) => (b, freq_rank(b)),
520 };
521 // The idea here is to look for the rarest byte in each pattern, and
522 // add that to our set. As a special exception, if we see a byte that
523 // we've already added, then we immediately stop and choose that byte,
524 // even if there's another rare byte in the pattern. This helps us
525 // apply the rare byte optimization in more cases by attempting to pick
526 // bytes that are in common between patterns. So for example, if we
527 // were searching for `Sherlock` and `lockjaw`, then this would pick
528 // `k` for both patterns, resulting in the use of `memchr` instead of
529 // `memchr2` for `k` and `j`.
530 let mut found = false;
531 for (pos, &b) in bytes.iter().enumerate() {
532 self.set_offset(pos, b);
533 if found {
534 continue;
535 }
536 if self.rare_set.contains(b) {
537 found = true;
538 continue;
539 }
540 let rank = freq_rank(b);
541 if rank < rarest.1 {
542 rarest = (b, rank);
543 }
544 }
545 if !found {
546 self.add_rare_byte(rarest.0);
547 }
548 }
549
550 fn set_offset(&mut self, pos: usize, byte: u8) {
551 // This unwrap is OK because pos is never bigger than our max.
552 let offset = RareByteOffset::new(pos).unwrap();
553 self.byte_offsets.set(byte, offset);
554 if self.ascii_case_insensitive {
555 self.byte_offsets.set(opposite_ascii_case(byte), offset);
556 }
557 }
558
559 fn add_rare_byte(&mut self, byte: u8) {
560 self.add_one_rare_byte(byte);
561 if self.ascii_case_insensitive {
562 self.add_one_rare_byte(opposite_ascii_case(byte));
563 }
564 }
565
566 fn add_one_rare_byte(&mut self, byte: u8) {
567 if !self.rare_set.contains(byte) {
568 self.rare_set.add(byte);
569 self.count += 1;
570 self.rank_sum += freq_rank(byte) as u16;
571 }
572 }
573}
574
575/// A prefilter for scanning for a single "rare" byte.
576#[cfg(feature = "perf-literal")]
577#[derive(Clone, Debug)]
578struct RareBytesOne {
579 byte1: u8,
580 offset: RareByteOffset,
581}
582
583#[cfg(feature = "perf-literal")]
584impl PrefilterI for RareBytesOne {
585 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
586 memchr::memchr(self.byte1, &haystack[span])
587 .map(|i| {
588 let pos = span.start + i;
589 cmp::max(
590 span.start,
591 pos.saturating_sub(usize::from(self.offset.max)),
592 )
593 })
594 .map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
595 }
596}
597
598/// A prefilter for scanning for two "rare" bytes.
599#[cfg(feature = "perf-literal")]
600#[derive(Clone, Debug)]
601struct RareBytesTwo {
602 offsets: RareByteOffsets,
603 byte1: u8,
604 byte2: u8,
605}
606
607#[cfg(feature = "perf-literal")]
608impl PrefilterI for RareBytesTwo {
609 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
610 memchr::memchr2(self.byte1, self.byte2, &haystack[span])
611 .map(|i| {
612 let pos = span.start + i;
613 let offset = self.offsets.set[usize::from(haystack[pos])].max;
614 cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
615 })
616 .map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
617 }
618}
619
620/// A prefilter for scanning for three "rare" bytes.
621#[cfg(feature = "perf-literal")]
622#[derive(Clone, Debug)]
623struct RareBytesThree {
624 offsets: RareByteOffsets,
625 byte1: u8,
626 byte2: u8,
627 byte3: u8,
628}
629
630#[cfg(feature = "perf-literal")]
631impl PrefilterI for RareBytesThree {
632 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
633 memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
634 .map(|i| {
635 let pos = span.start + i;
636 let offset = self.offsets.set[usize::from(haystack[pos])].max;
637 cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
638 })
639 .map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
640 }
641}
642
643/// A builder for constructing a starting byte prefilter.
644///
645/// A starting byte prefilter is a simplistic prefilter that looks for possible
646/// matches by reporting all positions corresponding to a particular byte. This
647/// generally only takes affect when there are at most 3 distinct possible
648/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
649/// distinct starting bytes (`f` and `b`), and this prefilter returns all
650/// occurrences of either `f` or `b`.
651///
652/// In some cases, a heuristic frequency analysis may determine that it would
653/// be better not to use this prefilter even when there are 3 or fewer distinct
654/// starting bytes.
655#[derive(Clone, Debug)]
656struct StartBytesBuilder {
657 /// Whether this prefilter should account for ASCII case insensitivity or
658 /// not.
659 ascii_case_insensitive: bool,
660 /// The set of starting bytes observed.
661 byteset: Vec<bool>,
662 /// The number of bytes set to true in `byteset`.
663 count: usize,
664 /// The sum of frequency ranks for the rare bytes detected. This is
665 /// intended to give a heuristic notion of how rare the bytes are.
666 rank_sum: u16,
667}
668
669impl StartBytesBuilder {
670 /// Create a new builder for constructing a start byte prefilter.
671 fn new() -> StartBytesBuilder {
672 StartBytesBuilder {
673 ascii_case_insensitive: false,
674 byteset: vec![false; 256],
675 count: 0,
676 rank_sum: 0,
677 }
678 }
679
680 /// Enable ASCII case insensitivity. When set, byte strings added to this
681 /// builder will be interpreted without respect to ASCII case.
682 fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
683 self.ascii_case_insensitive = yes;
684 self
685 }
686
687 /// Build the starting bytes prefilter.
688 ///
689 /// If there are more than 3 distinct starting bytes, or if heuristics
690 /// otherwise determine that this prefilter should not be used, then `None`
691 /// is returned.
692 fn build(&self) -> Option<Prefilter> {
693 #[cfg(feature = "perf-literal")]
694 fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> {
695 if builder.count > 3 {
696 return None;
697 }
698 let (mut bytes, mut len) = ([0; 3], 0);
699 for b in 0..256 {
700 if !builder.byteset[b] {
701 continue;
702 }
703 // We don't handle non-ASCII bytes for now. Getting non-ASCII
704 // bytes right is trickier, since we generally don't want to put
705 // a leading UTF-8 code unit into a prefilter that isn't ASCII,
706 // since they can frequently. Instead, it would be better to use a
707 // continuation byte, but this requires more sophisticated analysis
708 // of the automaton and a richer prefilter API.
709 if b > 0x7F {
710 return None;
711 }
712 bytes[len] = b as u8;
713 len += 1;
714 }
715 let finder: Arc<dyn PrefilterI> = match len {
716 0 => return None,
717 1 => Arc::new(StartBytesOne { byte1: bytes[0] }),
718 2 => Arc::new(StartBytesTwo {
719 byte1: bytes[0],
720 byte2: bytes[1],
721 }),
722 3 => Arc::new(StartBytesThree {
723 byte1: bytes[0],
724 byte2: bytes[1],
725 byte3: bytes[2],
726 }),
727 _ => unreachable!(),
728 };
729 Some(Prefilter { finder, memory_usage: 0 })
730 }
731
732 #[cfg(not(feature = "perf-literal"))]
733 fn imp(_: &StartBytesBuilder) -> Option<Prefilter> {
734 None
735 }
736
737 imp(self)
738 }
739
740 /// Add a byte string to this builder.
741 ///
742 /// All patterns added to an Aho-Corasick automaton should be added to this
743 /// builder before attempting to construct the prefilter.
744 fn add(&mut self, bytes: &[u8]) {
745 if self.count > 3 {
746 return;
747 }
748 if let Some(&byte) = bytes.get(0) {
749 self.add_one_byte(byte);
750 if self.ascii_case_insensitive {
751 self.add_one_byte(opposite_ascii_case(byte));
752 }
753 }
754 }
755
756 fn add_one_byte(&mut self, byte: u8) {
757 if !self.byteset[byte as usize] {
758 self.byteset[byte as usize] = true;
759 self.count += 1;
760 self.rank_sum += freq_rank(byte) as u16;
761 }
762 }
763}
764
765/// A prefilter for scanning for a single starting byte.
766#[cfg(feature = "perf-literal")]
767#[derive(Clone, Debug)]
768struct StartBytesOne {
769 byte1: u8,
770}
771
772#[cfg(feature = "perf-literal")]
773impl PrefilterI for StartBytesOne {
774 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
775 memchr::memchr(self.byte1, &haystack[span])
776 .map(|i| span.start + i)
777 .map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
778 }
779}
780
781/// A prefilter for scanning for two starting bytes.
782#[cfg(feature = "perf-literal")]
783#[derive(Clone, Debug)]
784struct StartBytesTwo {
785 byte1: u8,
786 byte2: u8,
787}
788
789#[cfg(feature = "perf-literal")]
790impl PrefilterI for StartBytesTwo {
791 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
792 memchr::memchr2(self.byte1, self.byte2, &haystack[span])
793 .map(|i| span.start + i)
794 .map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
795 }
796}
797
798/// A prefilter for scanning for three starting bytes.
799#[cfg(feature = "perf-literal")]
800#[derive(Clone, Debug)]
801struct StartBytesThree {
802 byte1: u8,
803 byte2: u8,
804 byte3: u8,
805}
806
807#[cfg(feature = "perf-literal")]
808impl PrefilterI for StartBytesThree {
809 fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
810 memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
811 .map(|i| span.start + i)
812 .map_or(default:Candidate::None, f:Candidate::PossibleStartOfMatch)
813 }
814}
815
816/// If the given byte is an ASCII letter, then return it in the opposite case.
817/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
818/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
819pub(crate) fn opposite_ascii_case(b: u8) -> u8 {
820 if b'A' <= b && b <= b'Z' {
821 b.to_ascii_lowercase()
822 } else if b'a' <= b && b <= b'z' {
823 b.to_ascii_uppercase()
824 } else {
825 b
826 }
827}
828
829/// Return the frequency rank of the given byte. The higher the rank, the more
830/// common the byte (heuristically speaking).
831fn freq_rank(b: u8) -> u8 {
832 use crate::util::byte_frequencies::BYTE_FREQUENCIES;
833 BYTE_FREQUENCIES[b as usize]
834}
835