1/*
2This module implements a "fallback" prefilter that only relies on memchr to
3function. While memchr works best when it's explicitly vectorized, its
4fallback implementations are fast enough to make a prefilter like this
5worthwhile.
6
7The essence of this implementation is to identify two rare bytes in a needle
8based on a background frequency distribution of bytes. We then run memchr on the
9rarer byte. For each match, we use the second rare byte as a guard to quickly
10check if a match is possible. If the position passes the guard test, then we do
11a naive memcmp to confirm the match.
12
13In practice, this formulation works amazingly well, primarily because of the
14heuristic use of a background frequency distribution. However, it does have a
15number of weaknesses where it can get quite slow when its background frequency
16distribution doesn't line up with the haystack being searched. This is why we
17have specialized vector routines that essentially take this idea and move the
18guard check into vectorized code. (Those specialized vector routines do still
19make use of the background frequency distribution of bytes though.)
20
21This fallback implementation was originally formulated in regex many moons ago:
22https://github.com/rust-lang/regex/blob/3db8722d0b204a85380fe2a65e13d7065d7dd968/src/literal/imp.rs#L370-L501
23Prior to that, I'm not aware of anyone using this technique in any prominent
24substring search implementation. Although, I'm sure folks have had this same
25insight long before me.
26
27Another version of this also appeared in bstr:
28https://github.com/BurntSushi/bstr/blob/a444256ca7407fe180ee32534688549655b7a38e/src/search/prefilter.rs#L83-L340
29*/
30
31use crate::memmem::{
32 prefilter::{PrefilterFnTy, PrefilterState},
33 NeedleInfo,
34};
35
36// Check that the functions below satisfy the Prefilter function type.
37const _: PrefilterFnTy = find;
38
39/// Look for a possible occurrence of needle. The position returned
40/// corresponds to the beginning of the occurrence, if one exists.
41///
42/// Callers may assume that this never returns false negatives (i.e., it
43/// never misses an actual occurrence), but must check that the returned
44/// position corresponds to a match. That is, it can return false
45/// positives.
46///
47/// This should only be used when Freqy is constructed for forward
48/// searching.
49pub(crate) fn find(
50 prestate: &mut PrefilterState,
51 ninfo: &NeedleInfo,
52 haystack: &[u8],
53 needle: &[u8],
54) -> Option<usize> {
55 let mut i = 0;
56 let (rare1i, rare2i) = ninfo.rarebytes.as_rare_usize();
57 let (rare1, rare2) = ninfo.rarebytes.as_rare_bytes(needle);
58 while prestate.is_effective() {
59 // Use a fast vectorized implementation to skip to the next
60 // occurrence of the rarest byte (heuristically chosen) in the
61 // needle.
62 let found = crate::memchr(rare1, &haystack[i..])?;
63 prestate.update(found);
64 i += found;
65
66 // If we can't align our first match with the haystack, then a
67 // match is impossible.
68 if i < rare1i {
69 i += 1;
70 continue;
71 }
72
73 // Align our rare2 byte with the haystack. A mismatch means that
74 // a match is impossible.
75 let aligned_rare2i = i - rare1i + rare2i;
76 if haystack.get(aligned_rare2i) != Some(&rare2) {
77 i += 1;
78 continue;
79 }
80
81 // We've done what we can. There might be a match here.
82 return Some(i - rare1i);
83 }
84 // The only way we get here is if we believe our skipping heuristic
85 // has become ineffective. We're allowed to return false positives,
86 // so return the position at which we advanced to, aligned to the
87 // haystack.
88 Some(i.saturating_sub(rare1i))
89}
90
91#[cfg(all(test, feature = "std"))]
92mod tests {
93 use super::*;
94
95 fn freqy_find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
96 let ninfo = NeedleInfo::new(needle);
97 let mut prestate = PrefilterState::new();
98 find(&mut prestate, &ninfo, haystack, needle)
99 }
100
101 #[test]
102 fn freqy_forward() {
103 assert_eq!(Some(0), freqy_find(b"BARFOO", b"BAR"));
104 assert_eq!(Some(3), freqy_find(b"FOOBAR", b"BAR"));
105 assert_eq!(Some(0), freqy_find(b"zyzz", b"zyzy"));
106 assert_eq!(Some(2), freqy_find(b"zzzy", b"zyzy"));
107 assert_eq!(None, freqy_find(b"zazb", b"zyzy"));
108 assert_eq!(Some(0), freqy_find(b"yzyy", b"yzyz"));
109 assert_eq!(Some(2), freqy_find(b"yyyz", b"yzyz"));
110 assert_eq!(None, freqy_find(b"yayb", b"yzyz"));
111 }
112
113 #[test]
114 #[cfg(not(miri))]
115 fn prefilter_permutations() {
116 use crate::memmem::prefilter::tests::PrefilterTest;
117
118 // SAFETY: super::find is safe to call for all inputs and on all
119 // platforms.
120 unsafe { PrefilterTest::run_all_tests(super::find) };
121 }
122}
123