1 | /// A heuristic frequency based detection of rare bytes for substring search. |
2 | /// |
3 | /// This detector attempts to pick out two bytes in a needle that are predicted |
4 | /// to occur least frequently. The purpose is to use these bytes to implement |
5 | /// fast candidate search using vectorized code. |
6 | /// |
7 | /// A set of offsets is only computed for needles of length 2 or greater. |
8 | /// Smaller needles should be special cased by the substring search algorithm |
9 | /// in use. (e.g., Use memchr for single byte needles.) |
10 | /// |
11 | /// Note that we use `u8` to represent the offsets of the rare bytes in a |
12 | /// needle to reduce space usage. This means that rare byte occurring after the |
13 | /// first 255 bytes in a needle will never be used. |
14 | #[derive (Clone, Copy, Debug, Default)] |
15 | pub(crate) struct RareNeedleBytes { |
16 | /// The leftmost offset of the rarest byte in the needle, according to |
17 | /// pre-computed frequency analysis. The "leftmost offset" means that |
18 | /// rare1i <= i for all i where needle[i] == needle[rare1i]. |
19 | rare1i: u8, |
20 | /// The leftmost offset of the second rarest byte in the needle, according |
21 | /// to pre-computed frequency analysis. The "leftmost offset" means that |
22 | /// rare2i <= i for all i where needle[i] == needle[rare2i]. |
23 | /// |
24 | /// The second rarest byte is used as a type of guard for quickly detecting |
25 | /// a mismatch if the first byte matches. This is a hedge against |
26 | /// pathological cases where the pre-computed frequency analysis may be |
27 | /// off. (But of course, does not prevent *all* pathological cases.) |
28 | /// |
29 | /// In general, rare1i != rare2i by construction, although there is no hard |
30 | /// requirement that they be different. However, since the case of a single |
31 | /// byte needle is handled specially by memchr itself, rare2i generally |
32 | /// always should be different from rare1i since it would otherwise be |
33 | /// ineffective as a guard. |
34 | rare2i: u8, |
35 | } |
36 | |
37 | impl RareNeedleBytes { |
38 | /// Create a new pair of rare needle bytes with the given offsets. This is |
39 | /// only used in tests for generating input data. |
40 | #[cfg (all(test, feature = "std" ))] |
41 | pub(crate) fn new(rare1i: u8, rare2i: u8) -> RareNeedleBytes { |
42 | RareNeedleBytes { rare1i, rare2i } |
43 | } |
44 | |
45 | /// Detect the leftmost offsets of the two rarest bytes in the given |
46 | /// needle. |
47 | pub(crate) fn forward(needle: &[u8]) -> RareNeedleBytes { |
48 | if needle.len() <= 1 || needle.len() > core::u8::MAX as usize { |
49 | // For needles bigger than u8::MAX, our offsets aren't big enough. |
50 | // (We make our offsets small to reduce stack copying.) |
51 | // If you have a use case for it, please file an issue. In that |
52 | // case, we should probably just adjust the routine below to pick |
53 | // some rare bytes from the first 255 bytes of the needle. |
54 | // |
55 | // Also note that for needles of size 0 or 1, they are special |
56 | // cased in Two-Way. |
57 | // |
58 | // TODO: Benchmar this. |
59 | return RareNeedleBytes { rare1i: 0, rare2i: 0 }; |
60 | } |
61 | |
62 | // Find the rarest two bytes. We make them distinct by construction. |
63 | let (mut rare1, mut rare1i) = (needle[0], 0); |
64 | let (mut rare2, mut rare2i) = (needle[1], 1); |
65 | if rank(rare2) < rank(rare1) { |
66 | core::mem::swap(&mut rare1, &mut rare2); |
67 | core::mem::swap(&mut rare1i, &mut rare2i); |
68 | } |
69 | for (i, &b) in needle.iter().enumerate().skip(2) { |
70 | if rank(b) < rank(rare1) { |
71 | rare2 = rare1; |
72 | rare2i = rare1i; |
73 | rare1 = b; |
74 | rare1i = i as u8; |
75 | } else if b != rare1 && rank(b) < rank(rare2) { |
76 | rare2 = b; |
77 | rare2i = i as u8; |
78 | } |
79 | } |
80 | // While not strictly required, we really don't want these to be |
81 | // equivalent. If they were, it would reduce the effectiveness of |
82 | // candidate searching using these rare bytes by increasing the rate of |
83 | // false positives. |
84 | assert_ne!(rare1i, rare2i); |
85 | RareNeedleBytes { rare1i, rare2i } |
86 | } |
87 | |
88 | /// Return the rare bytes in the given needle in the forward direction. |
89 | /// The needle given must be the same one given to the RareNeedleBytes |
90 | /// constructor. |
91 | pub(crate) fn as_rare_bytes(&self, needle: &[u8]) -> (u8, u8) { |
92 | (needle[self.rare1i as usize], needle[self.rare2i as usize]) |
93 | } |
94 | |
95 | /// Return the rare offsets such that the first offset is always <= to the |
96 | /// second offset. This is useful when the caller doesn't care whether |
97 | /// rare1 is rarer than rare2, but just wants to ensure that they are |
98 | /// ordered with respect to one another. |
99 | #[cfg (memchr_runtime_simd)] |
100 | pub(crate) fn as_rare_ordered_usize(&self) -> (usize, usize) { |
101 | let (rare1i, rare2i) = self.as_rare_ordered_u8(); |
102 | (rare1i as usize, rare2i as usize) |
103 | } |
104 | |
105 | /// Like as_rare_ordered_usize, but returns the offsets as their native |
106 | /// u8 values. |
107 | #[cfg (memchr_runtime_simd)] |
108 | pub(crate) fn as_rare_ordered_u8(&self) -> (u8, u8) { |
109 | if self.rare1i <= self.rare2i { |
110 | (self.rare1i, self.rare2i) |
111 | } else { |
112 | (self.rare2i, self.rare1i) |
113 | } |
114 | } |
115 | |
116 | /// Return the rare offsets as usize values in the order in which they were |
117 | /// constructed. rare1, for example, is constructed as the "rarer" byte, |
118 | /// and thus, callers may want to treat it differently from rare2. |
119 | pub(crate) fn as_rare_usize(&self) -> (usize, usize) { |
120 | (self.rare1i as usize, self.rare2i as usize) |
121 | } |
122 | |
123 | /// Return the byte frequency rank of each byte. The higher the rank, the |
124 | /// more frequency the byte is predicted to be. The needle given must be |
125 | /// the same one given to the RareNeedleBytes constructor. |
126 | pub(crate) fn as_ranks(&self, needle: &[u8]) -> (usize, usize) { |
127 | let (b1, b2) = self.as_rare_bytes(needle); |
128 | (rank(b1), rank(b2)) |
129 | } |
130 | } |
131 | |
132 | /// Return the heuristical frequency rank of the given byte. A lower rank |
133 | /// means the byte is believed to occur less frequently. |
134 | fn rank(b: u8) -> usize { |
135 | crate::memmem::byte_frequencies::BYTE_FREQUENCIES[b as usize] as usize |
136 | } |
137 | |