| 1 | /*! |
| 2 | Provides an architecture independent implementation of the "packed pair" |
| 3 | algorithm. |
| 4 | |
| 5 | The "packed pair" algorithm is based on the [generic SIMD] algorithm. The main |
| 6 | difference is that it (by default) uses a background distribution of byte |
| 7 | frequencies to heuristically select the pair of bytes to search for. Note that |
| 8 | this module provides an architecture independent version that doesn't do as |
| 9 | good of a job keeping the search for candidates inside a SIMD hot path. It |
| 10 | however can be good enough in many circumstances. |
| 11 | |
| 12 | [generic SIMD]: http://0x80.pl/articles/simd-strfind.html#first-and-last |
| 13 | */ |
| 14 | |
| 15 | use crate::memchr; |
| 16 | |
| 17 | mod default_rank; |
| 18 | |
| 19 | /// An architecture independent "packed pair" finder. |
| 20 | /// |
| 21 | /// This finder picks two bytes that it believes have high predictive power for |
| 22 | /// indicating an overall match of a needle. At search time, it reports offsets |
| 23 | /// where the needle could match based on whether the pair of bytes it chose |
| 24 | /// match. |
| 25 | /// |
| 26 | /// This is architecture independent because it utilizes `memchr` to find the |
| 27 | /// occurrence of one of the bytes in the pair, and then checks whether the |
| 28 | /// second byte matches. If it does, in the case of [`Finder::find_prefilter`], |
| 29 | /// the location at which the needle could match is returned. |
| 30 | /// |
| 31 | /// It is generally preferred to use architecture specific routines for a |
| 32 | /// "packed pair" prefilter, but this can be a useful fallback when the |
| 33 | /// architecture independent routines are unavailable. |
| 34 | #[derive (Clone, Copy, Debug)] |
| 35 | pub struct Finder { |
| 36 | pair: Pair, |
| 37 | byte1: u8, |
| 38 | byte2: u8, |
| 39 | } |
| 40 | |
| 41 | impl Finder { |
| 42 | /// Create a new prefilter that reports possible locations where the given |
| 43 | /// needle matches. |
| 44 | #[inline ] |
| 45 | pub fn new(needle: &[u8]) -> Option<Finder> { |
| 46 | Finder::with_pair(needle, Pair::new(needle)?) |
| 47 | } |
| 48 | |
| 49 | /// Create a new prefilter using the pair given. |
| 50 | /// |
| 51 | /// If the prefilter could not be constructed, then `None` is returned. |
| 52 | /// |
| 53 | /// This constructor permits callers to control precisely which pair of |
| 54 | /// bytes is used as a predicate. |
| 55 | #[inline ] |
| 56 | pub fn with_pair(needle: &[u8], pair: Pair) -> Option<Finder> { |
| 57 | let byte1 = needle[usize::from(pair.index1())]; |
| 58 | let byte2 = needle[usize::from(pair.index2())]; |
| 59 | // Currently this can never fail so we could just return a Finder, |
| 60 | // but it's conceivable this could change. |
| 61 | Some(Finder { pair, byte1, byte2 }) |
| 62 | } |
| 63 | |
| 64 | /// Run this finder on the given haystack as a prefilter. |
| 65 | /// |
| 66 | /// If a candidate match is found, then an offset where the needle *could* |
| 67 | /// begin in the haystack is returned. |
| 68 | #[inline ] |
| 69 | pub fn find_prefilter(&self, haystack: &[u8]) -> Option<usize> { |
| 70 | let mut i = 0; |
| 71 | let index1 = usize::from(self.pair.index1()); |
| 72 | let index2 = usize::from(self.pair.index2()); |
| 73 | loop { |
| 74 | // Use a fast vectorized implementation to skip to the next |
| 75 | // occurrence of the rarest byte (heuristically chosen) in the |
| 76 | // needle. |
| 77 | i += memchr(self.byte1, &haystack[i..])?; |
| 78 | let found = i; |
| 79 | i += 1; |
| 80 | |
| 81 | // If we can't align our first byte match with the haystack, then a |
| 82 | // match is impossible. |
| 83 | let aligned1 = match found.checked_sub(index1) { |
| 84 | None => continue, |
| 85 | Some(aligned1) => aligned1, |
| 86 | }; |
| 87 | |
| 88 | // Now align the second byte match with the haystack. A mismatch |
| 89 | // means that a match is impossible. |
| 90 | let aligned2 = match aligned1.checked_add(index2) { |
| 91 | None => continue, |
| 92 | Some(aligned_index2) => aligned_index2, |
| 93 | }; |
| 94 | if haystack.get(aligned2).map_or(true, |&b| b != self.byte2) { |
| 95 | continue; |
| 96 | } |
| 97 | |
| 98 | // We've done what we can. There might be a match here. |
| 99 | return Some(aligned1); |
| 100 | } |
| 101 | } |
| 102 | |
| 103 | /// Returns the pair of offsets (into the needle) used to check as a |
| 104 | /// predicate before confirming whether a needle exists at a particular |
| 105 | /// position. |
| 106 | #[inline ] |
| 107 | pub fn pair(&self) -> &Pair { |
| 108 | &self.pair |
| 109 | } |
| 110 | } |
| 111 | |
| 112 | /// A pair of byte offsets into a needle to use as a predicate. |
| 113 | /// |
| 114 | /// This pair is used as a predicate to quickly filter out positions in a |
| 115 | /// haystack in which a needle cannot match. In some cases, this pair can even |
| 116 | /// be used in vector algorithms such that the vector algorithm only switches |
| 117 | /// over to scalar code once this pair has been found. |
| 118 | /// |
| 119 | /// A pair of offsets can be used in both substring search implementations and |
| 120 | /// in prefilters. The former will report matches of a needle in a haystack |
| 121 | /// where as the latter will only report possible matches of a needle. |
| 122 | /// |
| 123 | /// The offsets are limited each to a maximum of 255 to keep memory usage low. |
| 124 | /// Moreover, it's rarely advantageous to create a predicate using offsets |
| 125 | /// greater than 255 anyway. |
| 126 | /// |
| 127 | /// The only guarantee enforced on the pair of offsets is that they are not |
| 128 | /// equivalent. It is not necessarily the case that `index1 < index2` for |
| 129 | /// example. By convention, `index1` corresponds to the byte in the needle |
| 130 | /// that is believed to be most the predictive. Note also that because of the |
| 131 | /// requirement that the indices be both valid for the needle used to build |
| 132 | /// the pair and not equal, it follows that a pair can only be constructed for |
| 133 | /// needles with length at least 2. |
| 134 | #[derive (Clone, Copy, Debug)] |
| 135 | pub struct Pair { |
| 136 | index1: u8, |
| 137 | index2: u8, |
| 138 | } |
| 139 | |
| 140 | impl Pair { |
| 141 | /// Create a new pair of offsets from the given needle. |
| 142 | /// |
| 143 | /// If a pair could not be created (for example, if the needle is too |
| 144 | /// short), then `None` is returned. |
| 145 | /// |
| 146 | /// This chooses the pair in the needle that is believed to be as |
| 147 | /// predictive of an overall match of the needle as possible. |
| 148 | #[inline ] |
| 149 | pub fn new(needle: &[u8]) -> Option<Pair> { |
| 150 | Pair::with_ranker(needle, DefaultFrequencyRank) |
| 151 | } |
| 152 | |
| 153 | /// Create a new pair of offsets from the given needle and ranker. |
| 154 | /// |
| 155 | /// This permits the caller to choose a background frequency distribution |
| 156 | /// with which bytes are selected. The idea is to select a pair of bytes |
| 157 | /// that is believed to strongly predict a match in the haystack. This |
| 158 | /// usually means selecting bytes that occur rarely in a haystack. |
| 159 | /// |
| 160 | /// If a pair could not be created (for example, if the needle is too |
| 161 | /// short), then `None` is returned. |
| 162 | #[inline ] |
| 163 | pub fn with_ranker<R: HeuristicFrequencyRank>( |
| 164 | needle: &[u8], |
| 165 | ranker: R, |
| 166 | ) -> Option<Pair> { |
| 167 | if needle.len() <= 1 { |
| 168 | return None; |
| 169 | } |
| 170 | // Find the rarest two bytes. We make them distinct indices by |
| 171 | // construction. (The actual byte value may be the same in degenerate |
| 172 | // cases, but that's OK.) |
| 173 | let (mut rare1, mut index1) = (needle[0], 0); |
| 174 | let (mut rare2, mut index2) = (needle[1], 1); |
| 175 | if ranker.rank(rare2) < ranker.rank(rare1) { |
| 176 | core::mem::swap(&mut rare1, &mut rare2); |
| 177 | core::mem::swap(&mut index1, &mut index2); |
| 178 | } |
| 179 | let max = usize::from(core::u8::MAX); |
| 180 | for (i, &b) in needle.iter().enumerate().take(max).skip(2) { |
| 181 | if ranker.rank(b) < ranker.rank(rare1) { |
| 182 | rare2 = rare1; |
| 183 | index2 = index1; |
| 184 | rare1 = b; |
| 185 | index1 = u8::try_from(i).unwrap(); |
| 186 | } else if b != rare1 && ranker.rank(b) < ranker.rank(rare2) { |
| 187 | rare2 = b; |
| 188 | index2 = u8::try_from(i).unwrap(); |
| 189 | } |
| 190 | } |
| 191 | // While not strictly required for how a Pair is normally used, we |
| 192 | // really don't want these to be equivalent. If they were, it would |
| 193 | // reduce the effectiveness of candidate searching using these rare |
| 194 | // bytes by increasing the rate of false positives. |
| 195 | assert_ne!(index1, index2); |
| 196 | Some(Pair { index1, index2 }) |
| 197 | } |
| 198 | |
| 199 | /// Create a new pair using the offsets given for the needle given. |
| 200 | /// |
| 201 | /// This bypasses any sort of heuristic process for choosing the offsets |
| 202 | /// and permits the caller to choose the offsets themselves. |
| 203 | /// |
| 204 | /// Indices are limited to valid `u8` values so that a `Pair` uses less |
| 205 | /// memory. It is not possible to create a `Pair` with offsets bigger than |
| 206 | /// `u8::MAX`. It's likely that such a thing is not needed, but if it is, |
| 207 | /// it's suggested to build your own bespoke algorithm because you're |
| 208 | /// likely working on a very niche case. (File an issue if this suggestion |
| 209 | /// does not make sense to you.) |
| 210 | /// |
| 211 | /// If a pair could not be created (for example, if the needle is too |
| 212 | /// short), then `None` is returned. |
| 213 | #[inline ] |
| 214 | pub fn with_indices( |
| 215 | needle: &[u8], |
| 216 | index1: u8, |
| 217 | index2: u8, |
| 218 | ) -> Option<Pair> { |
| 219 | // While not strictly required for how a Pair is normally used, we |
| 220 | // really don't want these to be equivalent. If they were, it would |
| 221 | // reduce the effectiveness of candidate searching using these rare |
| 222 | // bytes by increasing the rate of false positives. |
| 223 | if index1 == index2 { |
| 224 | return None; |
| 225 | } |
| 226 | // Similarly, invalid indices means the Pair is invalid too. |
| 227 | if usize::from(index1) >= needle.len() { |
| 228 | return None; |
| 229 | } |
| 230 | if usize::from(index2) >= needle.len() { |
| 231 | return None; |
| 232 | } |
| 233 | Some(Pair { index1, index2 }) |
| 234 | } |
| 235 | |
| 236 | /// Returns the first offset of the pair. |
| 237 | #[inline ] |
| 238 | pub fn index1(&self) -> u8 { |
| 239 | self.index1 |
| 240 | } |
| 241 | |
| 242 | /// Returns the second offset of the pair. |
| 243 | #[inline ] |
| 244 | pub fn index2(&self) -> u8 { |
| 245 | self.index2 |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | /// This trait allows the user to customize the heuristic used to determine the |
| 250 | /// relative frequency of a given byte in the dataset being searched. |
| 251 | /// |
| 252 | /// The use of this trait can have a dramatic impact on performance depending |
| 253 | /// on the type of data being searched. The details of why are explained in the |
| 254 | /// docs of [`crate::memmem::Prefilter`]. To summarize, the core algorithm uses |
| 255 | /// a prefilter to quickly identify candidate matches that are later verified |
| 256 | /// more slowly. This prefilter is implemented in terms of trying to find |
| 257 | /// `rare` bytes at specific offsets that will occur less frequently in the |
| 258 | /// dataset. While the concept of a `rare` byte is similar for most datasets, |
| 259 | /// there are some specific datasets (like binary executables) that have |
| 260 | /// dramatically different byte distributions. For these datasets customizing |
| 261 | /// the byte frequency heuristic can have a massive impact on performance, and |
| 262 | /// might even need to be done at runtime. |
| 263 | /// |
| 264 | /// The default implementation of `HeuristicFrequencyRank` reads from the |
| 265 | /// static frequency table defined in `src/memmem/byte_frequencies.rs`. This |
| 266 | /// is optimal for most inputs, so if you are unsure of the impact of using a |
| 267 | /// custom `HeuristicFrequencyRank` you should probably just use the default. |
| 268 | /// |
| 269 | /// # Example |
| 270 | /// |
| 271 | /// ``` |
| 272 | /// use memchr::{ |
| 273 | /// arch::all::packedpair::HeuristicFrequencyRank, |
| 274 | /// memmem::FinderBuilder, |
| 275 | /// }; |
| 276 | /// |
| 277 | /// /// A byte-frequency table that is good for scanning binary executables. |
| 278 | /// struct Binary; |
| 279 | /// |
| 280 | /// impl HeuristicFrequencyRank for Binary { |
| 281 | /// fn rank(&self, byte: u8) -> u8 { |
| 282 | /// const TABLE: [u8; 256] = [ |
| 283 | /// 255, 128, 61, 43, 50, 41, 27, 28, 57, 15, 21, 13, 24, 17, 17, |
| 284 | /// 89, 58, 16, 11, 7, 14, 23, 7, 6, 24, 9, 6, 5, 9, 4, 7, 16, |
| 285 | /// 68, 11, 9, 6, 88, 7, 4, 4, 23, 9, 4, 8, 8, 5, 10, 4, 30, 11, |
| 286 | /// 9, 24, 11, 5, 5, 5, 19, 11, 6, 17, 9, 9, 6, 8, |
| 287 | /// 48, 58, 11, 14, 53, 40, 9, 9, 254, 35, 3, 6, 52, 23, 6, 6, 27, |
| 288 | /// 4, 7, 11, 14, 13, 10, 11, 11, 5, 2, 10, 16, 12, 6, 19, |
| 289 | /// 19, 20, 5, 14, 16, 31, 19, 7, 14, 20, 4, 4, 19, 8, 18, 20, 24, |
| 290 | /// 1, 25, 19, 58, 29, 10, 5, 15, 20, 2, 2, 9, 4, 3, 5, |
| 291 | /// 51, 11, 4, 53, 23, 39, 6, 4, 13, 81, 4, 186, 5, 67, 3, 2, 15, |
| 292 | /// 0, 0, 1, 3, 2, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, |
| 293 | /// 12, 2, 1, 1, 3, 1, 1, 1, 6, 1, 2, 1, 3, 1, 1, 2, 9, 1, 1, 0, |
| 294 | /// 2, 2, 4, 4, 11, 6, 7, 3, 6, 9, 4, 5, |
| 295 | /// 46, 18, 8, 18, 17, 3, 8, 20, 16, 10, 3, 7, 175, 4, 6, 7, 13, |
| 296 | /// 3, 7, 3, 3, 1, 3, 3, 10, 3, 1, 5, 2, 0, 1, 2, |
| 297 | /// 16, 3, 5, 1, 6, 1, 1, 2, 58, 20, 3, 14, 12, 2, 1, 3, 16, 3, 5, |
| 298 | /// 8, 3, 1, 8, 6, 17, 6, 5, 3, 8, 6, 13, 175, |
| 299 | /// ]; |
| 300 | /// TABLE[byte as usize] |
| 301 | /// } |
| 302 | /// } |
| 303 | /// // Create a new finder with the custom heuristic. |
| 304 | /// let finder = FinderBuilder::new() |
| 305 | /// .build_forward_with_ranker(Binary, b" \x00\x00\xdd\xdd" ); |
| 306 | /// // Find needle with custom heuristic. |
| 307 | /// assert!(finder.find(b" \x00\x00\x00\xdd\xdd" ).is_some()); |
| 308 | /// ``` |
| 309 | pub trait HeuristicFrequencyRank { |
| 310 | /// Return the heuristic frequency rank of the given byte. A lower rank |
| 311 | /// means the byte is believed to occur less frequently in the haystack. |
| 312 | /// |
| 313 | /// Some uses of this heuristic may treat arbitrary absolute rank values as |
| 314 | /// significant. For example, an implementation detail in this crate may |
| 315 | /// determine that heuristic prefilters are inappropriate if every byte in |
| 316 | /// the needle has a "high" rank. |
| 317 | fn rank(&self, byte: u8) -> u8; |
| 318 | } |
| 319 | |
| 320 | /// The default byte frequency heuristic that is good for most haystacks. |
| 321 | pub(crate) struct DefaultFrequencyRank; |
| 322 | |
| 323 | impl HeuristicFrequencyRank for DefaultFrequencyRank { |
| 324 | fn rank(&self, byte: u8) -> u8 { |
| 325 | self::default_rank::RANK[usize::from(byte)] |
| 326 | } |
| 327 | } |
| 328 | |
| 329 | /// This permits passing any implementation of `HeuristicFrequencyRank` as a |
| 330 | /// borrowed version of itself. |
| 331 | impl<'a, R> HeuristicFrequencyRank for &'a R |
| 332 | where |
| 333 | R: HeuristicFrequencyRank, |
| 334 | { |
| 335 | fn rank(&self, byte: u8) -> u8 { |
| 336 | (**self).rank(byte) |
| 337 | } |
| 338 | } |
| 339 | |
| 340 | #[cfg (test)] |
| 341 | mod tests { |
| 342 | use super::*; |
| 343 | |
| 344 | #[test ] |
| 345 | fn forward_packedpair() { |
| 346 | fn find( |
| 347 | haystack: &[u8], |
| 348 | needle: &[u8], |
| 349 | _index1: u8, |
| 350 | _index2: u8, |
| 351 | ) -> Option<Option<usize>> { |
| 352 | // We ignore the index positions requested since it winds up making |
| 353 | // this test too slow overall. |
| 354 | let f = Finder::new(needle)?; |
| 355 | Some(f.find_prefilter(haystack)) |
| 356 | } |
| 357 | crate::tests::packedpair::Runner::new().fwd(find).run() |
| 358 | } |
| 359 | } |
| 360 | |