| 1 | use alloc::{sync::Arc, vec, vec::Vec}; |
| 2 | |
| 3 | use crate::{packed::pattern::Patterns, util::search::Match, PatternID}; |
| 4 | |
| 5 | /// The type of the rolling hash used in the Rabin-Karp algorithm. |
| 6 | type Hash = usize; |
| 7 | |
| 8 | /// The number of buckets to store our patterns in. We don't want this to be |
| 9 | /// too big in order to avoid wasting memory, but we don't want it to be too |
| 10 | /// small either to avoid spending too much time confirming literals. |
| 11 | /// |
| 12 | /// The number of buckets MUST be a power of two. Otherwise, determining the |
| 13 | /// bucket from a hash will slow down the code considerably. Using a power |
| 14 | /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` |
| 15 | /// instruction. |
| 16 | const NUM_BUCKETS: usize = 64; |
| 17 | |
| 18 | /// An implementation of the Rabin-Karp algorithm. The main idea of this |
| 19 | /// algorithm is to maintain a rolling hash as it moves through the input, and |
| 20 | /// then check whether that hash corresponds to the same hash for any of the |
| 21 | /// patterns we're looking for. |
| 22 | /// |
| 23 | /// A draw back of naively scaling Rabin-Karp to multiple patterns is that |
| 24 | /// it requires all of the patterns to be the same length, which in turn |
| 25 | /// corresponds to the number of bytes to hash. We adapt this to work for |
| 26 | /// multiple patterns of varying size by fixing the number of bytes to hash |
| 27 | /// to be the length of the smallest pattern. We also split the patterns into |
| 28 | /// several buckets to hopefully make the confirmation step faster. |
| 29 | /// |
| 30 | /// Wikipedia has a decent explanation, if a bit heavy on the theory: |
| 31 | /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm |
| 32 | /// |
| 33 | /// But ESMAJ provides something a bit more concrete: |
| 34 | /// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html |
| 35 | #[derive (Clone, Debug)] |
| 36 | pub(crate) struct RabinKarp { |
| 37 | /// The patterns we're searching for. |
| 38 | patterns: Arc<Patterns>, |
| 39 | /// The order of patterns in each bucket is significant. Namely, they are |
| 40 | /// arranged such that the first one to match is the correct match. This |
| 41 | /// may not necessarily correspond to the order provided by the caller. |
| 42 | /// For example, if leftmost-longest semantics are used, then the patterns |
| 43 | /// are sorted by their length in descending order. If leftmost-first |
| 44 | /// semantics are used, then the patterns are sorted by their pattern ID |
| 45 | /// in ascending order (which corresponds to the caller's order). |
| 46 | buckets: Vec<Vec<(Hash, PatternID)>>, |
| 47 | /// The length of the hashing window. Generally, this corresponds to the |
| 48 | /// length of the smallest pattern. |
| 49 | hash_len: usize, |
| 50 | /// The factor to subtract out of a hash before updating it with a new |
| 51 | /// byte. |
| 52 | hash_2pow: usize, |
| 53 | } |
| 54 | |
| 55 | impl RabinKarp { |
| 56 | /// Compile a new Rabin-Karp matcher from the patterns given. |
| 57 | /// |
| 58 | /// This panics if any of the patterns in the collection are empty, or if |
| 59 | /// the collection is itself empty. |
| 60 | pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp { |
| 61 | assert!(patterns.len() >= 1); |
| 62 | let hash_len = patterns.minimum_len(); |
| 63 | assert!(hash_len >= 1); |
| 64 | |
| 65 | let mut hash_2pow = 1usize; |
| 66 | for _ in 1..hash_len { |
| 67 | hash_2pow = hash_2pow.wrapping_shl(1); |
| 68 | } |
| 69 | |
| 70 | let mut rk = RabinKarp { |
| 71 | patterns: Arc::clone(patterns), |
| 72 | buckets: vec![vec![]; NUM_BUCKETS], |
| 73 | hash_len, |
| 74 | hash_2pow, |
| 75 | }; |
| 76 | for (id, pat) in patterns.iter() { |
| 77 | let hash = rk.hash(&pat.bytes()[..rk.hash_len]); |
| 78 | let bucket = hash % NUM_BUCKETS; |
| 79 | rk.buckets[bucket].push((hash, id)); |
| 80 | } |
| 81 | rk |
| 82 | } |
| 83 | |
| 84 | /// Return the first matching pattern in the given haystack, begining the |
| 85 | /// search at `at`. |
| 86 | pub(crate) fn find_at( |
| 87 | &self, |
| 88 | haystack: &[u8], |
| 89 | mut at: usize, |
| 90 | ) -> Option<Match> { |
| 91 | assert_eq!(NUM_BUCKETS, self.buckets.len()); |
| 92 | |
| 93 | if at + self.hash_len > haystack.len() { |
| 94 | return None; |
| 95 | } |
| 96 | let mut hash = self.hash(&haystack[at..at + self.hash_len]); |
| 97 | loop { |
| 98 | let bucket = &self.buckets[hash % NUM_BUCKETS]; |
| 99 | for &(phash, pid) in bucket { |
| 100 | if phash == hash { |
| 101 | if let Some(c) = self.verify(pid, haystack, at) { |
| 102 | return Some(c); |
| 103 | } |
| 104 | } |
| 105 | } |
| 106 | if at + self.hash_len >= haystack.len() { |
| 107 | return None; |
| 108 | } |
| 109 | hash = self.update_hash( |
| 110 | hash, |
| 111 | haystack[at], |
| 112 | haystack[at + self.hash_len], |
| 113 | ); |
| 114 | at += 1; |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | /// Returns the approximate total amount of heap used by this searcher, in |
| 119 | /// units of bytes. |
| 120 | pub(crate) fn memory_usage(&self) -> usize { |
| 121 | self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>() |
| 122 | + self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>() |
| 123 | } |
| 124 | |
| 125 | /// Verify whether the pattern with the given id matches at |
| 126 | /// `haystack[at..]`. |
| 127 | /// |
| 128 | /// We tag this function as `cold` because it helps improve codegen. |
| 129 | /// Intuitively, it would seem like inlining it would be better. However, |
| 130 | /// the only time this is called and a match is not found is when there |
| 131 | /// there is a hash collision, or when a prefix of a pattern matches but |
| 132 | /// the entire pattern doesn't match. This is hopefully fairly rare, and |
| 133 | /// if it does occur a lot, it's going to be slow no matter what we do. |
| 134 | #[cold ] |
| 135 | fn verify( |
| 136 | &self, |
| 137 | id: PatternID, |
| 138 | haystack: &[u8], |
| 139 | at: usize, |
| 140 | ) -> Option<Match> { |
| 141 | let pat = self.patterns.get(id); |
| 142 | if pat.is_prefix(&haystack[at..]) { |
| 143 | Some(Match::new(id, at..at + pat.len())) |
| 144 | } else { |
| 145 | None |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | /// Hash the given bytes. |
| 150 | fn hash(&self, bytes: &[u8]) -> Hash { |
| 151 | assert_eq!(self.hash_len, bytes.len()); |
| 152 | |
| 153 | let mut hash = 0usize; |
| 154 | for &b in bytes { |
| 155 | hash = hash.wrapping_shl(1).wrapping_add(b as usize); |
| 156 | } |
| 157 | hash |
| 158 | } |
| 159 | |
| 160 | /// Update the hash given based on removing `old_byte` at the beginning |
| 161 | /// of some byte string, and appending `new_byte` to the end of that same |
| 162 | /// byte string. |
| 163 | fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { |
| 164 | prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) |
| 165 | .wrapping_shl(1) |
| 166 | .wrapping_add(new_byte as usize) |
| 167 | } |
| 168 | } |
| 169 | |