1 | use alloc::{vec, vec::Vec}; |
2 | |
3 | use crate::{ |
4 | packed::pattern::{PatternID, Patterns}, |
5 | util::search::Match, |
6 | }; |
7 | |
8 | /// The type of the rolling hash used in the Rabin-Karp algorithm. |
9 | type Hash = usize; |
10 | |
11 | /// The number of buckets to store our patterns in. We don't want this to be |
12 | /// too big in order to avoid wasting memory, but we don't want it to be too |
13 | /// small either to avoid spending too much time confirming literals. |
14 | /// |
15 | /// The number of buckets MUST be a power of two. Otherwise, determining the |
16 | /// bucket from a hash will slow down the code considerably. Using a power |
17 | /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` |
18 | /// instruction. |
19 | const NUM_BUCKETS: usize = 64; |
20 | |
21 | /// An implementation of the Rabin-Karp algorithm. The main idea of this |
22 | /// algorithm is to maintain a rolling hash as it moves through the input, and |
23 | /// then check whether that hash corresponds to the same hash for any of the |
24 | /// patterns we're looking for. |
25 | /// |
26 | /// A draw back of naively scaling Rabin-Karp to multiple patterns is that |
27 | /// it requires all of the patterns to be the same length, which in turn |
28 | /// corresponds to the number of bytes to hash. We adapt this to work for |
29 | /// multiple patterns of varying size by fixing the number of bytes to hash |
30 | /// to be the length of the smallest pattern. We also split the patterns into |
31 | /// several buckets to hopefully make the confirmation step faster. |
32 | /// |
33 | /// Wikipedia has a decent explanation, if a bit heavy on the theory: |
34 | /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm |
35 | /// |
36 | /// But ESMAJ provides something a bit more concrete: |
37 | /// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html |
38 | #[derive (Clone, Debug)] |
39 | pub struct RabinKarp { |
40 | /// The order of patterns in each bucket is significant. Namely, they are |
41 | /// arranged such that the first one to match is the correct match. This |
42 | /// may not necessarily correspond to the order provided by the caller. |
43 | /// For example, if leftmost-longest semantics are used, then the patterns |
44 | /// are sorted by their length in descending order. If leftmost-first |
45 | /// semantics are used, then the patterns are sorted by their pattern ID |
46 | /// in ascending order (which corresponds to the caller's order). |
47 | buckets: Vec<Vec<(Hash, PatternID)>>, |
48 | /// The length of the hashing window. Generally, this corresponds to the |
49 | /// length of the smallest pattern. |
50 | hash_len: usize, |
51 | /// The factor to subtract out of a hash before updating it with a new |
52 | /// byte. |
53 | hash_2pow: usize, |
54 | /// The maximum identifier of a pattern. This is used as a sanity check |
55 | /// to ensure that the patterns provided by the caller are the same as |
56 | /// the patterns that were used to compile the matcher. This sanity check |
57 | /// possibly permits safely eliminating bounds checks regardless of what |
58 | /// patterns are provided by the caller. |
59 | /// |
60 | /// (Currently, we don't use this to elide bounds checks since it doesn't |
61 | /// result in a measurable performance improvement, but we do use it for |
62 | /// better failure modes.) |
63 | max_pattern_id: PatternID, |
64 | } |
65 | |
66 | impl RabinKarp { |
67 | /// Compile a new Rabin-Karp matcher from the patterns given. |
68 | /// |
69 | /// This panics if any of the patterns in the collection are empty, or if |
70 | /// the collection is itself empty. |
71 | pub fn new(patterns: &Patterns) -> RabinKarp { |
72 | assert!(patterns.len() >= 1); |
73 | let hash_len = patterns.minimum_len(); |
74 | assert!(hash_len >= 1); |
75 | |
76 | let mut hash_2pow = 1usize; |
77 | for _ in 1..hash_len { |
78 | hash_2pow = hash_2pow.wrapping_shl(1); |
79 | } |
80 | |
81 | let mut rk = RabinKarp { |
82 | buckets: vec![vec![]; NUM_BUCKETS], |
83 | hash_len, |
84 | hash_2pow, |
85 | max_pattern_id: patterns.max_pattern_id(), |
86 | }; |
87 | for (id, pat) in patterns.iter() { |
88 | let hash = rk.hash(&pat.bytes()[..rk.hash_len]); |
89 | let bucket = hash % NUM_BUCKETS; |
90 | rk.buckets[bucket].push((hash, id)); |
91 | } |
92 | rk |
93 | } |
94 | |
95 | /// Return the first matching pattern in the given haystack, begining the |
96 | /// search at `at`. |
97 | pub fn find_at( |
98 | &self, |
99 | patterns: &Patterns, |
100 | haystack: &[u8], |
101 | mut at: usize, |
102 | ) -> Option<Match> { |
103 | assert_eq!(NUM_BUCKETS, self.buckets.len()); |
104 | assert_eq!( |
105 | self.max_pattern_id, |
106 | patterns.max_pattern_id(), |
107 | "Rabin-Karp must be called with same patterns it was built with" , |
108 | ); |
109 | |
110 | if at + self.hash_len > haystack.len() { |
111 | return None; |
112 | } |
113 | let mut hash = self.hash(&haystack[at..at + self.hash_len]); |
114 | loop { |
115 | let bucket = &self.buckets[hash % NUM_BUCKETS]; |
116 | for &(phash, pid) in bucket { |
117 | if phash == hash { |
118 | if let Some(c) = self.verify(patterns, pid, haystack, at) { |
119 | return Some(c); |
120 | } |
121 | } |
122 | } |
123 | if at + self.hash_len >= haystack.len() { |
124 | return None; |
125 | } |
126 | hash = self.update_hash( |
127 | hash, |
128 | haystack[at], |
129 | haystack[at + self.hash_len], |
130 | ); |
131 | at += 1; |
132 | } |
133 | } |
134 | |
135 | /// Returns the approximate total amount of heap used by this searcher, in |
136 | /// units of bytes. |
137 | pub fn memory_usage(&self) -> usize { |
138 | let num_patterns = self.max_pattern_id as usize + 1; |
139 | self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>() |
140 | + num_patterns * core::mem::size_of::<(Hash, PatternID)>() |
141 | } |
142 | |
143 | /// Verify whether the pattern with the given id matches at |
144 | /// `haystack[at..]`. |
145 | /// |
146 | /// We tag this function as `cold` because it helps improve codegen. |
147 | /// Intuitively, it would seem like inlining it would be better. However, |
148 | /// the only time this is called and a match is not found is when there |
149 | /// there is a hash collision, or when a prefix of a pattern matches but |
150 | /// the entire pattern doesn't match. This is hopefully fairly rare, and |
151 | /// if it does occur a lot, it's going to be slow no matter what we do. |
152 | #[cold ] |
153 | fn verify( |
154 | &self, |
155 | patterns: &Patterns, |
156 | id: PatternID, |
157 | haystack: &[u8], |
158 | at: usize, |
159 | ) -> Option<Match> { |
160 | let pat = patterns.get(id); |
161 | if pat.is_prefix(&haystack[at..]) { |
162 | Some(Match::must(id as usize, at..at + pat.len())) |
163 | } else { |
164 | None |
165 | } |
166 | } |
167 | |
168 | /// Hash the given bytes. |
169 | fn hash(&self, bytes: &[u8]) -> Hash { |
170 | assert_eq!(self.hash_len, bytes.len()); |
171 | |
172 | let mut hash = 0usize; |
173 | for &b in bytes { |
174 | hash = hash.wrapping_shl(1).wrapping_add(b as usize); |
175 | } |
176 | hash |
177 | } |
178 | |
179 | /// Update the hash given based on removing `old_byte` at the beginning |
180 | /// of some byte string, and appending `new_byte` to the end of that same |
181 | /// byte string. |
182 | fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { |
183 | prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) |
184 | .wrapping_shl(1) |
185 | .wrapping_add(new_byte as usize) |
186 | } |
187 | } |
188 | |