1use alloc::{vec, vec::Vec};
2
3use crate::{
4 packed::pattern::{PatternID, Patterns},
5 util::search::Match,
6};
7
8/// The type of the rolling hash used in the Rabin-Karp algorithm.
9type Hash = usize;
10
11/// The number of buckets to store our patterns in. We don't want this to be
12/// too big in order to avoid wasting memory, but we don't want it to be too
13/// small either to avoid spending too much time confirming literals.
14///
15/// The number of buckets MUST be a power of two. Otherwise, determining the
16/// bucket from a hash will slow down the code considerably. Using a power
17/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
18/// instruction.
19const NUM_BUCKETS: usize = 64;
20
21/// An implementation of the Rabin-Karp algorithm. The main idea of this
22/// algorithm is to maintain a rolling hash as it moves through the input, and
23/// then check whether that hash corresponds to the same hash for any of the
24/// patterns we're looking for.
25///
26/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
27/// it requires all of the patterns to be the same length, which in turn
28/// corresponds to the number of bytes to hash. We adapt this to work for
29/// multiple patterns of varying size by fixing the number of bytes to hash
30/// to be the length of the smallest pattern. We also split the patterns into
31/// several buckets to hopefully make the confirmation step faster.
32///
33/// Wikipedia has a decent explanation, if a bit heavy on the theory:
34/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
35///
36/// But ESMAJ provides something a bit more concrete:
37/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
38#[derive(Clone, Debug)]
39pub struct RabinKarp {
40 /// The order of patterns in each bucket is significant. Namely, they are
41 /// arranged such that the first one to match is the correct match. This
42 /// may not necessarily correspond to the order provided by the caller.
43 /// For example, if leftmost-longest semantics are used, then the patterns
44 /// are sorted by their length in descending order. If leftmost-first
45 /// semantics are used, then the patterns are sorted by their pattern ID
46 /// in ascending order (which corresponds to the caller's order).
47 buckets: Vec<Vec<(Hash, PatternID)>>,
48 /// The length of the hashing window. Generally, this corresponds to the
49 /// length of the smallest pattern.
50 hash_len: usize,
51 /// The factor to subtract out of a hash before updating it with a new
52 /// byte.
53 hash_2pow: usize,
54 /// The maximum identifier of a pattern. This is used as a sanity check
55 /// to ensure that the patterns provided by the caller are the same as
56 /// the patterns that were used to compile the matcher. This sanity check
57 /// possibly permits safely eliminating bounds checks regardless of what
58 /// patterns are provided by the caller.
59 ///
60 /// (Currently, we don't use this to elide bounds checks since it doesn't
61 /// result in a measurable performance improvement, but we do use it for
62 /// better failure modes.)
63 max_pattern_id: PatternID,
64}
65
66impl RabinKarp {
67 /// Compile a new Rabin-Karp matcher from the patterns given.
68 ///
69 /// This panics if any of the patterns in the collection are empty, or if
70 /// the collection is itself empty.
71 pub fn new(patterns: &Patterns) -> RabinKarp {
72 assert!(patterns.len() >= 1);
73 let hash_len = patterns.minimum_len();
74 assert!(hash_len >= 1);
75
76 let mut hash_2pow = 1usize;
77 for _ in 1..hash_len {
78 hash_2pow = hash_2pow.wrapping_shl(1);
79 }
80
81 let mut rk = RabinKarp {
82 buckets: vec![vec![]; NUM_BUCKETS],
83 hash_len,
84 hash_2pow,
85 max_pattern_id: patterns.max_pattern_id(),
86 };
87 for (id, pat) in patterns.iter() {
88 let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
89 let bucket = hash % NUM_BUCKETS;
90 rk.buckets[bucket].push((hash, id));
91 }
92 rk
93 }
94
95 /// Return the first matching pattern in the given haystack, begining the
96 /// search at `at`.
97 pub fn find_at(
98 &self,
99 patterns: &Patterns,
100 haystack: &[u8],
101 mut at: usize,
102 ) -> Option<Match> {
103 assert_eq!(NUM_BUCKETS, self.buckets.len());
104 assert_eq!(
105 self.max_pattern_id,
106 patterns.max_pattern_id(),
107 "Rabin-Karp must be called with same patterns it was built with",
108 );
109
110 if at + self.hash_len > haystack.len() {
111 return None;
112 }
113 let mut hash = self.hash(&haystack[at..at + self.hash_len]);
114 loop {
115 let bucket = &self.buckets[hash % NUM_BUCKETS];
116 for &(phash, pid) in bucket {
117 if phash == hash {
118 if let Some(c) = self.verify(patterns, pid, haystack, at) {
119 return Some(c);
120 }
121 }
122 }
123 if at + self.hash_len >= haystack.len() {
124 return None;
125 }
126 hash = self.update_hash(
127 hash,
128 haystack[at],
129 haystack[at + self.hash_len],
130 );
131 at += 1;
132 }
133 }
134
135 /// Returns the approximate total amount of heap used by this searcher, in
136 /// units of bytes.
137 pub fn memory_usage(&self) -> usize {
138 let num_patterns = self.max_pattern_id as usize + 1;
139 self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>()
140 + num_patterns * core::mem::size_of::<(Hash, PatternID)>()
141 }
142
143 /// Verify whether the pattern with the given id matches at
144 /// `haystack[at..]`.
145 ///
146 /// We tag this function as `cold` because it helps improve codegen.
147 /// Intuitively, it would seem like inlining it would be better. However,
148 /// the only time this is called and a match is not found is when there
149 /// there is a hash collision, or when a prefix of a pattern matches but
150 /// the entire pattern doesn't match. This is hopefully fairly rare, and
151 /// if it does occur a lot, it's going to be slow no matter what we do.
152 #[cold]
153 fn verify(
154 &self,
155 patterns: &Patterns,
156 id: PatternID,
157 haystack: &[u8],
158 at: usize,
159 ) -> Option<Match> {
160 let pat = patterns.get(id);
161 if pat.is_prefix(&haystack[at..]) {
162 Some(Match::must(id as usize, at..at + pat.len()))
163 } else {
164 None
165 }
166 }
167
168 /// Hash the given bytes.
169 fn hash(&self, bytes: &[u8]) -> Hash {
170 assert_eq!(self.hash_len, bytes.len());
171
172 let mut hash = 0usize;
173 for &b in bytes {
174 hash = hash.wrapping_shl(1).wrapping_add(b as usize);
175 }
176 hash
177 }
178
179 /// Update the hash given based on removing `old_byte` at the beginning
180 /// of some byte string, and appending `new_byte` to the end of that same
181 /// byte string.
182 fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
183 prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
184 .wrapping_shl(1)
185 .wrapping_add(new_byte as usize)
186 }
187}
188