1use std::mem;
2
3use crate::packed::pattern::{PatternID, Patterns};
4use crate::Match;
5
6/// The type of the rolling hash used in the Rabin-Karp algorithm.
7type Hash = usize;
8
9/// The number of buckets to store our patterns in. We don't want this to be
10/// too big in order to avoid wasting memory, but we don't want it to be too
11/// small either to avoid spending too much time confirming literals.
12///
13/// The number of buckets MUST be a power of two. Otherwise, determining the
14/// bucket from a hash will slow down the code considerably. Using a power
15/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
16/// instruction.
17const NUM_BUCKETS: usize = 64;
18
19/// An implementation of the Rabin-Karp algorithm. The main idea of this
20/// algorithm is to maintain a rolling hash as it moves through the input, and
21/// then check whether that hash corresponds to the same hash for any of the
22/// patterns we're looking for.
23///
24/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
25/// it requires all of the patterns to be the same length, which in turn
26/// corresponds to the number of bytes to hash. We adapt this to work for
27/// multiple patterns of varying size by fixing the number of bytes to hash
28/// to be the length of the smallest pattern. We also split the patterns into
29/// several buckets to hopefully make the confirmation step faster.
30///
31/// Wikipedia has a decent explanation, if a bit heavy on the theory:
32/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
33///
34/// But ESMAJ provides something a bit more concrete:
35/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
36#[derive(Clone, Debug)]
37pub struct RabinKarp {
38 /// The order of patterns in each bucket is significant. Namely, they are
39 /// arranged such that the first one to match is the correct match. This
40 /// may not necessarily correspond to the order provided by the caller.
41 /// For example, if leftmost-longest semantics are used, then the patterns
42 /// are sorted by their length in descending order. If leftmost-first
43 /// semantics are used, then the patterns are sorted by their pattern ID
44 /// in ascending order (which corresponds to the caller's order).
45 buckets: Vec<Vec<(Hash, PatternID)>>,
46 /// The length of the hashing window. Generally, this corresponds to the
47 /// length of the smallest pattern.
48 hash_len: usize,
49 /// The factor to subtract out of a hash before updating it with a new
50 /// byte.
51 hash_2pow: usize,
52 /// The maximum identifier of a pattern. This is used as a sanity check
53 /// to ensure that the patterns provided by the caller are the same as
54 /// the patterns that were used to compile the matcher. This sanity check
55 /// possibly permits safely eliminating bounds checks regardless of what
56 /// patterns are provided by the caller.
57 ///
58 /// (Currently, we don't use this to elide bounds checks since it doesn't
59 /// result in a measurable performance improvement, but we do use it for
60 /// better failure modes.)
61 max_pattern_id: PatternID,
62}
63
64impl RabinKarp {
65 /// Compile a new Rabin-Karp matcher from the patterns given.
66 ///
67 /// This panics if any of the patterns in the collection are empty, or if
68 /// the collection is itself empty.
69 pub fn new(patterns: &Patterns) -> RabinKarp {
70 assert!(patterns.len() >= 1);
71 let hash_len = patterns.minimum_len();
72 assert!(hash_len >= 1);
73
74 let mut hash_2pow = 1usize;
75 for _ in 1..hash_len {
76 hash_2pow = hash_2pow.wrapping_shl(1);
77 }
78
79 let mut rk = RabinKarp {
80 buckets: vec![vec![]; NUM_BUCKETS],
81 hash_len,
82 hash_2pow,
83 max_pattern_id: patterns.max_pattern_id(),
84 };
85 for (id, pat) in patterns.iter() {
86 let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
87 let bucket = hash % NUM_BUCKETS;
88 rk.buckets[bucket].push((hash, id));
89 }
90 rk
91 }
92
93 /// Return the first matching pattern in the given haystack, begining the
94 /// search at `at`.
95 pub fn find_at(
96 &self,
97 patterns: &Patterns,
98 haystack: &[u8],
99 mut at: usize,
100 ) -> Option<Match> {
101 assert_eq!(NUM_BUCKETS, self.buckets.len());
102 assert_eq!(
103 self.max_pattern_id,
104 patterns.max_pattern_id(),
105 "Rabin-Karp must be called with same patterns it was built with",
106 );
107
108 if at + self.hash_len > haystack.len() {
109 return None;
110 }
111 let mut hash = self.hash(&haystack[at..at + self.hash_len]);
112 loop {
113 let bucket = &self.buckets[hash % NUM_BUCKETS];
114 for &(phash, pid) in bucket {
115 if phash == hash {
116 if let Some(c) = self.verify(patterns, pid, haystack, at) {
117 return Some(c);
118 }
119 }
120 }
121 if at + self.hash_len >= haystack.len() {
122 return None;
123 }
124 hash = self.update_hash(
125 hash,
126 haystack[at],
127 haystack[at + self.hash_len],
128 );
129 at += 1;
130 }
131 }
132
133 /// Returns the approximate total amount of heap used by this searcher, in
134 /// units of bytes.
135 pub fn heap_bytes(&self) -> usize {
136 let num_patterns = self.max_pattern_id as usize + 1;
137 self.buckets.len() * mem::size_of::<Vec<(Hash, PatternID)>>()
138 + num_patterns * mem::size_of::<(Hash, PatternID)>()
139 }
140
141 /// Verify whether the pattern with the given id matches at
142 /// `haystack[at..]`.
143 ///
144 /// We tag this function as `cold` because it helps improve codegen.
145 /// Intuitively, it would seem like inlining it would be better. However,
146 /// the only time this is called and a match is not found is when there
147 /// there is a hash collision, or when a prefix of a pattern matches but
148 /// the entire pattern doesn't match. This is hopefully fairly rare, and
149 /// if it does occur a lot, it's going to be slow no matter what we do.
150 #[cold]
151 fn verify(
152 &self,
153 patterns: &Patterns,
154 id: PatternID,
155 haystack: &[u8],
156 at: usize,
157 ) -> Option<Match> {
158 let pat = patterns.get(id);
159 if pat.is_prefix(&haystack[at..]) {
160 Some(Match::from_span(id as usize, at, at + pat.len()))
161 } else {
162 None
163 }
164 }
165
166 /// Hash the given bytes.
167 fn hash(&self, bytes: &[u8]) -> Hash {
168 assert_eq!(self.hash_len, bytes.len());
169
170 let mut hash = 0usize;
171 for &b in bytes {
172 hash = hash.wrapping_shl(1).wrapping_add(b as usize);
173 }
174 hash
175 }
176
177 /// Update the hash given based on removing `old_byte` at the beginning
178 /// of some byte string, and appending `new_byte` to the end of that same
179 /// byte string.
180 fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
181 prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
182 .wrapping_shl(1)
183 .wrapping_add(new_byte as usize)
184 }
185}
186