rabinkarp.rs - Codebrowser

1	use alloc::{sync::Arc, vec, vec::Vec};
2
3	use crate::{packed::pattern::Patterns, util::search::Match, PatternID};
4
5	/// The type of the rolling hash used in the Rabin-Karp algorithm.
6	type Hash = usize;
7
8	/// The number of buckets to store our patterns in. We don't want this to be
9	/// too big in order to avoid wasting memory, but we don't want it to be too
10	/// small either to avoid spending too much time confirming literals.
11	///
12	/// The number of buckets MUST be a power of two. Otherwise, determining the
13	/// bucket from a hash will slow down the code considerably. Using a power
14	/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
15	/// instruction.
16	const NUM_BUCKETS: usize = `64`;
17
18	/// An implementation of the Rabin-Karp algorithm. The main idea of this
19	/// algorithm is to maintain a rolling hash as it moves through the input, and
20	/// then check whether that hash corresponds to the same hash for any of the
21	/// patterns we're looking for.
22	///
23	/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
24	/// it requires all of the patterns to be the same length, which in turn
25	/// corresponds to the number of bytes to hash. We adapt this to work for
26	/// multiple patterns of varying size by fixing the number of bytes to hash
27	/// to be the length of the smallest pattern. We also split the patterns into
28	/// several buckets to hopefully make the confirmation step faster.
29	///
30	/// Wikipedia has a decent explanation, if a bit heavy on the theory:
31	/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
32	///
33	/// But ESMAJ provides something a bit more concrete:
34	/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
35	#[derive(Clone, Debug)]
36	pub(crate) struct RabinKarp {
37	/// The patterns we're searching for.
38	patterns: Arc<Patterns>,
39	/// The order of patterns in each bucket is significant. Namely, they are
40	/// arranged such that the first one to match is the correct match. This
41	/// may not necessarily correspond to the order provided by the caller.
42	/// For example, if leftmost-longest semantics are used, then the patterns
43	/// are sorted by their length in descending order. If leftmost-first
44	/// semantics are used, then the patterns are sorted by their pattern ID
45	/// in ascending order (which corresponds to the caller's order).
46	buckets: Vec<Vec<(Hash, PatternID)>>,
47	/// The length of the hashing window. Generally, this corresponds to the
48	/// length of the smallest pattern.
49	hash_len: usize,
50	/// The factor to subtract out of a hash before updating it with a new
51	/// byte.
52	hash_2pow: usize,
53	}
54
55	impl RabinKarp {
56	/// Compile a new Rabin-Karp matcher from the patterns given.
57	///
58	/// This panics if any of the patterns in the collection are empty, or if
59	/// the collection is itself empty.
60	pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp {
61	assert!(patterns.len() >= `1`);
62	let hash_len = patterns.minimum_len();
63	assert!(hash_len >= `1`);
64
65	let mut hash_2pow = `1usize`;
66	for _ in `1`..hash_len {
67	hash_2pow = hash_2pow.wrapping_shl(`1`);
68	}
69
70	let mut rk = RabinKarp {
71	patterns: Arc::clone(patterns),
72	buckets: vec![vec![]; NUM_BUCKETS],
73	hash_len,
74	hash_2pow,
75	};
76	for (id, pat) in patterns.iter() {
77	let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
78	let bucket = hash % NUM_BUCKETS;
79	rk.buckets[bucket].push((hash, id));
80	}
81	rk
82	}
83
84	/// Return the first matching pattern in the given haystack, begining the
85	/// search at `at`.
86	pub(crate) fn find_at(
87	&self,
88	haystack: &[u8],
89	mut at: usize,
90	) -> Option<Match> {
91	assert_eq!(NUM_BUCKETS, self.buckets.len());
92
93	if at + self.hash_len > haystack.len() {
94	return None;
95	}
96	let mut hash = self.hash(&haystack[at..at + self.hash_len]);
97	loop {
98	let bucket = &self.buckets[hash % NUM_BUCKETS];
99	for &(phash, pid) in bucket {
100	if phash == hash {
101	if let Some(c) = self.verify(pid, haystack, at) {
102	return Some(c);
103	}
104	}
105	}
106	if at + self.hash_len >= haystack.len() {
107	return None;
108	}
109	hash = self.update_hash(
110	hash,
111	haystack[at],
112	haystack[at + self.hash_len],
113	);
114	at += `1`;
115	}
116	}
117
118	/// Returns the approximate total amount of heap used by this searcher, in
119	/// units of bytes.
120	pub(crate) fn memory_usage(&self) -> usize {
121	self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>()
122	+ self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>()
123	}
124
125	/// Verify whether the pattern with the given id matches at
126	/// `haystack[at..]`.
127	///
128	/// We tag this function as `cold` because it helps improve codegen.
129	/// Intuitively, it would seem like inlining it would be better. However,
130	/// the only time this is called and a match is not found is when there
131	/// there is a hash collision, or when a prefix of a pattern matches but
132	/// the entire pattern doesn't match. This is hopefully fairly rare, and
133	/// if it does occur a lot, it's going to be slow no matter what we do.
134	#[cold]
135	fn verify(
136	&self,
137	id: PatternID,
138	haystack: &[u8],
139	at: usize,
140	) -> Option<Match> {
141	let pat = self.patterns.get(id);
142	if pat.is_prefix(&haystack[at..]) {
143	Some(Match::new(id, at..at + pat.len()))
144	} else {
145	None
146	}
147	}
148
149	/// Hash the given bytes.
150	fn hash(&self, bytes: &[u8]) -> Hash {
151	assert_eq!(self.hash_len, bytes.len());
152
153	let mut hash = `0usize`;
154	for &b in bytes {
155	hash = hash.wrapping_shl(`1`).wrapping_add(b as usize);
156	}
157	hash
158	}
159
160	/// Update the hash given based on removing `old_byte` at the beginning
161	/// of some byte string, and appending `new_byte` to the end of that same
162	/// byte string.
163	fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
164	prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
165	.wrapping_shl(`1`)
166	.wrapping_add(new_byte as usize)
167	}
168	}
169