rabinkarp.rs source code [crates/aho-corasick-1.0.2/src/packed/rabinkarp.rs]

1	use alloc::{vec, vec::Vec};
2
3	use crate::{
4	packed::pattern::{PatternID, Patterns},
5	util::search::Match,
6	};
7
8	/// The type of the rolling hash used in the Rabin-Karp algorithm.
9	type Hash = usize;
10
11	/// The number of buckets to store our patterns in. We don't want this to be
12	/// too big in order to avoid wasting memory, but we don't want it to be too
13	/// small either to avoid spending too much time confirming literals.
14	///
15	/// The number of buckets MUST be a power of two. Otherwise, determining the
16	/// bucket from a hash will slow down the code considerably. Using a power
17	/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
18	/// instruction.
19	const NUM_BUCKETS: usize = `64`;
20
21	/// An implementation of the Rabin-Karp algorithm. The main idea of this
22	/// algorithm is to maintain a rolling hash as it moves through the input, and
23	/// then check whether that hash corresponds to the same hash for any of the
24	/// patterns we're looking for.
25	///
26	/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
27	/// it requires all of the patterns to be the same length, which in turn
28	/// corresponds to the number of bytes to hash. We adapt this to work for
29	/// multiple patterns of varying size by fixing the number of bytes to hash
30	/// to be the length of the smallest pattern. We also split the patterns into
31	/// several buckets to hopefully make the confirmation step faster.
32	///
33	/// Wikipedia has a decent explanation, if a bit heavy on the theory:
34	/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
35	///
36	/// But ESMAJ provides something a bit more concrete:
37	/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
38	#[derive(Clone, Debug)]
39	pub struct RabinKarp {
40	/// The order of patterns in each bucket is significant. Namely, they are
41	/// arranged such that the first one to match is the correct match. This
42	/// may not necessarily correspond to the order provided by the caller.
43	/// For example, if leftmost-longest semantics are used, then the patterns
44	/// are sorted by their length in descending order. If leftmost-first
45	/// semantics are used, then the patterns are sorted by their pattern ID
46	/// in ascending order (which corresponds to the caller's order).
47	buckets: Vec<Vec<(Hash, PatternID)>>,
48	/// The length of the hashing window. Generally, this corresponds to the
49	/// length of the smallest pattern.
50	hash_len: usize,
51	/// The factor to subtract out of a hash before updating it with a new
52	/// byte.
53	hash_2pow: usize,
54	/// The maximum identifier of a pattern. This is used as a sanity check
55	/// to ensure that the patterns provided by the caller are the same as
56	/// the patterns that were used to compile the matcher. This sanity check
57	/// possibly permits safely eliminating bounds checks regardless of what
58	/// patterns are provided by the caller.
59	///
60	/// (Currently, we don't use this to elide bounds checks since it doesn't
61	/// result in a measurable performance improvement, but we do use it for
62	/// better failure modes.)
63	max_pattern_id: PatternID,
64	}
65
66	impl RabinKarp {
67	/// Compile a new Rabin-Karp matcher from the patterns given.
68	///
69	/// This panics if any of the patterns in the collection are empty, or if
70	/// the collection is itself empty.
71	pub fn new(patterns: &Patterns) -> RabinKarp {
72	assert!(patterns.len() >= `1`);
73	let hash_len = patterns.minimum_len();
74	assert!(hash_len >= `1`);
75
76	let mut hash_2pow = `1usize`;
77	for _ in `1`..hash_len {
78	hash_2pow = hash_2pow.wrapping_shl(`1`);
79	}
80
81	let mut rk = RabinKarp {
82	buckets: vec![vec![]; NUM_BUCKETS],
83	hash_len,
84	hash_2pow,
85	max_pattern_id: patterns.max_pattern_id(),
86	};
87	for (id, pat) in patterns.iter() {
88	let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
89	let bucket = hash % NUM_BUCKETS;
90	rk.buckets[bucket].push((hash, id));
91	}
92	rk
93	}
94
95	/// Return the first matching pattern in the given haystack, begining the
96	/// search at `at`.
97	pub fn find_at(
98	&self,
99	patterns: &Patterns,
100	haystack: &[u8],
101	mut at: usize,
102	) -> Option<Match> {
103	assert_eq!(NUM_BUCKETS, self.buckets.len());
104	assert_eq!(
105	self.max_pattern_id,
106	patterns.max_pattern_id(),
107	"Rabin-Karp must be called with same patterns it was built with",
108	);
109
110	if at + self.hash_len > haystack.len() {
111	return None;
112	}
113	let mut hash = self.hash(&haystack[at..at + self.hash_len]);
114	loop {
115	let bucket = &self.buckets[hash % NUM_BUCKETS];
116	for &(phash, pid) in bucket {
117	if phash == hash {
118	if let Some(c) = self.verify(patterns, pid, haystack, at) {
119	return Some(c);
120	}
121	}
122	}
123	if at + self.hash_len >= haystack.len() {
124	return None;
125	}
126	hash = self.update_hash(
127	hash,
128	haystack[at],
129	haystack[at + self.hash_len],
130	);
131	at += `1`;
132	}
133	}
134
135	/// Returns the approximate total amount of heap used by this searcher, in
136	/// units of bytes.
137	pub fn memory_usage(&self) -> usize {
138	let num_patterns = self.max_pattern_id as usize + `1`;
139	self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>()
140	+ num_patterns * core::mem::size_of::<(Hash, PatternID)>()
141	}
142
143	/// Verify whether the pattern with the given id matches at
144	/// `haystack[at..]`.
145	///
146	/// We tag this function as `cold` because it helps improve codegen.
147	/// Intuitively, it would seem like inlining it would be better. However,
148	/// the only time this is called and a match is not found is when there
149	/// there is a hash collision, or when a prefix of a pattern matches but
150	/// the entire pattern doesn't match. This is hopefully fairly rare, and
151	/// if it does occur a lot, it's going to be slow no matter what we do.
152	#[cold]
153	fn verify(
154	&self,
155	patterns: &Patterns,
156	id: PatternID,
157	haystack: &[u8],
158	at: usize,
159	) -> Option<Match> {
160	let pat = patterns.get(id);
161	if pat.is_prefix(&haystack[at..]) {
162	Some(Match::must(id as usize, at..at + pat.len()))
163	} else {
164	None
165	}
166	}
167
168	/// Hash the given bytes.
169	fn hash(&self, bytes: &[u8]) -> Hash {
170	assert_eq!(self.hash_len, bytes.len());
171
172	let mut hash = `0usize`;
173	for &b in bytes {
174	hash = hash.wrapping_shl(`1`).wrapping_add(b as usize);
175	}
176	hash
177	}
178
179	/// Update the hash given based on removing `old_byte` at the beginning
180	/// of some byte string, and appending `new_byte` to the end of that same
181	/// byte string.
182	fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
183	prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
184	.wrapping_shl(`1`)
185	.wrapping_add(new_byte as usize)
186	}
187	}
188