rabinkarp.rs source code [crates/aho-corasick-0.7.20/src/packed/rabinkarp.rs]

1	use std::mem;
2
3	use crate::packed::pattern::{PatternID, Patterns};
4	use crate::Match;
5
6	/// The type of the rolling hash used in the Rabin-Karp algorithm.
7	type Hash = usize;
8
9	/// The number of buckets to store our patterns in. We don't want this to be
10	/// too big in order to avoid wasting memory, but we don't want it to be too
11	/// small either to avoid spending too much time confirming literals.
12	///
13	/// The number of buckets MUST be a power of two. Otherwise, determining the
14	/// bucket from a hash will slow down the code considerably. Using a power
15	/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
16	/// instruction.
17	const NUM_BUCKETS: usize = `64`;
18
19	/// An implementation of the Rabin-Karp algorithm. The main idea of this
20	/// algorithm is to maintain a rolling hash as it moves through the input, and
21	/// then check whether that hash corresponds to the same hash for any of the
22	/// patterns we're looking for.
23	///
24	/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
25	/// it requires all of the patterns to be the same length, which in turn
26	/// corresponds to the number of bytes to hash. We adapt this to work for
27	/// multiple patterns of varying size by fixing the number of bytes to hash
28	/// to be the length of the smallest pattern. We also split the patterns into
29	/// several buckets to hopefully make the confirmation step faster.
30	///
31	/// Wikipedia has a decent explanation, if a bit heavy on the theory:
32	/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
33	///
34	/// But ESMAJ provides something a bit more concrete:
35	/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
36	#[derive(Clone, Debug)]
37	pub struct RabinKarp {
38	/// The order of patterns in each bucket is significant. Namely, they are
39	/// arranged such that the first one to match is the correct match. This
40	/// may not necessarily correspond to the order provided by the caller.
41	/// For example, if leftmost-longest semantics are used, then the patterns
42	/// are sorted by their length in descending order. If leftmost-first
43	/// semantics are used, then the patterns are sorted by their pattern ID
44	/// in ascending order (which corresponds to the caller's order).
45	buckets: Vec<Vec<(Hash, PatternID)>>,
46	/// The length of the hashing window. Generally, this corresponds to the
47	/// length of the smallest pattern.
48	hash_len: usize,
49	/// The factor to subtract out of a hash before updating it with a new
50	/// byte.
51	hash_2pow: usize,
52	/// The maximum identifier of a pattern. This is used as a sanity check
53	/// to ensure that the patterns provided by the caller are the same as
54	/// the patterns that were used to compile the matcher. This sanity check
55	/// possibly permits safely eliminating bounds checks regardless of what
56	/// patterns are provided by the caller.
57	///
58	/// (Currently, we don't use this to elide bounds checks since it doesn't
59	/// result in a measurable performance improvement, but we do use it for
60	/// better failure modes.)
61	max_pattern_id: PatternID,
62	}
63
64	impl RabinKarp {
65	/// Compile a new Rabin-Karp matcher from the patterns given.
66	///
67	/// This panics if any of the patterns in the collection are empty, or if
68	/// the collection is itself empty.
69	pub fn new(patterns: &Patterns) -> RabinKarp {
70	assert!(patterns.len() >= `1`);
71	let hash_len = patterns.minimum_len();
72	assert!(hash_len >= `1`);
73
74	let mut hash_2pow = `1usize`;
75	for _ in `1`..hash_len {
76	hash_2pow = hash_2pow.wrapping_shl(`1`);
77	}
78
79	let mut rk = RabinKarp {
80	buckets: vec![vec![]; NUM_BUCKETS],
81	hash_len,
82	hash_2pow,
83	max_pattern_id: patterns.max_pattern_id(),
84	};
85	for (id, pat) in patterns.iter() {
86	let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
87	let bucket = hash % NUM_BUCKETS;
88	rk.buckets[bucket].push((hash, id));
89	}
90	rk
91	}
92
93	/// Return the first matching pattern in the given haystack, begining the
94	/// search at `at`.
95	pub fn find_at(
96	&self,
97	patterns: &Patterns,
98	haystack: &[u8],
99	mut at: usize,
100	) -> Option<Match> {
101	assert_eq!(NUM_BUCKETS, self.buckets.len());
102	assert_eq!(
103	self.max_pattern_id,
104	patterns.max_pattern_id(),
105	"Rabin-Karp must be called with same patterns it was built with",
106	);
107
108	if at + self.hash_len > haystack.len() {
109	return None;
110	}
111	let mut hash = self.hash(&haystack[at..at + self.hash_len]);
112	loop {
113	let bucket = &self.buckets[hash % NUM_BUCKETS];
114	for &(phash, pid) in bucket {
115	if phash == hash {
116	if let Some(c) = self.verify(patterns, pid, haystack, at) {
117	return Some(c);
118	}
119	}
120	}
121	if at + self.hash_len >= haystack.len() {
122	return None;
123	}
124	hash = self.update_hash(
125	hash,
126	haystack[at],
127	haystack[at + self.hash_len],
128	);
129	at += `1`;
130	}
131	}
132
133	/// Returns the approximate total amount of heap used by this searcher, in
134	/// units of bytes.
135	pub fn heap_bytes(&self) -> usize {
136	let num_patterns = self.max_pattern_id as usize + `1`;
137	self.buckets.len() * mem::size_of::<Vec<(Hash, PatternID)>>()
138	+ num_patterns * mem::size_of::<(Hash, PatternID)>()
139	}
140
141	/// Verify whether the pattern with the given id matches at
142	/// `haystack[at..]`.
143	///
144	/// We tag this function as `cold` because it helps improve codegen.
145	/// Intuitively, it would seem like inlining it would be better. However,
146	/// the only time this is called and a match is not found is when there
147	/// there is a hash collision, or when a prefix of a pattern matches but
148	/// the entire pattern doesn't match. This is hopefully fairly rare, and
149	/// if it does occur a lot, it's going to be slow no matter what we do.
150	#[cold]
151	fn verify(
152	&self,
153	patterns: &Patterns,
154	id: PatternID,
155	haystack: &[u8],
156	at: usize,
157	) -> Option<Match> {
158	let pat = patterns.get(id);
159	if pat.is_prefix(&haystack[at..]) {
160	Some(Match::from_span(id as usize, at, at + pat.len()))
161	} else {
162	None
163	}
164	}
165
166	/// Hash the given bytes.
167	fn hash(&self, bytes: &[u8]) -> Hash {
168	assert_eq!(self.hash_len, bytes.len());
169
170	let mut hash = `0usize`;
171	for &b in bytes {
172	hash = hash.wrapping_shl(`1`).wrapping_add(b as usize);
173	}
174	hash
175	}
176
177	/// Update the hash given based on removing `old_byte` at the beginning
178	/// of some byte string, and appending `new_byte` to the end of that same
179	/// byte string.
180	fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
181	prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
182	.wrapping_shl(`1`)
183	.wrapping_add(new_byte as usize)
184	}
185	}
186