map.rs source code [crates/regex-automata-0.4.9/src/nfa/thompson/map.rs]

1	// This module contains a couple simple and purpose built hash maps. The key
2	// trade off they make is that they serve as caches rather than true maps. That
3	// is, inserting a new entry may cause eviction of another entry. This gives
4	// us two things. First, there's less overhead associated with inserts and
5	// lookups. Secondly, it lets us control our memory usage.
6	//
7	// These maps are used in some fairly hot code when generating NFA states for
8	// large Unicode character classes.
9	//
10	// Instead of exposing a rich hashmap entry API, we just permit the caller to
11	// produce a hash of the key directly. The hash can then be reused for both
12	// lookups and insertions at the cost of leaking abstraction a bit. But these
13	// are for internal use only, so it's fine.
14	//
15	// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
16	// (almost) minimal DFA for large Unicode character classes in linear time.
17	// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
18	// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
19	// since there's a bit more expense in the reverse direction.)
20	//
21	// The Utf8SuffixMap is used when compiling large Unicode character classes for
22	// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
23	// construction of UTF-8 automata by caching common suffixes. This doesn't
24	// get the same space savings as Daciuk's algorithm, but it's basically as
25	// fast as the naive approach and typically winds up using less memory (since
26	// it generates smaller NFAs) despite the presence of the cache.
27	//
28	// These maps effectively represent caching mechanisms for sparse and
29	// byte-range NFA states, respectively. The former represents a single NFA
30	// state with many transitions of equivalent priority while the latter
31	// represents a single NFA state with a single transition. (Neither state ever
32	// has or is an epsilon transition.) Thus, they have different key types. It's
33	// likely we could make one generic map, but the machinery didn't seem worth
34	// it. They are simple enough.
35
36	use alloc::{vec, vec::Vec};
37
38	use crate::{
39	nfa::thompson::Transition,
40	util::{
41	int::{Usize, U64},
42	primitives::StateID,
43	},
44	};
45
46	// Basic FNV-1a hash constants as described in:
47	// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
48	const PRIME: u64 = `1099511628211`;
49	const INIT: u64 = `14695981039346656037`;
50
51	/// A bounded hash map where the key is a sequence of NFA transitions and the
52	/// value is a pre-existing NFA state ID.
53	///
54	/// std's hashmap can be used for this, however, this map has two important
55	/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
56	/// control our memory usage by limited the number of slots. In general, the
57	/// cost here is that this map acts as a cache. That is, inserting a new entry
58	/// may remove an old entry. We are okay with this, since it does not impact
59	/// correctness in the cases where it is used. The only effect that dropping
60	/// states from the cache has is that the resulting NFA generated may be bigger
61	/// than it otherwise would be.
62	///
63	/// This improves benchmarks that compile large Unicode character classes,
64	/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
65	/// Specifically, one could observe the difference with std's hashmap via
66	/// something like the following benchmark:
67	///
68	/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'"
69	///
70	/// But to observe that difference, you'd have to modify the code to use
71	/// std's hashmap.
72	///
73	/// It is quite possible that there is a better way to approach this problem.
74	/// For example, if there happens to be a very common state that collides with
75	/// a lot of less frequent states, then we could wind up with very poor caching
76	/// behavior. Alas, the effectiveness of this cache has not been measured.
77	/// Instead, ad hoc experiments suggest that it is "good enough." Additional
78	/// smarts (such as an LRU eviction policy) have to be weighed against the
79	/// amount of extra time they cost.
80	#[derive(Clone, Debug)]
81	pub struct Utf8BoundedMap {
82	/// The current version of this map. Only entries with matching versions
83	/// are considered during lookups. If an entry is found with a mismatched
84	/// version, then the map behaves as if the entry does not exist.
85	///
86	/// This makes it possible to clear the map by simply incrementing the
87	/// version number instead of actually deallocating any storage.
88	version: u16,
89	/// The total number of entries this map can store.
90	capacity: usize,
91	/// The actual entries, keyed by hash. Collisions between different states
92	/// result in the old state being dropped.
93	map: Vec<Utf8BoundedEntry>,
94	}
95
96	/// An entry in this map.
97	#[derive(Clone, Debug, Default)]
98	struct Utf8BoundedEntry {
99	/// The version of the map used to produce this entry. If this entry's
100	/// version does not match the current version of the map, then the map
101	/// should behave as if this entry does not exist.
102	version: u16,
103	/// The key, which is a sorted sequence of non-overlapping NFA transitions.
104	key: Vec<Transition>,
105	/// The state ID corresponding to the state containing the transitions in
106	/// this entry.
107	val: StateID,
108	}
109
110	impl Utf8BoundedMap {
111	/// Create a new bounded map with the given capacity. The map will never
112	/// grow beyond the given size.
113	///
114	/// Note that this does not allocate. Instead, callers must call `clear`
115	/// before using this map. `clear` will allocate space if necessary.
116	///
117	/// This avoids the need to pay for the allocation of this map when
118	/// compiling regexes that lack large Unicode character classes.
119	pub fn new(capacity: usize) -> Utf8BoundedMap {
120	assert!(capacity > `0`);
121	Utf8BoundedMap { version: `0`, capacity, map: vec![] }
122	}
123
124	/// Clear this map of all entries, but permit the reuse of allocation
125	/// if possible.
126	///
127	/// This must be called before the map can be used.
128	pub fn clear(&mut self) {
129	if self.map.is_empty() {
130	self.map = vec![Utf8BoundedEntry::default(); self.capacity];
131	} else {
132	self.version = self.version.wrapping_add(`1`);
133	// If we loop back to version 0, then we forcefully clear the
134	// entire map. Otherwise, it might be possible to incorrectly
135	// match entries used to generate other NFAs.
136	if self.version == `0` {
137	self.map = vec![Utf8BoundedEntry::default(); self.capacity];
138	}
139	}
140	}
141
142	/// Return a hash of the given transitions.
143	pub fn hash(&self, key: &[Transition]) -> usize {
144	let mut h = INIT;
145	for t in key {
146	h = (h ^ u64::from(t.start)).wrapping_mul(PRIME);
147	h = (h ^ u64::from(t.end)).wrapping_mul(PRIME);
148	h = (h ^ t.next.as_u64()).wrapping_mul(PRIME);
149	}
150	(h % self.map.len().as_u64()).as_usize()
151	}
152
153	/// Retrieve the cached state ID corresponding to the given key. The hash
154	/// given must have been computed with `hash` using the same key value.
155	///
156	/// If there is no cached state with the given transitions, then None is
157	/// returned.
158	pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
159	let entry = &self.map[hash];
160	if entry.version != self.version {
161	return None;
162	}
163	// There may be a hash collision, so we need to confirm real equality.
164	if entry.key != key {
165	return None;
166	}
167	Some(entry.val)
168	}
169
170	/// Add a cached state to this map with the given key. Callers should
171	/// ensure that `state_id` points to a state that contains precisely the
172	/// NFA transitions given.
173	///
174	/// `hash` must have been computed using the `hash` method with the same
175	/// key.
176	pub fn set(
177	&mut self,
178	key: Vec<Transition>,
179	hash: usize,
180	state_id: StateID,
181	) {
182	self.map[hash] =
183	Utf8BoundedEntry { version: self.version, key, val: state_id };
184	}
185	}
186
187	/// A cache of suffixes used to modestly compress UTF-8 automata for large
188	/// Unicode character classes.
189	#[derive(Clone, Debug)]
190	pub struct Utf8SuffixMap {
191	/// The current version of this map. Only entries with matching versions
192	/// are considered during lookups. If an entry is found with a mismatched
193	/// version, then the map behaves as if the entry does not exist.
194	version: u16,
195	/// The total number of entries this map can store.
196	capacity: usize,
197	/// The actual entries, keyed by hash. Collisions between different states
198	/// result in the old state being dropped.
199	map: Vec<Utf8SuffixEntry>,
200	}
201
202	/// A key that uniquely identifies an NFA state. It is a triple that represents
203	/// a transition from one state for a particular byte range.
204	#[derive(Clone, Debug, Default, Eq, PartialEq)]
205	pub struct Utf8SuffixKey {
206	pub from: StateID,
207	pub start: u8,
208	pub end: u8,
209	}
210
211	/// An entry in this map.
212	#[derive(Clone, Debug, Default)]
213	struct Utf8SuffixEntry {
214	/// The version of the map used to produce this entry. If this entry's
215	/// version does not match the current version of the map, then the map
216	/// should behave as if this entry does not exist.
217	version: u16,
218	/// The key, which consists of a transition in a particular state.
219	key: Utf8SuffixKey,
220	/// The identifier that the transition in the key maps to.
221	val: StateID,
222	}
223
224	impl Utf8SuffixMap {
225	/// Create a new bounded map with the given capacity. The map will never
226	/// grow beyond the given size.
227	///
228	/// Note that this does not allocate. Instead, callers must call `clear`
229	/// before using this map. `clear` will allocate space if necessary.
230	///
231	/// This avoids the need to pay for the allocation of this map when
232	/// compiling regexes that lack large Unicode character classes.
233	pub fn new(capacity: usize) -> Utf8SuffixMap {
234	assert!(capacity > `0`);
235	Utf8SuffixMap { version: `0`, capacity, map: vec![] }
236	}
237
238	/// Clear this map of all entries, but permit the reuse of allocation
239	/// if possible.
240	///
241	/// This must be called before the map can be used.
242	pub fn clear(&mut self) {
243	if self.map.is_empty() {
244	self.map = vec![Utf8SuffixEntry::default(); self.capacity];
245	} else {
246	self.version = self.version.wrapping_add(`1`);
247	if self.version == `0` {
248	self.map = vec![Utf8SuffixEntry::default(); self.capacity];
249	}
250	}
251	}
252
253	/// Return a hash of the given transition.
254	pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
255	// Basic FNV-1a hash as described:
256	// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
257	const PRIME: u64 = `1099511628211`;
258	const INIT: u64 = `14695981039346656037`;
259
260	let mut h = INIT;
261	h = (h ^ key.from.as_u64()).wrapping_mul(PRIME);
262	h = (h ^ u64::from(key.start)).wrapping_mul(PRIME);
263	h = (h ^ u64::from(key.end)).wrapping_mul(PRIME);
264	(h % self.map.len().as_u64()).as_usize()
265	}
266
267	/// Retrieve the cached state ID corresponding to the given key. The hash
268	/// given must have been computed with `hash` using the same key value.
269	///
270	/// If there is no cached state with the given key, then None is returned.
271	pub fn get(
272	&mut self,
273	key: &Utf8SuffixKey,
274	hash: usize,
275	) -> Option<StateID> {
276	let entry = &self.map[hash];
277	if entry.version != self.version {
278	return None;
279	}
280	if key != &entry.key {
281	return None;
282	}
283	Some(entry.val)
284	}
285
286	/// Add a cached state to this map with the given key. Callers should
287	/// ensure that `state_id` points to a state that contains precisely the
288	/// NFA transition given.
289	///
290	/// `hash` must have been computed using the `hash` method with the same
291	/// key.
292	pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
293	self.map[hash] =
294	Utf8SuffixEntry { version: self.version, key, val: state_id };
295	}
296	}
297