map.rs source code [crates/regex-automata-0.1.10/src/nfa/map.rs]

1	// This module contains a couple simple and purpose built hash maps. The key
2	// trade off they make is that they serve as caches rather than true maps. That
3	// is, inserting a new entry may cause eviction of another entry. This gives
4	// us two things. First, there's less overhead associated with inserts and
5	// lookups. Secondly, it lets us control our memory usage.
6	//
7	// These maps are used in some fairly hot code when generating NFA states for
8	// large Unicode character classes.
9	//
10	// Instead of exposing a rich hashmap entry API, we just permit the caller
11	// to produce a hash of the key directly. The hash can then be reused for both
12	// lookups and insertions at the cost of leaking things a bit. But these are
13	// for internal use only, so it's fine.
14	//
15	// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
16	// (almost) minimal DFA for large Unicode character classes in linear time.
17	// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
18	// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
19	// since there's a bit more expense in the reverse direction.)
20	//
21	// The Utf8SuffixMap is used when compiling large Unicode character classes for
22	// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
23	// construction of UTF-8 automata by caching common suffixes. This doesn't
24	// get the same space savings as Daciuk's algorithm, but it's basically as
25	// fast as the naive approach and typically winds up using less memory (since
26	// it generates smaller NFAs) despite the presence of the cache.
27	//
28	// These maps effectively represent caching mechanisms for CState::Sparse and
29	// CState::Range, respectively. The former represents a single NFA state with
30	// many transitions of equivalent priority while the latter represents a single
31	// NFA state with a single transition. (Neither state ever has or is an
32	// epsilon transition.) Thus, they have different key types. It's likely we
33	// could make one generic map, but the machinery didn't seem worth it. They
34	// are simple enough.
35
36	use nfa::{StateID, Transition};
37
38	// Basic FNV-1a hash constants as described in:
39	// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
40	const PRIME: u64 = `1099511628211`;
41	const INIT: u64 = `14695981039346656037`;
42
43	/// A bounded hash map where the key is a sequence of NFA transitions and the
44	/// value is a pre-existing NFA state ID.
45	///
46	/// std's hashmap can be used for this, however, this map has two important
47	/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
48	/// control our memory usage by limited the number of slots. In general, the
49	/// cost here is that this map acts as a cache. That is, inserting a new entry
50	/// may remove an old entry. We are okay with this, since it does not impact
51	/// correctness in the cases where it is used. The only effect that dropping
52	/// states from the cache has is that the resulting NFA generated may be bigger
53	/// than it otherwise would be.
54	///
55	/// This improves benchmarks that compile large Unicode character classes,
56	/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
57	/// Specifically, one could observe the difference with std's hashmap via
58	/// something like the following benchmark:
59	///
60	/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
61	///
62	/// But to observe that difference, you'd have to modify the code to use
63	/// std's hashmap.
64	///
65	/// It is quite possible that there is a better way to approach this problem.
66	/// For example, if there happens to be a very common state that collides with
67	/// a lot of less frequent states, then we could wind up with very poor caching
68	/// behavior. Alas, the effectiveness of this cache has not been measured.
69	/// Instead, ad hoc experiments suggest that it is "good enough." Additional
70	/// smarts (such as an LRU eviction policy) have to be weighed against the
71	/// amount of extra time they cost.
72	#[derive(Clone, Debug)]
73	pub struct Utf8BoundedMap {
74	/// The current version of this map. Only entries with matching versions
75	/// are considered during lookups. If an entry is found with a mismatched
76	/// version, then the map behaves as if the entry does not exist.
77	version: u16,
78	/// The total number of entries this map can store.
79	capacity: usize,
80	/// The actual entries, keyed by hash. Collisions between different states
81	/// result in the old state being dropped.
82	map: Vec<Utf8BoundedEntry>,
83	}
84
85	/// An entry in this map.
86	#[derive(Clone, Debug, Default)]
87	struct Utf8BoundedEntry {
88	/// The version of the map used to produce this entry. If this entry's
89	/// version does not match the current version of the map, then the map
90	/// should behave as if this entry does not exist.
91	version: u16,
92	/// The key, which is a sorted sequence of non-overlapping NFA transitions.
93	key: Vec<Transition>,
94	/// The state ID corresponding to the state containing the transitions in
95	/// this entry.
96	val: StateID,
97	}
98
99	impl Utf8BoundedMap {
100	/// Create a new bounded map with the given capacity. The map will never
101	/// grow beyond the given size.
102	///
103	/// Note that this does not allocate. Instead, callers must call `clear`
104	/// before using this map. `clear` will allocate space if necessary.
105	///
106	/// This avoids the need to pay for the allocation of this map when
107	/// compiling regexes that lack large Unicode character classes.
108	pub fn new(capacity: usize) -> Utf8BoundedMap {
109	assert!(capacity > `0`);
110	Utf8BoundedMap { version: `0`, capacity, map: vec![] }
111	}
112
113	/// Clear this map of all entries, but permit the reuse of allocation
114	/// if possible.
115	///
116	/// This must be called before the map can be used.
117	pub fn clear(&mut self) {
118	if self.map.is_empty() {
119	self.map = vec![Utf8BoundedEntry::default(); self.capacity];
120	} else {
121	self.version = self.version.wrapping_add(`1`);
122	if self.version == `0` {
123	self.map = vec![Utf8BoundedEntry::default(); self.capacity];
124	}
125	}
126	}
127
128	/// Return a hash of the given transitions.
129	pub fn hash(&self, key: &[Transition]) -> usize {
130	let mut h = INIT;
131	for t in key {
132	h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
133	h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
134	h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
135	}
136	(h as usize) % self.map.len()
137	}
138
139	/// Retrieve the cached state ID corresponding to the given key. The hash
140	/// given must have been computed with `hash` using the same key value.
141	///
142	/// If there is no cached state with the given transitions, then None is
143	/// returned.
144	pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
145	let entry = &self.map[hash];
146	if entry.version != self.version {
147	return None;
148	}
149	// There may be a hash collision, so we need to confirm real equality.
150	if entry.key != key {
151	return None;
152	}
153	Some(entry.val)
154	}
155
156	/// Add a cached state to this map with the given key. Callers should
157	/// ensure that `state_id` points to a state that contains precisely the
158	/// NFA transitions given.
159	///
160	/// `hash` must have been computed using the `hash` method with the same
161	/// key.
162	pub fn set(
163	&mut self,
164	key: Vec<Transition>,
165	hash: usize,
166	state_id: StateID,
167	) {
168	self.map[hash] =
169	Utf8BoundedEntry { version: self.version, key, val: state_id };
170	}
171	}
172
173	/// A cache of suffixes used to modestly compress UTF-8 automata for large
174	/// Unicode character classes.
175	#[derive(Clone, Debug)]
176	pub struct Utf8SuffixMap {
177	/// The current version of this map. Only entries with matching versions
178	/// are considered during lookups. If an entry is found with a mismatched
179	/// version, then the map behaves as if the entry does not exist.
180	version: u16,
181	/// The total number of entries this map can store.
182	capacity: usize,
183	/// The actual entries, keyed by hash. Collisions between different states
184	/// result in the old state being dropped.
185	map: Vec<Utf8SuffixEntry>,
186	}
187
188	/// A key that uniquely identifies an NFA state. It is a triple that represents
189	/// a transition from one state for a particular byte range.
190	#[derive(Clone, Debug, Default, Eq, PartialEq)]
191	pub struct Utf8SuffixKey {
192	pub from: StateID,
193	pub start: u8,
194	pub end: u8,
195	}
196
197	/// An entry in this map.
198	#[derive(Clone, Debug, Default)]
199	struct Utf8SuffixEntry {
200	/// The version of the map used to produce this entry. If this entry's
201	/// version does not match the current version of the map, then the map
202	/// should behave as if this entry does not exist.
203	version: u16,
204	/// The key, which consists of a transition in a particular state.
205	key: Utf8SuffixKey,
206	/// The identifier that the transition in the key maps to.
207	val: StateID,
208	}
209
210	impl Utf8SuffixMap {
211	/// Create a new bounded map with the given capacity. The map will never
212	/// grow beyond the given size.
213	///
214	/// Note that this does not allocate. Instead, callers must call `clear`
215	/// before using this map. `clear` will allocate space if necessary.
216	///
217	/// This avoids the need to pay for the allocation of this map when
218	/// compiling regexes that lack large Unicode character classes.
219	pub fn new(capacity: usize) -> Utf8SuffixMap {
220	assert!(capacity > `0`);
221	Utf8SuffixMap { version: `0`, capacity, map: vec![] }
222	}
223
224	/// Clear this map of all entries, but permit the reuse of allocation
225	/// if possible.
226	///
227	/// This must be called before the map can be used.
228	pub fn clear(&mut self) {
229	if self.map.is_empty() {
230	self.map = vec![Utf8SuffixEntry::default(); self.capacity];
231	} else {
232	self.version = self.version.wrapping_add(`1`);
233	if self.version == `0` {
234	self.map = vec![Utf8SuffixEntry::default(); self.capacity];
235	}
236	}
237	}
238
239	/// Return a hash of the given transition.
240	pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
241	// Basic FNV-1a hash as described:
242	// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
243	const PRIME: u64 = `1099511628211`;
244	const INIT: u64 = `14695981039346656037`;
245
246	let mut h = INIT;
247	h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
248	h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
249	h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
250	(h as usize) % self.map.len()
251	}
252
253	/// Retrieve the cached state ID corresponding to the given key. The hash
254	/// given must have been computed with `hash` using the same key value.
255	///
256	/// If there is no cached state with the given key, then None is returned.
257	pub fn get(
258	&mut self,
259	key: &Utf8SuffixKey,
260	hash: usize,
261	) -> Option<StateID> {
262	let entry = &self.map[hash];
263	if entry.version != self.version {
264	return None;
265	}
266	if key != &entry.key {
267	return None;
268	}
269	Some(entry.val)
270	}
271
272	/// Add a cached state to this map with the given key. Callers should
273	/// ensure that `state_id` points to a state that contains precisely the
274	/// NFA transition given.
275	///
276	/// `hash` must have been computed using the `hash` method with the same
277	/// key.
278	pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
279	self.map[hash] =
280	Utf8SuffixEntry { version: self.version, key, val: state_id };
281	}
282	}
283