1// This module contains a couple simple and purpose built hash maps. The key
2// trade off they make is that they serve as caches rather than true maps. That
3// is, inserting a new entry may cause eviction of another entry. This gives
4// us two things. First, there's less overhead associated with inserts and
5// lookups. Secondly, it lets us control our memory usage.
6//
7// These maps are used in some fairly hot code when generating NFA states for
8// large Unicode character classes.
9//
10// Instead of exposing a rich hashmap entry API, we just permit the caller
11// to produce a hash of the key directly. The hash can then be reused for both
12// lookups and insertions at the cost of leaking things a bit. But these are
13// for internal use only, so it's fine.
14//
15// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
16// (almost) minimal DFA for large Unicode character classes in linear time.
17// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
18// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
19// since there's a bit more expense in the reverse direction.)
20//
21// The Utf8SuffixMap is used when compiling large Unicode character classes for
22// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
23// construction of UTF-8 automata by caching common suffixes. This doesn't
24// get the same space savings as Daciuk's algorithm, but it's basically as
25// fast as the naive approach and typically winds up using less memory (since
26// it generates smaller NFAs) despite the presence of the cache.
27//
28// These maps effectively represent caching mechanisms for CState::Sparse and
29// CState::Range, respectively. The former represents a single NFA state with
30// many transitions of equivalent priority while the latter represents a single
31// NFA state with a single transition. (Neither state ever has or is an
32// epsilon transition.) Thus, they have different key types. It's likely we
33// could make one generic map, but the machinery didn't seem worth it. They
34// are simple enough.
35
36use nfa::{StateID, Transition};
37
38// Basic FNV-1a hash constants as described in:
39// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
40const PRIME: u64 = 1099511628211;
41const INIT: u64 = 14695981039346656037;
42
43/// A bounded hash map where the key is a sequence of NFA transitions and the
44/// value is a pre-existing NFA state ID.
45///
46/// std's hashmap can be used for this, however, this map has two important
47/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
48/// control our memory usage by limited the number of slots. In general, the
49/// cost here is that this map acts as a cache. That is, inserting a new entry
50/// may remove an old entry. We are okay with this, since it does not impact
51/// correctness in the cases where it is used. The only effect that dropping
52/// states from the cache has is that the resulting NFA generated may be bigger
53/// than it otherwise would be.
54///
55/// This improves benchmarks that compile large Unicode character classes,
56/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
57/// Specifically, one could observe the difference with std's hashmap via
58/// something like the following benchmark:
59///
60/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
61///
62/// But to observe that difference, you'd have to modify the code to use
63/// std's hashmap.
64///
65/// It is quite possible that there is a better way to approach this problem.
66/// For example, if there happens to be a very common state that collides with
67/// a lot of less frequent states, then we could wind up with very poor caching
68/// behavior. Alas, the effectiveness of this cache has not been measured.
69/// Instead, ad hoc experiments suggest that it is "good enough." Additional
70/// smarts (such as an LRU eviction policy) have to be weighed against the
71/// amount of extra time they cost.
72#[derive(Clone, Debug)]
73pub struct Utf8BoundedMap {
74 /// The current version of this map. Only entries with matching versions
75 /// are considered during lookups. If an entry is found with a mismatched
76 /// version, then the map behaves as if the entry does not exist.
77 version: u16,
78 /// The total number of entries this map can store.
79 capacity: usize,
80 /// The actual entries, keyed by hash. Collisions between different states
81 /// result in the old state being dropped.
82 map: Vec<Utf8BoundedEntry>,
83}
84
85/// An entry in this map.
86#[derive(Clone, Debug, Default)]
87struct Utf8BoundedEntry {
88 /// The version of the map used to produce this entry. If this entry's
89 /// version does not match the current version of the map, then the map
90 /// should behave as if this entry does not exist.
91 version: u16,
92 /// The key, which is a sorted sequence of non-overlapping NFA transitions.
93 key: Vec<Transition>,
94 /// The state ID corresponding to the state containing the transitions in
95 /// this entry.
96 val: StateID,
97}
98
99impl Utf8BoundedMap {
100 /// Create a new bounded map with the given capacity. The map will never
101 /// grow beyond the given size.
102 ///
103 /// Note that this does not allocate. Instead, callers must call `clear`
104 /// before using this map. `clear` will allocate space if necessary.
105 ///
106 /// This avoids the need to pay for the allocation of this map when
107 /// compiling regexes that lack large Unicode character classes.
108 pub fn new(capacity: usize) -> Utf8BoundedMap {
109 assert!(capacity > 0);
110 Utf8BoundedMap { version: 0, capacity, map: vec![] }
111 }
112
113 /// Clear this map of all entries, but permit the reuse of allocation
114 /// if possible.
115 ///
116 /// This must be called before the map can be used.
117 pub fn clear(&mut self) {
118 if self.map.is_empty() {
119 self.map = vec![Utf8BoundedEntry::default(); self.capacity];
120 } else {
121 self.version = self.version.wrapping_add(1);
122 if self.version == 0 {
123 self.map = vec![Utf8BoundedEntry::default(); self.capacity];
124 }
125 }
126 }
127
128 /// Return a hash of the given transitions.
129 pub fn hash(&self, key: &[Transition]) -> usize {
130 let mut h = INIT;
131 for t in key {
132 h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
133 h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
134 h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
135 }
136 (h as usize) % self.map.len()
137 }
138
139 /// Retrieve the cached state ID corresponding to the given key. The hash
140 /// given must have been computed with `hash` using the same key value.
141 ///
142 /// If there is no cached state with the given transitions, then None is
143 /// returned.
144 pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
145 let entry = &self.map[hash];
146 if entry.version != self.version {
147 return None;
148 }
149 // There may be a hash collision, so we need to confirm real equality.
150 if entry.key != key {
151 return None;
152 }
153 Some(entry.val)
154 }
155
156 /// Add a cached state to this map with the given key. Callers should
157 /// ensure that `state_id` points to a state that contains precisely the
158 /// NFA transitions given.
159 ///
160 /// `hash` must have been computed using the `hash` method with the same
161 /// key.
162 pub fn set(
163 &mut self,
164 key: Vec<Transition>,
165 hash: usize,
166 state_id: StateID,
167 ) {
168 self.map[hash] =
169 Utf8BoundedEntry { version: self.version, key, val: state_id };
170 }
171}
172
173/// A cache of suffixes used to modestly compress UTF-8 automata for large
174/// Unicode character classes.
175#[derive(Clone, Debug)]
176pub struct Utf8SuffixMap {
177 /// The current version of this map. Only entries with matching versions
178 /// are considered during lookups. If an entry is found with a mismatched
179 /// version, then the map behaves as if the entry does not exist.
180 version: u16,
181 /// The total number of entries this map can store.
182 capacity: usize,
183 /// The actual entries, keyed by hash. Collisions between different states
184 /// result in the old state being dropped.
185 map: Vec<Utf8SuffixEntry>,
186}
187
188/// A key that uniquely identifies an NFA state. It is a triple that represents
189/// a transition from one state for a particular byte range.
190#[derive(Clone, Debug, Default, Eq, PartialEq)]
191pub struct Utf8SuffixKey {
192 pub from: StateID,
193 pub start: u8,
194 pub end: u8,
195}
196
197/// An entry in this map.
198#[derive(Clone, Debug, Default)]
199struct Utf8SuffixEntry {
200 /// The version of the map used to produce this entry. If this entry's
201 /// version does not match the current version of the map, then the map
202 /// should behave as if this entry does not exist.
203 version: u16,
204 /// The key, which consists of a transition in a particular state.
205 key: Utf8SuffixKey,
206 /// The identifier that the transition in the key maps to.
207 val: StateID,
208}
209
210impl Utf8SuffixMap {
211 /// Create a new bounded map with the given capacity. The map will never
212 /// grow beyond the given size.
213 ///
214 /// Note that this does not allocate. Instead, callers must call `clear`
215 /// before using this map. `clear` will allocate space if necessary.
216 ///
217 /// This avoids the need to pay for the allocation of this map when
218 /// compiling regexes that lack large Unicode character classes.
219 pub fn new(capacity: usize) -> Utf8SuffixMap {
220 assert!(capacity > 0);
221 Utf8SuffixMap { version: 0, capacity, map: vec![] }
222 }
223
224 /// Clear this map of all entries, but permit the reuse of allocation
225 /// if possible.
226 ///
227 /// This must be called before the map can be used.
228 pub fn clear(&mut self) {
229 if self.map.is_empty() {
230 self.map = vec![Utf8SuffixEntry::default(); self.capacity];
231 } else {
232 self.version = self.version.wrapping_add(1);
233 if self.version == 0 {
234 self.map = vec![Utf8SuffixEntry::default(); self.capacity];
235 }
236 }
237 }
238
239 /// Return a hash of the given transition.
240 pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
241 // Basic FNV-1a hash as described:
242 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
243 const PRIME: u64 = 1099511628211;
244 const INIT: u64 = 14695981039346656037;
245
246 let mut h = INIT;
247 h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
248 h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
249 h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
250 (h as usize) % self.map.len()
251 }
252
253 /// Retrieve the cached state ID corresponding to the given key. The hash
254 /// given must have been computed with `hash` using the same key value.
255 ///
256 /// If there is no cached state with the given key, then None is returned.
257 pub fn get(
258 &mut self,
259 key: &Utf8SuffixKey,
260 hash: usize,
261 ) -> Option<StateID> {
262 let entry = &self.map[hash];
263 if entry.version != self.version {
264 return None;
265 }
266 if key != &entry.key {
267 return None;
268 }
269 Some(entry.val)
270 }
271
272 /// Add a cached state to this map with the given key. Callers should
273 /// ensure that `state_id` points to a state that contains precisely the
274 /// NFA transition given.
275 ///
276 /// `hash` must have been computed using the `hash` method with the same
277 /// key.
278 pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
279 self.map[hash] =
280 Utf8SuffixEntry { version: self.version, key, val: state_id };
281 }
282}
283