| 1 | // This module contains a couple simple and purpose built hash maps. The key |
| 2 | // trade off they make is that they serve as caches rather than true maps. That |
| 3 | // is, inserting a new entry may cause eviction of another entry. This gives |
| 4 | // us two things. First, there's less overhead associated with inserts and |
| 5 | // lookups. Secondly, it lets us control our memory usage. |
| 6 | // |
| 7 | // These maps are used in some fairly hot code when generating NFA states for |
| 8 | // large Unicode character classes. |
| 9 | // |
| 10 | // Instead of exposing a rich hashmap entry API, we just permit the caller to |
| 11 | // produce a hash of the key directly. The hash can then be reused for both |
| 12 | // lookups and insertions at the cost of leaking abstraction a bit. But these |
| 13 | // are for internal use only, so it's fine. |
| 14 | // |
| 15 | // The Utf8BoundedMap is used for Daciuk's algorithm for constructing a |
| 16 | // (almost) minimal DFA for large Unicode character classes in linear time. |
| 17 | // (Daciuk's algorithm is always used when compiling forward NFAs. For reverse |
| 18 | // NFAs, it's only used when the compiler is configured to 'shrink' the NFA, |
| 19 | // since there's a bit more expense in the reverse direction.) |
| 20 | // |
| 21 | // The Utf8SuffixMap is used when compiling large Unicode character classes for |
| 22 | // reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive |
| 23 | // construction of UTF-8 automata by caching common suffixes. This doesn't |
| 24 | // get the same space savings as Daciuk's algorithm, but it's basically as |
| 25 | // fast as the naive approach and typically winds up using less memory (since |
| 26 | // it generates smaller NFAs) despite the presence of the cache. |
| 27 | // |
| 28 | // These maps effectively represent caching mechanisms for sparse and |
| 29 | // byte-range NFA states, respectively. The former represents a single NFA |
| 30 | // state with many transitions of equivalent priority while the latter |
| 31 | // represents a single NFA state with a single transition. (Neither state ever |
| 32 | // has or is an epsilon transition.) Thus, they have different key types. It's |
| 33 | // likely we could make one generic map, but the machinery didn't seem worth |
| 34 | // it. They are simple enough. |
| 35 | |
| 36 | use alloc::{vec, vec::Vec}; |
| 37 | |
| 38 | use crate::{ |
| 39 | nfa::thompson::Transition, |
| 40 | util::{ |
| 41 | int::{Usize, U64}, |
| 42 | primitives::StateID, |
| 43 | }, |
| 44 | }; |
| 45 | |
| 46 | // Basic FNV-1a hash constants as described in: |
| 47 | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function |
| 48 | const PRIME: u64 = 1099511628211; |
| 49 | const INIT: u64 = 14695981039346656037; |
| 50 | |
| 51 | /// A bounded hash map where the key is a sequence of NFA transitions and the |
| 52 | /// value is a pre-existing NFA state ID. |
| 53 | /// |
| 54 | /// std's hashmap can be used for this, however, this map has two important |
| 55 | /// advantages. Firstly, it has lower overhead. Secondly, it permits us to |
| 56 | /// control our memory usage by limited the number of slots. In general, the |
| 57 | /// cost here is that this map acts as a cache. That is, inserting a new entry |
| 58 | /// may remove an old entry. We are okay with this, since it does not impact |
| 59 | /// correctness in the cases where it is used. The only effect that dropping |
| 60 | /// states from the cache has is that the resulting NFA generated may be bigger |
| 61 | /// than it otherwise would be. |
| 62 | /// |
| 63 | /// This improves benchmarks that compile large Unicode character classes, |
| 64 | /// since it makes the generation of (almost) minimal UTF-8 automaton faster. |
| 65 | /// Specifically, one could observe the difference with std's hashmap via |
| 66 | /// something like the following benchmark: |
| 67 | /// |
| 68 | /// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" |
| 69 | /// |
| 70 | /// But to observe that difference, you'd have to modify the code to use |
| 71 | /// std's hashmap. |
| 72 | /// |
| 73 | /// It is quite possible that there is a better way to approach this problem. |
| 74 | /// For example, if there happens to be a very common state that collides with |
| 75 | /// a lot of less frequent states, then we could wind up with very poor caching |
| 76 | /// behavior. Alas, the effectiveness of this cache has not been measured. |
| 77 | /// Instead, ad hoc experiments suggest that it is "good enough." Additional |
| 78 | /// smarts (such as an LRU eviction policy) have to be weighed against the |
| 79 | /// amount of extra time they cost. |
| 80 | #[derive (Clone, Debug)] |
| 81 | pub struct Utf8BoundedMap { |
| 82 | /// The current version of this map. Only entries with matching versions |
| 83 | /// are considered during lookups. If an entry is found with a mismatched |
| 84 | /// version, then the map behaves as if the entry does not exist. |
| 85 | /// |
| 86 | /// This makes it possible to clear the map by simply incrementing the |
| 87 | /// version number instead of actually deallocating any storage. |
| 88 | version: u16, |
| 89 | /// The total number of entries this map can store. |
| 90 | capacity: usize, |
| 91 | /// The actual entries, keyed by hash. Collisions between different states |
| 92 | /// result in the old state being dropped. |
| 93 | map: Vec<Utf8BoundedEntry>, |
| 94 | } |
| 95 | |
| 96 | /// An entry in this map. |
| 97 | #[derive (Clone, Debug, Default)] |
| 98 | struct Utf8BoundedEntry { |
| 99 | /// The version of the map used to produce this entry. If this entry's |
| 100 | /// version does not match the current version of the map, then the map |
| 101 | /// should behave as if this entry does not exist. |
| 102 | version: u16, |
| 103 | /// The key, which is a sorted sequence of non-overlapping NFA transitions. |
| 104 | key: Vec<Transition>, |
| 105 | /// The state ID corresponding to the state containing the transitions in |
| 106 | /// this entry. |
| 107 | val: StateID, |
| 108 | } |
| 109 | |
| 110 | impl Utf8BoundedMap { |
| 111 | /// Create a new bounded map with the given capacity. The map will never |
| 112 | /// grow beyond the given size. |
| 113 | /// |
| 114 | /// Note that this does not allocate. Instead, callers must call `clear` |
| 115 | /// before using this map. `clear` will allocate space if necessary. |
| 116 | /// |
| 117 | /// This avoids the need to pay for the allocation of this map when |
| 118 | /// compiling regexes that lack large Unicode character classes. |
| 119 | pub fn new(capacity: usize) -> Utf8BoundedMap { |
| 120 | assert!(capacity > 0); |
| 121 | Utf8BoundedMap { version: 0, capacity, map: vec![] } |
| 122 | } |
| 123 | |
| 124 | /// Clear this map of all entries, but permit the reuse of allocation |
| 125 | /// if possible. |
| 126 | /// |
| 127 | /// This must be called before the map can be used. |
| 128 | pub fn clear(&mut self) { |
| 129 | if self.map.is_empty() { |
| 130 | self.map = vec![Utf8BoundedEntry::default(); self.capacity]; |
| 131 | } else { |
| 132 | self.version = self.version.wrapping_add(1); |
| 133 | // If we loop back to version 0, then we forcefully clear the |
| 134 | // entire map. Otherwise, it might be possible to incorrectly |
| 135 | // match entries used to generate other NFAs. |
| 136 | if self.version == 0 { |
| 137 | self.map = vec![Utf8BoundedEntry::default(); self.capacity]; |
| 138 | } |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | /// Return a hash of the given transitions. |
| 143 | pub fn hash(&self, key: &[Transition]) -> usize { |
| 144 | let mut h = INIT; |
| 145 | for t in key { |
| 146 | h = (h ^ u64::from(t.start)).wrapping_mul(PRIME); |
| 147 | h = (h ^ u64::from(t.end)).wrapping_mul(PRIME); |
| 148 | h = (h ^ t.next.as_u64()).wrapping_mul(PRIME); |
| 149 | } |
| 150 | (h % self.map.len().as_u64()).as_usize() |
| 151 | } |
| 152 | |
| 153 | /// Retrieve the cached state ID corresponding to the given key. The hash |
| 154 | /// given must have been computed with `hash` using the same key value. |
| 155 | /// |
| 156 | /// If there is no cached state with the given transitions, then None is |
| 157 | /// returned. |
| 158 | pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> { |
| 159 | let entry = &self.map[hash]; |
| 160 | if entry.version != self.version { |
| 161 | return None; |
| 162 | } |
| 163 | // There may be a hash collision, so we need to confirm real equality. |
| 164 | if entry.key != key { |
| 165 | return None; |
| 166 | } |
| 167 | Some(entry.val) |
| 168 | } |
| 169 | |
| 170 | /// Add a cached state to this map with the given key. Callers should |
| 171 | /// ensure that `state_id` points to a state that contains precisely the |
| 172 | /// NFA transitions given. |
| 173 | /// |
| 174 | /// `hash` must have been computed using the `hash` method with the same |
| 175 | /// key. |
| 176 | pub fn set( |
| 177 | &mut self, |
| 178 | key: Vec<Transition>, |
| 179 | hash: usize, |
| 180 | state_id: StateID, |
| 181 | ) { |
| 182 | self.map[hash] = |
| 183 | Utf8BoundedEntry { version: self.version, key, val: state_id }; |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | /// A cache of suffixes used to modestly compress UTF-8 automata for large |
| 188 | /// Unicode character classes. |
| 189 | #[derive (Clone, Debug)] |
| 190 | pub struct Utf8SuffixMap { |
| 191 | /// The current version of this map. Only entries with matching versions |
| 192 | /// are considered during lookups. If an entry is found with a mismatched |
| 193 | /// version, then the map behaves as if the entry does not exist. |
| 194 | version: u16, |
| 195 | /// The total number of entries this map can store. |
| 196 | capacity: usize, |
| 197 | /// The actual entries, keyed by hash. Collisions between different states |
| 198 | /// result in the old state being dropped. |
| 199 | map: Vec<Utf8SuffixEntry>, |
| 200 | } |
| 201 | |
| 202 | /// A key that uniquely identifies an NFA state. It is a triple that represents |
| 203 | /// a transition from one state for a particular byte range. |
| 204 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
| 205 | pub struct Utf8SuffixKey { |
| 206 | pub from: StateID, |
| 207 | pub start: u8, |
| 208 | pub end: u8, |
| 209 | } |
| 210 | |
| 211 | /// An entry in this map. |
| 212 | #[derive (Clone, Debug, Default)] |
| 213 | struct Utf8SuffixEntry { |
| 214 | /// The version of the map used to produce this entry. If this entry's |
| 215 | /// version does not match the current version of the map, then the map |
| 216 | /// should behave as if this entry does not exist. |
| 217 | version: u16, |
| 218 | /// The key, which consists of a transition in a particular state. |
| 219 | key: Utf8SuffixKey, |
| 220 | /// The identifier that the transition in the key maps to. |
| 221 | val: StateID, |
| 222 | } |
| 223 | |
| 224 | impl Utf8SuffixMap { |
| 225 | /// Create a new bounded map with the given capacity. The map will never |
| 226 | /// grow beyond the given size. |
| 227 | /// |
| 228 | /// Note that this does not allocate. Instead, callers must call `clear` |
| 229 | /// before using this map. `clear` will allocate space if necessary. |
| 230 | /// |
| 231 | /// This avoids the need to pay for the allocation of this map when |
| 232 | /// compiling regexes that lack large Unicode character classes. |
| 233 | pub fn new(capacity: usize) -> Utf8SuffixMap { |
| 234 | assert!(capacity > 0); |
| 235 | Utf8SuffixMap { version: 0, capacity, map: vec![] } |
| 236 | } |
| 237 | |
| 238 | /// Clear this map of all entries, but permit the reuse of allocation |
| 239 | /// if possible. |
| 240 | /// |
| 241 | /// This must be called before the map can be used. |
| 242 | pub fn clear(&mut self) { |
| 243 | if self.map.is_empty() { |
| 244 | self.map = vec![Utf8SuffixEntry::default(); self.capacity]; |
| 245 | } else { |
| 246 | self.version = self.version.wrapping_add(1); |
| 247 | if self.version == 0 { |
| 248 | self.map = vec![Utf8SuffixEntry::default(); self.capacity]; |
| 249 | } |
| 250 | } |
| 251 | } |
| 252 | |
| 253 | /// Return a hash of the given transition. |
| 254 | pub fn hash(&self, key: &Utf8SuffixKey) -> usize { |
| 255 | // Basic FNV-1a hash as described: |
| 256 | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function |
| 257 | const PRIME: u64 = 1099511628211; |
| 258 | const INIT: u64 = 14695981039346656037; |
| 259 | |
| 260 | let mut h = INIT; |
| 261 | h = (h ^ key.from.as_u64()).wrapping_mul(PRIME); |
| 262 | h = (h ^ u64::from(key.start)).wrapping_mul(PRIME); |
| 263 | h = (h ^ u64::from(key.end)).wrapping_mul(PRIME); |
| 264 | (h % self.map.len().as_u64()).as_usize() |
| 265 | } |
| 266 | |
| 267 | /// Retrieve the cached state ID corresponding to the given key. The hash |
| 268 | /// given must have been computed with `hash` using the same key value. |
| 269 | /// |
| 270 | /// If there is no cached state with the given key, then None is returned. |
| 271 | pub fn get( |
| 272 | &mut self, |
| 273 | key: &Utf8SuffixKey, |
| 274 | hash: usize, |
| 275 | ) -> Option<StateID> { |
| 276 | let entry = &self.map[hash]; |
| 277 | if entry.version != self.version { |
| 278 | return None; |
| 279 | } |
| 280 | if key != &entry.key { |
| 281 | return None; |
| 282 | } |
| 283 | Some(entry.val) |
| 284 | } |
| 285 | |
| 286 | /// Add a cached state to this map with the given key. Callers should |
| 287 | /// ensure that `state_id` points to a state that contains precisely the |
| 288 | /// NFA transition given. |
| 289 | /// |
| 290 | /// `hash` must have been computed using the `hash` method with the same |
| 291 | /// key. |
| 292 | pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) { |
| 293 | self.map[hash] = |
| 294 | Utf8SuffixEntry { version: self.version, key, val: state_id }; |
| 295 | } |
| 296 | } |
| 297 | |