| 1 | use core::{cell::RefCell, fmt, mem}; |
| 2 | |
| 3 | use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; |
| 4 | |
| 5 | use crate::{ |
| 6 | dfa::{automaton::Automaton, dense, DEAD}, |
| 7 | util::{ |
| 8 | alphabet, |
| 9 | primitives::{PatternID, StateID}, |
| 10 | }, |
| 11 | }; |
| 12 | |
| 13 | /// An implementation of Hopcroft's algorithm for minimizing DFAs. |
| 14 | /// |
| 15 | /// The algorithm implemented here is mostly taken from Wikipedia: |
| 16 | /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm |
| 17 | /// |
| 18 | /// This code has had some light optimization attention paid to it, |
| 19 | /// particularly in the form of reducing allocation as much as possible. |
| 20 | /// However, it is still generally slow. Future optimization work should |
| 21 | /// probably focus on the bigger picture rather than micro-optimizations. For |
| 22 | /// example: |
| 23 | /// |
| 24 | /// 1. Figure out how to more intelligently create initial partitions. That is, |
| 25 | /// Hopcroft's algorithm starts by creating two partitions of DFA states |
| 26 | /// that are known to NOT be equivalent: match states and non-match states. |
| 27 | /// The algorithm proceeds by progressively refining these partitions into |
| 28 | /// smaller partitions. If we could start with more partitions, then we |
| 29 | /// could reduce the amount of work that Hopcroft's algorithm needs to do. |
| 30 | /// 2. For every partition that we visit, we find all incoming transitions to |
| 31 | /// every state in the partition for *every* element in the alphabet. (This |
| 32 | /// is why using byte classes can significantly decrease minimization times, |
| 33 | /// since byte classes shrink the alphabet.) This is quite costly and there |
| 34 | /// is perhaps some redundant work being performed depending on the specific |
| 35 | /// states in the set. For example, we might be able to only visit some |
| 36 | /// elements of the alphabet based on the transitions. |
| 37 | /// 3. Move parts of minimization into determinization. If minimization has |
| 38 | /// fewer states to deal with, then it should run faster. A prime example |
| 39 | /// of this might be large Unicode classes, which are generated in way that |
| 40 | /// can create a lot of redundant states. (Some work has been done on this |
| 41 | /// point during NFA compilation via the algorithm described in the |
| 42 | /// "Incremental Construction of MinimalAcyclic Finite-State Automata" |
| 43 | /// paper.) |
| 44 | pub(crate) struct Minimizer<'a> { |
| 45 | dfa: &'a mut dense::OwnedDFA, |
| 46 | in_transitions: Vec<Vec<Vec<StateID>>>, |
| 47 | partitions: Vec<StateSet>, |
| 48 | waiting: Vec<StateSet>, |
| 49 | } |
| 50 | |
| 51 | impl<'a> fmt::Debug for Minimizer<'a> { |
| 52 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 53 | f.debug_struct("Minimizer" ) |
| 54 | .field("dfa" , &self.dfa) |
| 55 | .field("in_transitions" , &self.in_transitions) |
| 56 | .field("partitions" , &self.partitions) |
| 57 | .field("waiting" , &self.waiting) |
| 58 | .finish() |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | /// A set of states. A state set makes up a single partition in Hopcroft's |
| 63 | /// algorithm. |
| 64 | /// |
| 65 | /// It is represented by an ordered set of state identifiers. We use shared |
| 66 | /// ownership so that a single state set can be in both the set of partitions |
| 67 | /// and in the set of waiting sets simultaneously without an additional |
| 68 | /// allocation. Generally, once a state set is built, it becomes immutable. |
| 69 | /// |
| 70 | /// We use this representation because it avoids the overhead of more |
| 71 | /// traditional set data structures (HashSet/BTreeSet), and also because |
| 72 | /// computing intersection/subtraction on this representation is especially |
| 73 | /// fast. |
| 74 | #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] |
| 75 | struct StateSet { |
| 76 | ids: Rc<RefCell<Vec<StateID>>>, |
| 77 | } |
| 78 | |
| 79 | impl<'a> Minimizer<'a> { |
| 80 | pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { |
| 81 | let in_transitions = Minimizer::incoming_transitions(dfa); |
| 82 | let partitions = Minimizer::initial_partitions(dfa); |
| 83 | let waiting = partitions.clone(); |
| 84 | Minimizer { dfa, in_transitions, partitions, waiting } |
| 85 | } |
| 86 | |
| 87 | pub fn run(mut self) { |
| 88 | let stride2 = self.dfa.stride2(); |
| 89 | let as_state_id = |index: usize| -> StateID { |
| 90 | StateID::new(index << stride2).unwrap() |
| 91 | }; |
| 92 | let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; |
| 93 | |
| 94 | let mut incoming = StateSet::empty(); |
| 95 | let mut scratch1 = StateSet::empty(); |
| 96 | let mut scratch2 = StateSet::empty(); |
| 97 | let mut newparts = vec![]; |
| 98 | |
| 99 | // This loop is basically Hopcroft's algorithm. Everything else is just |
| 100 | // shuffling data around to fit our representation. |
| 101 | while let Some(set) = self.waiting.pop() { |
| 102 | for b in self.dfa.byte_classes().iter() { |
| 103 | self.find_incoming_to(b, &set, &mut incoming); |
| 104 | // If incoming is empty, then the intersection with any other |
| 105 | // set must also be empty. So 'newparts' just ends up being |
| 106 | // 'self.partitions'. So there's no need to go through the loop |
| 107 | // below. |
| 108 | // |
| 109 | // This actually turns out to be rather large optimization. On |
| 110 | // the order of making minimization 4-5x faster. It's likely |
| 111 | // that the vast majority of all states have very few incoming |
| 112 | // transitions. |
| 113 | if incoming.is_empty() { |
| 114 | continue; |
| 115 | } |
| 116 | |
| 117 | for p in 0..self.partitions.len() { |
| 118 | self.partitions[p].intersection(&incoming, &mut scratch1); |
| 119 | if scratch1.is_empty() { |
| 120 | newparts.push(self.partitions[p].clone()); |
| 121 | continue; |
| 122 | } |
| 123 | |
| 124 | self.partitions[p].subtract(&incoming, &mut scratch2); |
| 125 | if scratch2.is_empty() { |
| 126 | newparts.push(self.partitions[p].clone()); |
| 127 | continue; |
| 128 | } |
| 129 | |
| 130 | let (x, y) = |
| 131 | (scratch1.deep_clone(), scratch2.deep_clone()); |
| 132 | newparts.push(x.clone()); |
| 133 | newparts.push(y.clone()); |
| 134 | match self.find_waiting(&self.partitions[p]) { |
| 135 | Some(i) => { |
| 136 | self.waiting[i] = x; |
| 137 | self.waiting.push(y); |
| 138 | } |
| 139 | None => { |
| 140 | if x.len() <= y.len() { |
| 141 | self.waiting.push(x); |
| 142 | } else { |
| 143 | self.waiting.push(y); |
| 144 | } |
| 145 | } |
| 146 | } |
| 147 | } |
| 148 | newparts = mem::replace(&mut self.partitions, newparts); |
| 149 | newparts.clear(); |
| 150 | } |
| 151 | } |
| 152 | |
| 153 | // At this point, we now have a minimal partitioning of states, where |
| 154 | // each partition is an equivalence class of DFA states. Now we need to |
| 155 | // use this partitioning to update the DFA to only contain one state for |
| 156 | // each partition. |
| 157 | |
| 158 | // Create a map from DFA state ID to the representative ID of the |
| 159 | // equivalence class to which it belongs. The representative ID of an |
| 160 | // equivalence class of states is the minimum ID in that class. |
| 161 | let mut state_to_part = vec![DEAD; self.dfa.state_len()]; |
| 162 | for p in &self.partitions { |
| 163 | p.iter(|id| state_to_part[as_index(id)] = p.min()); |
| 164 | } |
| 165 | |
| 166 | // Generate a new contiguous sequence of IDs for minimal states, and |
| 167 | // create a map from equivalence IDs to the new IDs. Thus, the new |
| 168 | // minimal ID of *any* state in the unminimized DFA can be obtained |
| 169 | // with minimals_ids[state_to_part[old_id]]. |
| 170 | let mut minimal_ids = vec![DEAD; self.dfa.state_len()]; |
| 171 | let mut new_index = 0; |
| 172 | for state in self.dfa.states() { |
| 173 | if state_to_part[as_index(state.id())] == state.id() { |
| 174 | minimal_ids[as_index(state.id())] = as_state_id(new_index); |
| 175 | new_index += 1; |
| 176 | } |
| 177 | } |
| 178 | // The total number of states in the minimal DFA. |
| 179 | let minimal_count = new_index; |
| 180 | // Convenience function for remapping state IDs. This takes an old ID, |
| 181 | // looks up its Hopcroft partition and then maps that to the new ID |
| 182 | // range. |
| 183 | let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; |
| 184 | |
| 185 | // Re-map this DFA in place such that the only states remaining |
| 186 | // correspond to the representative states of every equivalence class. |
| 187 | for id in (0..self.dfa.state_len()).map(as_state_id) { |
| 188 | // If this state isn't a representative for an equivalence class, |
| 189 | // then we skip it since it won't appear in the minimal DFA. |
| 190 | if state_to_part[as_index(id)] != id { |
| 191 | continue; |
| 192 | } |
| 193 | self.dfa.remap_state(id, remap); |
| 194 | self.dfa.swap_states(id, minimal_ids[as_index(id)]); |
| 195 | } |
| 196 | // Trim off all unused states from the pre-minimized DFA. This |
| 197 | // represents all states that were merged into a non-singleton |
| 198 | // equivalence class of states, and appeared after the first state |
| 199 | // in each such class. (Because the state with the smallest ID in each |
| 200 | // equivalence class is its representative ID.) |
| 201 | self.dfa.truncate_states(minimal_count); |
| 202 | |
| 203 | // Update the new start states, which is now just the minimal ID of |
| 204 | // whatever state the old start state was collapsed into. Also, we |
| 205 | // collect everything before-hand to work around the borrow checker. |
| 206 | // We're already allocating so much that this is probably fine. If this |
| 207 | // turns out to be costly, then I guess add a `starts_mut` iterator. |
| 208 | let starts: Vec<_> = self.dfa.starts().collect(); |
| 209 | for (old_start_id, anchored, start_type) in starts { |
| 210 | self.dfa.set_start_state( |
| 211 | anchored, |
| 212 | start_type, |
| 213 | remap(old_start_id), |
| 214 | ); |
| 215 | } |
| 216 | |
| 217 | // Update the match state pattern ID list for multi-regexes. All we |
| 218 | // need to do is remap the match state IDs. The pattern ID lists are |
| 219 | // always the same as they were since match states with distinct |
| 220 | // pattern ID lists are always considered distinct states. |
| 221 | let mut pmap = BTreeMap::new(); |
| 222 | for (match_id, pattern_ids) in self.dfa.pattern_map() { |
| 223 | let new_id = remap(match_id); |
| 224 | pmap.insert(new_id, pattern_ids); |
| 225 | } |
| 226 | // This unwrap is OK because minimization never increases the number of |
| 227 | // match states or patterns in those match states. Since minimization |
| 228 | // runs after the pattern map has already been set at least once, we |
| 229 | // know that our match states cannot error. |
| 230 | self.dfa.set_pattern_map(&pmap).unwrap(); |
| 231 | |
| 232 | // In order to update the ID of the maximum match state, we need to |
| 233 | // find the maximum ID among all of the match states in the minimized |
| 234 | // DFA. This is not necessarily the new ID of the unminimized maximum |
| 235 | // match state, since that could have been collapsed with a much |
| 236 | // earlier match state. Therefore, to find the new max match state, |
| 237 | // we iterate over all previous match states, find their corresponding |
| 238 | // new minimal ID, and take the maximum of those. |
| 239 | let old = self.dfa.special().clone(); |
| 240 | let new = self.dfa.special_mut(); |
| 241 | // ... but only remap if we had match states. |
| 242 | if old.matches() { |
| 243 | new.min_match = StateID::MAX; |
| 244 | new.max_match = StateID::ZERO; |
| 245 | for i in as_index(old.min_match)..=as_index(old.max_match) { |
| 246 | let new_id = remap(as_state_id(i)); |
| 247 | if new_id < new.min_match { |
| 248 | new.min_match = new_id; |
| 249 | } |
| 250 | if new_id > new.max_match { |
| 251 | new.max_match = new_id; |
| 252 | } |
| 253 | } |
| 254 | } |
| 255 | // ... same, but for start states. |
| 256 | if old.starts() { |
| 257 | new.min_start = StateID::MAX; |
| 258 | new.max_start = StateID::ZERO; |
| 259 | for i in as_index(old.min_start)..=as_index(old.max_start) { |
| 260 | let new_id = remap(as_state_id(i)); |
| 261 | if new_id == DEAD { |
| 262 | continue; |
| 263 | } |
| 264 | if new_id < new.min_start { |
| 265 | new.min_start = new_id; |
| 266 | } |
| 267 | if new_id > new.max_start { |
| 268 | new.max_start = new_id; |
| 269 | } |
| 270 | } |
| 271 | if new.max_start == DEAD { |
| 272 | new.min_start = DEAD; |
| 273 | } |
| 274 | } |
| 275 | new.quit_id = remap(new.quit_id); |
| 276 | new.set_max(); |
| 277 | } |
| 278 | |
| 279 | fn find_waiting(&self, set: &StateSet) -> Option<usize> { |
| 280 | self.waiting.iter().position(|s| s == set) |
| 281 | } |
| 282 | |
| 283 | fn find_incoming_to( |
| 284 | &self, |
| 285 | b: alphabet::Unit, |
| 286 | set: &StateSet, |
| 287 | incoming: &mut StateSet, |
| 288 | ) { |
| 289 | incoming.clear(); |
| 290 | set.iter(|id| { |
| 291 | for &inid in |
| 292 | &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] |
| 293 | { |
| 294 | incoming.add(inid); |
| 295 | } |
| 296 | }); |
| 297 | incoming.canonicalize(); |
| 298 | } |
| 299 | |
| 300 | fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> { |
| 301 | // For match states, we know that two match states with different |
| 302 | // pattern ID lists will *always* be distinct, so we can partition them |
| 303 | // initially based on that. |
| 304 | let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new(); |
| 305 | let mut is_quit = StateSet::empty(); |
| 306 | let mut no_match = StateSet::empty(); |
| 307 | for state in dfa.states() { |
| 308 | if dfa.is_match_state(state.id()) { |
| 309 | let mut pids = vec![]; |
| 310 | for i in 0..dfa.match_len(state.id()) { |
| 311 | pids.push(dfa.match_pattern(state.id(), i)); |
| 312 | } |
| 313 | matching |
| 314 | .entry(pids) |
| 315 | .or_insert(StateSet::empty()) |
| 316 | .add(state.id()); |
| 317 | } else if dfa.is_quit_state(state.id()) { |
| 318 | is_quit.add(state.id()); |
| 319 | } else { |
| 320 | no_match.add(state.id()); |
| 321 | } |
| 322 | } |
| 323 | |
| 324 | let mut sets: Vec<StateSet> = |
| 325 | matching.into_iter().map(|(_, set)| set).collect(); |
| 326 | sets.push(no_match); |
| 327 | sets.push(is_quit); |
| 328 | sets |
| 329 | } |
| 330 | |
| 331 | fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> { |
| 332 | let mut incoming = vec![]; |
| 333 | for _ in dfa.states() { |
| 334 | incoming.push(vec![vec![]; dfa.alphabet_len()]); |
| 335 | } |
| 336 | for state in dfa.states() { |
| 337 | for (b, next) in state.transitions() { |
| 338 | incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); |
| 339 | } |
| 340 | } |
| 341 | incoming |
| 342 | } |
| 343 | } |
| 344 | |
| 345 | impl StateSet { |
| 346 | fn empty() -> StateSet { |
| 347 | StateSet { ids: Rc::new(RefCell::new(vec![])) } |
| 348 | } |
| 349 | |
| 350 | fn add(&mut self, id: StateID) { |
| 351 | self.ids.borrow_mut().push(id); |
| 352 | } |
| 353 | |
| 354 | fn min(&self) -> StateID { |
| 355 | self.ids.borrow()[0] |
| 356 | } |
| 357 | |
| 358 | fn canonicalize(&mut self) { |
| 359 | self.ids.borrow_mut().sort(); |
| 360 | self.ids.borrow_mut().dedup(); |
| 361 | } |
| 362 | |
| 363 | fn clear(&mut self) { |
| 364 | self.ids.borrow_mut().clear(); |
| 365 | } |
| 366 | |
| 367 | fn len(&self) -> usize { |
| 368 | self.ids.borrow().len() |
| 369 | } |
| 370 | |
| 371 | fn is_empty(&self) -> bool { |
| 372 | self.len() == 0 |
| 373 | } |
| 374 | |
| 375 | fn deep_clone(&self) -> StateSet { |
| 376 | let ids = self.ids.borrow().iter().cloned().collect(); |
| 377 | StateSet { ids: Rc::new(RefCell::new(ids)) } |
| 378 | } |
| 379 | |
| 380 | fn iter<F: FnMut(StateID)>(&self, mut f: F) { |
| 381 | for &id in self.ids.borrow().iter() { |
| 382 | f(id); |
| 383 | } |
| 384 | } |
| 385 | |
| 386 | fn intersection(&self, other: &StateSet, dest: &mut StateSet) { |
| 387 | dest.clear(); |
| 388 | if self.is_empty() || other.is_empty() { |
| 389 | return; |
| 390 | } |
| 391 | |
| 392 | let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); |
| 393 | let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); |
| 394 | let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); |
| 395 | loop { |
| 396 | if a == b { |
| 397 | dest.add(a); |
| 398 | a = match ita.next() { |
| 399 | None => break, |
| 400 | Some(a) => a, |
| 401 | }; |
| 402 | b = match itb.next() { |
| 403 | None => break, |
| 404 | Some(b) => b, |
| 405 | }; |
| 406 | } else if a < b { |
| 407 | a = match ita.next() { |
| 408 | None => break, |
| 409 | Some(a) => a, |
| 410 | }; |
| 411 | } else { |
| 412 | b = match itb.next() { |
| 413 | None => break, |
| 414 | Some(b) => b, |
| 415 | }; |
| 416 | } |
| 417 | } |
| 418 | } |
| 419 | |
| 420 | fn subtract(&self, other: &StateSet, dest: &mut StateSet) { |
| 421 | dest.clear(); |
| 422 | if self.is_empty() || other.is_empty() { |
| 423 | self.iter(|s| dest.add(s)); |
| 424 | return; |
| 425 | } |
| 426 | |
| 427 | let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); |
| 428 | let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); |
| 429 | let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); |
| 430 | loop { |
| 431 | if a == b { |
| 432 | a = match ita.next() { |
| 433 | None => break, |
| 434 | Some(a) => a, |
| 435 | }; |
| 436 | b = match itb.next() { |
| 437 | None => { |
| 438 | dest.add(a); |
| 439 | break; |
| 440 | } |
| 441 | Some(b) => b, |
| 442 | }; |
| 443 | } else if a < b { |
| 444 | dest.add(a); |
| 445 | a = match ita.next() { |
| 446 | None => break, |
| 447 | Some(a) => a, |
| 448 | }; |
| 449 | } else { |
| 450 | b = match itb.next() { |
| 451 | None => { |
| 452 | dest.add(a); |
| 453 | break; |
| 454 | } |
| 455 | Some(b) => b, |
| 456 | }; |
| 457 | } |
| 458 | } |
| 459 | for a in ita { |
| 460 | dest.add(a); |
| 461 | } |
| 462 | } |
| 463 | } |
| 464 | |