1 | use std::cell::RefCell; |
2 | use std::fmt; |
3 | use std::mem; |
4 | use std::rc::Rc; |
5 | |
6 | use dense; |
7 | use state_id::{dead_id, StateID}; |
8 | |
9 | type DFARepr<S> = dense::Repr<Vec<S>, S>; |
10 | |
11 | /// An implementation of Hopcroft's algorithm for minimizing DFAs. |
12 | /// |
13 | /// The algorithm implemented here is mostly taken from Wikipedia: |
14 | /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm |
15 | /// |
16 | /// This code has had some light optimization attention paid to it, |
17 | /// particularly in the form of reducing allocation as much as possible. |
18 | /// However, it is still generally slow. Future optimization work should |
19 | /// probably focus on the bigger picture rather than micro-optimizations. For |
20 | /// example: |
21 | /// |
22 | /// 1. Figure out how to more intelligently create initial partitions. That is, |
23 | /// Hopcroft's algorithm starts by creating two partitions of DFA states |
24 | /// that are known to NOT be equivalent: match states and non-match states. |
25 | /// The algorithm proceeds by progressively refining these partitions into |
26 | /// smaller partitions. If we could start with more partitions, then we |
27 | /// could reduce the amount of work that Hopcroft's algorithm needs to do. |
28 | /// 2. For every partition that we visit, we find all incoming transitions to |
29 | /// every state in the partition for *every* element in the alphabet. (This |
30 | /// is why using byte classes can significantly decrease minimization times, |
31 | /// since byte classes shrink the alphabet.) This is quite costly and there |
32 | /// is perhaps some redundant work being performed depending on the specific |
33 | /// states in the set. For example, we might be able to only visit some |
34 | /// elements of the alphabet based on the transitions. |
35 | /// 3. Move parts of minimization into determinization. If minimization has |
36 | /// fewer states to deal with, then it should run faster. A prime example |
37 | /// of this might be large Unicode classes, which are generated in way that |
38 | /// can create a lot of redundant states. (Some work has been done on this |
39 | /// point during NFA compilation via the algorithm described in the |
40 | /// "Incremental Construction of MinimalAcyclic Finite-State Automata" |
41 | /// paper.) |
42 | pub(crate) struct Minimizer<'a, S: 'a> { |
43 | dfa: &'a mut DFARepr<S>, |
44 | in_transitions: Vec<Vec<Vec<S>>>, |
45 | partitions: Vec<StateSet<S>>, |
46 | waiting: Vec<StateSet<S>>, |
47 | } |
48 | |
49 | impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> { |
50 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
51 | f.debug_struct("Minimizer" ) |
52 | .field("dfa" , &self.dfa) |
53 | .field("in_transitions" , &self.in_transitions) |
54 | .field("partitions" , &self.partitions) |
55 | .field("waiting" , &self.waiting) |
56 | .finish() |
57 | } |
58 | } |
59 | |
60 | /// A set of states. A state set makes up a single partition in Hopcroft's |
61 | /// algorithm. |
62 | /// |
63 | /// It is represented by an ordered set of state identifiers. We use shared |
64 | /// ownership so that a single state set can be in both the set of partitions |
65 | /// and in the set of waiting sets simultaneously without an additional |
66 | /// allocation. Generally, once a state set is built, it becomes immutable. |
67 | /// |
68 | /// We use this representation because it avoids the overhead of more |
69 | /// traditional set data structures (HashSet/BTreeSet), and also because |
70 | /// computing intersection/subtraction on this representation is especially |
71 | /// fast. |
72 | #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] |
73 | struct StateSet<S>(Rc<RefCell<Vec<S>>>); |
74 | |
75 | impl<'a, S: StateID> Minimizer<'a, S> { |
76 | pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> { |
77 | let in_transitions = Minimizer::incoming_transitions(dfa); |
78 | let partitions = Minimizer::initial_partitions(dfa); |
79 | let waiting = vec![partitions[0].clone()]; |
80 | |
81 | Minimizer { dfa, in_transitions, partitions, waiting } |
82 | } |
83 | |
84 | pub fn run(mut self) { |
85 | let mut incoming = StateSet::empty(); |
86 | let mut scratch1 = StateSet::empty(); |
87 | let mut scratch2 = StateSet::empty(); |
88 | let mut newparts = vec![]; |
89 | |
90 | while let Some(set) = self.waiting.pop() { |
91 | for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) { |
92 | self.find_incoming_to(b, &set, &mut incoming); |
93 | |
94 | for p in 0..self.partitions.len() { |
95 | self.partitions[p].intersection(&incoming, &mut scratch1); |
96 | if scratch1.is_empty() { |
97 | newparts.push(self.partitions[p].clone()); |
98 | continue; |
99 | } |
100 | |
101 | self.partitions[p].subtract(&incoming, &mut scratch2); |
102 | if scratch2.is_empty() { |
103 | newparts.push(self.partitions[p].clone()); |
104 | continue; |
105 | } |
106 | |
107 | let (x, y) = |
108 | (scratch1.deep_clone(), scratch2.deep_clone()); |
109 | newparts.push(x.clone()); |
110 | newparts.push(y.clone()); |
111 | match self.find_waiting(&self.partitions[p]) { |
112 | Some(i) => { |
113 | self.waiting[i] = x; |
114 | self.waiting.push(y); |
115 | } |
116 | None => { |
117 | if x.len() <= y.len() { |
118 | self.waiting.push(x); |
119 | } else { |
120 | self.waiting.push(y); |
121 | } |
122 | } |
123 | } |
124 | } |
125 | newparts = mem::replace(&mut self.partitions, newparts); |
126 | newparts.clear(); |
127 | } |
128 | } |
129 | |
130 | // At this point, we now have a minimal partitioning of states, where |
131 | // each partition is an equivalence class of DFA states. Now we need to |
132 | // use this partioning to update the DFA to only contain one state for |
133 | // each partition. |
134 | |
135 | // Create a map from DFA state ID to the representative ID of the |
136 | // equivalence class to which it belongs. The representative ID of an |
137 | // equivalence class of states is the minimum ID in that class. |
138 | let mut state_to_part = vec![dead_id(); self.dfa.state_count()]; |
139 | for p in &self.partitions { |
140 | p.iter(|id| state_to_part[id.to_usize()] = p.min()); |
141 | } |
142 | |
143 | // Generate a new contiguous sequence of IDs for minimal states, and |
144 | // create a map from equivalence IDs to the new IDs. Thus, the new |
145 | // minimal ID of *any* state in the unminimized DFA can be obtained |
146 | // with minimals_ids[state_to_part[old_id]]. |
147 | let mut minimal_ids = vec![dead_id(); self.dfa.state_count()]; |
148 | let mut new_id = S::from_usize(0); |
149 | for (id, _) in self.dfa.states() { |
150 | if state_to_part[id.to_usize()] == id { |
151 | minimal_ids[id.to_usize()] = new_id; |
152 | new_id = S::from_usize(new_id.to_usize() + 1); |
153 | } |
154 | } |
155 | // The total number of states in the minimal DFA. |
156 | let minimal_count = new_id.to_usize(); |
157 | |
158 | // Re-map this DFA in place such that the only states remaining |
159 | // correspond to the representative states of every equivalence class. |
160 | for id in (0..self.dfa.state_count()).map(S::from_usize) { |
161 | // If this state isn't a representative for an equivalence class, |
162 | // then we skip it since it won't appear in the minimal DFA. |
163 | if state_to_part[id.to_usize()] != id { |
164 | continue; |
165 | } |
166 | for (_, next) in self.dfa.get_state_mut(id).iter_mut() { |
167 | *next = minimal_ids[state_to_part[next.to_usize()].to_usize()]; |
168 | } |
169 | self.dfa.swap_states(id, minimal_ids[id.to_usize()]); |
170 | } |
171 | // Trim off all unused states from the pre-minimized DFA. This |
172 | // represents all states that were merged into a non-singleton |
173 | // equivalence class of states, and appeared after the first state |
174 | // in each such class. (Because the state with the smallest ID in each |
175 | // equivalence class is its representative ID.) |
176 | self.dfa.truncate_states(minimal_count); |
177 | |
178 | // Update the new start state, which is now just the minimal ID of |
179 | // whatever state the old start state was collapsed into. |
180 | let old_start = self.dfa.start_state(); |
181 | self.dfa.set_start_state( |
182 | minimal_ids[state_to_part[old_start.to_usize()].to_usize()], |
183 | ); |
184 | |
185 | // In order to update the ID of the maximum match state, we need to |
186 | // find the maximum ID among all of the match states in the minimized |
187 | // DFA. This is not necessarily the new ID of the unminimized maximum |
188 | // match state, since that could have been collapsed with a much |
189 | // earlier match state. Therefore, to find the new max match state, |
190 | // we iterate over all previous match states, find their corresponding |
191 | // new minimal ID, and take the maximum of those. |
192 | let old_max = self.dfa.max_match_state(); |
193 | self.dfa.set_max_match_state(dead_id()); |
194 | for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) { |
195 | let part = state_to_part[id.to_usize()]; |
196 | let new_id = minimal_ids[part.to_usize()]; |
197 | if new_id > self.dfa.max_match_state() { |
198 | self.dfa.set_max_match_state(new_id); |
199 | } |
200 | } |
201 | } |
202 | |
203 | fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> { |
204 | self.waiting.iter().position(|s| s == set) |
205 | } |
206 | |
207 | fn find_incoming_to( |
208 | &self, |
209 | b: u8, |
210 | set: &StateSet<S>, |
211 | incoming: &mut StateSet<S>, |
212 | ) { |
213 | incoming.clear(); |
214 | set.iter(|id| { |
215 | for &inid in &self.in_transitions[id.to_usize()][b as usize] { |
216 | incoming.add(inid); |
217 | } |
218 | }); |
219 | incoming.canonicalize(); |
220 | } |
221 | |
222 | fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> { |
223 | let mut is_match = StateSet::empty(); |
224 | let mut no_match = StateSet::empty(); |
225 | for (id, _) in dfa.states() { |
226 | if dfa.is_match_state(id) { |
227 | is_match.add(id); |
228 | } else { |
229 | no_match.add(id); |
230 | } |
231 | } |
232 | |
233 | let mut sets = vec![is_match]; |
234 | if !no_match.is_empty() { |
235 | sets.push(no_match); |
236 | } |
237 | sets.sort_by_key(|s| s.len()); |
238 | sets |
239 | } |
240 | |
241 | fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> { |
242 | let mut incoming = vec![]; |
243 | for _ in dfa.states() { |
244 | incoming.push(vec![vec![]; dfa.alphabet_len()]); |
245 | } |
246 | for (id, state) in dfa.states() { |
247 | for (b, next) in state.transitions() { |
248 | incoming[next.to_usize()][b as usize].push(id); |
249 | } |
250 | } |
251 | incoming |
252 | } |
253 | } |
254 | |
255 | impl<S: StateID> StateSet<S> { |
256 | fn empty() -> StateSet<S> { |
257 | StateSet(Rc::new(RefCell::new(vec![]))) |
258 | } |
259 | |
260 | fn add(&mut self, id: S) { |
261 | self.0.borrow_mut().push(id); |
262 | } |
263 | |
264 | fn min(&self) -> S { |
265 | self.0.borrow()[0] |
266 | } |
267 | |
268 | fn canonicalize(&mut self) { |
269 | self.0.borrow_mut().sort(); |
270 | self.0.borrow_mut().dedup(); |
271 | } |
272 | |
273 | fn clear(&mut self) { |
274 | self.0.borrow_mut().clear(); |
275 | } |
276 | |
277 | fn len(&self) -> usize { |
278 | self.0.borrow().len() |
279 | } |
280 | |
281 | fn is_empty(&self) -> bool { |
282 | self.len() == 0 |
283 | } |
284 | |
285 | fn deep_clone(&self) -> StateSet<S> { |
286 | let ids = self.0.borrow().iter().cloned().collect(); |
287 | StateSet(Rc::new(RefCell::new(ids))) |
288 | } |
289 | |
290 | fn iter<F: FnMut(S)>(&self, mut f: F) { |
291 | for &id in self.0.borrow().iter() { |
292 | f(id); |
293 | } |
294 | } |
295 | |
296 | fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) { |
297 | dest.clear(); |
298 | if self.is_empty() || other.is_empty() { |
299 | return; |
300 | } |
301 | |
302 | let (seta, setb) = (self.0.borrow(), other.0.borrow()); |
303 | let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); |
304 | let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); |
305 | loop { |
306 | if a == b { |
307 | dest.add(a); |
308 | a = match ita.next() { |
309 | None => break, |
310 | Some(a) => a, |
311 | }; |
312 | b = match itb.next() { |
313 | None => break, |
314 | Some(b) => b, |
315 | }; |
316 | } else if a < b { |
317 | a = match ita.next() { |
318 | None => break, |
319 | Some(a) => a, |
320 | }; |
321 | } else { |
322 | b = match itb.next() { |
323 | None => break, |
324 | Some(b) => b, |
325 | }; |
326 | } |
327 | } |
328 | } |
329 | |
330 | fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) { |
331 | dest.clear(); |
332 | if self.is_empty() || other.is_empty() { |
333 | self.iter(|s| dest.add(s)); |
334 | return; |
335 | } |
336 | |
337 | let (seta, setb) = (self.0.borrow(), other.0.borrow()); |
338 | let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); |
339 | let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); |
340 | loop { |
341 | if a == b { |
342 | a = match ita.next() { |
343 | None => break, |
344 | Some(a) => a, |
345 | }; |
346 | b = match itb.next() { |
347 | None => { |
348 | dest.add(a); |
349 | break; |
350 | } |
351 | Some(b) => b, |
352 | }; |
353 | } else if a < b { |
354 | dest.add(a); |
355 | a = match ita.next() { |
356 | None => break, |
357 | Some(a) => a, |
358 | }; |
359 | } else { |
360 | b = match itb.next() { |
361 | None => { |
362 | dest.add(a); |
363 | break; |
364 | } |
365 | Some(b) => b, |
366 | }; |
367 | } |
368 | } |
369 | for a in ita { |
370 | dest.add(a); |
371 | } |
372 | } |
373 | } |
374 | |