1use std::cell::RefCell;
2use std::fmt;
3use std::mem;
4use std::rc::Rc;
5
6use dense;
7use state_id::{dead_id, StateID};
8
9type DFARepr<S> = dense::Repr<Vec<S>, S>;
10
11/// An implementation of Hopcroft's algorithm for minimizing DFAs.
12///
13/// The algorithm implemented here is mostly taken from Wikipedia:
14/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
15///
16/// This code has had some light optimization attention paid to it,
17/// particularly in the form of reducing allocation as much as possible.
18/// However, it is still generally slow. Future optimization work should
19/// probably focus on the bigger picture rather than micro-optimizations. For
20/// example:
21///
22/// 1. Figure out how to more intelligently create initial partitions. That is,
23/// Hopcroft's algorithm starts by creating two partitions of DFA states
24/// that are known to NOT be equivalent: match states and non-match states.
25/// The algorithm proceeds by progressively refining these partitions into
26/// smaller partitions. If we could start with more partitions, then we
27/// could reduce the amount of work that Hopcroft's algorithm needs to do.
28/// 2. For every partition that we visit, we find all incoming transitions to
29/// every state in the partition for *every* element in the alphabet. (This
30/// is why using byte classes can significantly decrease minimization times,
31/// since byte classes shrink the alphabet.) This is quite costly and there
32/// is perhaps some redundant work being performed depending on the specific
33/// states in the set. For example, we might be able to only visit some
34/// elements of the alphabet based on the transitions.
35/// 3. Move parts of minimization into determinization. If minimization has
36/// fewer states to deal with, then it should run faster. A prime example
37/// of this might be large Unicode classes, which are generated in way that
38/// can create a lot of redundant states. (Some work has been done on this
39/// point during NFA compilation via the algorithm described in the
40/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
41/// paper.)
42pub(crate) struct Minimizer<'a, S: 'a> {
43 dfa: &'a mut DFARepr<S>,
44 in_transitions: Vec<Vec<Vec<S>>>,
45 partitions: Vec<StateSet<S>>,
46 waiting: Vec<StateSet<S>>,
47}
48
49impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
50 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
51 f.debug_struct("Minimizer")
52 .field("dfa", &self.dfa)
53 .field("in_transitions", &self.in_transitions)
54 .field("partitions", &self.partitions)
55 .field("waiting", &self.waiting)
56 .finish()
57 }
58}
59
60/// A set of states. A state set makes up a single partition in Hopcroft's
61/// algorithm.
62///
63/// It is represented by an ordered set of state identifiers. We use shared
64/// ownership so that a single state set can be in both the set of partitions
65/// and in the set of waiting sets simultaneously without an additional
66/// allocation. Generally, once a state set is built, it becomes immutable.
67///
68/// We use this representation because it avoids the overhead of more
69/// traditional set data structures (HashSet/BTreeSet), and also because
70/// computing intersection/subtraction on this representation is especially
71/// fast.
72#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
73struct StateSet<S>(Rc<RefCell<Vec<S>>>);
74
75impl<'a, S: StateID> Minimizer<'a, S> {
76 pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
77 let in_transitions = Minimizer::incoming_transitions(dfa);
78 let partitions = Minimizer::initial_partitions(dfa);
79 let waiting = vec![partitions[0].clone()];
80
81 Minimizer { dfa, in_transitions, partitions, waiting }
82 }
83
84 pub fn run(mut self) {
85 let mut incoming = StateSet::empty();
86 let mut scratch1 = StateSet::empty();
87 let mut scratch2 = StateSet::empty();
88 let mut newparts = vec![];
89
90 while let Some(set) = self.waiting.pop() {
91 for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) {
92 self.find_incoming_to(b, &set, &mut incoming);
93
94 for p in 0..self.partitions.len() {
95 self.partitions[p].intersection(&incoming, &mut scratch1);
96 if scratch1.is_empty() {
97 newparts.push(self.partitions[p].clone());
98 continue;
99 }
100
101 self.partitions[p].subtract(&incoming, &mut scratch2);
102 if scratch2.is_empty() {
103 newparts.push(self.partitions[p].clone());
104 continue;
105 }
106
107 let (x, y) =
108 (scratch1.deep_clone(), scratch2.deep_clone());
109 newparts.push(x.clone());
110 newparts.push(y.clone());
111 match self.find_waiting(&self.partitions[p]) {
112 Some(i) => {
113 self.waiting[i] = x;
114 self.waiting.push(y);
115 }
116 None => {
117 if x.len() <= y.len() {
118 self.waiting.push(x);
119 } else {
120 self.waiting.push(y);
121 }
122 }
123 }
124 }
125 newparts = mem::replace(&mut self.partitions, newparts);
126 newparts.clear();
127 }
128 }
129
130 // At this point, we now have a minimal partitioning of states, where
131 // each partition is an equivalence class of DFA states. Now we need to
132 // use this partioning to update the DFA to only contain one state for
133 // each partition.
134
135 // Create a map from DFA state ID to the representative ID of the
136 // equivalence class to which it belongs. The representative ID of an
137 // equivalence class of states is the minimum ID in that class.
138 let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
139 for p in &self.partitions {
140 p.iter(|id| state_to_part[id.to_usize()] = p.min());
141 }
142
143 // Generate a new contiguous sequence of IDs for minimal states, and
144 // create a map from equivalence IDs to the new IDs. Thus, the new
145 // minimal ID of *any* state in the unminimized DFA can be obtained
146 // with minimals_ids[state_to_part[old_id]].
147 let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
148 let mut new_id = S::from_usize(0);
149 for (id, _) in self.dfa.states() {
150 if state_to_part[id.to_usize()] == id {
151 minimal_ids[id.to_usize()] = new_id;
152 new_id = S::from_usize(new_id.to_usize() + 1);
153 }
154 }
155 // The total number of states in the minimal DFA.
156 let minimal_count = new_id.to_usize();
157
158 // Re-map this DFA in place such that the only states remaining
159 // correspond to the representative states of every equivalence class.
160 for id in (0..self.dfa.state_count()).map(S::from_usize) {
161 // If this state isn't a representative for an equivalence class,
162 // then we skip it since it won't appear in the minimal DFA.
163 if state_to_part[id.to_usize()] != id {
164 continue;
165 }
166 for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
167 *next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
168 }
169 self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
170 }
171 // Trim off all unused states from the pre-minimized DFA. This
172 // represents all states that were merged into a non-singleton
173 // equivalence class of states, and appeared after the first state
174 // in each such class. (Because the state with the smallest ID in each
175 // equivalence class is its representative ID.)
176 self.dfa.truncate_states(minimal_count);
177
178 // Update the new start state, which is now just the minimal ID of
179 // whatever state the old start state was collapsed into.
180 let old_start = self.dfa.start_state();
181 self.dfa.set_start_state(
182 minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
183 );
184
185 // In order to update the ID of the maximum match state, we need to
186 // find the maximum ID among all of the match states in the minimized
187 // DFA. This is not necessarily the new ID of the unminimized maximum
188 // match state, since that could have been collapsed with a much
189 // earlier match state. Therefore, to find the new max match state,
190 // we iterate over all previous match states, find their corresponding
191 // new minimal ID, and take the maximum of those.
192 let old_max = self.dfa.max_match_state();
193 self.dfa.set_max_match_state(dead_id());
194 for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
195 let part = state_to_part[id.to_usize()];
196 let new_id = minimal_ids[part.to_usize()];
197 if new_id > self.dfa.max_match_state() {
198 self.dfa.set_max_match_state(new_id);
199 }
200 }
201 }
202
203 fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
204 self.waiting.iter().position(|s| s == set)
205 }
206
207 fn find_incoming_to(
208 &self,
209 b: u8,
210 set: &StateSet<S>,
211 incoming: &mut StateSet<S>,
212 ) {
213 incoming.clear();
214 set.iter(|id| {
215 for &inid in &self.in_transitions[id.to_usize()][b as usize] {
216 incoming.add(inid);
217 }
218 });
219 incoming.canonicalize();
220 }
221
222 fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
223 let mut is_match = StateSet::empty();
224 let mut no_match = StateSet::empty();
225 for (id, _) in dfa.states() {
226 if dfa.is_match_state(id) {
227 is_match.add(id);
228 } else {
229 no_match.add(id);
230 }
231 }
232
233 let mut sets = vec![is_match];
234 if !no_match.is_empty() {
235 sets.push(no_match);
236 }
237 sets.sort_by_key(|s| s.len());
238 sets
239 }
240
241 fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
242 let mut incoming = vec![];
243 for _ in dfa.states() {
244 incoming.push(vec![vec![]; dfa.alphabet_len()]);
245 }
246 for (id, state) in dfa.states() {
247 for (b, next) in state.transitions() {
248 incoming[next.to_usize()][b as usize].push(id);
249 }
250 }
251 incoming
252 }
253}
254
255impl<S: StateID> StateSet<S> {
256 fn empty() -> StateSet<S> {
257 StateSet(Rc::new(RefCell::new(vec![])))
258 }
259
260 fn add(&mut self, id: S) {
261 self.0.borrow_mut().push(id);
262 }
263
264 fn min(&self) -> S {
265 self.0.borrow()[0]
266 }
267
268 fn canonicalize(&mut self) {
269 self.0.borrow_mut().sort();
270 self.0.borrow_mut().dedup();
271 }
272
273 fn clear(&mut self) {
274 self.0.borrow_mut().clear();
275 }
276
277 fn len(&self) -> usize {
278 self.0.borrow().len()
279 }
280
281 fn is_empty(&self) -> bool {
282 self.len() == 0
283 }
284
285 fn deep_clone(&self) -> StateSet<S> {
286 let ids = self.0.borrow().iter().cloned().collect();
287 StateSet(Rc::new(RefCell::new(ids)))
288 }
289
290 fn iter<F: FnMut(S)>(&self, mut f: F) {
291 for &id in self.0.borrow().iter() {
292 f(id);
293 }
294 }
295
296 fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
297 dest.clear();
298 if self.is_empty() || other.is_empty() {
299 return;
300 }
301
302 let (seta, setb) = (self.0.borrow(), other.0.borrow());
303 let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
304 let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
305 loop {
306 if a == b {
307 dest.add(a);
308 a = match ita.next() {
309 None => break,
310 Some(a) => a,
311 };
312 b = match itb.next() {
313 None => break,
314 Some(b) => b,
315 };
316 } else if a < b {
317 a = match ita.next() {
318 None => break,
319 Some(a) => a,
320 };
321 } else {
322 b = match itb.next() {
323 None => break,
324 Some(b) => b,
325 };
326 }
327 }
328 }
329
330 fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
331 dest.clear();
332 if self.is_empty() || other.is_empty() {
333 self.iter(|s| dest.add(s));
334 return;
335 }
336
337 let (seta, setb) = (self.0.borrow(), other.0.borrow());
338 let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
339 let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
340 loop {
341 if a == b {
342 a = match ita.next() {
343 None => break,
344 Some(a) => a,
345 };
346 b = match itb.next() {
347 None => {
348 dest.add(a);
349 break;
350 }
351 Some(b) => b,
352 };
353 } else if a < b {
354 dest.add(a);
355 a = match ita.next() {
356 None => break,
357 Some(a) => a,
358 };
359 } else {
360 b = match itb.next() {
361 None => {
362 dest.add(a);
363 break;
364 }
365 Some(b) => b,
366 };
367 }
368 }
369 for a in ita {
370 dest.add(a);
371 }
372 }
373}
374