minimize.rs source code [crates/regex-automata-0.1.10/src/minimize.rs]

1	use std::cell::RefCell;
2	use std::fmt;
3	use std::mem;
4	use std::rc::Rc;
5
6	use dense;
7	use state_id::{dead_id, StateID};
8
9	type DFARepr<S> = dense::Repr<Vec<S>, S>;
10
11	/// An implementation of Hopcroft's algorithm for minimizing DFAs.
12	///
13	/// The algorithm implemented here is mostly taken from Wikipedia:
14	/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
15	///
16	/// This code has had some light optimization attention paid to it,
17	/// particularly in the form of reducing allocation as much as possible.
18	/// However, it is still generally slow. Future optimization work should
19	/// probably focus on the bigger picture rather than micro-optimizations. For
20	/// example:
21	///
22	/// 1. Figure out how to more intelligently create initial partitions. That is,
23	/// Hopcroft's algorithm starts by creating two partitions of DFA states
24	/// that are known to NOT be equivalent: match states and non-match states.
25	/// The algorithm proceeds by progressively refining these partitions into
26	/// smaller partitions. If we could start with more partitions, then we
27	/// could reduce the amount of work that Hopcroft's algorithm needs to do.
28	/// 2. For every partition that we visit, we find all incoming transitions to
29	/// every state in the partition for every* element in the alphabet. (This*
30	/// is why using byte classes can significantly decrease minimization times,
31	/// since byte classes shrink the alphabet.) This is quite costly and there
32	/// is perhaps some redundant work being performed depending on the specific
33	/// states in the set. For example, we might be able to only visit some
34	/// elements of the alphabet based on the transitions.
35	/// 3. Move parts of minimization into determinization. If minimization has
36	/// fewer states to deal with, then it should run faster. A prime example
37	/// of this might be large Unicode classes, which are generated in way that
38	/// can create a lot of redundant states. (Some work has been done on this
39	/// point during NFA compilation via the algorithm described in the
40	/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
41	/// paper.)
42	pub(crate) struct Minimizer<'a, S: 'a> {
43	dfa: &'a mut DFARepr<S>,
44	in_transitions: Vec<Vec<Vec<S>>>,
45	partitions: Vec<StateSet<S>>,
46	waiting: Vec<StateSet<S>>,
47	}
48
49	impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
50	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
51	f&mut DebugStruct<'_, '_>.debug_struct("Minimizer")
52	.field("dfa", &self.dfa)
53	.field("in_transitions", &self.in_transitions)
54	.field("partitions", &self.partitions)
55	.field(name:"waiting", &self.waiting)
56	.finish()
57	}
58	}
59
60	/// A set of states. A state set makes up a single partition in Hopcroft's
61	/// algorithm.
62	///
63	/// It is represented by an ordered set of state identifiers. We use shared
64	/// ownership so that a single state set can be in both the set of partitions
65	/// and in the set of waiting sets simultaneously without an additional
66	/// allocation. Generally, once a state set is built, it becomes immutable.
67	///
68	/// We use this representation because it avoids the overhead of more
69	/// traditional set data structures (HashSet/BTreeSet), and also because
70	/// computing intersection/subtraction on this representation is especially
71	/// fast.
72	#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
73	struct StateSet<S>(Rc<RefCell<Vec<S>>>);
74
75	impl<'a, S: StateID> Minimizer<'a, S> {
76	pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
77	let in_transitions = Minimizer::incoming_transitions(dfa);
78	let partitions = Minimizer::initial_partitions(dfa);
79	let waiting = vec![partitions[`0`].clone()];
80
81	Minimizer { dfa, in_transitions, partitions, waiting }
82	}
83
84	pub fn run(mut self) {
85	let mut incoming = StateSet::empty();
86	let mut scratch1 = StateSet::empty();
87	let mut scratch2 = StateSet::empty();
88	let mut newparts = vec![];
89
90	while let Some(set) = self.waiting.pop() {
91	for b in (`0`..self.dfa.alphabet_len()).map(\|b\| b as u8) {
92	self.find_incoming_to(b, &set, &mut incoming);
93
94	for p in `0`..self.partitions.len() {
95	self.partitions[p].intersection(&incoming, &mut scratch1);
96	if scratch1.is_empty() {
97	newparts.push(self.partitions[p].clone());
98	continue;
99	}
100
101	self.partitions[p].subtract(&incoming, &mut scratch2);
102	if scratch2.is_empty() {
103	newparts.push(self.partitions[p].clone());
104	continue;
105	}
106
107	let (x, y) =
108	(scratch1.deep_clone(), scratch2.deep_clone());
109	newparts.push(x.clone());
110	newparts.push(y.clone());
111	match self.find_waiting(&self.partitions[p]) {
112	Some(i) => {
113	self.waiting[i] = x;
114	self.waiting.push(y);
115	}
116	None => {
117	if x.len() <= y.len() {
118	self.waiting.push(x);
119	} else {
120	self.waiting.push(y);
121	}
122	}
123	}
124	}
125	newparts = mem::replace(&mut self.partitions, newparts);
126	newparts.clear();
127	}
128	}
129
130	// At this point, we now have a minimal partitioning of states, where
131	// each partition is an equivalence class of DFA states. Now we need to
132	// use this partioning to update the DFA to only contain one state for
133	// each partition.
134
135	// Create a map from DFA state ID to the representative ID of the
136	// equivalence class to which it belongs. The representative ID of an
137	// equivalence class of states is the minimum ID in that class.
138	let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
139	for p in &self.partitions {
140	p.iter(\|id\| state_to_part[id.to_usize()] = p.min());
141	}
142
143	// Generate a new contiguous sequence of IDs for minimal states, and
144	// create a map from equivalence IDs to the new IDs. Thus, the new
145	// minimal ID of any* state in the unminimized DFA can be obtained*
146	// with minimals_ids[state_to_part[old_id]].
147	let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
148	let mut new_id = S::from_usize(`0`);
149	for (id, _) in self.dfa.states() {
150	if state_to_part[id.to_usize()] == id {
151	minimal_ids[id.to_usize()] = new_id;
152	new_id = S::from_usize(new_id.to_usize() + `1`);
153	}
154	}
155	// The total number of states in the minimal DFA.
156	let minimal_count = new_id.to_usize();
157
158	// Re-map this DFA in place such that the only states remaining
159	// correspond to the representative states of every equivalence class.
160	for id in (`0`..self.dfa.state_count()).map(S::from_usize) {
161	// If this state isn't a representative for an equivalence class,
162	// then we skip it since it won't appear in the minimal DFA.
163	if state_to_part[id.to_usize()] != id {
164	continue;
165	}
166	for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
167	*next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
168	}
169	self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
170	}
171	// Trim off all unused states from the pre-minimized DFA. This
172	// represents all states that were merged into a non-singleton
173	// equivalence class of states, and appeared after the first state
174	// in each such class. (Because the state with the smallest ID in each
175	// equivalence class is its representative ID.)
176	self.dfa.truncate_states(minimal_count);
177
178	// Update the new start state, which is now just the minimal ID of
179	// whatever state the old start state was collapsed into.
180	let old_start = self.dfa.start_state();
181	self.dfa.set_start_state(
182	minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
183	);
184
185	// In order to update the ID of the maximum match state, we need to
186	// find the maximum ID among all of the match states in the minimized
187	// DFA. This is not necessarily the new ID of the unminimized maximum
188	// match state, since that could have been collapsed with a much
189	// earlier match state. Therefore, to find the new max match state,
190	// we iterate over all previous match states, find their corresponding
191	// new minimal ID, and take the maximum of those.
192	let old_max = self.dfa.max_match_state();
193	self.dfa.set_max_match_state(dead_id());
194	for id in (`0`..(old_max.to_usize() + `1`)).map(S::from_usize) {
195	let part = state_to_part[id.to_usize()];
196	let new_id = minimal_ids[part.to_usize()];
197	if new_id > self.dfa.max_match_state() {
198	self.dfa.set_max_match_state(new_id);
199	}
200	}
201	}
202
203	fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
204	self.waiting.iter().position(\|s\| s == set)
205	}
206
207	fn find_incoming_to(
208	&self,
209	b: u8,
210	set: &StateSet<S>,
211	incoming: &mut StateSet<S>,
212	) {
213	incoming.clear();
214	set.iter(\|id\| {
215	for &inid in &self.in_transitions[id.to_usize()][b as usize] {
216	incoming.add(inid);
217	}
218	});
219	incoming.canonicalize();
220	}
221
222	fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
223	let mut is_match = StateSet::empty();
224	let mut no_match = StateSet::empty();
225	for (id, _) in dfa.states() {
226	if dfa.is_match_state(id) {
227	is_match.add(id);
228	} else {
229	no_match.add(id);
230	}
231	}
232
233	let mut sets = vec![is_match];
234	if !no_match.is_empty() {
235	sets.push(no_match);
236	}
237	sets.sort_by_key(\|s\| s.len());
238	sets
239	}
240
241	fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
242	let mut incoming = vec![];
243	for _ in dfa.states() {
244	incoming.push(vec![vec![]; dfa.alphabet_len()]);
245	}
246	for (id, state) in dfa.states() {
247	for (b, next) in state.transitions() {
248	incoming[next.to_usize()][b as usize].push(id);
249	}
250	}
251	incoming
252	}
253	}
254
255	impl<S: StateID> StateSet<S> {
256	fn empty() -> StateSet<S> {
257	StateSet(Rc::new(RefCell::new(vec![])))
258	}
259
260	fn add(&mut self, id: S) {
261	self.0.borrow_mut().push(id);
262	}
263
264	fn min(&self) -> S {
265	self.0.borrow()[`0`]
266	}
267
268	fn canonicalize(&mut self) {
269	self.0.borrow_mut().sort();
270	self.0.borrow_mut().dedup();
271	}
272
273	fn clear(&mut self) {
274	self.0.borrow_mut().clear();
275	}
276
277	fn len(&self) -> usize {
278	self.0.borrow().len()
279	}
280
281	fn is_empty(&self) -> bool {
282	self.len() == `0`
283	}
284
285	fn deep_clone(&self) -> StateSet<S> {
286	let ids = self.0.borrow().iter().cloned().collect();
287	StateSet(Rc::new(RefCell::new(ids)))
288	}
289
290	fn iter<F: FnMut(S)>(&self, mut f: F) {
291	for &id in self.0.borrow().iter() {
292	f(id);
293	}
294	}
295
296	fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
297	dest.clear();
298	if self.is_empty() \|\| other.is_empty() {
299	return;
300	}
301
302	let (seta, setb) = (self.0.borrow(), other.0.borrow());
303	let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
304	let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
305	loop {
306	if a == b {
307	dest.add(a);
308	a = match ita.next() {
309	None => break,
310	Some(a) => a,
311	};
312	b = match itb.next() {
313	None => break,
314	Some(b) => b,
315	};
316	} else if a < b {
317	a = match ita.next() {
318	None => break,
319	Some(a) => a,
320	};
321	} else {
322	b = match itb.next() {
323	None => break,
324	Some(b) => b,
325	};
326	}
327	}
328	}
329
330	fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
331	dest.clear();
332	if self.is_empty() \|\| other.is_empty() {
333	self.iter(\|s\| dest.add(s));
334	return;
335	}
336
337	let (seta, setb) = (self.0.borrow(), other.0.borrow());
338	let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
339	let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
340	loop {
341	if a == b {
342	a = match ita.next() {
343	None => break,
344	Some(a) => a,
345	};
346	b = match itb.next() {
347	None => {
348	dest.add(a);
349	break;
350	}
351	Some(b) => b,
352	};
353	} else if a < b {
354	dest.add(a);
355	a = match ita.next() {
356	None => break,
357	Some(a) => a,
358	};
359	} else {
360	b = match itb.next() {
361	None => {
362	dest.add(a);
363	break;
364	}
365	Some(b) => b,
366	};
367	}
368	}
369	for a in ita {
370	dest.add(a);
371	}
372	}
373	}
374