minimize.rs - Codebrowser

1	use core::{cell::RefCell, fmt, mem};
2
3	use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
4
5	use crate::{
6	dfa::{automaton::Automaton, dense, DEAD},
7	util::{
8	alphabet,
9	primitives::{PatternID, StateID},
10	},
11	};
12
13	/// An implementation of Hopcroft's algorithm for minimizing DFAs.
14	///
15	/// The algorithm implemented here is mostly taken from Wikipedia:
16	/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
17	///
18	/// This code has had some light optimization attention paid to it,
19	/// particularly in the form of reducing allocation as much as possible.
20	/// However, it is still generally slow. Future optimization work should
21	/// probably focus on the bigger picture rather than micro-optimizations. For
22	/// example:
23	///
24	/// 1. Figure out how to more intelligently create initial partitions. That is,
25	/// Hopcroft's algorithm starts by creating two partitions of DFA states
26	/// that are known to NOT be equivalent: match states and non-match states.
27	/// The algorithm proceeds by progressively refining these partitions into
28	/// smaller partitions. If we could start with more partitions, then we
29	/// could reduce the amount of work that Hopcroft's algorithm needs to do.
30	/// 2. For every partition that we visit, we find all incoming transitions to
31	/// every state in the partition for every* element in the alphabet. (This*
32	/// is why using byte classes can significantly decrease minimization times,
33	/// since byte classes shrink the alphabet.) This is quite costly and there
34	/// is perhaps some redundant work being performed depending on the specific
35	/// states in the set. For example, we might be able to only visit some
36	/// elements of the alphabet based on the transitions.
37	/// 3. Move parts of minimization into determinization. If minimization has
38	/// fewer states to deal with, then it should run faster. A prime example
39	/// of this might be large Unicode classes, which are generated in way that
40	/// can create a lot of redundant states. (Some work has been done on this
41	/// point during NFA compilation via the algorithm described in the
42	/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
43	/// paper.)
44	pub(crate) struct Minimizer<'a> {
45	dfa: &'a mut dense::OwnedDFA,
46	in_transitions: Vec<Vec<Vec<StateID>>>,
47	partitions: Vec<StateSet>,
48	waiting: Vec<StateSet>,
49	}
50
51	impl<'a> fmt::Debug for Minimizer<'a> {
52	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
53	f.debug_struct("Minimizer")
54	.field("dfa", &self.dfa)
55	.field("in_transitions", &self.in_transitions)
56	.field("partitions", &self.partitions)
57	.field("waiting", &self.waiting)
58	.finish()
59	}
60	}
61
62	/// A set of states. A state set makes up a single partition in Hopcroft's
63	/// algorithm.
64	///
65	/// It is represented by an ordered set of state identifiers. We use shared
66	/// ownership so that a single state set can be in both the set of partitions
67	/// and in the set of waiting sets simultaneously without an additional
68	/// allocation. Generally, once a state set is built, it becomes immutable.
69	///
70	/// We use this representation because it avoids the overhead of more
71	/// traditional set data structures (HashSet/BTreeSet), and also because
72	/// computing intersection/subtraction on this representation is especially
73	/// fast.
74	#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
75	struct StateSet {
76	ids: Rc<RefCell<Vec<StateID>>>,
77	}
78
79	impl<'a> Minimizer<'a> {
80	pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
81	let in_transitions = Minimizer::incoming_transitions(dfa);
82	let partitions = Minimizer::initial_partitions(dfa);
83	let waiting = partitions.clone();
84	Minimizer { dfa, in_transitions, partitions, waiting }
85	}
86
87	pub fn run(mut self) {
88	let stride2 = self.dfa.stride2();
89	let as_state_id = \|index: usize\| -> StateID {
90	StateID::new(index << stride2).unwrap()
91	};
92	let as_index = \|id: StateID\| -> usize { id.as_usize() >> stride2 };
93
94	let mut incoming = StateSet::empty();
95	let mut scratch1 = StateSet::empty();
96	let mut scratch2 = StateSet::empty();
97	let mut newparts = vec![];
98
99	// This loop is basically Hopcroft's algorithm. Everything else is just
100	// shuffling data around to fit our representation.
101	while let Some(set) = self.waiting.pop() {
102	for b in self.dfa.byte_classes().iter() {
103	self.find_incoming_to(b, &set, &mut incoming);
104	// If incoming is empty, then the intersection with any other
105	// set must also be empty. So 'newparts' just ends up being
106	// 'self.partitions'. So there's no need to go through the loop
107	// below.
108	//
109	// This actually turns out to be rather large optimization. On
110	// the order of making minimization 4-5x faster. It's likely
111	// that the vast majority of all states have very few incoming
112	// transitions.
113	if incoming.is_empty() {
114	continue;
115	}
116
117	for p in `0`..self.partitions.len() {
118	self.partitions[p].intersection(&incoming, &mut scratch1);
119	if scratch1.is_empty() {
120	newparts.push(self.partitions[p].clone());
121	continue;
122	}
123
124	self.partitions[p].subtract(&incoming, &mut scratch2);
125	if scratch2.is_empty() {
126	newparts.push(self.partitions[p].clone());
127	continue;
128	}
129
130	let (x, y) =
131	(scratch1.deep_clone(), scratch2.deep_clone());
132	newparts.push(x.clone());
133	newparts.push(y.clone());
134	match self.find_waiting(&self.partitions[p]) {
135	Some(i) => {
136	self.waiting[i] = x;
137	self.waiting.push(y);
138	}
139	None => {
140	if x.len() <= y.len() {
141	self.waiting.push(x);
142	} else {
143	self.waiting.push(y);
144	}
145	}
146	}
147	}
148	newparts = mem::replace(&mut self.partitions, newparts);
149	newparts.clear();
150	}
151	}
152
153	// At this point, we now have a minimal partitioning of states, where
154	// each partition is an equivalence class of DFA states. Now we need to
155	// use this partitioning to update the DFA to only contain one state for
156	// each partition.
157
158	// Create a map from DFA state ID to the representative ID of the
159	// equivalence class to which it belongs. The representative ID of an
160	// equivalence class of states is the minimum ID in that class.
161	let mut state_to_part = vec![DEAD; self.dfa.state_len()];
162	for p in &self.partitions {
163	p.iter(\|id\| state_to_part[as_index(id)] = p.min());
164	}
165
166	// Generate a new contiguous sequence of IDs for minimal states, and
167	// create a map from equivalence IDs to the new IDs. Thus, the new
168	// minimal ID of any* state in the unminimized DFA can be obtained*
169	// with minimals_ids[state_to_part[old_id]].
170	let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
171	let mut new_index = `0`;
172	for state in self.dfa.states() {
173	if state_to_part[as_index(state.id())] == state.id() {
174	minimal_ids[as_index(state.id())] = as_state_id(new_index);
175	new_index += `1`;
176	}
177	}
178	// The total number of states in the minimal DFA.
179	let minimal_count = new_index;
180	// Convenience function for remapping state IDs. This takes an old ID,
181	// looks up its Hopcroft partition and then maps that to the new ID
182	// range.
183	let remap = \|old\| minimal_ids[as_index(state_to_part[as_index(old)])];
184
185	// Re-map this DFA in place such that the only states remaining
186	// correspond to the representative states of every equivalence class.
187	for id in (`0`..self.dfa.state_len()).map(as_state_id) {
188	// If this state isn't a representative for an equivalence class,
189	// then we skip it since it won't appear in the minimal DFA.
190	if state_to_part[as_index(id)] != id {
191	continue;
192	}
193	self.dfa.remap_state(id, remap);
194	self.dfa.swap_states(id, minimal_ids[as_index(id)]);
195	}
196	// Trim off all unused states from the pre-minimized DFA. This
197	// represents all states that were merged into a non-singleton
198	// equivalence class of states, and appeared after the first state
199	// in each such class. (Because the state with the smallest ID in each
200	// equivalence class is its representative ID.)
201	self.dfa.truncate_states(minimal_count);
202
203	// Update the new start states, which is now just the minimal ID of
204	// whatever state the old start state was collapsed into. Also, we
205	// collect everything before-hand to work around the borrow checker.
206	// We're already allocating so much that this is probably fine. If this
207	// turns out to be costly, then I guess add a `starts_mut` iterator.
208	let starts: Vec<_> = self.dfa.starts().collect();
209	for (old_start_id, anchored, start_type) in starts {
210	self.dfa.set_start_state(
211	anchored,
212	start_type,
213	remap(old_start_id),
214	);
215	}
216
217	// Update the match state pattern ID list for multi-regexes. All we
218	// need to do is remap the match state IDs. The pattern ID lists are
219	// always the same as they were since match states with distinct
220	// pattern ID lists are always considered distinct states.
221	let mut pmap = BTreeMap::new();
222	for (match_id, pattern_ids) in self.dfa.pattern_map() {
223	let new_id = remap(match_id);
224	pmap.insert(new_id, pattern_ids);
225	}
226	// This unwrap is OK because minimization never increases the number of
227	// match states or patterns in those match states. Since minimization
228	// runs after the pattern map has already been set at least once, we
229	// know that our match states cannot error.
230	self.dfa.set_pattern_map(&pmap).unwrap();
231
232	// In order to update the ID of the maximum match state, we need to
233	// find the maximum ID among all of the match states in the minimized
234	// DFA. This is not necessarily the new ID of the unminimized maximum
235	// match state, since that could have been collapsed with a much
236	// earlier match state. Therefore, to find the new max match state,
237	// we iterate over all previous match states, find their corresponding
238	// new minimal ID, and take the maximum of those.
239	let old = self.dfa.special().clone();
240	let new = self.dfa.special_mut();
241	// ... but only remap if we had match states.
242	if old.matches() {
243	new.min_match = StateID::MAX;
244	new.max_match = StateID::ZERO;
245	for i in as_index(old.min_match)..=as_index(old.max_match) {
246	let new_id = remap(as_state_id(i));
247	if new_id < new.min_match {
248	new.min_match = new_id;
249	}
250	if new_id > new.max_match {
251	new.max_match = new_id;
252	}
253	}
254	}
255	// ... same, but for start states.
256	if old.starts() {
257	new.min_start = StateID::MAX;
258	new.max_start = StateID::ZERO;
259	for i in as_index(old.min_start)..=as_index(old.max_start) {
260	let new_id = remap(as_state_id(i));
261	if new_id == DEAD {
262	continue;
263	}
264	if new_id < new.min_start {
265	new.min_start = new_id;
266	}
267	if new_id > new.max_start {
268	new.max_start = new_id;
269	}
270	}
271	if new.max_start == DEAD {
272	new.min_start = DEAD;
273	}
274	}
275	new.quit_id = remap(new.quit_id);
276	new.set_max();
277	}
278
279	fn find_waiting(&self, set: &StateSet) -> Option<usize> {
280	self.waiting.iter().position(\|s\| s == set)
281	}
282
283	fn find_incoming_to(
284	&self,
285	b: alphabet::Unit,
286	set: &StateSet,
287	incoming: &mut StateSet,
288	) {
289	incoming.clear();
290	set.iter(\|id\| {
291	for &inid in
292	&self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
293	{
294	incoming.add(inid);
295	}
296	});
297	incoming.canonicalize();
298	}
299
300	fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
301	// For match states, we know that two match states with different
302	// pattern ID lists will always* be distinct, so we can partition them*
303	// initially based on that.
304	let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
305	let mut is_quit = StateSet::empty();
306	let mut no_match = StateSet::empty();
307	for state in dfa.states() {
308	if dfa.is_match_state(state.id()) {
309	let mut pids = vec![];
310	for i in `0`..dfa.match_len(state.id()) {
311	pids.push(dfa.match_pattern(state.id(), i));
312	}
313	matching
314	.entry(pids)
315	.or_insert(StateSet::empty())
316	.add(state.id());
317	} else if dfa.is_quit_state(state.id()) {
318	is_quit.add(state.id());
319	} else {
320	no_match.add(state.id());
321	}
322	}
323
324	let mut sets: Vec<StateSet> =
325	matching.into_iter().map(\|(_, set)\| set).collect();
326	sets.push(no_match);
327	sets.push(is_quit);
328	sets
329	}
330
331	fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
332	let mut incoming = vec![];
333	for _ in dfa.states() {
334	incoming.push(vec![vec![]; dfa.alphabet_len()]);
335	}
336	for state in dfa.states() {
337	for (b, next) in state.transitions() {
338	incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
339	}
340	}
341	incoming
342	}
343	}
344
345	impl StateSet {
346	fn empty() -> StateSet {
347	StateSet { ids: Rc::new(RefCell::new(vec![])) }
348	}
349
350	fn add(&mut self, id: StateID) {
351	self.ids.borrow_mut().push(id);
352	}
353
354	fn min(&self) -> StateID {
355	self.ids.borrow()[`0`]
356	}
357
358	fn canonicalize(&mut self) {
359	self.ids.borrow_mut().sort();
360	self.ids.borrow_mut().dedup();
361	}
362
363	fn clear(&mut self) {
364	self.ids.borrow_mut().clear();
365	}
366
367	fn len(&self) -> usize {
368	self.ids.borrow().len()
369	}
370
371	fn is_empty(&self) -> bool {
372	self.len() == `0`
373	}
374
375	fn deep_clone(&self) -> StateSet {
376	let ids = self.ids.borrow().iter().cloned().collect();
377	StateSet { ids: Rc::new(RefCell::new(ids)) }
378	}
379
380	fn iter<F: FnMut(StateID)>(&self, mut f: F) {
381	for &id in self.ids.borrow().iter() {
382	f(id);
383	}
384	}
385
386	fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
387	dest.clear();
388	if self.is_empty() \|\| other.is_empty() {
389	return;
390	}
391
392	let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
393	let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
394	let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
395	loop {
396	if a == b {
397	dest.add(a);
398	a = match ita.next() {
399	None => break,
400	Some(a) => a,
401	};
402	b = match itb.next() {
403	None => break,
404	Some(b) => b,
405	};
406	} else if a < b {
407	a = match ita.next() {
408	None => break,
409	Some(a) => a,
410	};
411	} else {
412	b = match itb.next() {
413	None => break,
414	Some(b) => b,
415	};
416	}
417	}
418	}
419
420	fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
421	dest.clear();
422	if self.is_empty() \|\| other.is_empty() {
423	self.iter(\|s\| dest.add(s));
424	return;
425	}
426
427	let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
428	let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
429	let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
430	loop {
431	if a == b {
432	a = match ita.next() {
433	None => break,
434	Some(a) => a,
435	};
436	b = match itb.next() {
437	None => {
438	dest.add(a);
439	break;
440	}
441	Some(b) => b,
442	};
443	} else if a < b {
444	dest.add(a);
445	a = match ita.next() {
446	None => break,
447	Some(a) => a,
448	};
449	} else {
450	b = match itb.next() {
451	None => {
452	dest.add(a);
453	break;
454	}
455	Some(b) => b,
456	};
457	}
458	}
459	for a in ita {
460	dest.add(a);
461	}
462	}
463	}
464

Provided by KDAB

Definitions