determinize.rs - Codebrowser

1	use alloc::{collections::BTreeMap, vec::Vec};
2
3	use crate::{
4	dfa::{
5	dense::{self, BuildError},
6	DEAD,
7	},
8	nfa::thompson,
9	util::{
10	self,
11	alphabet::{self, ByteSet},
12	determinize::{State, StateBuilderEmpty, StateBuilderNFA},
13	primitives::{PatternID, StateID},
14	search::{Anchored, MatchKind},
15	sparse_set::SparseSets,
16	start::Start,
17	},
18	};
19
20	/// A builder for configuring and running a DFA determinizer.
21	#[derive(Clone, Debug)]
22	pub(crate) struct Config {
23	match_kind: MatchKind,
24	quit: ByteSet,
25	dfa_size_limit: Option<usize>,
26	determinize_size_limit: Option<usize>,
27	}
28
29	impl Config {
30	/// Create a new default config for a determinizer. The determinizer may be
31	/// configured before calling `run`.
32	pub fn new() -> Config {
33	Config {
34	match_kind: MatchKind::LeftmostFirst,
35	quit: ByteSet::empty(),
36	dfa_size_limit: None,
37	determinize_size_limit: None,
38	}
39	}
40
41	/// Run determinization on the given NFA and write the resulting DFA into
42	/// the one given. The DFA given should be initialized but otherwise empty.
43	/// "Initialized" means that it is setup to handle the NFA's byte classes,
44	/// number of patterns and whether to build start states for each pattern.
45	pub fn run(
46	&self,
47	nfa: &thompson::NFA,
48	dfa: &mut dense::OwnedDFA,
49	) -> Result<(), BuildError> {
50	let dead = State::dead();
51	let quit = State::dead();
52	let mut cache = StateMap::default();
53	// We only insert the dead state here since its representation is
54	// identical to the quit state. And we never want anything pointing
55	// to the quit state other than specific transitions derived from the
56	// determinizer's configured "quit" bytes.
57	//
58	// We do put the quit state into 'builder_states' below. This ensures
59	// that a proper DFA state ID is allocated for it, and that no other
60	// DFA state uses the "location after the DEAD state." That is, it
61	// is assumed that the quit state is always the state immediately
62	// following the DEAD state.
63	cache.insert(dead.clone(), DEAD);
64
65	let runner = Runner {
66	config: self.clone(),
67	nfa,
68	dfa,
69	builder_states: alloc::vec![dead, quit],
70	cache,
71	memory_usage_state: `0`,
72	sparses: SparseSets::new(nfa.states().len()),
73	stack: alloc::vec![],
74	scratch_state_builder: StateBuilderEmpty::new(),
75	};
76	runner.run()
77	}
78
79	/// The match semantics to use for determinization.
80	///
81	/// MatchKind::All corresponds to the standard textbook construction.
82	/// All possible match states are represented in the DFA.
83	/// MatchKind::LeftmostFirst permits greediness and otherwise tries to
84	/// simulate the match semantics of backtracking regex engines. Namely,
85	/// only a subset of match states are built, and dead states are used to
86	/// stop searches with an unanchored prefix.
87	///
88	/// The default is MatchKind::LeftmostFirst.
89	pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
90	self.match_kind = kind;
91	self
92	}
93
94	/// The set of bytes to use that will cause the DFA to enter a quit state,
95	/// stop searching and return an error. By default, this is empty.
96	pub fn quit(&mut self, set: ByteSet) -> &mut Config {
97	self.quit = set;
98	self
99	}
100
101	/// The limit, in bytes of the heap, that the DFA is permitted to use. This
102	/// does not include the auxiliary heap storage used by determinization.
103	pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
104	self.dfa_size_limit = bytes;
105	self
106	}
107
108	/// The limit, in bytes of the heap, that determinization itself is allowed
109	/// to use. This does not include the size of the DFA being built.
110	pub fn determinize_size_limit(
111	&mut self,
112	bytes: Option<usize>,
113	) -> &mut Config {
114	self.determinize_size_limit = bytes;
115	self
116	}
117	}
118
119	/// The actual implementation of determinization that converts an NFA to a DFA
120	/// through powerset construction.
121	///
122	/// This determinizer roughly follows the typical powerset construction, where
123	/// each DFA state is comprised of one or more NFA states. In the worst case,
124	/// there is one DFA state for every possible combination of NFA states. In
125	/// practice, this only happens in certain conditions, typically when there are
126	/// bounded repetitions.
127	///
128	/// The main differences between this implementation and typical deteminization
129	/// are that this implementation delays matches by one state and hackily makes
130	/// look-around work. Comments below attempt to explain this.
131	///
132	/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
133	/// whichever is shorter.
134	#[derive(Debug)]
135	struct Runner<'a> {
136	/// The configuration used to initialize determinization.
137	config: Config,
138	/// The NFA we're converting into a DFA.
139	nfa: &'a thompson::NFA,
140	/// The DFA we're building.
141	dfa: &'a mut dense::OwnedDFA,
142	/// Each DFA state being built is defined as an ordered* set of NFA*
143	/// states, along with some meta facts about the ordered set of NFA states.
144	///
145	/// This is never empty. The first state is always a dummy state such that
146	/// a state id == 0 corresponds to a dead state. The second state is always
147	/// the quit state.
148	///
149	/// Why do we have states in both a `Vec` and in a cache map below?
150	/// Well, they serve two different roles based on access patterns.
151	/// `builder_states` is the canonical home of each state, and provides
152	/// constant random access by a DFA state's ID. The cache map below, on
153	/// the other hand, provides a quick way of searching for identical DFA
154	/// states by using the DFA state as a key in the map. Of course, we use
155	/// reference counting to avoid actually duplicating the state's data
156	/// itself. (Although this has never been benchmarked.) Note that the cache
157	/// map does not give us full minimization; it just lets us avoid some very
158	/// obvious redundant states.
159	///
160	/// Note that the index into this Vec isn't quite the DFA's state ID.
161	/// Rather, it's just an index. To get the state ID, you have to multiply
162	/// it by the DFA's stride. That's done by self.dfa.from_index. And the
163	/// inverse is self.dfa.to_index.
164	///
165	/// Moreover, DFA states don't usually retain the IDs assigned to them
166	/// by their position in this Vec. After determinization completes,
167	/// states are shuffled around to support other optimizations. See the
168	/// sibling 'special' module for more details on that. (The reason for
169	/// mentioning this is that if you print out the DFA for debugging during
170	/// determinization, and then print out the final DFA after it is fully
171	/// built, then the state IDs likely won't match up.)
172	builder_states: Vec<State>,
173	/// A cache of DFA states that already exist and can be easily looked up
174	/// via ordered sets of NFA states.
175	///
176	/// See `builder_states` docs for why we store states in two different
177	/// ways.
178	cache: StateMap,
179	/// The memory usage, in bytes, used by builder_states and cache. We track
180	/// this as new states are added since states use a variable amount of
181	/// heap. Tracking this as we add states makes it possible to compute the
182	/// total amount of memory used by the determinizer in constant time.
183	memory_usage_state: usize,
184	/// A pair of sparse sets for tracking ordered sets of NFA state IDs.
185	/// These are reused throughout determinization. A bounded sparse set
186	/// gives us constant time insertion, membership testing and clearing.
187	sparses: SparseSets,
188	/// Scratch space for a stack of NFA states to visit, for depth first
189	/// visiting without recursion.
190	stack: Vec<StateID>,
191	/// Scratch space for storing an ordered sequence of NFA states, for
192	/// amortizing allocation. This is principally useful for when we avoid
193	/// adding a new DFA state since it already exists. In order to detect this
194	/// case though, we still need an ordered set of NFA state IDs. So we use
195	/// this space to stage that ordered set before we know whether we need to
196	/// create a new DFA state or not.
197	scratch_state_builder: StateBuilderEmpty,
198	}
199
200	/// A map from states to state identifiers. When using std, we use a standard
201	/// hashmap, since it's a bit faster for this use case. (Other maps, like
202	/// one's based on FNV, have not yet been benchmarked.)
203	///
204	/// The main purpose of this map is to reuse states where possible. This won't
205	/// fully minimize the DFA, but it works well in a lot of cases.
206	#[cfg(feature = "std")]
207	type StateMap = std::collections::HashMap<State, StateID>;
208	#[cfg(not(feature = "std"))]
209	type StateMap = BTreeMap<State, StateID>;
210
211	impl<'a> Runner<'a> {
212	/// Build the DFA. If there was a problem constructing the DFA (e.g., if
213	/// the chosen state identifier representation is too small), then an error
214	/// is returned.
215	fn run(mut self) -> Result<(), BuildError> {
216	if self.nfa.look_set_any().contains_word_unicode()
217	&& !self.config.quit.contains_range(`0x80`, `0xFF`)
218	{
219	return Err(BuildError::unsupported_dfa_word_boundary_unicode());
220	}
221
222	// A sequence of "representative" bytes drawn from each equivalence
223	// class. These representative bytes are fed to the NFA to compute
224	// state transitions. This allows us to avoid re-computing state
225	// transitions for bytes that are guaranteed to produce identical
226	// results. Since computing the representatives needs to do a little
227	// work, we do it once here because we'll be iterating over them a lot.
228	let representatives: Vec<alphabet::Unit> =
229	self.dfa.byte_classes().representatives(..).collect();
230	// The set of all DFA state IDs that still need to have their
231	// transitions set. We start by seeding this with all starting states.
232	let mut uncompiled = alloc::vec![];
233	self.add_all_starts(&mut uncompiled)?;
234	while let Some(dfa_id) = uncompiled.pop() {
235	for &unit in &representatives {
236	if unit.as_u8().map_or(`false`, \|b\| self.config.quit.contains(b))
237	{
238	continue;
239	}
240	// In many cases, the state we transition to has already been
241	// computed. 'cached_state' will do the minimal amount of work
242	// to check this, and if it exists, immediately return an
243	// already existing state ID.
244	let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
245	self.dfa.set_transition(dfa_id, unit, next_dfa_id);
246	// If the state ID we got back is newly created, then we need
247	// to compile it, so add it to our uncompiled frontier.
248	if is_new {
249	uncompiled.push(next_dfa_id);
250	}
251	}
252	}
253	debug!(
254	"determinization complete, memory usage: {}, \
255	dense DFA size: {}, \
256	is reverse? {}",
257	self.memory_usage(),
258	self.dfa.memory_usage(),
259	self.nfa.is_reverse(),
260	);
261
262	// A map from DFA state ID to one or more NFA match IDs. Each NFA match
263	// ID corresponds to a distinct regex pattern that matches in the state
264	// corresponding to the key.
265	let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
266	self.cache.clear();
267	#[cfg(feature = "logging")]
268	let mut total_pat_len = `0`;
269	for (i, state) in self.builder_states.into_iter().enumerate() {
270	if let Some(pat_ids) = state.match_pattern_ids() {
271	let id = self.dfa.to_state_id(i);
272	log! {
273	total_pat_len += pat_ids.len();
274	}
275	matches.insert(id, pat_ids);
276	}
277	}
278	log! {
279	use core::mem::size_of;
280	let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
281	let pats = total_pat_len * size_of::<PatternID>();
282	let mem = (matches.len() * per_elem) + pats;
283	log::debug!("matches map built, memory usage: {}", mem);
284	}
285	// At this point, we shuffle the "special" states in the final DFA.
286	// This permits a DFA's match loop to detect a match condition (among
287	// other things) by merely inspecting the current state's identifier,
288	// and avoids the need for any additional auxiliary storage.
289	self.dfa.shuffle(matches)?;
290	Ok(())
291	}
292
293	/// Return the identifier for the next DFA state given an existing DFA
294	/// state and an input byte. If the next DFA state already exists, then
295	/// return its identifier from the cache. Otherwise, build the state, cache
296	/// it and return its identifier.
297	///
298	/// This routine returns a boolean indicating whether a new state was
299	/// built. If a new state is built, then the caller needs to add it to its
300	/// frontier of uncompiled DFA states to compute transitions for.
301	fn cached_state(
302	&mut self,
303	dfa_id: StateID,
304	unit: alphabet::Unit,
305	) -> Result<(StateID, bool), BuildError> {
306	// Compute the set of all reachable NFA states, including epsilons.
307	let empty_builder = self.get_state_builder();
308	let builder = util::determinize::next(
309	self.nfa,
310	self.config.match_kind,
311	&mut self.sparses,
312	&mut self.stack,
313	&self.builder_states[self.dfa.to_index(dfa_id)],
314	unit,
315	empty_builder,
316	);
317	self.maybe_add_state(builder)
318	}
319
320	/// Compute the set of DFA start states and add their identifiers in
321	/// 'dfa_state_ids' (no duplicates are added).
322	fn add_all_starts(
323	&mut self,
324	dfa_state_ids: &mut Vec<StateID>,
325	) -> Result<(), BuildError> {
326	// These should be the first states added.
327	assert!(dfa_state_ids.is_empty());
328	// We only want to add (un)anchored starting states that is consistent
329	// with our DFA's configuration. Unconditionally adding both (although
330	// it is the default) can make DFAs quite a bit bigger.
331	if self.dfa.start_kind().has_unanchored() {
332	self.add_start_group(Anchored::No, dfa_state_ids)?;
333	}
334	if self.dfa.start_kind().has_anchored() {
335	self.add_start_group(Anchored::Yes, dfa_state_ids)?;
336	}
337	// I previously has an 'assert' here checking that either
338	// 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
339	// turns out this isn't always true. For example, the NFA might have
340	// one or more patterns but where all such patterns are just 'fail'
341	// states. These will ultimately just compile down to DFA dead states,
342	// and since the dead state was added earlier, no new DFA states are
343	// added. And thus, it is valid and okay for 'dfa_state_ids' to be
344	// empty even if there are a non-zero number of patterns in the NFA.
345
346	// We only need to compute anchored start states for each pattern if it
347	// was requested to do so.
348	if self.dfa.starts_for_each_pattern() {
349	for pid in self.nfa.patterns() {
350	self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
351	}
352	}
353	Ok(())
354	}
355
356	/// Add a group of start states for the given match pattern ID. Any new
357	/// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
358	/// pushed.)
359	///
360	/// When pattern_id is None, then this will compile a group of unanchored
361	/// start states (if the DFA is unanchored). When the pattern_id is
362	/// present, then this will compile a group of anchored start states that
363	/// only match the given pattern.
364	///
365	/// This panics if `anchored` corresponds to an invalid pattern ID.
366	fn add_start_group(
367	&mut self,
368	anchored: Anchored,
369	dfa_state_ids: &mut Vec<StateID>,
370	) -> Result<(), BuildError> {
371	let nfa_start = match anchored {
372	Anchored::No => self.nfa.start_unanchored(),
373	Anchored::Yes => self.nfa.start_anchored(),
374	Anchored::Pattern(pid) => {
375	self.nfa.start_pattern(pid).expect("valid pattern ID")
376	}
377	};
378
379	// When compiling start states, we're careful not to build additional
380	// states that aren't necessary. For example, if the NFA has no word
381	// boundary assertion, then there's no reason to have distinct start
382	// states for 'NonWordByte' and 'WordByte' starting configurations.
383	// Instead, the 'WordByte' starting configuration can just point
384	// directly to the start state for the 'NonWordByte' config.
385	//
386	// Note though that we only need to care about assertions in the prefix
387	// of an NFA since this only concerns the starting states. (Actually,
388	// the most precisely thing we could do it is look at the prefix
389	// assertions of each pattern when 'anchored == Anchored::Pattern',
390	// and then only compile extra states if the prefix is non-empty.) But
391	// we settle for simplicity here instead of absolute minimalism. It is
392	// somewhat rare, after all, for multiple patterns in the same regex to
393	// have different prefix look-arounds.
394
395	let (id, is_new) =
396	self.add_one_start(nfa_start, Start::NonWordByte)?;
397	self.dfa.set_start_state(anchored, Start::NonWordByte, id);
398	if is_new {
399	dfa_state_ids.push(id);
400	}
401
402	if !self.nfa.look_set_prefix_any().contains_word() {
403	self.dfa.set_start_state(anchored, Start::WordByte, id);
404	} else {
405	let (id, is_new) =
406	self.add_one_start(nfa_start, Start::WordByte)?;
407	self.dfa.set_start_state(anchored, Start::WordByte, id);
408	if is_new {
409	dfa_state_ids.push(id);
410	}
411	}
412	if !self.nfa.look_set_prefix_any().contains_anchor() {
413	self.dfa.set_start_state(anchored, Start::Text, id);
414	self.dfa.set_start_state(anchored, Start::LineLF, id);
415	self.dfa.set_start_state(anchored, Start::LineCR, id);
416	self.dfa.set_start_state(
417	anchored,
418	Start::CustomLineTerminator,
419	id,
420	);
421	} else {
422	let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
423	self.dfa.set_start_state(anchored, Start::Text, id);
424	if is_new {
425	dfa_state_ids.push(id);
426	}
427
428	let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
429	self.dfa.set_start_state(anchored, Start::LineLF, id);
430	if is_new {
431	dfa_state_ids.push(id);
432	}
433
434	let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
435	self.dfa.set_start_state(anchored, Start::LineCR, id);
436	if is_new {
437	dfa_state_ids.push(id);
438	}
439
440	let (id, is_new) =
441	self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
442	self.dfa.set_start_state(
443	anchored,
444	Start::CustomLineTerminator,
445	id,
446	);
447	if is_new {
448	dfa_state_ids.push(id);
449	}
450	}
451
452	Ok(())
453	}
454
455	/// Add a new DFA start state corresponding to the given starting NFA
456	/// state, and the starting search configuration. (The starting search
457	/// configuration essentially tells us which look-behind assertions are
458	/// true for this particular state.)
459	///
460	/// The boolean returned indicates whether the state ID returned is a newly
461	/// created state, or a previously cached state.
462	fn add_one_start(
463	&mut self,
464	nfa_start: StateID,
465	start: Start,
466	) -> Result<(StateID, bool), BuildError> {
467	// Compute the look-behind assertions that are true in this starting
468	// configuration, and the determine the epsilon closure. While
469	// computing the epsilon closure, we only follow condiional epsilon
470	// transitions that satisfy the look-behind assertions in 'look_have'.
471	let mut builder_matches = self.get_state_builder().into_matches();
472	util::determinize::set_lookbehind_from_start(
473	self.nfa,
474	&start,
475	&mut builder_matches,
476	);
477	self.sparses.set1.clear();
478	util::determinize::epsilon_closure(
479	self.nfa,
480	nfa_start,
481	builder_matches.look_have(),
482	&mut self.stack,
483	&mut self.sparses.set1,
484	);
485	let mut builder = builder_matches.into_nfa();
486	util::determinize::add_nfa_states(
487	&self.nfa,
488	&self.sparses.set1,
489	&mut builder,
490	);
491	self.maybe_add_state(builder)
492	}
493
494	/// Adds the given state to the DFA being built depending on whether it
495	/// already exists in this determinizer's cache.
496	///
497	/// If it does exist, then the memory used by 'state' is put back into the
498	/// determinizer and the previously created state's ID is returned. (Along
499	/// with 'false', indicating that no new state was added.)
500	///
501	/// If it does not exist, then the state is added to the DFA being built
502	/// and a fresh ID is allocated (if ID allocation fails, then an error is
503	/// returned) and returned. (Along with 'true', indicating that a new state
504	/// was added.)
505	fn maybe_add_state(
506	&mut self,
507	builder: StateBuilderNFA,
508	) -> Result<(StateID, bool), BuildError> {
509	if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
510	// Since we have a cached state, put the constructed state's
511	// memory back into our scratch space, so that it can be reused.
512	self.put_state_builder(builder);
513	return Ok((cached_id, `false`));
514	}
515	self.add_state(builder).map(\|sid\| (sid, `true`))
516	}
517
518	/// Add the given state to the DFA and make it available in the cache.
519	///
520	/// The state initially has no transitions. That is, it transitions to the
521	/// dead state for all possible inputs, and transitions to the quit state
522	/// for all quit bytes.
523	///
524	/// If adding the state would exceed the maximum value for StateID, then an
525	/// error is returned.
526	fn add_state(
527	&mut self,
528	builder: StateBuilderNFA,
529	) -> Result<StateID, BuildError> {
530	let id = self.dfa.add_empty_state()?;
531	if !self.config.quit.is_empty() {
532	for b in self.config.quit.iter() {
533	self.dfa.set_transition(
534	id,
535	alphabet::Unit::u8(b),
536	self.dfa.quit_id(),
537	);
538	}
539	}
540	let state = builder.to_state();
541	// States use reference counting internally, so we only need to count
542	// their memory usage once.
543	self.memory_usage_state += state.memory_usage();
544	self.builder_states.push(state.clone());
545	self.cache.insert(state, id);
546	self.put_state_builder(builder);
547	if let Some(limit) = self.config.dfa_size_limit {
548	if self.dfa.memory_usage() > limit {
549	return Err(BuildError::dfa_exceeded_size_limit(limit));
550	}
551	}
552	if let Some(limit) = self.config.determinize_size_limit {
553	if self.memory_usage() > limit {
554	return Err(BuildError::determinize_exceeded_size_limit(
555	limit,
556	));
557	}
558	}
559	Ok(id)
560	}
561
562	/// Returns a state builder from this determinizer that might have existing
563	/// capacity. This helps avoid allocs in cases where a state is built that
564	/// turns out to already be cached.
565	///
566	/// Callers must put the state builder back with 'put_state_builder',
567	/// otherwise the allocation reuse won't work.
568	fn get_state_builder(&mut self) -> StateBuilderEmpty {
569	core::mem::replace(
570	&mut self.scratch_state_builder,
571	StateBuilderEmpty::new(),
572	)
573	}
574
575	/// Puts the given state builder back into this determinizer for reuse.
576	///
577	/// Note that building a 'State' from a builder always creates a new
578	/// alloc, so callers should always put the builder back.
579	fn put_state_builder(&mut self, builder: StateBuilderNFA) {
580	let _ = core::mem::replace(
581	&mut self.scratch_state_builder,
582	builder.clear(),
583	);
584	}
585
586	/// Return the memory usage, in bytes, of this determinizer at the current
587	/// point in time. This does not include memory used by the NFA or the
588	/// dense DFA itself.
589	fn memory_usage(&self) -> usize {
590	use core::mem::size_of;
591
592	self.builder_states.len() * size_of::<State>()
593	// Maps likely use more memory than this, but it's probably close.
594	+ self.cache.len() * (size_of::<State>() + size_of::<StateID>())
595	+ self.memory_usage_state
596	+ self.stack.capacity() * size_of::<StateID>()
597	+ self.scratch_state_builder.capacity()
598	}
599	}
600