1use alloc::{collections::BTreeMap, vec::Vec};
2
3use crate::{
4 dfa::{
5 dense::{self, BuildError},
6 DEAD,
7 },
8 nfa::thompson,
9 util::{
10 self,
11 alphabet::{self, ByteSet},
12 determinize::{State, StateBuilderEmpty, StateBuilderNFA},
13 primitives::{PatternID, StateID},
14 search::{Anchored, MatchKind},
15 sparse_set::SparseSets,
16 start::Start,
17 },
18};
19
20/// A builder for configuring and running a DFA determinizer.
21#[derive(Clone, Debug)]
22pub(crate) struct Config {
23 match_kind: MatchKind,
24 quit: ByteSet,
25 dfa_size_limit: Option<usize>,
26 determinize_size_limit: Option<usize>,
27}
28
29impl Config {
30 /// Create a new default config for a determinizer. The determinizer may be
31 /// configured before calling `run`.
32 pub fn new() -> Config {
33 Config {
34 match_kind: MatchKind::LeftmostFirst,
35 quit: ByteSet::empty(),
36 dfa_size_limit: None,
37 determinize_size_limit: None,
38 }
39 }
40
41 /// Run determinization on the given NFA and write the resulting DFA into
42 /// the one given. The DFA given should be initialized but otherwise empty.
43 /// "Initialized" means that it is setup to handle the NFA's byte classes,
44 /// number of patterns and whether to build start states for each pattern.
45 pub fn run(
46 &self,
47 nfa: &thompson::NFA,
48 dfa: &mut dense::OwnedDFA,
49 ) -> Result<(), BuildError> {
50 let dead = State::dead();
51 let quit = State::dead();
52 let mut cache = StateMap::default();
53 // We only insert the dead state here since its representation is
54 // identical to the quit state. And we never want anything pointing
55 // to the quit state other than specific transitions derived from the
56 // determinizer's configured "quit" bytes.
57 //
58 // We do put the quit state into 'builder_states' below. This ensures
59 // that a proper DFA state ID is allocated for it, and that no other
60 // DFA state uses the "location after the DEAD state." That is, it
61 // is assumed that the quit state is always the state immediately
62 // following the DEAD state.
63 cache.insert(dead.clone(), DEAD);
64
65 let runner = Runner {
66 config: self.clone(),
67 nfa,
68 dfa,
69 builder_states: alloc::vec![dead, quit],
70 cache,
71 memory_usage_state: 0,
72 sparses: SparseSets::new(nfa.states().len()),
73 stack: alloc::vec![],
74 scratch_state_builder: StateBuilderEmpty::new(),
75 };
76 runner.run()
77 }
78
79 /// The match semantics to use for determinization.
80 ///
81 /// MatchKind::All corresponds to the standard textbook construction.
82 /// All possible match states are represented in the DFA.
83 /// MatchKind::LeftmostFirst permits greediness and otherwise tries to
84 /// simulate the match semantics of backtracking regex engines. Namely,
85 /// only a subset of match states are built, and dead states are used to
86 /// stop searches with an unanchored prefix.
87 ///
88 /// The default is MatchKind::LeftmostFirst.
89 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
90 self.match_kind = kind;
91 self
92 }
93
94 /// The set of bytes to use that will cause the DFA to enter a quit state,
95 /// stop searching and return an error. By default, this is empty.
96 pub fn quit(&mut self, set: ByteSet) -> &mut Config {
97 self.quit = set;
98 self
99 }
100
101 /// The limit, in bytes of the heap, that the DFA is permitted to use. This
102 /// does not include the auxiliary heap storage used by determinization.
103 pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
104 self.dfa_size_limit = bytes;
105 self
106 }
107
108 /// The limit, in bytes of the heap, that determinization itself is allowed
109 /// to use. This does not include the size of the DFA being built.
110 pub fn determinize_size_limit(
111 &mut self,
112 bytes: Option<usize>,
113 ) -> &mut Config {
114 self.determinize_size_limit = bytes;
115 self
116 }
117}
118
119/// The actual implementation of determinization that converts an NFA to a DFA
120/// through powerset construction.
121///
122/// This determinizer roughly follows the typical powerset construction, where
123/// each DFA state is comprised of one or more NFA states. In the worst case,
124/// there is one DFA state for every possible combination of NFA states. In
125/// practice, this only happens in certain conditions, typically when there are
126/// bounded repetitions.
127///
128/// The main differences between this implementation and typical deteminization
129/// are that this implementation delays matches by one state and hackily makes
130/// look-around work. Comments below attempt to explain this.
131///
132/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
133/// whichever is shorter.
134#[derive(Debug)]
135struct Runner<'a> {
136 /// The configuration used to initialize determinization.
137 config: Config,
138 /// The NFA we're converting into a DFA.
139 nfa: &'a thompson::NFA,
140 /// The DFA we're building.
141 dfa: &'a mut dense::OwnedDFA,
142 /// Each DFA state being built is defined as an *ordered* set of NFA
143 /// states, along with some meta facts about the ordered set of NFA states.
144 ///
145 /// This is never empty. The first state is always a dummy state such that
146 /// a state id == 0 corresponds to a dead state. The second state is always
147 /// the quit state.
148 ///
149 /// Why do we have states in both a `Vec` and in a cache map below?
150 /// Well, they serve two different roles based on access patterns.
151 /// `builder_states` is the canonical home of each state, and provides
152 /// constant random access by a DFA state's ID. The cache map below, on
153 /// the other hand, provides a quick way of searching for identical DFA
154 /// states by using the DFA state as a key in the map. Of course, we use
155 /// reference counting to avoid actually duplicating the state's data
156 /// itself. (Although this has never been benchmarked.) Note that the cache
157 /// map does not give us full minimization; it just lets us avoid some very
158 /// obvious redundant states.
159 ///
160 /// Note that the index into this Vec isn't quite the DFA's state ID.
161 /// Rather, it's just an index. To get the state ID, you have to multiply
162 /// it by the DFA's stride. That's done by self.dfa.from_index. And the
163 /// inverse is self.dfa.to_index.
164 ///
165 /// Moreover, DFA states don't usually retain the IDs assigned to them
166 /// by their position in this Vec. After determinization completes,
167 /// states are shuffled around to support other optimizations. See the
168 /// sibling 'special' module for more details on that. (The reason for
169 /// mentioning this is that if you print out the DFA for debugging during
170 /// determinization, and then print out the final DFA after it is fully
171 /// built, then the state IDs likely won't match up.)
172 builder_states: Vec<State>,
173 /// A cache of DFA states that already exist and can be easily looked up
174 /// via ordered sets of NFA states.
175 ///
176 /// See `builder_states` docs for why we store states in two different
177 /// ways.
178 cache: StateMap,
179 /// The memory usage, in bytes, used by builder_states and cache. We track
180 /// this as new states are added since states use a variable amount of
181 /// heap. Tracking this as we add states makes it possible to compute the
182 /// total amount of memory used by the determinizer in constant time.
183 memory_usage_state: usize,
184 /// A pair of sparse sets for tracking ordered sets of NFA state IDs.
185 /// These are reused throughout determinization. A bounded sparse set
186 /// gives us constant time insertion, membership testing and clearing.
187 sparses: SparseSets,
188 /// Scratch space for a stack of NFA states to visit, for depth first
189 /// visiting without recursion.
190 stack: Vec<StateID>,
191 /// Scratch space for storing an ordered sequence of NFA states, for
192 /// amortizing allocation. This is principally useful for when we avoid
193 /// adding a new DFA state since it already exists. In order to detect this
194 /// case though, we still need an ordered set of NFA state IDs. So we use
195 /// this space to stage that ordered set before we know whether we need to
196 /// create a new DFA state or not.
197 scratch_state_builder: StateBuilderEmpty,
198}
199
200/// A map from states to state identifiers. When using std, we use a standard
201/// hashmap, since it's a bit faster for this use case. (Other maps, like
202/// one's based on FNV, have not yet been benchmarked.)
203///
204/// The main purpose of this map is to reuse states where possible. This won't
205/// fully minimize the DFA, but it works well in a lot of cases.
206#[cfg(feature = "std")]
207type StateMap = std::collections::HashMap<State, StateID>;
208#[cfg(not(feature = "std"))]
209type StateMap = BTreeMap<State, StateID>;
210
211impl<'a> Runner<'a> {
212 /// Build the DFA. If there was a problem constructing the DFA (e.g., if
213 /// the chosen state identifier representation is too small), then an error
214 /// is returned.
215 fn run(mut self) -> Result<(), BuildError> {
216 if self.nfa.look_set_any().contains_word_unicode()
217 && !self.config.quit.contains_range(0x80, 0xFF)
218 {
219 return Err(BuildError::unsupported_dfa_word_boundary_unicode());
220 }
221
222 // A sequence of "representative" bytes drawn from each equivalence
223 // class. These representative bytes are fed to the NFA to compute
224 // state transitions. This allows us to avoid re-computing state
225 // transitions for bytes that are guaranteed to produce identical
226 // results. Since computing the representatives needs to do a little
227 // work, we do it once here because we'll be iterating over them a lot.
228 let representatives: Vec<alphabet::Unit> =
229 self.dfa.byte_classes().representatives(..).collect();
230 // The set of all DFA state IDs that still need to have their
231 // transitions set. We start by seeding this with all starting states.
232 let mut uncompiled = alloc::vec![];
233 self.add_all_starts(&mut uncompiled)?;
234 while let Some(dfa_id) = uncompiled.pop() {
235 for &unit in &representatives {
236 if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
237 {
238 continue;
239 }
240 // In many cases, the state we transition to has already been
241 // computed. 'cached_state' will do the minimal amount of work
242 // to check this, and if it exists, immediately return an
243 // already existing state ID.
244 let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
245 self.dfa.set_transition(dfa_id, unit, next_dfa_id);
246 // If the state ID we got back is newly created, then we need
247 // to compile it, so add it to our uncompiled frontier.
248 if is_new {
249 uncompiled.push(next_dfa_id);
250 }
251 }
252 }
253 debug!(
254 "determinization complete, memory usage: {}, \
255 dense DFA size: {}, \
256 is reverse? {}",
257 self.memory_usage(),
258 self.dfa.memory_usage(),
259 self.nfa.is_reverse(),
260 );
261
262 // A map from DFA state ID to one or more NFA match IDs. Each NFA match
263 // ID corresponds to a distinct regex pattern that matches in the state
264 // corresponding to the key.
265 let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
266 self.cache.clear();
267 #[cfg(feature = "logging")]
268 let mut total_pat_len = 0;
269 for (i, state) in self.builder_states.into_iter().enumerate() {
270 if let Some(pat_ids) = state.match_pattern_ids() {
271 let id = self.dfa.to_state_id(i);
272 log! {
273 total_pat_len += pat_ids.len();
274 }
275 matches.insert(id, pat_ids);
276 }
277 }
278 log! {
279 use core::mem::size_of;
280 let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
281 let pats = total_pat_len * size_of::<PatternID>();
282 let mem = (matches.len() * per_elem) + pats;
283 log::debug!("matches map built, memory usage: {}", mem);
284 }
285 // At this point, we shuffle the "special" states in the final DFA.
286 // This permits a DFA's match loop to detect a match condition (among
287 // other things) by merely inspecting the current state's identifier,
288 // and avoids the need for any additional auxiliary storage.
289 self.dfa.shuffle(matches)?;
290 Ok(())
291 }
292
293 /// Return the identifier for the next DFA state given an existing DFA
294 /// state and an input byte. If the next DFA state already exists, then
295 /// return its identifier from the cache. Otherwise, build the state, cache
296 /// it and return its identifier.
297 ///
298 /// This routine returns a boolean indicating whether a new state was
299 /// built. If a new state is built, then the caller needs to add it to its
300 /// frontier of uncompiled DFA states to compute transitions for.
301 fn cached_state(
302 &mut self,
303 dfa_id: StateID,
304 unit: alphabet::Unit,
305 ) -> Result<(StateID, bool), BuildError> {
306 // Compute the set of all reachable NFA states, including epsilons.
307 let empty_builder = self.get_state_builder();
308 let builder = util::determinize::next(
309 self.nfa,
310 self.config.match_kind,
311 &mut self.sparses,
312 &mut self.stack,
313 &self.builder_states[self.dfa.to_index(dfa_id)],
314 unit,
315 empty_builder,
316 );
317 self.maybe_add_state(builder)
318 }
319
320 /// Compute the set of DFA start states and add their identifiers in
321 /// 'dfa_state_ids' (no duplicates are added).
322 fn add_all_starts(
323 &mut self,
324 dfa_state_ids: &mut Vec<StateID>,
325 ) -> Result<(), BuildError> {
326 // These should be the first states added.
327 assert!(dfa_state_ids.is_empty());
328 // We only want to add (un)anchored starting states that is consistent
329 // with our DFA's configuration. Unconditionally adding both (although
330 // it is the default) can make DFAs quite a bit bigger.
331 if self.dfa.start_kind().has_unanchored() {
332 self.add_start_group(Anchored::No, dfa_state_ids)?;
333 }
334 if self.dfa.start_kind().has_anchored() {
335 self.add_start_group(Anchored::Yes, dfa_state_ids)?;
336 }
337 // I previously has an 'assert' here checking that either
338 // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
339 // turns out this isn't always true. For example, the NFA might have
340 // one or more patterns but where all such patterns are just 'fail'
341 // states. These will ultimately just compile down to DFA dead states,
342 // and since the dead state was added earlier, no new DFA states are
343 // added. And thus, it is valid and okay for 'dfa_state_ids' to be
344 // empty even if there are a non-zero number of patterns in the NFA.
345
346 // We only need to compute anchored start states for each pattern if it
347 // was requested to do so.
348 if self.dfa.starts_for_each_pattern() {
349 for pid in self.nfa.patterns() {
350 self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
351 }
352 }
353 Ok(())
354 }
355
356 /// Add a group of start states for the given match pattern ID. Any new
357 /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
358 /// pushed.)
359 ///
360 /// When pattern_id is None, then this will compile a group of unanchored
361 /// start states (if the DFA is unanchored). When the pattern_id is
362 /// present, then this will compile a group of anchored start states that
363 /// only match the given pattern.
364 ///
365 /// This panics if `anchored` corresponds to an invalid pattern ID.
366 fn add_start_group(
367 &mut self,
368 anchored: Anchored,
369 dfa_state_ids: &mut Vec<StateID>,
370 ) -> Result<(), BuildError> {
371 let nfa_start = match anchored {
372 Anchored::No => self.nfa.start_unanchored(),
373 Anchored::Yes => self.nfa.start_anchored(),
374 Anchored::Pattern(pid) => {
375 self.nfa.start_pattern(pid).expect("valid pattern ID")
376 }
377 };
378
379 // When compiling start states, we're careful not to build additional
380 // states that aren't necessary. For example, if the NFA has no word
381 // boundary assertion, then there's no reason to have distinct start
382 // states for 'NonWordByte' and 'WordByte' starting configurations.
383 // Instead, the 'WordByte' starting configuration can just point
384 // directly to the start state for the 'NonWordByte' config.
385 //
386 // Note though that we only need to care about assertions in the prefix
387 // of an NFA since this only concerns the starting states. (Actually,
388 // the most precisely thing we could do it is look at the prefix
389 // assertions of each pattern when 'anchored == Anchored::Pattern',
390 // and then only compile extra states if the prefix is non-empty.) But
391 // we settle for simplicity here instead of absolute minimalism. It is
392 // somewhat rare, after all, for multiple patterns in the same regex to
393 // have different prefix look-arounds.
394
395 let (id, is_new) =
396 self.add_one_start(nfa_start, Start::NonWordByte)?;
397 self.dfa.set_start_state(anchored, Start::NonWordByte, id);
398 if is_new {
399 dfa_state_ids.push(id);
400 }
401
402 if !self.nfa.look_set_prefix_any().contains_word() {
403 self.dfa.set_start_state(anchored, Start::WordByte, id);
404 } else {
405 let (id, is_new) =
406 self.add_one_start(nfa_start, Start::WordByte)?;
407 self.dfa.set_start_state(anchored, Start::WordByte, id);
408 if is_new {
409 dfa_state_ids.push(id);
410 }
411 }
412 if !self.nfa.look_set_prefix_any().contains_anchor() {
413 self.dfa.set_start_state(anchored, Start::Text, id);
414 self.dfa.set_start_state(anchored, Start::LineLF, id);
415 self.dfa.set_start_state(anchored, Start::LineCR, id);
416 self.dfa.set_start_state(
417 anchored,
418 Start::CustomLineTerminator,
419 id,
420 );
421 } else {
422 let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
423 self.dfa.set_start_state(anchored, Start::Text, id);
424 if is_new {
425 dfa_state_ids.push(id);
426 }
427
428 let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
429 self.dfa.set_start_state(anchored, Start::LineLF, id);
430 if is_new {
431 dfa_state_ids.push(id);
432 }
433
434 let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
435 self.dfa.set_start_state(anchored, Start::LineCR, id);
436 if is_new {
437 dfa_state_ids.push(id);
438 }
439
440 let (id, is_new) =
441 self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
442 self.dfa.set_start_state(
443 anchored,
444 Start::CustomLineTerminator,
445 id,
446 );
447 if is_new {
448 dfa_state_ids.push(id);
449 }
450 }
451
452 Ok(())
453 }
454
455 /// Add a new DFA start state corresponding to the given starting NFA
456 /// state, and the starting search configuration. (The starting search
457 /// configuration essentially tells us which look-behind assertions are
458 /// true for this particular state.)
459 ///
460 /// The boolean returned indicates whether the state ID returned is a newly
461 /// created state, or a previously cached state.
462 fn add_one_start(
463 &mut self,
464 nfa_start: StateID,
465 start: Start,
466 ) -> Result<(StateID, bool), BuildError> {
467 // Compute the look-behind assertions that are true in this starting
468 // configuration, and the determine the epsilon closure. While
469 // computing the epsilon closure, we only follow condiional epsilon
470 // transitions that satisfy the look-behind assertions in 'look_have'.
471 let mut builder_matches = self.get_state_builder().into_matches();
472 util::determinize::set_lookbehind_from_start(
473 self.nfa,
474 &start,
475 &mut builder_matches,
476 );
477 self.sparses.set1.clear();
478 util::determinize::epsilon_closure(
479 self.nfa,
480 nfa_start,
481 builder_matches.look_have(),
482 &mut self.stack,
483 &mut self.sparses.set1,
484 );
485 let mut builder = builder_matches.into_nfa();
486 util::determinize::add_nfa_states(
487 &self.nfa,
488 &self.sparses.set1,
489 &mut builder,
490 );
491 self.maybe_add_state(builder)
492 }
493
494 /// Adds the given state to the DFA being built depending on whether it
495 /// already exists in this determinizer's cache.
496 ///
497 /// If it does exist, then the memory used by 'state' is put back into the
498 /// determinizer and the previously created state's ID is returned. (Along
499 /// with 'false', indicating that no new state was added.)
500 ///
501 /// If it does not exist, then the state is added to the DFA being built
502 /// and a fresh ID is allocated (if ID allocation fails, then an error is
503 /// returned) and returned. (Along with 'true', indicating that a new state
504 /// was added.)
505 fn maybe_add_state(
506 &mut self,
507 builder: StateBuilderNFA,
508 ) -> Result<(StateID, bool), BuildError> {
509 if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
510 // Since we have a cached state, put the constructed state's
511 // memory back into our scratch space, so that it can be reused.
512 self.put_state_builder(builder);
513 return Ok((cached_id, false));
514 }
515 self.add_state(builder).map(|sid| (sid, true))
516 }
517
518 /// Add the given state to the DFA and make it available in the cache.
519 ///
520 /// The state initially has no transitions. That is, it transitions to the
521 /// dead state for all possible inputs, and transitions to the quit state
522 /// for all quit bytes.
523 ///
524 /// If adding the state would exceed the maximum value for StateID, then an
525 /// error is returned.
526 fn add_state(
527 &mut self,
528 builder: StateBuilderNFA,
529 ) -> Result<StateID, BuildError> {
530 let id = self.dfa.add_empty_state()?;
531 if !self.config.quit.is_empty() {
532 for b in self.config.quit.iter() {
533 self.dfa.set_transition(
534 id,
535 alphabet::Unit::u8(b),
536 self.dfa.quit_id(),
537 );
538 }
539 }
540 let state = builder.to_state();
541 // States use reference counting internally, so we only need to count
542 // their memory usage once.
543 self.memory_usage_state += state.memory_usage();
544 self.builder_states.push(state.clone());
545 self.cache.insert(state, id);
546 self.put_state_builder(builder);
547 if let Some(limit) = self.config.dfa_size_limit {
548 if self.dfa.memory_usage() > limit {
549 return Err(BuildError::dfa_exceeded_size_limit(limit));
550 }
551 }
552 if let Some(limit) = self.config.determinize_size_limit {
553 if self.memory_usage() > limit {
554 return Err(BuildError::determinize_exceeded_size_limit(
555 limit,
556 ));
557 }
558 }
559 Ok(id)
560 }
561
562 /// Returns a state builder from this determinizer that might have existing
563 /// capacity. This helps avoid allocs in cases where a state is built that
564 /// turns out to already be cached.
565 ///
566 /// Callers must put the state builder back with 'put_state_builder',
567 /// otherwise the allocation reuse won't work.
568 fn get_state_builder(&mut self) -> StateBuilderEmpty {
569 core::mem::replace(
570 &mut self.scratch_state_builder,
571 StateBuilderEmpty::new(),
572 )
573 }
574
575 /// Puts the given state builder back into this determinizer for reuse.
576 ///
577 /// Note that building a 'State' from a builder always creates a new
578 /// alloc, so callers should always put the builder back.
579 fn put_state_builder(&mut self, builder: StateBuilderNFA) {
580 let _ = core::mem::replace(
581 &mut self.scratch_state_builder,
582 builder.clear(),
583 );
584 }
585
586 /// Return the memory usage, in bytes, of this determinizer at the current
587 /// point in time. This does not include memory used by the NFA or the
588 /// dense DFA itself.
589 fn memory_usage(&self) -> usize {
590 use core::mem::size_of;
591
592 self.builder_states.len() * size_of::<State>()
593 // Maps likely use more memory than this, but it's probably close.
594 + self.cache.len() * (size_of::<State>() + size_of::<StateID>())
595 + self.memory_usage_state
596 + self.stack.capacity() * size_of::<StateID>()
597 + self.scratch_state_builder.capacity()
598 }
599}
600