1 | use alloc::{collections::BTreeMap, vec::Vec}; |
2 | |
3 | use crate::{ |
4 | dfa::{ |
5 | dense::{self, BuildError}, |
6 | DEAD, |
7 | }, |
8 | nfa::thompson, |
9 | util::{ |
10 | self, |
11 | alphabet::{self, ByteSet}, |
12 | determinize::{State, StateBuilderEmpty, StateBuilderNFA}, |
13 | primitives::{PatternID, StateID}, |
14 | search::{Anchored, MatchKind}, |
15 | sparse_set::SparseSets, |
16 | start::Start, |
17 | }, |
18 | }; |
19 | |
20 | /// A builder for configuring and running a DFA determinizer. |
21 | #[derive(Clone, Debug)] |
22 | pub(crate) struct Config { |
23 | match_kind: MatchKind, |
24 | quit: ByteSet, |
25 | dfa_size_limit: Option<usize>, |
26 | determinize_size_limit: Option<usize>, |
27 | } |
28 | |
29 | impl Config { |
30 | /// Create a new default config for a determinizer. The determinizer may be |
31 | /// configured before calling `run`. |
32 | pub fn new() -> Config { |
33 | Config { |
34 | match_kind: MatchKind::LeftmostFirst, |
35 | quit: ByteSet::empty(), |
36 | dfa_size_limit: None, |
37 | determinize_size_limit: None, |
38 | } |
39 | } |
40 | |
41 | /// Run determinization on the given NFA and write the resulting DFA into |
42 | /// the one given. The DFA given should be initialized but otherwise empty. |
43 | /// "Initialized" means that it is setup to handle the NFA's byte classes, |
44 | /// number of patterns and whether to build start states for each pattern. |
45 | pub fn run( |
46 | &self, |
47 | nfa: &thompson::NFA, |
48 | dfa: &mut dense::OwnedDFA, |
49 | ) -> Result<(), BuildError> { |
50 | let dead = State::dead(); |
51 | let quit = State::dead(); |
52 | let mut cache = StateMap::default(); |
53 | // We only insert the dead state here since its representation is |
54 | // identical to the quit state. And we never want anything pointing |
55 | // to the quit state other than specific transitions derived from the |
56 | // determinizer's configured "quit" bytes. |
57 | // |
58 | // We do put the quit state into 'builder_states' below. This ensures |
59 | // that a proper DFA state ID is allocated for it, and that no other |
60 | // DFA state uses the "location after the DEAD state." That is, it |
61 | // is assumed that the quit state is always the state immediately |
62 | // following the DEAD state. |
63 | cache.insert(dead.clone(), DEAD); |
64 | |
65 | let runner = Runner { |
66 | config: self.clone(), |
67 | nfa, |
68 | dfa, |
69 | builder_states: alloc::vec![dead, quit], |
70 | cache, |
71 | memory_usage_state: 0, |
72 | sparses: SparseSets::new(nfa.states().len()), |
73 | stack: alloc::vec![], |
74 | scratch_state_builder: StateBuilderEmpty::new(), |
75 | }; |
76 | runner.run() |
77 | } |
78 | |
79 | /// The match semantics to use for determinization. |
80 | /// |
81 | /// MatchKind::All corresponds to the standard textbook construction. |
82 | /// All possible match states are represented in the DFA. |
83 | /// MatchKind::LeftmostFirst permits greediness and otherwise tries to |
84 | /// simulate the match semantics of backtracking regex engines. Namely, |
85 | /// only a subset of match states are built, and dead states are used to |
86 | /// stop searches with an unanchored prefix. |
87 | /// |
88 | /// The default is MatchKind::LeftmostFirst. |
89 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { |
90 | self.match_kind = kind; |
91 | self |
92 | } |
93 | |
94 | /// The set of bytes to use that will cause the DFA to enter a quit state, |
95 | /// stop searching and return an error. By default, this is empty. |
96 | pub fn quit(&mut self, set: ByteSet) -> &mut Config { |
97 | self.quit = set; |
98 | self |
99 | } |
100 | |
101 | /// The limit, in bytes of the heap, that the DFA is permitted to use. This |
102 | /// does not include the auxiliary heap storage used by determinization. |
103 | pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config { |
104 | self.dfa_size_limit = bytes; |
105 | self |
106 | } |
107 | |
108 | /// The limit, in bytes of the heap, that determinization itself is allowed |
109 | /// to use. This does not include the size of the DFA being built. |
110 | pub fn determinize_size_limit( |
111 | &mut self, |
112 | bytes: Option<usize>, |
113 | ) -> &mut Config { |
114 | self.determinize_size_limit = bytes; |
115 | self |
116 | } |
117 | } |
118 | |
119 | /// The actual implementation of determinization that converts an NFA to a DFA |
120 | /// through powerset construction. |
121 | /// |
122 | /// This determinizer roughly follows the typical powerset construction, where |
123 | /// each DFA state is comprised of one or more NFA states. In the worst case, |
124 | /// there is one DFA state for every possible combination of NFA states. In |
125 | /// practice, this only happens in certain conditions, typically when there are |
126 | /// bounded repetitions. |
127 | /// |
128 | /// The main differences between this implementation and typical deteminization |
129 | /// are that this implementation delays matches by one state and hackily makes |
130 | /// look-around work. Comments below attempt to explain this. |
131 | /// |
132 | /// The lifetime variable `'a` refers to the lifetime of the NFA or DFA, |
133 | /// whichever is shorter. |
134 | #[derive(Debug)] |
135 | struct Runner<'a> { |
136 | /// The configuration used to initialize determinization. |
137 | config: Config, |
138 | /// The NFA we're converting into a DFA. |
139 | nfa: &'a thompson::NFA, |
140 | /// The DFA we're building. |
141 | dfa: &'a mut dense::OwnedDFA, |
142 | /// Each DFA state being built is defined as an *ordered* set of NFA |
143 | /// states, along with some meta facts about the ordered set of NFA states. |
144 | /// |
145 | /// This is never empty. The first state is always a dummy state such that |
146 | /// a state id == 0 corresponds to a dead state. The second state is always |
147 | /// the quit state. |
148 | /// |
149 | /// Why do we have states in both a `Vec` and in a cache map below? |
150 | /// Well, they serve two different roles based on access patterns. |
151 | /// `builder_states` is the canonical home of each state, and provides |
152 | /// constant random access by a DFA state's ID. The cache map below, on |
153 | /// the other hand, provides a quick way of searching for identical DFA |
154 | /// states by using the DFA state as a key in the map. Of course, we use |
155 | /// reference counting to avoid actually duplicating the state's data |
156 | /// itself. (Although this has never been benchmarked.) Note that the cache |
157 | /// map does not give us full minimization; it just lets us avoid some very |
158 | /// obvious redundant states. |
159 | /// |
160 | /// Note that the index into this Vec isn't quite the DFA's state ID. |
161 | /// Rather, it's just an index. To get the state ID, you have to multiply |
162 | /// it by the DFA's stride. That's done by self.dfa.from_index. And the |
163 | /// inverse is self.dfa.to_index. |
164 | /// |
165 | /// Moreover, DFA states don't usually retain the IDs assigned to them |
166 | /// by their position in this Vec. After determinization completes, |
167 | /// states are shuffled around to support other optimizations. See the |
168 | /// sibling 'special' module for more details on that. (The reason for |
169 | /// mentioning this is that if you print out the DFA for debugging during |
170 | /// determinization, and then print out the final DFA after it is fully |
171 | /// built, then the state IDs likely won't match up.) |
172 | builder_states: Vec<State>, |
173 | /// A cache of DFA states that already exist and can be easily looked up |
174 | /// via ordered sets of NFA states. |
175 | /// |
176 | /// See `builder_states` docs for why we store states in two different |
177 | /// ways. |
178 | cache: StateMap, |
179 | /// The memory usage, in bytes, used by builder_states and cache. We track |
180 | /// this as new states are added since states use a variable amount of |
181 | /// heap. Tracking this as we add states makes it possible to compute the |
182 | /// total amount of memory used by the determinizer in constant time. |
183 | memory_usage_state: usize, |
184 | /// A pair of sparse sets for tracking ordered sets of NFA state IDs. |
185 | /// These are reused throughout determinization. A bounded sparse set |
186 | /// gives us constant time insertion, membership testing and clearing. |
187 | sparses: SparseSets, |
188 | /// Scratch space for a stack of NFA states to visit, for depth first |
189 | /// visiting without recursion. |
190 | stack: Vec<StateID>, |
191 | /// Scratch space for storing an ordered sequence of NFA states, for |
192 | /// amortizing allocation. This is principally useful for when we avoid |
193 | /// adding a new DFA state since it already exists. In order to detect this |
194 | /// case though, we still need an ordered set of NFA state IDs. So we use |
195 | /// this space to stage that ordered set before we know whether we need to |
196 | /// create a new DFA state or not. |
197 | scratch_state_builder: StateBuilderEmpty, |
198 | } |
199 | |
200 | /// A map from states to state identifiers. When using std, we use a standard |
201 | /// hashmap, since it's a bit faster for this use case. (Other maps, like |
202 | /// one's based on FNV, have not yet been benchmarked.) |
203 | /// |
204 | /// The main purpose of this map is to reuse states where possible. This won't |
205 | /// fully minimize the DFA, but it works well in a lot of cases. |
206 | #[cfg (feature = "std" )] |
207 | type StateMap = std::collections::HashMap<State, StateID>; |
208 | #[cfg (not(feature = "std" ))] |
209 | type StateMap = BTreeMap<State, StateID>; |
210 | |
211 | impl<'a> Runner<'a> { |
212 | /// Build the DFA. If there was a problem constructing the DFA (e.g., if |
213 | /// the chosen state identifier representation is too small), then an error |
214 | /// is returned. |
215 | fn run(mut self) -> Result<(), BuildError> { |
216 | if self.nfa.look_set_any().contains_word_unicode() |
217 | && !self.config.quit.contains_range(0x80, 0xFF) |
218 | { |
219 | return Err(BuildError::unsupported_dfa_word_boundary_unicode()); |
220 | } |
221 | |
222 | // A sequence of "representative" bytes drawn from each equivalence |
223 | // class. These representative bytes are fed to the NFA to compute |
224 | // state transitions. This allows us to avoid re-computing state |
225 | // transitions for bytes that are guaranteed to produce identical |
226 | // results. Since computing the representatives needs to do a little |
227 | // work, we do it once here because we'll be iterating over them a lot. |
228 | let representatives: Vec<alphabet::Unit> = |
229 | self.dfa.byte_classes().representatives(..).collect(); |
230 | // The set of all DFA state IDs that still need to have their |
231 | // transitions set. We start by seeding this with all starting states. |
232 | let mut uncompiled = alloc::vec![]; |
233 | self.add_all_starts(&mut uncompiled)?; |
234 | while let Some(dfa_id) = uncompiled.pop() { |
235 | for &unit in &representatives { |
236 | if unit.as_u8().map_or(false, |b| self.config.quit.contains(b)) |
237 | { |
238 | continue; |
239 | } |
240 | // In many cases, the state we transition to has already been |
241 | // computed. 'cached_state' will do the minimal amount of work |
242 | // to check this, and if it exists, immediately return an |
243 | // already existing state ID. |
244 | let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?; |
245 | self.dfa.set_transition(dfa_id, unit, next_dfa_id); |
246 | // If the state ID we got back is newly created, then we need |
247 | // to compile it, so add it to our uncompiled frontier. |
248 | if is_new { |
249 | uncompiled.push(next_dfa_id); |
250 | } |
251 | } |
252 | } |
253 | debug!( |
254 | "determinization complete, memory usage: {}, \ |
255 | dense DFA size: {}, \ |
256 | is reverse? {}" , |
257 | self.memory_usage(), |
258 | self.dfa.memory_usage(), |
259 | self.nfa.is_reverse(), |
260 | ); |
261 | |
262 | // A map from DFA state ID to one or more NFA match IDs. Each NFA match |
263 | // ID corresponds to a distinct regex pattern that matches in the state |
264 | // corresponding to the key. |
265 | let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new(); |
266 | self.cache.clear(); |
267 | #[cfg (feature = "logging" )] |
268 | let mut total_pat_len = 0; |
269 | for (i, state) in self.builder_states.into_iter().enumerate() { |
270 | if let Some(pat_ids) = state.match_pattern_ids() { |
271 | let id = self.dfa.to_state_id(i); |
272 | log! { |
273 | total_pat_len += pat_ids.len(); |
274 | } |
275 | matches.insert(id, pat_ids); |
276 | } |
277 | } |
278 | log! { |
279 | use core::mem::size_of; |
280 | let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>(); |
281 | let pats = total_pat_len * size_of::<PatternID>(); |
282 | let mem = (matches.len() * per_elem) + pats; |
283 | log::debug!("matches map built, memory usage: {}" , mem); |
284 | } |
285 | // At this point, we shuffle the "special" states in the final DFA. |
286 | // This permits a DFA's match loop to detect a match condition (among |
287 | // other things) by merely inspecting the current state's identifier, |
288 | // and avoids the need for any additional auxiliary storage. |
289 | self.dfa.shuffle(matches)?; |
290 | Ok(()) |
291 | } |
292 | |
293 | /// Return the identifier for the next DFA state given an existing DFA |
294 | /// state and an input byte. If the next DFA state already exists, then |
295 | /// return its identifier from the cache. Otherwise, build the state, cache |
296 | /// it and return its identifier. |
297 | /// |
298 | /// This routine returns a boolean indicating whether a new state was |
299 | /// built. If a new state is built, then the caller needs to add it to its |
300 | /// frontier of uncompiled DFA states to compute transitions for. |
301 | fn cached_state( |
302 | &mut self, |
303 | dfa_id: StateID, |
304 | unit: alphabet::Unit, |
305 | ) -> Result<(StateID, bool), BuildError> { |
306 | // Compute the set of all reachable NFA states, including epsilons. |
307 | let empty_builder = self.get_state_builder(); |
308 | let builder = util::determinize::next( |
309 | self.nfa, |
310 | self.config.match_kind, |
311 | &mut self.sparses, |
312 | &mut self.stack, |
313 | &self.builder_states[self.dfa.to_index(dfa_id)], |
314 | unit, |
315 | empty_builder, |
316 | ); |
317 | self.maybe_add_state(builder) |
318 | } |
319 | |
320 | /// Compute the set of DFA start states and add their identifiers in |
321 | /// 'dfa_state_ids' (no duplicates are added). |
322 | fn add_all_starts( |
323 | &mut self, |
324 | dfa_state_ids: &mut Vec<StateID>, |
325 | ) -> Result<(), BuildError> { |
326 | // These should be the first states added. |
327 | assert!(dfa_state_ids.is_empty()); |
328 | // We only want to add (un)anchored starting states that is consistent |
329 | // with our DFA's configuration. Unconditionally adding both (although |
330 | // it is the default) can make DFAs quite a bit bigger. |
331 | if self.dfa.start_kind().has_unanchored() { |
332 | self.add_start_group(Anchored::No, dfa_state_ids)?; |
333 | } |
334 | if self.dfa.start_kind().has_anchored() { |
335 | self.add_start_group(Anchored::Yes, dfa_state_ids)?; |
336 | } |
337 | // I previously has an 'assert' here checking that either |
338 | // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it |
339 | // turns out this isn't always true. For example, the NFA might have |
340 | // one or more patterns but where all such patterns are just 'fail' |
341 | // states. These will ultimately just compile down to DFA dead states, |
342 | // and since the dead state was added earlier, no new DFA states are |
343 | // added. And thus, it is valid and okay for 'dfa_state_ids' to be |
344 | // empty even if there are a non-zero number of patterns in the NFA. |
345 | |
346 | // We only need to compute anchored start states for each pattern if it |
347 | // was requested to do so. |
348 | if self.dfa.starts_for_each_pattern() { |
349 | for pid in self.nfa.patterns() { |
350 | self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?; |
351 | } |
352 | } |
353 | Ok(()) |
354 | } |
355 | |
356 | /// Add a group of start states for the given match pattern ID. Any new |
357 | /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are |
358 | /// pushed.) |
359 | /// |
360 | /// When pattern_id is None, then this will compile a group of unanchored |
361 | /// start states (if the DFA is unanchored). When the pattern_id is |
362 | /// present, then this will compile a group of anchored start states that |
363 | /// only match the given pattern. |
364 | /// |
365 | /// This panics if `anchored` corresponds to an invalid pattern ID. |
366 | fn add_start_group( |
367 | &mut self, |
368 | anchored: Anchored, |
369 | dfa_state_ids: &mut Vec<StateID>, |
370 | ) -> Result<(), BuildError> { |
371 | let nfa_start = match anchored { |
372 | Anchored::No => self.nfa.start_unanchored(), |
373 | Anchored::Yes => self.nfa.start_anchored(), |
374 | Anchored::Pattern(pid) => { |
375 | self.nfa.start_pattern(pid).expect("valid pattern ID" ) |
376 | } |
377 | }; |
378 | |
379 | // When compiling start states, we're careful not to build additional |
380 | // states that aren't necessary. For example, if the NFA has no word |
381 | // boundary assertion, then there's no reason to have distinct start |
382 | // states for 'NonWordByte' and 'WordByte' starting configurations. |
383 | // Instead, the 'WordByte' starting configuration can just point |
384 | // directly to the start state for the 'NonWordByte' config. |
385 | // |
386 | // Note though that we only need to care about assertions in the prefix |
387 | // of an NFA since this only concerns the starting states. (Actually, |
388 | // the most precisely thing we could do it is look at the prefix |
389 | // assertions of each pattern when 'anchored == Anchored::Pattern', |
390 | // and then only compile extra states if the prefix is non-empty.) But |
391 | // we settle for simplicity here instead of absolute minimalism. It is |
392 | // somewhat rare, after all, for multiple patterns in the same regex to |
393 | // have different prefix look-arounds. |
394 | |
395 | let (id, is_new) = |
396 | self.add_one_start(nfa_start, Start::NonWordByte)?; |
397 | self.dfa.set_start_state(anchored, Start::NonWordByte, id); |
398 | if is_new { |
399 | dfa_state_ids.push(id); |
400 | } |
401 | |
402 | if !self.nfa.look_set_prefix_any().contains_word() { |
403 | self.dfa.set_start_state(anchored, Start::WordByte, id); |
404 | } else { |
405 | let (id, is_new) = |
406 | self.add_one_start(nfa_start, Start::WordByte)?; |
407 | self.dfa.set_start_state(anchored, Start::WordByte, id); |
408 | if is_new { |
409 | dfa_state_ids.push(id); |
410 | } |
411 | } |
412 | if !self.nfa.look_set_prefix_any().contains_anchor() { |
413 | self.dfa.set_start_state(anchored, Start::Text, id); |
414 | self.dfa.set_start_state(anchored, Start::LineLF, id); |
415 | self.dfa.set_start_state(anchored, Start::LineCR, id); |
416 | self.dfa.set_start_state( |
417 | anchored, |
418 | Start::CustomLineTerminator, |
419 | id, |
420 | ); |
421 | } else { |
422 | let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?; |
423 | self.dfa.set_start_state(anchored, Start::Text, id); |
424 | if is_new { |
425 | dfa_state_ids.push(id); |
426 | } |
427 | |
428 | let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?; |
429 | self.dfa.set_start_state(anchored, Start::LineLF, id); |
430 | if is_new { |
431 | dfa_state_ids.push(id); |
432 | } |
433 | |
434 | let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?; |
435 | self.dfa.set_start_state(anchored, Start::LineCR, id); |
436 | if is_new { |
437 | dfa_state_ids.push(id); |
438 | } |
439 | |
440 | let (id, is_new) = |
441 | self.add_one_start(nfa_start, Start::CustomLineTerminator)?; |
442 | self.dfa.set_start_state( |
443 | anchored, |
444 | Start::CustomLineTerminator, |
445 | id, |
446 | ); |
447 | if is_new { |
448 | dfa_state_ids.push(id); |
449 | } |
450 | } |
451 | |
452 | Ok(()) |
453 | } |
454 | |
455 | /// Add a new DFA start state corresponding to the given starting NFA |
456 | /// state, and the starting search configuration. (The starting search |
457 | /// configuration essentially tells us which look-behind assertions are |
458 | /// true for this particular state.) |
459 | /// |
460 | /// The boolean returned indicates whether the state ID returned is a newly |
461 | /// created state, or a previously cached state. |
462 | fn add_one_start( |
463 | &mut self, |
464 | nfa_start: StateID, |
465 | start: Start, |
466 | ) -> Result<(StateID, bool), BuildError> { |
467 | // Compute the look-behind assertions that are true in this starting |
468 | // configuration, and the determine the epsilon closure. While |
469 | // computing the epsilon closure, we only follow condiional epsilon |
470 | // transitions that satisfy the look-behind assertions in 'look_have'. |
471 | let mut builder_matches = self.get_state_builder().into_matches(); |
472 | util::determinize::set_lookbehind_from_start( |
473 | self.nfa, |
474 | &start, |
475 | &mut builder_matches, |
476 | ); |
477 | self.sparses.set1.clear(); |
478 | util::determinize::epsilon_closure( |
479 | self.nfa, |
480 | nfa_start, |
481 | builder_matches.look_have(), |
482 | &mut self.stack, |
483 | &mut self.sparses.set1, |
484 | ); |
485 | let mut builder = builder_matches.into_nfa(); |
486 | util::determinize::add_nfa_states( |
487 | &self.nfa, |
488 | &self.sparses.set1, |
489 | &mut builder, |
490 | ); |
491 | self.maybe_add_state(builder) |
492 | } |
493 | |
494 | /// Adds the given state to the DFA being built depending on whether it |
495 | /// already exists in this determinizer's cache. |
496 | /// |
497 | /// If it does exist, then the memory used by 'state' is put back into the |
498 | /// determinizer and the previously created state's ID is returned. (Along |
499 | /// with 'false', indicating that no new state was added.) |
500 | /// |
501 | /// If it does not exist, then the state is added to the DFA being built |
502 | /// and a fresh ID is allocated (if ID allocation fails, then an error is |
503 | /// returned) and returned. (Along with 'true', indicating that a new state |
504 | /// was added.) |
505 | fn maybe_add_state( |
506 | &mut self, |
507 | builder: StateBuilderNFA, |
508 | ) -> Result<(StateID, bool), BuildError> { |
509 | if let Some(&cached_id) = self.cache.get(builder.as_bytes()) { |
510 | // Since we have a cached state, put the constructed state's |
511 | // memory back into our scratch space, so that it can be reused. |
512 | self.put_state_builder(builder); |
513 | return Ok((cached_id, false)); |
514 | } |
515 | self.add_state(builder).map(|sid| (sid, true)) |
516 | } |
517 | |
518 | /// Add the given state to the DFA and make it available in the cache. |
519 | /// |
520 | /// The state initially has no transitions. That is, it transitions to the |
521 | /// dead state for all possible inputs, and transitions to the quit state |
522 | /// for all quit bytes. |
523 | /// |
524 | /// If adding the state would exceed the maximum value for StateID, then an |
525 | /// error is returned. |
526 | fn add_state( |
527 | &mut self, |
528 | builder: StateBuilderNFA, |
529 | ) -> Result<StateID, BuildError> { |
530 | let id = self.dfa.add_empty_state()?; |
531 | if !self.config.quit.is_empty() { |
532 | for b in self.config.quit.iter() { |
533 | self.dfa.set_transition( |
534 | id, |
535 | alphabet::Unit::u8(b), |
536 | self.dfa.quit_id(), |
537 | ); |
538 | } |
539 | } |
540 | let state = builder.to_state(); |
541 | // States use reference counting internally, so we only need to count |
542 | // their memory usage once. |
543 | self.memory_usage_state += state.memory_usage(); |
544 | self.builder_states.push(state.clone()); |
545 | self.cache.insert(state, id); |
546 | self.put_state_builder(builder); |
547 | if let Some(limit) = self.config.dfa_size_limit { |
548 | if self.dfa.memory_usage() > limit { |
549 | return Err(BuildError::dfa_exceeded_size_limit(limit)); |
550 | } |
551 | } |
552 | if let Some(limit) = self.config.determinize_size_limit { |
553 | if self.memory_usage() > limit { |
554 | return Err(BuildError::determinize_exceeded_size_limit( |
555 | limit, |
556 | )); |
557 | } |
558 | } |
559 | Ok(id) |
560 | } |
561 | |
562 | /// Returns a state builder from this determinizer that might have existing |
563 | /// capacity. This helps avoid allocs in cases where a state is built that |
564 | /// turns out to already be cached. |
565 | /// |
566 | /// Callers must put the state builder back with 'put_state_builder', |
567 | /// otherwise the allocation reuse won't work. |
568 | fn get_state_builder(&mut self) -> StateBuilderEmpty { |
569 | core::mem::replace( |
570 | &mut self.scratch_state_builder, |
571 | StateBuilderEmpty::new(), |
572 | ) |
573 | } |
574 | |
575 | /// Puts the given state builder back into this determinizer for reuse. |
576 | /// |
577 | /// Note that building a 'State' from a builder always creates a new |
578 | /// alloc, so callers should always put the builder back. |
579 | fn put_state_builder(&mut self, builder: StateBuilderNFA) { |
580 | let _ = core::mem::replace( |
581 | &mut self.scratch_state_builder, |
582 | builder.clear(), |
583 | ); |
584 | } |
585 | |
586 | /// Return the memory usage, in bytes, of this determinizer at the current |
587 | /// point in time. This does not include memory used by the NFA or the |
588 | /// dense DFA itself. |
589 | fn memory_usage(&self) -> usize { |
590 | use core::mem::size_of; |
591 | |
592 | self.builder_states.len() * size_of::<State>() |
593 | // Maps likely use more memory than this, but it's probably close. |
594 | + self.cache.len() * (size_of::<State>() + size_of::<StateID>()) |
595 | + self.memory_usage_state |
596 | + self.stack.capacity() * size_of::<StateID>() |
597 | + self.scratch_state_builder.capacity() |
598 | } |
599 | } |
600 | |