1/*!
2Types and routines specific to lazy DFAs.
3
4This module is the home of [`hybrid::dfa::DFA`](DFA).
5
6This module also contains a [`hybrid::dfa::Builder`](Builder) and a
7[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
8*/
9
10use core::{iter, mem::size_of};
11
12use alloc::vec::Vec;
13
14use crate::{
15 hybrid::{
16 error::{BuildError, CacheError, StartError},
17 id::{LazyStateID, LazyStateIDError},
18 search,
19 },
20 nfa::thompson,
21 util::{
22 alphabet::{self, ByteClasses, ByteSet},
23 determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
24 empty,
25 prefilter::Prefilter,
26 primitives::{PatternID, StateID as NFAStateID},
27 search::{
28 Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
29 },
30 sparse_set::SparseSets,
31 start::{self, Start, StartByteMap},
32 },
33};
34
35/// The minimum number of states that a lazy DFA's cache size must support.
36///
37/// This is checked at time of construction to ensure that at least some small
38/// number of states can fit in the given capacity allotment. If we can't fit
39/// at least this number of states, then the thinking is that it's pretty
40/// senseless to use the lazy DFA. More to the point, parts of the code do
41/// assume that the cache can fit at least some small number of states.
42const MIN_STATES: usize = SENTINEL_STATES + 2;
43
44/// The number of "sentinel" states that get added to every lazy DFA.
45///
46/// These are special states indicating status conditions of a search: unknown,
47/// dead and quit. These states in particular also use zero NFA states, so
48/// their memory usage is quite small. This is relevant for computing the
49/// minimum memory needed for a lazy DFA cache.
50const SENTINEL_STATES: usize = 3;
51
52/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
53///
54/// A lazy DFA is a DFA that builds itself at search time. It otherwise has
55/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA).
56/// Indeed, both support precisely the same regex features with precisely the
57/// same semantics.
58///
59/// Where as a `dense::DFA` must be completely built to handle any input before
60/// it may be used for search, a lazy DFA starts off effectively empty. During
61/// a search, a lazy DFA will build itself depending on whether it has already
62/// computed the next transition or not. If it has, then it looks a lot like
63/// a `dense::DFA` internally: it does a very fast table based access to find
64/// the next transition. Otherwise, if the state hasn't been computed, then it
65/// does determinization _for that specific transition_ to compute the next DFA
66/// state.
67///
68/// The main selling point of a lazy DFA is that, in practice, it has
69/// the performance profile of a `dense::DFA` without the weakness of it
70/// taking worst case exponential time to build. Indeed, for each byte of
71/// input, the lazy DFA will construct as most one new DFA state. Thus, a
72/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~
73/// pattern.len()` and `n ~ haystack.len()`).
74///
75/// The main downsides of a lazy DFA are:
76///
77/// 1. It requires mutable "cache" space during search. This is where the
78/// transition table, among other things, is stored.
79/// 2. In pathological cases (e.g., if the cache is too small), it will run
80/// out of room and either require a bigger cache capacity or will repeatedly
81/// clear the cache and thus repeatedly regenerate DFA states. Overall, this
82/// will tend to be slower than a typical NFA simulation.
83///
84/// # Capabilities
85///
86/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following
87/// operations:
88///
89/// 1. Detection of a match.
90/// 2. Location of the end of a match.
91/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched
92/// is reported as well.
93///
94/// A notable absence from the above list of capabilities is the location of
95/// the *start* of a match. In order to provide both the start and end of
96/// a match, *two* lazy DFAs are required. This functionality is provided by a
97/// [`Regex`](crate::hybrid::regex::Regex).
98///
99/// # Example
100///
101/// This shows how to build a lazy DFA with the default configuration and
102/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create
103/// a cache and pass it to our search routine.
104///
105/// ```
106/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
107///
108/// let dfa = DFA::new("foo[0-9]+")?;
109/// let mut cache = dfa.create_cache();
110///
111/// let expected = Some(HalfMatch::must(0, 8));
112/// assert_eq!(expected, dfa.try_search_fwd(
113/// &mut cache, &Input::new("foo12345"))?,
114/// );
115/// # Ok::<(), Box<dyn std::error::Error>>(())
116/// ```
117#[derive(Clone, Debug)]
118pub struct DFA {
119 config: Config,
120 nfa: thompson::NFA,
121 stride2: usize,
122 start_map: StartByteMap,
123 classes: ByteClasses,
124 quitset: ByteSet,
125 cache_capacity: usize,
126}
127
128impl DFA {
129 /// Parse the given regular expression using a default configuration and
130 /// return the corresponding lazy DFA.
131 ///
132 /// If you want a non-default configuration, then use the [`Builder`] to
133 /// set your own configuration.
134 ///
135 /// # Example
136 ///
137 /// ```
138 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
139 ///
140 /// let dfa = DFA::new("foo[0-9]+bar")?;
141 /// let mut cache = dfa.create_cache();
142 ///
143 /// let expected = HalfMatch::must(0, 11);
144 /// assert_eq!(
145 /// Some(expected),
146 /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
147 /// );
148 /// # Ok::<(), Box<dyn std::error::Error>>(())
149 /// ```
150 #[cfg(feature = "syntax")]
151 pub fn new(pattern: &str) -> Result<DFA, BuildError> {
152 DFA::builder().build(pattern)
153 }
154
155 /// Parse the given regular expressions using a default configuration and
156 /// return the corresponding lazy multi-DFA.
157 ///
158 /// If you want a non-default configuration, then use the [`Builder`] to
159 /// set your own configuration.
160 ///
161 /// # Example
162 ///
163 /// ```
164 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
165 ///
166 /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
167 /// let mut cache = dfa.create_cache();
168 ///
169 /// let expected = HalfMatch::must(1, 3);
170 /// assert_eq!(
171 /// Some(expected),
172 /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
173 /// );
174 /// # Ok::<(), Box<dyn std::error::Error>>(())
175 /// ```
176 #[cfg(feature = "syntax")]
177 pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
178 DFA::builder().build_many(patterns)
179 }
180
181 /// Create a new lazy DFA that matches every input.
182 ///
183 /// # Example
184 ///
185 /// ```
186 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
187 ///
188 /// let dfa = DFA::always_match()?;
189 /// let mut cache = dfa.create_cache();
190 ///
191 /// let expected = HalfMatch::must(0, 0);
192 /// assert_eq!(Some(expected), dfa.try_search_fwd(
193 /// &mut cache, &Input::new(""))?,
194 /// );
195 /// assert_eq!(Some(expected), dfa.try_search_fwd(
196 /// &mut cache, &Input::new("foo"))?,
197 /// );
198 /// # Ok::<(), Box<dyn std::error::Error>>(())
199 /// ```
200 pub fn always_match() -> Result<DFA, BuildError> {
201 let nfa = thompson::NFA::always_match();
202 Builder::new().build_from_nfa(nfa)
203 }
204
205 /// Create a new lazy DFA that never matches any input.
206 ///
207 /// # Example
208 ///
209 /// ```
210 /// use regex_automata::{hybrid::dfa::DFA, Input};
211 ///
212 /// let dfa = DFA::never_match()?;
213 /// let mut cache = dfa.create_cache();
214 ///
215 /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?);
216 /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?);
217 /// # Ok::<(), Box<dyn std::error::Error>>(())
218 /// ```
219 pub fn never_match() -> Result<DFA, BuildError> {
220 let nfa = thompson::NFA::never_match();
221 Builder::new().build_from_nfa(nfa)
222 }
223
224 /// Return a default configuration for a `DFA`.
225 ///
226 /// This is a convenience routine to avoid needing to import the [`Config`]
227 /// type when customizing the construction of a lazy DFA.
228 ///
229 /// # Example
230 ///
231 /// This example shows how to build a lazy DFA that heuristically supports
232 /// Unicode word boundaries.
233 ///
234 /// ```
235 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
236 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input};
237 ///
238 /// let re = DFA::builder()
239 /// .configure(DFA::config().unicode_word_boundary(true))
240 /// .build(r"\b\w+\b")?;
241 /// let mut cache = re.create_cache();
242 ///
243 /// // Since our haystack is all ASCII, the DFA search sees then and knows
244 /// // it is legal to interpret Unicode word boundaries as ASCII word
245 /// // boundaries.
246 /// let input = Input::new("!!foo!!");
247 /// let expected = HalfMatch::must(0, 5);
248 /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?);
249 ///
250 /// // But if our haystack contains non-ASCII, then the search will fail
251 /// // with an error.
252 /// let input = Input::new("!!βββ!!");
253 /// let expected = MatchError::quit(b'\xCE', 2);
254 /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input));
255 ///
256 /// # Ok::<(), Box<dyn std::error::Error>>(())
257 /// ```
258 pub fn config() -> Config {
259 Config::new()
260 }
261
262 /// Return a builder for configuring the construction of a `Regex`.
263 ///
264 /// This is a convenience routine to avoid needing to import the
265 /// [`Builder`] type in common cases.
266 ///
267 /// # Example
268 ///
269 /// This example shows how to use the builder to disable UTF-8 mode
270 /// everywhere for lazy DFAs.
271 ///
272 /// ```
273 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
274 /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input};
275 ///
276 /// let re = DFA::builder()
277 /// .syntax(syntax::Config::new().utf8(false))
278 /// .build(r"foo(?-u:[^b])ar.*")?;
279 /// let mut cache = re.create_cache();
280 ///
281 /// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n");
282 /// let expected = Some(HalfMatch::must(0, 9));
283 /// let got = re.try_search_fwd(&mut cache, &input)?;
284 /// assert_eq!(expected, got);
285 ///
286 /// # Ok::<(), Box<dyn std::error::Error>>(())
287 /// ```
288 pub fn builder() -> Builder {
289 Builder::new()
290 }
291
292 /// Create a new cache for this lazy DFA.
293 ///
294 /// The cache returned should only be used for searches for this
295 /// lazy DFA. If you want to reuse the cache for another DFA, then
296 /// you must call [`Cache::reset`] with that DFA (or, equivalently,
297 /// [`DFA::reset_cache`]).
298 pub fn create_cache(&self) -> Cache {
299 Cache::new(self)
300 }
301
302 /// Reset the given cache such that it can be used for searching with the
303 /// this lazy DFA (and only this DFA).
304 ///
305 /// A cache reset permits reusing memory already allocated in this cache
306 /// with a different lazy DFA.
307 ///
308 /// Resetting a cache sets its "clear count" to 0. This is relevant if the
309 /// lazy DFA has been configured to "give up" after it has cleared the
310 /// cache a certain number of times.
311 ///
312 /// Any lazy state ID generated by the cache prior to resetting it is
313 /// invalid after the reset.
314 ///
315 /// # Example
316 ///
317 /// This shows how to re-purpose a cache for use with a different DFA.
318 ///
319 /// ```
320 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
321 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
322 ///
323 /// let dfa1 = DFA::new(r"\w")?;
324 /// let dfa2 = DFA::new(r"\W")?;
325 ///
326 /// let mut cache = dfa1.create_cache();
327 /// assert_eq!(
328 /// Some(HalfMatch::must(0, 2)),
329 /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
330 /// );
331 ///
332 /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
333 /// // incorrect results. In order to re-purpose the cache, we must reset
334 /// // it with the DFA we'd like to use it with.
335 /// //
336 /// // Similarly, after this reset, using the cache with 'dfa1' is also not
337 /// // allowed.
338 /// dfa2.reset_cache(&mut cache);
339 /// assert_eq!(
340 /// Some(HalfMatch::must(0, 3)),
341 /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
342 /// );
343 ///
344 /// # Ok::<(), Box<dyn std::error::Error>>(())
345 /// ```
346 pub fn reset_cache(&self, cache: &mut Cache) {
347 Lazy::new(self, cache).reset_cache()
348 }
349
350 /// Returns the total number of patterns compiled into this lazy DFA.
351 ///
352 /// In the case of a DFA that contains no patterns, this returns `0`.
353 ///
354 /// # Example
355 ///
356 /// This example shows the pattern length for a DFA that never matches:
357 ///
358 /// ```
359 /// use regex_automata::hybrid::dfa::DFA;
360 ///
361 /// let dfa = DFA::never_match()?;
362 /// assert_eq!(dfa.pattern_len(), 0);
363 /// # Ok::<(), Box<dyn std::error::Error>>(())
364 /// ```
365 ///
366 /// And another example for a DFA that matches at every position:
367 ///
368 /// ```
369 /// use regex_automata::hybrid::dfa::DFA;
370 ///
371 /// let dfa = DFA::always_match()?;
372 /// assert_eq!(dfa.pattern_len(), 1);
373 /// # Ok::<(), Box<dyn std::error::Error>>(())
374 /// ```
375 ///
376 /// And finally, a DFA that was constructed from multiple patterns:
377 ///
378 /// ```
379 /// use regex_automata::hybrid::dfa::DFA;
380 ///
381 /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
382 /// assert_eq!(dfa.pattern_len(), 3);
383 /// # Ok::<(), Box<dyn std::error::Error>>(())
384 /// ```
385 pub fn pattern_len(&self) -> usize {
386 self.nfa.pattern_len()
387 }
388
389 /// Returns the equivalence classes that make up the alphabet for this DFA.
390 ///
391 /// Unless [`Config::byte_classes`] was disabled, it is possible that
392 /// multiple distinct bytes are grouped into the same equivalence class
393 /// if it is impossible for them to discriminate between a match and a
394 /// non-match. This has the effect of reducing the overall alphabet size
395 /// and in turn potentially substantially reducing the size of the DFA's
396 /// transition table.
397 ///
398 /// The downside of using equivalence classes like this is that every state
399 /// transition will automatically use this map to convert an arbitrary
400 /// byte to its corresponding equivalence class. In practice this has a
401 /// negligible impact on performance.
402 pub fn byte_classes(&self) -> &ByteClasses {
403 &self.classes
404 }
405
406 /// Returns this lazy DFA's configuration.
407 pub fn get_config(&self) -> &Config {
408 &self.config
409 }
410
411 /// Returns a reference to the underlying NFA.
412 pub fn get_nfa(&self) -> &thompson::NFA {
413 &self.nfa
414 }
415
416 /// Returns the stride, as a base-2 exponent, required for these
417 /// equivalence classes.
418 ///
419 /// The stride is always the smallest power of 2 that is greater than or
420 /// equal to the alphabet length. This is done so that converting between
421 /// state IDs and indices can be done with shifts alone, which is much
422 /// faster than integer division.
423 fn stride2(&self) -> usize {
424 self.stride2
425 }
426
427 /// Returns the total stride for every state in this lazy DFA. This
428 /// corresponds to the total number of transitions used by each state in
429 /// this DFA's transition table.
430 fn stride(&self) -> usize {
431 1 << self.stride2()
432 }
433
434 /// Returns the memory usage, in bytes, of this lazy DFA.
435 ///
436 /// This does **not** include the stack size used up by this lazy DFA. To
437 /// compute that, use `std::mem::size_of::<DFA>()`. This also does not
438 /// include the size of the `Cache` used.
439 ///
440 /// This also does not include any heap memory used by the NFA inside of
441 /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and
442 /// thus not owned by this hybrid NFA/DFA. More practically, several regex
443 /// engines in this crate embed an NFA, and reporting the NFA's memory
444 /// usage in all of them would likely result in reporting higher heap
445 /// memory than is actually used.
446 pub fn memory_usage(&self) -> usize {
447 // The only thing that uses heap memory in a DFA is the NFA. But the
448 // NFA has shared ownership, so reporting its memory as part of the
449 // hybrid DFA is likely to lead to double-counting the NFA memory
450 // somehow. In particular, this DFA does not really own an NFA, so
451 // including it in the DFA's memory usage doesn't seem semantically
452 // correct.
453 0
454 }
455}
456
457impl DFA {
458 /// Executes a forward search and returns the end position of the leftmost
459 /// match that is found. If no match exists, then `None` is returned.
460 ///
461 /// In particular, this method continues searching even after it enters
462 /// a match state. The search only terminates once it has reached the
463 /// end of the input or when it has entered a dead or quit state. Upon
464 /// termination, the position of the last byte seen while still in a match
465 /// state is returned.
466 ///
467 /// # Errors
468 ///
469 /// This routine errors if the search could not complete. This can occur
470 /// in a number of circumstances:
471 ///
472 /// * The configuration of the lazy DFA may permit it to "quit" the search.
473 /// For example, setting quit bytes or enabling heuristic support for
474 /// Unicode word boundaries. The default configuration does not enable any
475 /// option that could result in the lazy DFA quitting.
476 /// * The configuration of the lazy DFA may also permit it to "give up"
477 /// on a search if it makes ineffective use of its transition table
478 /// cache. The default configuration does not enable this by default,
479 /// although it is typically a good idea to.
480 /// * When the provided `Input` configuration is not supported. For
481 /// example, by providing an unsupported anchor mode.
482 ///
483 /// When a search returns an error, callers cannot know whether a match
484 /// exists or not.
485 ///
486 /// # Example
487 ///
488 /// This example shows how to run a basic search.
489 ///
490 /// ```
491 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
492 ///
493 /// let dfa = DFA::new("foo[0-9]+")?;
494 /// let mut cache = dfa.create_cache();
495 /// let expected = HalfMatch::must(0, 8);
496 /// assert_eq!(Some(expected), dfa.try_search_fwd(
497 /// &mut cache, &Input::new("foo12345"))?,
498 /// );
499 ///
500 /// // Even though a match is found after reading the first byte (`a`),
501 /// // the leftmost first match semantics demand that we find the earliest
502 /// // match that prefers earlier parts of the pattern over later parts.
503 /// let dfa = DFA::new("abc|a")?;
504 /// let mut cache = dfa.create_cache();
505 /// let expected = HalfMatch::must(0, 3);
506 /// assert_eq!(Some(expected), dfa.try_search_fwd(
507 /// &mut cache, &Input::new("abc"))?,
508 /// );
509 ///
510 /// # Ok::<(), Box<dyn std::error::Error>>(())
511 /// ```
512 ///
513 /// # Example: specific pattern search
514 ///
515 /// This example shows how to build a lazy multi-DFA that permits searching
516 /// for specific patterns.
517 ///
518 /// ```
519 /// use regex_automata::{
520 /// hybrid::dfa::DFA,
521 /// Anchored, HalfMatch, PatternID, Input,
522 /// };
523 ///
524 /// let dfa = DFA::builder()
525 /// .configure(DFA::config().starts_for_each_pattern(true))
526 /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
527 /// let mut cache = dfa.create_cache();
528 /// let haystack = "foo123";
529 ///
530 /// // Since we are using the default leftmost-first match and both
531 /// // patterns match at the same starting position, only the first pattern
532 /// // will be returned in this case when doing a search for any of the
533 /// // patterns.
534 /// let expected = Some(HalfMatch::must(0, 6));
535 /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
536 /// assert_eq!(expected, got);
537 ///
538 /// // But if we want to check whether some other pattern matches, then we
539 /// // can provide its pattern ID.
540 /// let expected = Some(HalfMatch::must(1, 6));
541 /// let input = Input::new(haystack)
542 /// .anchored(Anchored::Pattern(PatternID::must(1)));
543 /// let got = dfa.try_search_fwd(&mut cache, &input)?;
544 /// assert_eq!(expected, got);
545 ///
546 /// # Ok::<(), Box<dyn std::error::Error>>(())
547 /// ```
548 ///
549 /// # Example: specifying the bounds of a search
550 ///
551 /// This example shows how providing the bounds of a search can produce
552 /// different results than simply sub-slicing the haystack.
553 ///
554 /// ```
555 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
556 ///
557 /// // N.B. We disable Unicode here so that we use a simple ASCII word
558 /// // boundary. Alternatively, we could enable heuristic support for
559 /// // Unicode word boundaries since our haystack is pure ASCII.
560 /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
561 /// let mut cache = dfa.create_cache();
562 /// let haystack = "foo123bar";
563 ///
564 /// // Since we sub-slice the haystack, the search doesn't know about the
565 /// // larger context and assumes that `123` is surrounded by word
566 /// // boundaries. And of course, the match position is reported relative
567 /// // to the sub-slice as well, which means we get `3` instead of `6`.
568 /// let expected = Some(HalfMatch::must(0, 3));
569 /// let got = dfa.try_search_fwd(
570 /// &mut cache,
571 /// &Input::new(&haystack[3..6]),
572 /// )?;
573 /// assert_eq!(expected, got);
574 ///
575 /// // But if we provide the bounds of the search within the context of the
576 /// // entire haystack, then the search can take the surrounding context
577 /// // into account. (And if we did find a match, it would be reported
578 /// // as a valid offset into `haystack` instead of its sub-slice.)
579 /// let expected = None;
580 /// let got = dfa.try_search_fwd(
581 /// &mut cache,
582 /// &Input::new(haystack).range(3..6),
583 /// )?;
584 /// assert_eq!(expected, got);
585 ///
586 /// # Ok::<(), Box<dyn std::error::Error>>(())
587 /// ```
588 #[inline]
589 pub fn try_search_fwd(
590 &self,
591 cache: &mut Cache,
592 input: &Input<'_>,
593 ) -> Result<Option<HalfMatch>, MatchError> {
594 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
595 let hm = match search::find_fwd(self, cache, input)? {
596 None => return Ok(None),
597 Some(hm) if !utf8empty => return Ok(Some(hm)),
598 Some(hm) => hm,
599 };
600 // We get to this point when we know our DFA can match the empty string
601 // AND when UTF-8 mode is enabled. In this case, we skip any matches
602 // whose offset splits a codepoint. Such a match is necessarily a
603 // zero-width match, because UTF-8 mode requires the underlying NFA
604 // to be built such that all non-empty matches span valid UTF-8.
605 // Therefore, any match that ends in the middle of a codepoint cannot
606 // be part of a span of valid UTF-8 and thus must be an empty match.
607 // In such cases, we skip it, so as not to report matches that split a
608 // codepoint.
609 //
610 // Note that this is not a checked assumption. Callers *can* provide an
611 // NFA with UTF-8 mode enabled but produces non-empty matches that span
612 // invalid UTF-8. But doing so is documented to result in unspecified
613 // behavior.
614 empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
615 let got = search::find_fwd(self, cache, input)?;
616 Ok(got.map(|hm| (hm, hm.offset())))
617 })
618 }
619
620 /// Executes a reverse search and returns the start of the position of the
621 /// leftmost match that is found. If no match exists, then `None` is
622 /// returned.
623 ///
624 /// # Errors
625 ///
626 /// This routine errors if the search could not complete. This can occur
627 /// in a number of circumstances:
628 ///
629 /// * The configuration of the lazy DFA may permit it to "quit" the search.
630 /// For example, setting quit bytes or enabling heuristic support for
631 /// Unicode word boundaries. The default configuration does not enable any
632 /// option that could result in the lazy DFA quitting.
633 /// * The configuration of the lazy DFA may also permit it to "give up"
634 /// on a search if it makes ineffective use of its transition table
635 /// cache. The default configuration does not enable this by default,
636 /// although it is typically a good idea to.
637 /// * When the provided `Input` configuration is not supported. For
638 /// example, by providing an unsupported anchor mode.
639 ///
640 /// When a search returns an error, callers cannot know whether a match
641 /// exists or not.
642 ///
643 /// # Example
644 ///
645 /// This routine is principally useful when used in
646 /// conjunction with the
647 /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
648 /// configuration. In general, it's unlikely to be correct to use both
649 /// `try_search_fwd` and `try_search_rev` with the same DFA since any
650 /// particular DFA will only support searching in one direction with
651 /// respect to the pattern.
652 ///
653 /// ```
654 /// use regex_automata::{
655 /// nfa::thompson,
656 /// hybrid::dfa::DFA,
657 /// HalfMatch, Input,
658 /// };
659 ///
660 /// let dfa = DFA::builder()
661 /// .thompson(thompson::Config::new().reverse(true))
662 /// .build("foo[0-9]+")?;
663 /// let mut cache = dfa.create_cache();
664 /// let expected = HalfMatch::must(0, 0);
665 /// assert_eq!(
666 /// Some(expected),
667 /// dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?,
668 /// );
669 ///
670 /// // Even though a match is found after reading the last byte (`c`),
671 /// // the leftmost first match semantics demand that we find the earliest
672 /// // match that prefers earlier parts of the pattern over latter parts.
673 /// let dfa = DFA::builder()
674 /// .thompson(thompson::Config::new().reverse(true))
675 /// .build("abc|c")?;
676 /// let mut cache = dfa.create_cache();
677 /// let expected = HalfMatch::must(0, 0);
678 /// assert_eq!(Some(expected), dfa.try_search_rev(
679 /// &mut cache, &Input::new("abc"))?,
680 /// );
681 ///
682 /// # Ok::<(), Box<dyn std::error::Error>>(())
683 /// ```
684 ///
685 /// # Example: UTF-8 mode
686 ///
687 /// This examples demonstrates that UTF-8 mode applies to reverse
688 /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
689 /// matches reported must correspond to valid UTF-8 spans. This includes
690 /// prohibiting zero-width matches that split a codepoint.
691 ///
692 /// UTF-8 mode is enabled by default. Notice below how the only zero-width
693 /// matches reported are those at UTF-8 boundaries:
694 ///
695 /// ```
696 /// use regex_automata::{
697 /// hybrid::dfa::DFA,
698 /// nfa::thompson,
699 /// HalfMatch, Input, MatchKind,
700 /// };
701 ///
702 /// let dfa = DFA::builder()
703 /// .thompson(thompson::Config::new().reverse(true))
704 /// .build(r"")?;
705 /// let mut cache = dfa.create_cache();
706 ///
707 /// // Run the reverse DFA to collect all matches.
708 /// let mut input = Input::new("☃");
709 /// let mut matches = vec![];
710 /// loop {
711 /// match dfa.try_search_rev(&mut cache, &input)? {
712 /// None => break,
713 /// Some(hm) => {
714 /// matches.push(hm);
715 /// if hm.offset() == 0 || input.end() == 0 {
716 /// break;
717 /// } else if hm.offset() < input.end() {
718 /// input.set_end(hm.offset());
719 /// } else {
720 /// // This is only necessary to handle zero-width
721 /// // matches, which of course occur in this example.
722 /// // Without this, the search would never advance
723 /// // backwards beyond the initial match.
724 /// input.set_end(input.end() - 1);
725 /// }
726 /// }
727 /// }
728 /// }
729 ///
730 /// // No matches split a codepoint.
731 /// let expected = vec![
732 /// HalfMatch::must(0, 3),
733 /// HalfMatch::must(0, 0),
734 /// ];
735 /// assert_eq!(expected, matches);
736 ///
737 /// # Ok::<(), Box<dyn std::error::Error>>(())
738 /// ```
739 ///
740 /// Now let's look at the same example, but with UTF-8 mode on the
741 /// underlying NFA disabled:
742 ///
743 /// ```
744 /// use regex_automata::{
745 /// hybrid::dfa::DFA,
746 /// nfa::thompson,
747 /// HalfMatch, Input, MatchKind,
748 /// };
749 ///
750 /// let dfa = DFA::builder()
751 /// .thompson(thompson::Config::new().reverse(true).utf8(false))
752 /// .build(r"")?;
753 /// let mut cache = dfa.create_cache();
754 ///
755 /// // Run the reverse DFA to collect all matches.
756 /// let mut input = Input::new("☃");
757 /// let mut matches = vec![];
758 /// loop {
759 /// match dfa.try_search_rev(&mut cache, &input)? {
760 /// None => break,
761 /// Some(hm) => {
762 /// matches.push(hm);
763 /// if hm.offset() == 0 || input.end() == 0 {
764 /// break;
765 /// } else if hm.offset() < input.end() {
766 /// input.set_end(hm.offset());
767 /// } else {
768 /// // This is only necessary to handle zero-width
769 /// // matches, which of course occur in this example.
770 /// // Without this, the search would never advance
771 /// // backwards beyond the initial match.
772 /// input.set_end(input.end() - 1);
773 /// }
774 /// }
775 /// }
776 /// }
777 ///
778 /// // No matches split a codepoint.
779 /// let expected = vec![
780 /// HalfMatch::must(0, 3),
781 /// HalfMatch::must(0, 2),
782 /// HalfMatch::must(0, 1),
783 /// HalfMatch::must(0, 0),
784 /// ];
785 /// assert_eq!(expected, matches);
786 ///
787 /// # Ok::<(), Box<dyn std::error::Error>>(())
788 /// ```
789 #[inline]
790 pub fn try_search_rev(
791 &self,
792 cache: &mut Cache,
793 input: &Input<'_>,
794 ) -> Result<Option<HalfMatch>, MatchError> {
795 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
796 let hm = match search::find_rev(self, cache, input)? {
797 None => return Ok(None),
798 Some(hm) if !utf8empty => return Ok(Some(hm)),
799 Some(hm) => hm,
800 };
801 empty::skip_splits_rev(input, hm, hm.offset(), |input| {
802 let got = search::find_rev(self, cache, input)?;
803 Ok(got.map(|hm| (hm, hm.offset())))
804 })
805 }
806
807 /// Executes an overlapping forward search and returns the end position of
808 /// matches as they are found. If no match exists, then `None` is returned.
809 ///
810 /// This routine is principally only useful when searching for multiple
811 /// patterns on inputs where multiple patterns may match the same regions
812 /// of text. In particular, callers must preserve the automaton's search
813 /// state from prior calls so that the implementation knows where the last
814 /// match occurred.
815 ///
816 /// When using this routine to implement an iterator of overlapping
817 /// matches, the `start` of the search should remain invariant throughout
818 /// iteration. The `OverlappingState` given to the search will keep track
819 /// of the current position of the search. (This is because multiple
820 /// matches may be reported at the same position, so only the search
821 /// implementation itself knows when to advance the position.)
822 ///
823 /// If for some reason you want the search to forget about its previous
824 /// state and restart the search at a particular position, then setting the
825 /// state to [`OverlappingState::start`] will accomplish that.
826 ///
827 /// # Errors
828 ///
829 /// This routine errors if the search could not complete. This can occur
830 /// in a number of circumstances:
831 ///
832 /// * The configuration of the lazy DFA may permit it to "quit" the search.
833 /// For example, setting quit bytes or enabling heuristic support for
834 /// Unicode word boundaries. The default configuration does not enable any
835 /// option that could result in the lazy DFA quitting.
836 /// * The configuration of the lazy DFA may also permit it to "give up"
837 /// on a search if it makes ineffective use of its transition table
838 /// cache. The default configuration does not enable this by default,
839 /// although it is typically a good idea to.
840 /// * When the provided `Input` configuration is not supported. For
841 /// example, by providing an unsupported anchor mode.
842 ///
843 /// When a search returns an error, callers cannot know whether a match
844 /// exists or not.
845 ///
846 /// # Example
847 ///
848 /// This example shows how to run a basic overlapping search. Notice
849 /// that we build the automaton with a `MatchKind::All` configuration.
850 /// Overlapping searches are unlikely to work as one would expect when
851 /// using the default `MatchKind::LeftmostFirst` match semantics, since
852 /// leftmost-first matching is fundamentally incompatible with overlapping
853 /// searches. Namely, overlapping searches need to report matches as they
854 /// are seen, where as leftmost-first searches will continue searching even
855 /// after a match has been observed in order to find the conventional end
856 /// position of the match. More concretely, leftmost-first searches use
857 /// dead states to terminate a search after a specific match can no longer
858 /// be extended. Overlapping searches instead do the opposite by continuing
859 /// the search to find totally new matches (potentially of other patterns).
860 ///
861 /// ```
862 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
863 /// use regex_automata::{
864 /// hybrid::dfa::{DFA, OverlappingState},
865 /// HalfMatch, Input, MatchKind,
866 /// };
867 ///
868 /// let dfa = DFA::builder()
869 /// .configure(DFA::config().match_kind(MatchKind::All))
870 /// .build_many(&[r"\w+$", r"\S+$"])?;
871 /// let mut cache = dfa.create_cache();
872 ///
873 /// let haystack = "@foo";
874 /// let mut state = OverlappingState::start();
875 ///
876 /// let expected = Some(HalfMatch::must(1, 4));
877 /// dfa.try_search_overlapping_fwd(
878 /// &mut cache, &Input::new(haystack), &mut state,
879 /// )?;
880 /// assert_eq!(expected, state.get_match());
881 ///
882 /// // The first pattern also matches at the same position, so re-running
883 /// // the search will yield another match. Notice also that the first
884 /// // pattern is returned after the second. This is because the second
885 /// // pattern begins its match before the first, is therefore an earlier
886 /// // match and is thus reported first.
887 /// let expected = Some(HalfMatch::must(0, 4));
888 /// dfa.try_search_overlapping_fwd(
889 /// &mut cache, &Input::new(haystack), &mut state,
890 /// )?;
891 /// assert_eq!(expected, state.get_match());
892 ///
893 /// # Ok::<(), Box<dyn std::error::Error>>(())
894 /// ```
895 #[inline]
896 pub fn try_search_overlapping_fwd(
897 &self,
898 cache: &mut Cache,
899 input: &Input<'_>,
900 state: &mut OverlappingState,
901 ) -> Result<(), MatchError> {
902 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
903 search::find_overlapping_fwd(self, cache, input, state)?;
904 match state.get_match() {
905 None => Ok(()),
906 Some(_) if !utf8empty => Ok(()),
907 Some(_) => skip_empty_utf8_splits_overlapping(
908 input,
909 state,
910 |input, state| {
911 search::find_overlapping_fwd(self, cache, input, state)
912 },
913 ),
914 }
915 }
916
917 /// Executes a reverse overlapping search and returns the start of the
918 /// position of the leftmost match that is found. If no match exists, then
919 /// `None` is returned.
920 ///
921 /// When using this routine to implement an iterator of overlapping
922 /// matches, the `start` of the search should remain invariant throughout
923 /// iteration. The `OverlappingState` given to the search will keep track
924 /// of the current position of the search. (This is because multiple
925 /// matches may be reported at the same position, so only the search
926 /// implementation itself knows when to advance the position.)
927 ///
928 /// If for some reason you want the search to forget about its previous
929 /// state and restart the search at a particular position, then setting the
930 /// state to [`OverlappingState::start`] will accomplish that.
931 ///
932 /// # Errors
933 ///
934 /// This routine errors if the search could not complete. This can occur
935 /// in a number of circumstances:
936 ///
937 /// * The configuration of the lazy DFA may permit it to "quit" the search.
938 /// For example, setting quit bytes or enabling heuristic support for
939 /// Unicode word boundaries. The default configuration does not enable any
940 /// option that could result in the lazy DFA quitting.
941 /// * The configuration of the lazy DFA may also permit it to "give up"
942 /// on a search if it makes ineffective use of its transition table
943 /// cache. The default configuration does not enable this by default,
944 /// although it is typically a good idea to.
945 /// * When the provided `Input` configuration is not supported. For
946 /// example, by providing an unsupported anchor mode.
947 ///
948 /// When a search returns an error, callers cannot know whether a match
949 /// exists or not.
950 ///
951 /// # Example: UTF-8 mode
952 ///
953 /// This examples demonstrates that UTF-8 mode applies to reverse
954 /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
955 /// matches reported must correspond to valid UTF-8 spans. This includes
956 /// prohibiting zero-width matches that split a codepoint.
957 ///
958 /// UTF-8 mode is enabled by default. Notice below how the only zero-width
959 /// matches reported are those at UTF-8 boundaries:
960 ///
961 /// ```
962 /// use regex_automata::{
963 /// hybrid::dfa::{DFA, OverlappingState},
964 /// nfa::thompson,
965 /// HalfMatch, Input, MatchKind,
966 /// };
967 ///
968 /// let dfa = DFA::builder()
969 /// .configure(DFA::config().match_kind(MatchKind::All))
970 /// .thompson(thompson::Config::new().reverse(true))
971 /// .build_many(&[r"", r"☃"])?;
972 /// let mut cache = dfa.create_cache();
973 ///
974 /// // Run the reverse DFA to collect all matches.
975 /// let input = Input::new("☃");
976 /// let mut state = OverlappingState::start();
977 /// let mut matches = vec![];
978 /// loop {
979 /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
980 /// match state.get_match() {
981 /// None => break,
982 /// Some(hm) => matches.push(hm),
983 /// }
984 /// }
985 ///
986 /// // No matches split a codepoint.
987 /// let expected = vec![
988 /// HalfMatch::must(0, 3),
989 /// HalfMatch::must(1, 0),
990 /// HalfMatch::must(0, 0),
991 /// ];
992 /// assert_eq!(expected, matches);
993 ///
994 /// # Ok::<(), Box<dyn std::error::Error>>(())
995 /// ```
996 ///
997 /// Now let's look at the same example, but with UTF-8 mode on the
998 /// underlying NFA disabled:
999 ///
1000 /// ```
1001 /// use regex_automata::{
1002 /// hybrid::dfa::{DFA, OverlappingState},
1003 /// nfa::thompson,
1004 /// HalfMatch, Input, MatchKind,
1005 /// };
1006 ///
1007 /// let dfa = DFA::builder()
1008 /// .configure(DFA::config().match_kind(MatchKind::All))
1009 /// .thompson(thompson::Config::new().reverse(true).utf8(false))
1010 /// .build_many(&[r"", r"☃"])?;
1011 /// let mut cache = dfa.create_cache();
1012 ///
1013 /// // Run the reverse DFA to collect all matches.
1014 /// let input = Input::new("☃");
1015 /// let mut state = OverlappingState::start();
1016 /// let mut matches = vec![];
1017 /// loop {
1018 /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
1019 /// match state.get_match() {
1020 /// None => break,
1021 /// Some(hm) => matches.push(hm),
1022 /// }
1023 /// }
1024 ///
1025 /// // Now *all* positions match, even within a codepoint,
1026 /// // because we lifted the requirement that matches
1027 /// // correspond to valid UTF-8 spans.
1028 /// let expected = vec![
1029 /// HalfMatch::must(0, 3),
1030 /// HalfMatch::must(0, 2),
1031 /// HalfMatch::must(0, 1),
1032 /// HalfMatch::must(1, 0),
1033 /// HalfMatch::must(0, 0),
1034 /// ];
1035 /// assert_eq!(expected, matches);
1036 ///
1037 /// # Ok::<(), Box<dyn std::error::Error>>(())
1038 /// ```
1039 #[inline]
1040 pub fn try_search_overlapping_rev(
1041 &self,
1042 cache: &mut Cache,
1043 input: &Input<'_>,
1044 state: &mut OverlappingState,
1045 ) -> Result<(), MatchError> {
1046 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1047 search::find_overlapping_rev(self, cache, input, state)?;
1048 match state.get_match() {
1049 None => Ok(()),
1050 Some(_) if !utf8empty => Ok(()),
1051 Some(_) => skip_empty_utf8_splits_overlapping(
1052 input,
1053 state,
1054 |input, state| {
1055 search::find_overlapping_rev(self, cache, input, state)
1056 },
1057 ),
1058 }
1059 }
1060
1061 /// Writes the set of patterns that match anywhere in the given search
1062 /// configuration to `patset`. If multiple patterns match at the same
1063 /// position and the underlying DFA supports overlapping matches, then all
1064 /// matching patterns are written to the given set.
1065 ///
1066 /// Unless all of the patterns in this DFA are anchored, then generally
1067 /// speaking, this will visit every byte in the haystack.
1068 ///
1069 /// This search routine *does not* clear the pattern set. This gives some
1070 /// flexibility to the caller (e.g., running multiple searches with the
1071 /// same pattern set), but does make the API bug-prone if you're reusing
1072 /// the same pattern set for multiple searches but intended them to be
1073 /// independent.
1074 ///
1075 /// If a pattern ID matched but the given `PatternSet` does not have
1076 /// sufficient capacity to store it, then it is not inserted and silently
1077 /// dropped.
1078 ///
1079 /// # Errors
1080 ///
1081 /// This routine errors if the search could not complete. This can occur
1082 /// in a number of circumstances:
1083 ///
1084 /// * The configuration of the lazy DFA may permit it to "quit" the search.
1085 /// For example, setting quit bytes or enabling heuristic support for
1086 /// Unicode word boundaries. The default configuration does not enable any
1087 /// option that could result in the lazy DFA quitting.
1088 /// * The configuration of the lazy DFA may also permit it to "give up"
1089 /// on a search if it makes ineffective use of its transition table
1090 /// cache. The default configuration does not enable this by default,
1091 /// although it is typically a good idea to.
1092 /// * When the provided `Input` configuration is not supported. For
1093 /// example, by providing an unsupported anchor mode.
1094 ///
1095 /// When a search returns an error, callers cannot know whether a match
1096 /// exists or not.
1097 ///
1098 /// # Example
1099 ///
1100 /// This example shows how to find all matching patterns in a haystack,
1101 /// even when some patterns match at the same position as other patterns.
1102 ///
1103 /// ```
1104 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1105 /// use regex_automata::{
1106 /// hybrid::dfa::DFA,
1107 /// Input, MatchKind, PatternSet,
1108 /// };
1109 ///
1110 /// let patterns = &[
1111 /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
1112 /// ];
1113 /// let dfa = DFA::builder()
1114 /// .configure(DFA::config().match_kind(MatchKind::All))
1115 /// .build_many(patterns)?;
1116 /// let mut cache = dfa.create_cache();
1117 ///
1118 /// let input = Input::new("foobar");
1119 /// let mut patset = PatternSet::new(dfa.pattern_len());
1120 /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?;
1121 /// let expected = vec![0, 2, 3, 4, 6];
1122 /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
1123 /// assert_eq!(expected, got);
1124 ///
1125 /// # Ok::<(), Box<dyn std::error::Error>>(())
1126 /// ```
1127 #[inline]
1128 pub fn try_which_overlapping_matches(
1129 &self,
1130 cache: &mut Cache,
1131 input: &Input<'_>,
1132 patset: &mut PatternSet,
1133 ) -> Result<(), MatchError> {
1134 let mut state = OverlappingState::start();
1135 while let Some(m) = {
1136 self.try_search_overlapping_fwd(cache, input, &mut state)?;
1137 state.get_match()
1138 } {
1139 let _ = patset.try_insert(m.pattern());
1140 // There's nothing left to find, so we can stop. Or the caller
1141 // asked us to.
1142 if patset.is_full() || input.get_earliest() {
1143 break;
1144 }
1145 }
1146 Ok(())
1147 }
1148}
1149
1150impl DFA {
1151 /// Transitions from the current state to the next state, given the next
1152 /// byte of input.
1153 ///
1154 /// The given cache is used to either reuse pre-computed state
1155 /// transitions, or to store this newly computed transition for future
1156 /// reuse. Thus, this routine guarantees that it will never return a state
1157 /// ID that has an "unknown" tag.
1158 ///
1159 /// # State identifier validity
1160 ///
1161 /// The only valid value for `current` is the lazy state ID returned
1162 /// by the most recent call to `next_state`, `next_state_untagged`,
1163 /// `next_state_untagged_unchecked`, `start_state_forward` or
1164 /// `state_state_reverse` for the given `cache`. Any state ID returned from
1165 /// prior calls to these routines (with the same `cache`) is considered
1166 /// invalid (even if it gives an appearance of working). State IDs returned
1167 /// from _any_ prior call for different `cache` values are also always
1168 /// invalid.
1169 ///
1170 /// The returned ID is always a valid ID when `current` refers to a valid
1171 /// ID. Moreover, this routine is defined for all possible values of
1172 /// `input`.
1173 ///
1174 /// These validity rules are not checked, even in debug mode. Callers are
1175 /// required to uphold these rules themselves.
1176 ///
1177 /// Violating these state ID validity rules will not sacrifice memory
1178 /// safety, but _may_ produce an incorrect result or a panic.
1179 ///
1180 /// # Panics
1181 ///
1182 /// If the given ID does not refer to a valid state, then this routine
1183 /// may panic but it also may not panic and instead return an invalid or
1184 /// incorrect ID.
1185 ///
1186 /// # Example
1187 ///
1188 /// This shows a simplistic example for walking a lazy DFA for a given
1189 /// haystack by using the `next_state` method.
1190 ///
1191 /// ```
1192 /// use regex_automata::{hybrid::dfa::DFA, Input};
1193 ///
1194 /// let dfa = DFA::new(r"[a-z]+r")?;
1195 /// let mut cache = dfa.create_cache();
1196 /// let haystack = "bar".as_bytes();
1197 ///
1198 /// // The start state is determined by inspecting the position and the
1199 /// // initial bytes of the haystack.
1200 /// let mut sid = dfa.start_state_forward(
1201 /// &mut cache, &Input::new(haystack),
1202 /// )?;
1203 /// // Walk all the bytes in the haystack.
1204 /// for &b in haystack {
1205 /// sid = dfa.next_state(&mut cache, sid, b)?;
1206 /// }
1207 /// // Matches are always delayed by 1 byte, so we must explicitly walk the
1208 /// // special "EOI" transition at the end of the search.
1209 /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1210 /// assert!(sid.is_match());
1211 ///
1212 /// # Ok::<(), Box<dyn std::error::Error>>(())
1213 /// ```
1214 #[inline]
1215 pub fn next_state(
1216 &self,
1217 cache: &mut Cache,
1218 current: LazyStateID,
1219 input: u8,
1220 ) -> Result<LazyStateID, CacheError> {
1221 let class = usize::from(self.classes.get(input));
1222 let offset = current.as_usize_untagged() + class;
1223 let sid = cache.trans[offset];
1224 if !sid.is_unknown() {
1225 return Ok(sid);
1226 }
1227 let unit = alphabet::Unit::u8(input);
1228 Lazy::new(self, cache).cache_next_state(current, unit)
1229 }
1230
1231 /// Transitions from the current state to the next state, given the next
1232 /// byte of input and a state ID that is not tagged.
1233 ///
1234 /// The only reason to use this routine is performance. In particular, the
1235 /// `next_state` method needs to do some additional checks, among them is
1236 /// to account for identifiers to states that are not yet computed. In
1237 /// such a case, the transition is computed on the fly. However, if it is
1238 /// known that the `current` state ID is untagged, then these checks can be
1239 /// omitted.
1240 ///
1241 /// Since this routine does not compute states on the fly, it does not
1242 /// modify the cache and thus cannot return an error. Consequently, `cache`
1243 /// does not need to be mutable and it is possible for this routine to
1244 /// return a state ID corresponding to the special "unknown" state. In
1245 /// this case, it is the caller's responsibility to use the prior state
1246 /// ID and `input` with `next_state` in order to force the computation of
1247 /// the unknown transition. Otherwise, trying to use the "unknown" state
1248 /// ID will just result in transitioning back to itself, and thus never
1249 /// terminating. (This is technically a special exemption to the state ID
1250 /// validity rules, but is permissible since this routine is guarateed to
1251 /// never mutate the given `cache`, and thus the identifier is guaranteed
1252 /// to remain valid.)
1253 ///
1254 /// See [`LazyStateID`] for more details on what it means for a state ID
1255 /// to be tagged. Also, see
1256 /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked)
1257 /// for this same idea, but with bounds checks forcefully elided.
1258 ///
1259 /// # State identifier validity
1260 ///
1261 /// The only valid value for `current` is an **untagged** lazy
1262 /// state ID returned by the most recent call to `next_state`,
1263 /// `next_state_untagged`, `next_state_untagged_unchecked`,
1264 /// `start_state_forward` or `state_state_reverse` for the given `cache`.
1265 /// Any state ID returned from prior calls to these routines (with the
1266 /// same `cache`) is considered invalid (even if it gives an appearance
1267 /// of working). State IDs returned from _any_ prior call for different
1268 /// `cache` values are also always invalid.
1269 ///
1270 /// The returned ID is always a valid ID when `current` refers to a valid
1271 /// ID, although it may be tagged. Moreover, this routine is defined for
1272 /// all possible values of `input`.
1273 ///
1274 /// Not all validity rules are checked, even in debug mode. Callers are
1275 /// required to uphold these rules themselves.
1276 ///
1277 /// Violating these state ID validity rules will not sacrifice memory
1278 /// safety, but _may_ produce an incorrect result or a panic.
1279 ///
1280 /// # Panics
1281 ///
1282 /// If the given ID does not refer to a valid state, then this routine
1283 /// may panic but it also may not panic and instead return an invalid or
1284 /// incorrect ID.
1285 ///
1286 /// # Example
1287 ///
1288 /// This shows a simplistic example for walking a lazy DFA for a given
1289 /// haystack by using the `next_state_untagged` method where possible.
1290 ///
1291 /// ```
1292 /// use regex_automata::{hybrid::dfa::DFA, Input};
1293 ///
1294 /// let dfa = DFA::new(r"[a-z]+r")?;
1295 /// let mut cache = dfa.create_cache();
1296 /// let haystack = "bar".as_bytes();
1297 ///
1298 /// // The start state is determined by inspecting the position and the
1299 /// // initial bytes of the haystack.
1300 /// let mut sid = dfa.start_state_forward(
1301 /// &mut cache, &Input::new(haystack),
1302 /// )?;
1303 /// // Walk all the bytes in the haystack.
1304 /// let mut at = 0;
1305 /// while at < haystack.len() {
1306 /// if sid.is_tagged() {
1307 /// sid = dfa.next_state(&mut cache, sid, haystack[at])?;
1308 /// } else {
1309 /// let mut prev_sid = sid;
1310 /// // We attempt to chew through as much as we can while moving
1311 /// // through untagged state IDs. Thus, the transition function
1312 /// // does less work on average per byte. (Unrolling this loop
1313 /// // may help even more.)
1314 /// while at < haystack.len() {
1315 /// prev_sid = sid;
1316 /// sid = dfa.next_state_untagged(
1317 /// &mut cache, sid, haystack[at],
1318 /// );
1319 /// at += 1;
1320 /// if sid.is_tagged() {
1321 /// break;
1322 /// }
1323 /// }
1324 /// // We must ensure that we never proceed to the next iteration
1325 /// // with an unknown state ID. If we don't account for this
1326 /// // case, then search isn't guaranteed to terminate since all
1327 /// // transitions on unknown states loop back to itself.
1328 /// if sid.is_unknown() {
1329 /// sid = dfa.next_state(
1330 /// &mut cache, prev_sid, haystack[at - 1],
1331 /// )?;
1332 /// }
1333 /// }
1334 /// }
1335 /// // Matches are always delayed by 1 byte, so we must explicitly walk the
1336 /// // special "EOI" transition at the end of the search.
1337 /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1338 /// assert!(sid.is_match());
1339 ///
1340 /// # Ok::<(), Box<dyn std::error::Error>>(())
1341 /// ```
1342 #[inline]
1343 pub fn next_state_untagged(
1344 &self,
1345 cache: &Cache,
1346 current: LazyStateID,
1347 input: u8,
1348 ) -> LazyStateID {
1349 debug_assert!(!current.is_tagged());
1350 let class = usize::from(self.classes.get(input));
1351 let offset = current.as_usize_unchecked() + class;
1352 cache.trans[offset]
1353 }
1354
1355 /// Transitions from the current state to the next state, eliding bounds
1356 /// checks, given the next byte of input and a state ID that is not tagged.
1357 ///
1358 /// The only reason to use this routine is performance. In particular, the
1359 /// `next_state` method needs to do some additional checks, among them is
1360 /// to account for identifiers to states that are not yet computed. In
1361 /// such a case, the transition is computed on the fly. However, if it is
1362 /// known that the `current` state ID is untagged, then these checks can be
1363 /// omitted.
1364 ///
1365 /// Since this routine does not compute states on the fly, it does not
1366 /// modify the cache and thus cannot return an error. Consequently, `cache`
1367 /// does not need to be mutable and it is possible for this routine to
1368 /// return a state ID corresponding to the special "unknown" state. In
1369 /// this case, it is the caller's responsibility to use the prior state
1370 /// ID and `input` with `next_state` in order to force the computation of
1371 /// the unknown transition. Otherwise, trying to use the "unknown" state
1372 /// ID will just result in transitioning back to itself, and thus never
1373 /// terminating. (This is technically a special exemption to the state ID
1374 /// validity rules, but is permissible since this routine is guarateed to
1375 /// never mutate the given `cache`, and thus the identifier is guaranteed
1376 /// to remain valid.)
1377 ///
1378 /// See [`LazyStateID`] for more details on what it means for a state ID
1379 /// to be tagged. Also, see
1380 /// [`next_state_untagged`](DFA::next_state_untagged)
1381 /// for this same idea, but with memory safety guaranteed by retaining
1382 /// bounds checks.
1383 ///
1384 /// # State identifier validity
1385 ///
1386 /// The only valid value for `current` is an **untagged** lazy
1387 /// state ID returned by the most recent call to `next_state`,
1388 /// `next_state_untagged`, `next_state_untagged_unchecked`,
1389 /// `start_state_forward` or `state_state_reverse` for the given `cache`.
1390 /// Any state ID returned from prior calls to these routines (with the
1391 /// same `cache`) is considered invalid (even if it gives an appearance
1392 /// of working). State IDs returned from _any_ prior call for different
1393 /// `cache` values are also always invalid.
1394 ///
1395 /// The returned ID is always a valid ID when `current` refers to a valid
1396 /// ID, although it may be tagged. Moreover, this routine is defined for
1397 /// all possible values of `input`.
1398 ///
1399 /// Not all validity rules are checked, even in debug mode. Callers are
1400 /// required to uphold these rules themselves.
1401 ///
1402 /// Violating these state ID validity rules will not sacrifice memory
1403 /// safety, but _may_ produce an incorrect result or a panic.
1404 ///
1405 /// # Safety
1406 ///
1407 /// Callers of this method must guarantee that `current` refers to a valid
1408 /// state ID according to the rules described above. If `current` is not a
1409 /// valid state ID for this automaton, then calling this routine may result
1410 /// in undefined behavior.
1411 ///
1412 /// If `current` is valid, then the ID returned is valid for all possible
1413 /// values of `input`.
1414 #[inline]
1415 pub unsafe fn next_state_untagged_unchecked(
1416 &self,
1417 cache: &Cache,
1418 current: LazyStateID,
1419 input: u8,
1420 ) -> LazyStateID {
1421 debug_assert!(!current.is_tagged());
1422 let class = usize::from(self.classes.get(input));
1423 let offset = current.as_usize_unchecked() + class;
1424 *cache.trans.get_unchecked(offset)
1425 }
1426
1427 /// Transitions from the current state to the next state for the special
1428 /// EOI symbol.
1429 ///
1430 /// The given cache is used to either reuse pre-computed state
1431 /// transitions, or to store this newly computed transition for future
1432 /// reuse. Thus, this routine guarantees that it will never return a state
1433 /// ID that has an "unknown" tag.
1434 ///
1435 /// This routine must be called at the end of every search in a correct
1436 /// implementation of search. Namely, lazy DFAs in this crate delay matches
1437 /// by one byte in order to support look-around operators. Thus, after
1438 /// reaching the end of a haystack, a search implementation must follow one
1439 /// last EOI transition.
1440 ///
1441 /// It is best to think of EOI as an additional symbol in the alphabet of a
1442 /// DFA that is distinct from every other symbol. That is, the alphabet of
1443 /// lazy DFAs in this crate has a logical size of 257 instead of 256, where
1444 /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
1445 /// physical alphabet size may be smaller because of alphabet compression
1446 /// via equivalence classes, but EOI is always represented somehow in the
1447 /// alphabet.)
1448 ///
1449 /// # State identifier validity
1450 ///
1451 /// The only valid value for `current` is the lazy state ID returned
1452 /// by the most recent call to `next_state`, `next_state_untagged`,
1453 /// `next_state_untagged_unchecked`, `start_state_forward` or
1454 /// `state_state_reverse` for the given `cache`. Any state ID returned from
1455 /// prior calls to these routines (with the same `cache`) is considered
1456 /// invalid (even if it gives an appearance of working). State IDs returned
1457 /// from _any_ prior call for different `cache` values are also always
1458 /// invalid.
1459 ///
1460 /// The returned ID is always a valid ID when `current` refers to a valid
1461 /// ID.
1462 ///
1463 /// These validity rules are not checked, even in debug mode. Callers are
1464 /// required to uphold these rules themselves.
1465 ///
1466 /// Violating these state ID validity rules will not sacrifice memory
1467 /// safety, but _may_ produce an incorrect result or a panic.
1468 ///
1469 /// # Panics
1470 ///
1471 /// If the given ID does not refer to a valid state, then this routine
1472 /// may panic but it also may not panic and instead return an invalid or
1473 /// incorrect ID.
1474 ///
1475 /// # Example
1476 ///
1477 /// This shows a simplistic example for walking a DFA for a given haystack,
1478 /// and then finishing the search with the final EOI transition.
1479 ///
1480 /// ```
1481 /// use regex_automata::{hybrid::dfa::DFA, Input};
1482 ///
1483 /// let dfa = DFA::new(r"[a-z]+r")?;
1484 /// let mut cache = dfa.create_cache();
1485 /// let haystack = "bar".as_bytes();
1486 ///
1487 /// // The start state is determined by inspecting the position and the
1488 /// // initial bytes of the haystack.
1489 /// let mut sid = dfa.start_state_forward(
1490 /// &mut cache, &Input::new(haystack),
1491 /// )?;
1492 /// // Walk all the bytes in the haystack.
1493 /// for &b in haystack {
1494 /// sid = dfa.next_state(&mut cache, sid, b)?;
1495 /// }
1496 /// // Matches are always delayed by 1 byte, so we must explicitly walk
1497 /// // the special "EOI" transition at the end of the search. Without this
1498 /// // final transition, the assert below will fail since the DFA will not
1499 /// // have entered a match state yet!
1500 /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1501 /// assert!(sid.is_match());
1502 ///
1503 /// # Ok::<(), Box<dyn std::error::Error>>(())
1504 /// ```
1505 #[inline]
1506 pub fn next_eoi_state(
1507 &self,
1508 cache: &mut Cache,
1509 current: LazyStateID,
1510 ) -> Result<LazyStateID, CacheError> {
1511 let eoi = self.classes.eoi().as_usize();
1512 let offset = current.as_usize_untagged() + eoi;
1513 let sid = cache.trans[offset];
1514 if !sid.is_unknown() {
1515 return Ok(sid);
1516 }
1517 let unit = self.classes.eoi();
1518 Lazy::new(self, cache).cache_next_state(current, unit)
1519 }
1520
1521 /// Return the ID of the start state for this lazy DFA for the given
1522 /// starting configuration.
1523 ///
1524 /// Unlike typical DFA implementations, the start state for DFAs in this
1525 /// crate is dependent on a few different factors:
1526 ///
1527 /// * The [`Anchored`] mode of the search. Unanchored, anchored and
1528 /// anchored searches for a specific [`PatternID`] all use different start
1529 /// states.
1530 /// * Whether a "look-behind" byte exists. For example, the `^` anchor
1531 /// matches if and only if there is no look-behind byte.
1532 /// * The specific value of that look-behind byte. For example, a `(?m:^)`
1533 /// assertion only matches when there is either no look-behind byte, or
1534 /// when the look-behind byte is a line terminator.
1535 ///
1536 /// The [starting configuration](start::Config) provides the above
1537 /// information.
1538 ///
1539 /// This routine can be used for either forward or reverse searches.
1540 /// Although, as a convenience, if you have an [`Input`], then it
1541 /// may be more succinct to use [`DFA::start_state_forward`] or
1542 /// [`DFA::start_state_reverse`]. Note, for example, that the convenience
1543 /// routines return a [`MatchError`] on failure where as this routine
1544 /// returns a [`StartError`].
1545 ///
1546 /// # Errors
1547 ///
1548 /// This may return a [`StartError`] if the search needs to give up when
1549 /// determining the start state (for example, if it sees a "quit" byte
1550 /// or if the cache has become inefficient). This can also return an
1551 /// error if the given configuration contains an unsupported [`Anchored`]
1552 /// configuration.
1553 #[cfg_attr(feature = "perf-inline", inline(always))]
1554 pub fn start_state(
1555 &self,
1556 cache: &mut Cache,
1557 config: &start::Config,
1558 ) -> Result<LazyStateID, StartError> {
1559 let lazy = LazyRef::new(self, cache);
1560 let anchored = config.get_anchored();
1561 let start = match config.get_look_behind() {
1562 None => Start::Text,
1563 Some(byte) => {
1564 if !self.quitset.is_empty() && self.quitset.contains(byte) {
1565 return Err(StartError::quit(byte));
1566 }
1567 self.start_map.get(byte)
1568 }
1569 };
1570 let start_id = lazy.get_cached_start_id(anchored, start)?;
1571 if !start_id.is_unknown() {
1572 return Ok(start_id);
1573 }
1574 Lazy::new(self, cache).cache_start_group(anchored, start)
1575 }
1576
1577 /// Return the ID of the start state for this lazy DFA when executing a
1578 /// forward search.
1579 ///
1580 /// This is a convenience routine for calling [`DFA::start_state`] that
1581 /// converts the given [`Input`] to a [start configuration](start::Config).
1582 /// Additionally, if an error occurs, it is converted from a [`StartError`]
1583 /// to a [`MatchError`] using the offset information in the given
1584 /// [`Input`].
1585 ///
1586 /// # Errors
1587 ///
1588 /// This may return a [`MatchError`] if the search needs to give up when
1589 /// determining the start state (for example, if it sees a "quit" byte or
1590 /// if the cache has become inefficient). This can also return an error if
1591 /// the given `Input` contains an unsupported [`Anchored`] configuration.
1592 #[cfg_attr(feature = "perf-inline", inline(always))]
1593 pub fn start_state_forward(
1594 &self,
1595 cache: &mut Cache,
1596 input: &Input<'_>,
1597 ) -> Result<LazyStateID, MatchError> {
1598 let config = start::Config::from_input_forward(input);
1599 self.start_state(cache, &config).map_err(|err| match err {
1600 StartError::Cache { .. } => MatchError::gave_up(input.start()),
1601 StartError::Quit { byte } => {
1602 let offset = input
1603 .start()
1604 .checked_sub(1)
1605 .expect("no quit in start without look-behind");
1606 MatchError::quit(byte, offset)
1607 }
1608 StartError::UnsupportedAnchored { mode } => {
1609 MatchError::unsupported_anchored(mode)
1610 }
1611 })
1612 }
1613
1614 /// Return the ID of the start state for this lazy DFA when executing a
1615 /// reverse search.
1616 ///
1617 /// This is a convenience routine for calling [`DFA::start_state`] that
1618 /// converts the given [`Input`] to a [start configuration](start::Config).
1619 /// Additionally, if an error occurs, it is converted from a [`StartError`]
1620 /// to a [`MatchError`] using the offset information in the given
1621 /// [`Input`].
1622 ///
1623 /// # Errors
1624 ///
1625 /// This may return a [`MatchError`] if the search needs to give up when
1626 /// determining the start state (for example, if it sees a "quit" byte or
1627 /// if the cache has become inefficient). This can also return an error if
1628 /// the given `Input` contains an unsupported [`Anchored`] configuration.
1629 #[cfg_attr(feature = "perf-inline", inline(always))]
1630 pub fn start_state_reverse(
1631 &self,
1632 cache: &mut Cache,
1633 input: &Input<'_>,
1634 ) -> Result<LazyStateID, MatchError> {
1635 let config = start::Config::from_input_reverse(input);
1636 self.start_state(cache, &config).map_err(|err| match err {
1637 StartError::Cache { .. } => MatchError::gave_up(input.end()),
1638 StartError::Quit { byte } => {
1639 let offset = input.end();
1640 MatchError::quit(byte, offset)
1641 }
1642 StartError::UnsupportedAnchored { mode } => {
1643 MatchError::unsupported_anchored(mode)
1644 }
1645 })
1646 }
1647
1648 /// Returns the total number of patterns that match in this state.
1649 ///
1650 /// If the lazy DFA was compiled with one pattern, then this must
1651 /// necessarily always return `1` for all match states.
1652 ///
1653 /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with
1654 /// indices up to (but not including) the length returned by this routine
1655 /// without panicking.
1656 ///
1657 /// # Panics
1658 ///
1659 /// If the given state is not a match state, then this may either panic
1660 /// or return an incorrect result.
1661 ///
1662 /// # Example
1663 ///
1664 /// This example shows a simple instance of implementing overlapping
1665 /// matches. In particular, it shows not only how to determine how many
1666 /// patterns have matched in a particular state, but also how to access
1667 /// which specific patterns have matched.
1668 ///
1669 /// Notice that we must use [`MatchKind::All`] when building the DFA. If we
1670 /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be
1671 /// constructed in a way that supports overlapping matches. (It would only
1672 /// report a single pattern that matches at any particular point in time.)
1673 ///
1674 /// Another thing to take note of is the patterns used and the order in
1675 /// which the pattern IDs are reported. In the example below, pattern `3`
1676 /// is yielded first. Why? Because it corresponds to the match that
1677 /// appears first. Namely, the `@` symbol is part of `\S+` but not part
1678 /// of any of the other patterns. Since the `\S+` pattern has a match that
1679 /// starts to the left of any other pattern, its ID is returned before any
1680 /// other.
1681 ///
1682 /// ```
1683 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1684 /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind};
1685 ///
1686 /// let dfa = DFA::builder()
1687 /// .configure(DFA::config().match_kind(MatchKind::All))
1688 /// .build_many(&[
1689 /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
1690 /// ])?;
1691 /// let mut cache = dfa.create_cache();
1692 /// let haystack = "@bar".as_bytes();
1693 ///
1694 /// // The start state is determined by inspecting the position and the
1695 /// // initial bytes of the haystack.
1696 /// let mut sid = dfa.start_state_forward(
1697 /// &mut cache, &Input::new(haystack),
1698 /// )?;
1699 /// // Walk all the bytes in the haystack.
1700 /// for &b in haystack {
1701 /// sid = dfa.next_state(&mut cache, sid, b)?;
1702 /// }
1703 /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1704 ///
1705 /// assert!(sid.is_match());
1706 /// assert_eq!(dfa.match_len(&mut cache, sid), 3);
1707 /// // The following calls are guaranteed to not panic since `match_len`
1708 /// // returned `3` above.
1709 /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3);
1710 /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0);
1711 /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1);
1712 ///
1713 /// # Ok::<(), Box<dyn std::error::Error>>(())
1714 /// ```
1715 #[inline]
1716 pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize {
1717 assert!(id.is_match());
1718 LazyRef::new(self, cache).get_cached_state(id).match_len()
1719 }
1720
1721 /// Returns the pattern ID corresponding to the given match index in the
1722 /// given state.
1723 ///
1724 /// See [`DFA::match_len`] for an example of how to use this method
1725 /// correctly. Note that if you know your lazy DFA is configured with a
1726 /// single pattern, then this routine is never necessary since it will
1727 /// always return a pattern ID of `0` for an index of `0` when `id`
1728 /// corresponds to a match state.
1729 ///
1730 /// Typically, this routine is used when implementing an overlapping
1731 /// search, as the example for `DFA::match_len` does.
1732 ///
1733 /// # Panics
1734 ///
1735 /// If the state ID is not a match state or if the match index is out
1736 /// of bounds for the given state, then this routine may either panic
1737 /// or produce an incorrect result. If the state ID is correct and the
1738 /// match index is correct, then this routine always produces a valid
1739 /// `PatternID`.
1740 #[inline]
1741 pub fn match_pattern(
1742 &self,
1743 cache: &Cache,
1744 id: LazyStateID,
1745 match_index: usize,
1746 ) -> PatternID {
1747 // This is an optimization for the very common case of a DFA with a
1748 // single pattern. This conditional avoids a somewhat more costly path
1749 // that finds the pattern ID from the corresponding `State`, which
1750 // requires a bit of slicing/pointer-chasing. This optimization tends
1751 // to only matter when matches are frequent.
1752 if self.pattern_len() == 1 {
1753 return PatternID::ZERO;
1754 }
1755 LazyRef::new(self, cache)
1756 .get_cached_state(id)
1757 .match_pattern(match_index)
1758 }
1759}
1760
1761/// A cache represents a partially computed DFA.
1762///
1763/// A cache is the key component that differentiates a classical DFA and a
1764/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
1765/// complete transition table that can handle all possible inputs, a hybrid
1766/// NFA/DFA starts with an empty transition table and builds only the parts
1767/// required during search. The parts that are built are stored in a cache. For
1768/// this reason, a cache is a required parameter for nearly every operation on
1769/// a [`DFA`].
1770///
1771/// Caches can be created from their corresponding DFA via
1772/// [`DFA::create_cache`]. A cache can only be used with either the DFA that
1773/// created it, or the DFA that was most recently used to reset it with
1774/// [`Cache::reset`]. Using a cache with any other DFA may result in panics
1775/// or incorrect results.
1776#[derive(Clone, Debug)]
1777pub struct Cache {
1778 // N.B. If you're looking to understand how determinization works, it
1779 // is probably simpler to first grok src/dfa/determinize.rs, since that
1780 // doesn't have the "laziness" component.
1781 /// The transition table.
1782 ///
1783 /// Given a `current` LazyStateID and an `input` byte, the next state can
1784 /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice
1785 /// that no multiplication is used. That's because state identifiers are
1786 /// "premultiplied."
1787 ///
1788 /// Note that the next state may be the "unknown" state. In this case, the
1789 /// next state is not known and determinization for `current` on `input`
1790 /// must be performed.
1791 trans: Vec<LazyStateID>,
1792 /// The starting states for this DFA.
1793 ///
1794 /// These are computed lazily. Initially, these are all set to "unknown"
1795 /// lazy state IDs.
1796 ///
1797 /// When 'starts_for_each_pattern' is disabled (the default), then the size
1798 /// of this is constrained to the possible starting configurations based
1799 /// on the search parameters. (At time of writing, that's 4.) However,
1800 /// when starting states for each pattern is enabled, then there are N
1801 /// additional groups of starting states, where each group reflects the
1802 /// different possible configurations and N is the number of patterns.
1803 starts: Vec<LazyStateID>,
1804 /// A sequence of NFA/DFA powerset states that have been computed for this
1805 /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every
1806 /// tagged LazyStateID can be used to index this sequence by converting it
1807 /// to its untagged form.)
1808 states: Vec<State>,
1809 /// A map from states to their corresponding IDs. This map may be accessed
1810 /// via the raw byte representation of a state, which means that a `State`
1811 /// does not need to be allocated to determine whether it already exists
1812 /// in this map. Indeed, the existence of such a state is what determines
1813 /// whether we allocate a new `State` or not.
1814 ///
1815 /// The higher level idea here is that we do just enough determinization
1816 /// for a state to check whether we've already computed it. If we have,
1817 /// then we can save a little (albeit not much) work. The real savings is
1818 /// in memory usage. If we never checked for trivially duplicate states,
1819 /// then our memory usage would explode to unreasonable levels.
1820 states_to_id: StateMap,
1821 /// Sparse sets used to track which NFA states have been visited during
1822 /// various traversals.
1823 sparses: SparseSets,
1824 /// Scratch space for traversing the NFA graph. (We use space on the heap
1825 /// instead of the call stack.)
1826 stack: Vec<NFAStateID>,
1827 /// Scratch space for building a NFA/DFA powerset state. This is used to
1828 /// help amortize allocation since not every powerset state generated is
1829 /// added to the cache. In particular, if it already exists in the cache,
1830 /// then there is no need to allocate a new `State` for it.
1831 scratch_state_builder: StateBuilderEmpty,
1832 /// A simple abstraction for handling the saving of at most a single state
1833 /// across a cache clearing. This is required for correctness. Namely, if
1834 /// adding a new state after clearing the cache fails, then the caller
1835 /// must retain the ability to continue using the state ID given. The
1836 /// state corresponding to the state ID is what we preserve across cache
1837 /// clearings.
1838 state_saver: StateSaver,
1839 /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We
1840 /// track this as new states are added since states use a variable amount
1841 /// of heap. Tracking this as we add states makes it possible to compute
1842 /// the total amount of memory used by the determinizer in constant time.
1843 memory_usage_state: usize,
1844 /// The number of times the cache has been cleared. When a minimum cache
1845 /// clear count is set, then the cache will return an error instead of
1846 /// clearing the cache if the count has been exceeded.
1847 clear_count: usize,
1848 /// The total number of bytes searched since the last time this cache was
1849 /// cleared, not including the current search.
1850 ///
1851 /// This can be added to the length of the current search to get the true
1852 /// total number of bytes searched.
1853 ///
1854 /// This is generally only non-zero when the
1855 /// `Cache::search_{start,update,finish}` APIs are used to track search
1856 /// progress.
1857 bytes_searched: usize,
1858 /// The progress of the current search.
1859 ///
1860 /// This is only non-`None` when callers utlize the `Cache::search_start`,
1861 /// `Cache::search_update` and `Cache::search_finish` APIs.
1862 ///
1863 /// The purpose of recording search progress is to be able to make a
1864 /// determination about the efficiency of the cache. Namely, by keeping
1865 /// track of the
1866 progress: Option<SearchProgress>,
1867}
1868
1869impl Cache {
1870 /// Create a new cache for the given lazy DFA.
1871 ///
1872 /// The cache returned should only be used for searches for the given DFA.
1873 /// If you want to reuse the cache for another DFA, then you must call
1874 /// [`Cache::reset`] with that DFA.
1875 pub fn new(dfa: &DFA) -> Cache {
1876 let mut cache = Cache {
1877 trans: alloc::vec![],
1878 starts: alloc::vec![],
1879 states: alloc::vec![],
1880 states_to_id: StateMap::new(),
1881 sparses: SparseSets::new(dfa.get_nfa().states().len()),
1882 stack: alloc::vec![],
1883 scratch_state_builder: StateBuilderEmpty::new(),
1884 state_saver: StateSaver::none(),
1885 memory_usage_state: 0,
1886 clear_count: 0,
1887 bytes_searched: 0,
1888 progress: None,
1889 };
1890 debug!("pre-init lazy DFA cache size: {}", cache.memory_usage());
1891 Lazy { dfa, cache: &mut cache }.init_cache();
1892 debug!("post-init lazy DFA cache size: {}", cache.memory_usage());
1893 cache
1894 }
1895
1896 /// Reset this cache such that it can be used for searching with the given
1897 /// lazy DFA (and only that DFA).
1898 ///
1899 /// A cache reset permits reusing memory already allocated in this cache
1900 /// with a different lazy DFA.
1901 ///
1902 /// Resetting a cache sets its "clear count" to 0. This is relevant if the
1903 /// lazy DFA has been configured to "give up" after it has cleared the
1904 /// cache a certain number of times.
1905 ///
1906 /// Any lazy state ID generated by the cache prior to resetting it is
1907 /// invalid after the reset.
1908 ///
1909 /// # Example
1910 ///
1911 /// This shows how to re-purpose a cache for use with a different DFA.
1912 ///
1913 /// ```
1914 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1915 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
1916 ///
1917 /// let dfa1 = DFA::new(r"\w")?;
1918 /// let dfa2 = DFA::new(r"\W")?;
1919 ///
1920 /// let mut cache = dfa1.create_cache();
1921 /// assert_eq!(
1922 /// Some(HalfMatch::must(0, 2)),
1923 /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
1924 /// );
1925 ///
1926 /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
1927 /// // incorrect results. In order to re-purpose the cache, we must reset
1928 /// // it with the DFA we'd like to use it with.
1929 /// //
1930 /// // Similarly, after this reset, using the cache with 'dfa1' is also not
1931 /// // allowed.
1932 /// cache.reset(&dfa2);
1933 /// assert_eq!(
1934 /// Some(HalfMatch::must(0, 3)),
1935 /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
1936 /// );
1937 ///
1938 /// # Ok::<(), Box<dyn std::error::Error>>(())
1939 /// ```
1940 pub fn reset(&mut self, dfa: &DFA) {
1941 Lazy::new(dfa, self).reset_cache()
1942 }
1943
1944 /// Initializes a new search starting at the given position.
1945 ///
1946 /// If a previous search was unfinished, then it is finished automatically
1947 /// and a new search is begun.
1948 ///
1949 /// Note that keeping track of search progress is _not necessary_
1950 /// for correct implementations of search using a lazy DFA. Keeping
1951 /// track of search progress is only necessary if you want the
1952 /// [`Config::minimum_bytes_per_state`] configuration knob to work.
1953 #[inline]
1954 pub fn search_start(&mut self, at: usize) {
1955 // If a previous search wasn't marked as finished, then finish it
1956 // now automatically.
1957 if let Some(p) = self.progress.take() {
1958 self.bytes_searched += p.len();
1959 }
1960 self.progress = Some(SearchProgress { start: at, at });
1961 }
1962
1963 /// Updates the current search to indicate that it has search to the
1964 /// current position.
1965 ///
1966 /// No special care needs to be taken for reverse searches. Namely, the
1967 /// position given may be _less than_ the starting position of the search.
1968 ///
1969 /// # Panics
1970 ///
1971 /// This panics if no search has been started by [`Cache::search_start`].
1972 #[inline]
1973 pub fn search_update(&mut self, at: usize) {
1974 let p =
1975 self.progress.as_mut().expect("no in-progress search to update");
1976 p.at = at;
1977 }
1978
1979 /// Indicates that a search has finished at the given position.
1980 ///
1981 /// # Panics
1982 ///
1983 /// This panics if no search has been started by [`Cache::search_start`].
1984 #[inline]
1985 pub fn search_finish(&mut self, at: usize) {
1986 let mut p =
1987 self.progress.take().expect("no in-progress search to finish");
1988 p.at = at;
1989 self.bytes_searched += p.len();
1990 }
1991
1992 /// Returns the total number of bytes that have been searched since this
1993 /// cache was last cleared.
1994 ///
1995 /// This is useful for determining the efficiency of the cache. For
1996 /// example, the lazy DFA uses this value in conjunction with the
1997 /// [`Config::minimum_bytes_per_state`] knob to help determine whether it
1998 /// should quit searching.
1999 ///
2000 /// This always returns `0` if search progress isn't being tracked. Note
2001 /// that the lazy DFA search routines in this crate always track search
2002 /// progress.
2003 pub fn search_total_len(&self) -> usize {
2004 self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len())
2005 }
2006
2007 /// Returns the total number of times this cache has been cleared since it
2008 /// was either created or last reset.
2009 ///
2010 /// This is useful for informational purposes or if you want to change
2011 /// search strategies based on the number of times the cache has been
2012 /// cleared.
2013 pub fn clear_count(&self) -> usize {
2014 self.clear_count
2015 }
2016
2017 /// Returns the heap memory usage, in bytes, of this cache.
2018 ///
2019 /// This does **not** include the stack size used up by this cache. To
2020 /// compute that, use `std::mem::size_of::<Cache>()`.
2021 pub fn memory_usage(&self) -> usize {
2022 const ID_SIZE: usize = size_of::<LazyStateID>();
2023 const STATE_SIZE: usize = size_of::<State>();
2024
2025 // NOTE: If you make changes to the below, then
2026 // 'minimum_cache_capacity' should be updated correspondingly.
2027
2028 self.trans.len() * ID_SIZE
2029 + self.starts.len() * ID_SIZE
2030 + self.states.len() * STATE_SIZE
2031 // Maps likely use more memory than this, but it's probably close.
2032 + self.states_to_id.len() * (STATE_SIZE + ID_SIZE)
2033 + self.sparses.memory_usage()
2034 + self.stack.capacity() * ID_SIZE
2035 + self.scratch_state_builder.capacity()
2036 // Heap memory used by 'State' in both 'states' and 'states_to_id'.
2037 + self.memory_usage_state
2038 }
2039}
2040
2041/// Keeps track of the progress of the current search.
2042///
2043/// This is updated via the `Cache::search_{start,update,finish}` APIs to
2044/// record how many bytes have been searched. This permits computing a
2045/// heuristic that represents the efficiency of a cache, and thus helps inform
2046/// whether the lazy DFA should give up or not.
2047#[derive(Clone, Debug)]
2048struct SearchProgress {
2049 start: usize,
2050 at: usize,
2051}
2052
2053impl SearchProgress {
2054 /// Returns the length, in bytes, of this search so far.
2055 ///
2056 /// This automatically handles the case of a reverse search, where `at`
2057 /// is likely to be less than `start`.
2058 fn len(&self) -> usize {
2059 if self.start <= self.at {
2060 self.at - self.start
2061 } else {
2062 self.start - self.at
2063 }
2064 }
2065}
2066
2067/// A map from states to state identifiers. When using std, we use a standard
2068/// hashmap, since it's a bit faster for this use case. (Other maps, like
2069/// one's based on FNV, have not yet been benchmarked.)
2070///
2071/// The main purpose of this map is to reuse states where possible. This won't
2072/// fully minimize the DFA, but it works well in a lot of cases.
2073#[cfg(feature = "std")]
2074type StateMap = std::collections::HashMap<State, LazyStateID>;
2075#[cfg(not(feature = "std"))]
2076type StateMap = alloc::collections::BTreeMap<State, LazyStateID>;
2077
2078/// A type that groups methods that require the base NFA/DFA and writable
2079/// access to the cache.
2080#[derive(Debug)]
2081struct Lazy<'i, 'c> {
2082 dfa: &'i DFA,
2083 cache: &'c mut Cache,
2084}
2085
2086impl<'i, 'c> Lazy<'i, 'c> {
2087 /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
2088 fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> {
2089 Lazy { dfa, cache }
2090 }
2091
2092 /// Return an immutable view by downgrading a writable cache to a read-only
2093 /// cache.
2094 fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> {
2095 LazyRef::new(self.dfa, self.cache)
2096 }
2097
2098 /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA'
2099 /// like 'next_state' and 'next_eoi_state' that are called in critical
2100 /// areas. The idea is to let the optimizer focus on the other areas of
2101 /// those methods as the hot path.
2102 ///
2103 /// Here's an example that justifies 'inline(never)'
2104 ///
2105 /// ```ignore
2106 /// regex-cli find match hybrid \
2107 /// --cache-capacity 100000000 \
2108 /// -p '\pL{100}'
2109 /// all-codepoints-utf8-100x
2110 /// ```
2111 ///
2112 /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
2113 /// codepoint, in sequence, repeated 100 times.
2114 ///
2115 /// With 'inline(never)' hyperfine reports 1.1s per run. With
2116 /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement.
2117 #[cold]
2118 #[inline(never)]
2119 fn cache_next_state(
2120 &mut self,
2121 mut current: LazyStateID,
2122 unit: alphabet::Unit,
2123 ) -> Result<LazyStateID, CacheError> {
2124 let stride2 = self.dfa.stride2();
2125 let empty_builder = self.get_state_builder();
2126 let builder = determinize::next(
2127 self.dfa.get_nfa(),
2128 self.dfa.get_config().get_match_kind(),
2129 &mut self.cache.sparses,
2130 &mut self.cache.stack,
2131 &self.cache.states[current.as_usize_untagged() >> stride2],
2132 unit,
2133 empty_builder,
2134 );
2135 let save_state = !self.as_ref().state_builder_fits_in_cache(&builder);
2136 if save_state {
2137 self.save_state(current);
2138 }
2139 let next = self.add_builder_state(builder, |sid| sid)?;
2140 if save_state {
2141 current = self.saved_state_id();
2142 }
2143 // This is the payoff. The next time 'next_state' is called with this
2144 // state and alphabet unit, it will find this transition and avoid
2145 // having to re-determinize this transition.
2146 self.set_transition(current, unit, next);
2147 Ok(next)
2148 }
2149
2150 /// Compute and cache the starting state for the given pattern ID (if
2151 /// present) and the starting configuration.
2152 ///
2153 /// This panics if a pattern ID is given and the DFA isn't configured to
2154 /// build anchored start states for each pattern.
2155 ///
2156 /// This will never return an unknown lazy state ID.
2157 ///
2158 /// If caching this state would otherwise result in a cache that has been
2159 /// cleared too many times, then an error is returned.
2160 #[cold]
2161 #[inline(never)]
2162 fn cache_start_group(
2163 &mut self,
2164 anchored: Anchored,
2165 start: Start,
2166 ) -> Result<LazyStateID, StartError> {
2167 let nfa_start_id = match anchored {
2168 Anchored::No => self.dfa.get_nfa().start_unanchored(),
2169 Anchored::Yes => self.dfa.get_nfa().start_anchored(),
2170 Anchored::Pattern(pid) => {
2171 if !self.dfa.get_config().get_starts_for_each_pattern() {
2172 return Err(StartError::unsupported_anchored(anchored));
2173 }
2174 match self.dfa.get_nfa().start_pattern(pid) {
2175 None => return Ok(self.as_ref().dead_id()),
2176 Some(sid) => sid,
2177 }
2178 }
2179 };
2180
2181 let id = self
2182 .cache_start_one(nfa_start_id, start)
2183 .map_err(StartError::cache)?;
2184 self.set_start_state(anchored, start, id);
2185 Ok(id)
2186 }
2187
2188 /// Compute and cache the starting state for the given NFA state ID and the
2189 /// starting configuration. The NFA state ID might be one of the following:
2190 ///
2191 /// 1) An unanchored start state to match any pattern.
2192 /// 2) An anchored start state to match any pattern.
2193 /// 3) An anchored start state for a particular pattern.
2194 ///
2195 /// This will never return an unknown lazy state ID.
2196 ///
2197 /// If caching this state would otherwise result in a cache that has been
2198 /// cleared too many times, then an error is returned.
2199 fn cache_start_one(
2200 &mut self,
2201 nfa_start_id: NFAStateID,
2202 start: Start,
2203 ) -> Result<LazyStateID, CacheError> {
2204 let mut builder_matches = self.get_state_builder().into_matches();
2205 determinize::set_lookbehind_from_start(
2206 self.dfa.get_nfa(),
2207 &start,
2208 &mut builder_matches,
2209 );
2210 self.cache.sparses.set1.clear();
2211 determinize::epsilon_closure(
2212 self.dfa.get_nfa(),
2213 nfa_start_id,
2214 builder_matches.look_have(),
2215 &mut self.cache.stack,
2216 &mut self.cache.sparses.set1,
2217 );
2218 let mut builder = builder_matches.into_nfa();
2219 determinize::add_nfa_states(
2220 &self.dfa.get_nfa(),
2221 &self.cache.sparses.set1,
2222 &mut builder,
2223 );
2224 let tag_starts = self.dfa.get_config().get_specialize_start_states();
2225 self.add_builder_state(builder, |id| {
2226 if tag_starts {
2227 id.to_start()
2228 } else {
2229 id
2230 }
2231 })
2232 }
2233
2234 /// Either add the given builder state to this cache, or return an ID to an
2235 /// equivalent state already in this cache.
2236 ///
2237 /// In the case where no equivalent state exists, the idmap function given
2238 /// may be used to transform the identifier allocated. This is useful if
2239 /// the caller needs to tag the ID with additional information.
2240 ///
2241 /// This will never return an unknown lazy state ID.
2242 ///
2243 /// If caching this state would otherwise result in a cache that has been
2244 /// cleared too many times, then an error is returned.
2245 fn add_builder_state(
2246 &mut self,
2247 builder: StateBuilderNFA,
2248 idmap: impl Fn(LazyStateID) -> LazyStateID,
2249 ) -> Result<LazyStateID, CacheError> {
2250 if let Some(&cached_id) =
2251 self.cache.states_to_id.get(builder.as_bytes())
2252 {
2253 // Since we have a cached state, put the constructed state's
2254 // memory back into our scratch space, so that it can be reused.
2255 self.put_state_builder(builder);
2256 return Ok(cached_id);
2257 }
2258 let result = self.add_state(builder.to_state(), idmap);
2259 self.put_state_builder(builder);
2260 result
2261 }
2262
2263 /// Allocate a new state ID and add the given state to this cache.
2264 ///
2265 /// The idmap function given may be used to transform the identifier
2266 /// allocated. This is useful if the caller needs to tag the ID with
2267 /// additional information.
2268 ///
2269 /// This will never return an unknown lazy state ID.
2270 ///
2271 /// If caching this state would otherwise result in a cache that has been
2272 /// cleared too many times, then an error is returned.
2273 fn add_state(
2274 &mut self,
2275 state: State,
2276 idmap: impl Fn(LazyStateID) -> LazyStateID,
2277 ) -> Result<LazyStateID, CacheError> {
2278 if !self.as_ref().state_fits_in_cache(&state) {
2279 self.try_clear_cache()?;
2280 }
2281 // It's important for this to come second, since the above may clear
2282 // the cache. If we clear the cache after ID generation, then the ID
2283 // is likely bunk since it would have been generated based on a larger
2284 // transition table.
2285 let mut id = idmap(self.next_state_id()?);
2286 if state.is_match() {
2287 id = id.to_match();
2288 }
2289 // Add room in the transition table. Since this is a fresh state, all
2290 // of its transitions are unknown.
2291 self.cache.trans.extend(
2292 iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()),
2293 );
2294 // When we add a sentinel state, we never want to set any quit
2295 // transitions. Technically, this is harmless, since sentinel states
2296 // have all of their transitions set to loop back to themselves. But
2297 // when creating sentinel states before the quit sentinel state,
2298 // this will try to call 'set_transition' on a state ID that doesn't
2299 // actually exist yet, which isn't allowed. So we just skip doing so
2300 // entirely.
2301 if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) {
2302 let quit_id = self.as_ref().quit_id();
2303 for b in self.dfa.quitset.iter() {
2304 self.set_transition(id, alphabet::Unit::u8(b), quit_id);
2305 }
2306 }
2307 self.cache.memory_usage_state += state.memory_usage();
2308 self.cache.states.push(state.clone());
2309 self.cache.states_to_id.insert(state, id);
2310 Ok(id)
2311 }
2312
2313 /// Allocate a new state ID.
2314 ///
2315 /// This will never return an unknown lazy state ID.
2316 ///
2317 /// If caching this state would otherwise result in a cache that has been
2318 /// cleared too many times, then an error is returned.
2319 fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> {
2320 let sid = match LazyStateID::new(self.cache.trans.len()) {
2321 Ok(sid) => sid,
2322 Err(_) => {
2323 self.try_clear_cache()?;
2324 // This has to pass since we check that ID capacity at
2325 // construction time can fit at least MIN_STATES states.
2326 LazyStateID::new(self.cache.trans.len()).unwrap()
2327 }
2328 };
2329 Ok(sid)
2330 }
2331
2332 /// Attempt to clear the cache used by this lazy DFA.
2333 ///
2334 /// If clearing the cache exceeds the minimum number of required cache
2335 /// clearings, then this will return a cache error. In this case,
2336 /// callers should bubble this up as the cache can't be used until it is
2337 /// reset. Implementations of search should convert this error into a
2338 /// [`MatchError::gave_up`].
2339 ///
2340 /// If 'self.state_saver' is set to save a state, then this state is
2341 /// persisted through cache clearing. Otherwise, the cache is returned to
2342 /// its state after initialization with two exceptions: its clear count
2343 /// is incremented and some of its memory likely has additional capacity.
2344 /// That is, clearing a cache does _not_ release memory.
2345 ///
2346 /// Otherwise, any lazy state ID generated by the cache prior to resetting
2347 /// it is invalid after the reset.
2348 fn try_clear_cache(&mut self) -> Result<(), CacheError> {
2349 let c = self.dfa.get_config();
2350 if let Some(min_count) = c.get_minimum_cache_clear_count() {
2351 if self.cache.clear_count >= min_count {
2352 if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() {
2353 let len = self.cache.search_total_len();
2354 let min_bytes =
2355 min_bytes_per.saturating_mul(self.cache.states.len());
2356 // If we've searched 0 bytes then probably something has
2357 // gone wrong and the lazy DFA search implementation isn't
2358 // correctly updating the search progress state.
2359 if len == 0 {
2360 trace!(
2361 "number of bytes searched is 0, but \
2362 a minimum bytes per state searched ({}) is \
2363 enabled, maybe Cache::search_update \
2364 is not being used?",
2365 min_bytes_per,
2366 );
2367 }
2368 if len < min_bytes {
2369 trace!(
2370 "lazy DFA cache has been cleared {} times, \
2371 which exceeds the limit of {}, \
2372 AND its bytes searched per state is less \
2373 than the configured minimum of {}, \
2374 therefore lazy DFA is giving up \
2375 (bytes searched since cache clear = {}, \
2376 number of states = {})",
2377 self.cache.clear_count,
2378 min_count,
2379 min_bytes_per,
2380 len,
2381 self.cache.states.len(),
2382 );
2383 return Err(CacheError::bad_efficiency());
2384 } else {
2385 trace!(
2386 "lazy DFA cache has been cleared {} times, \
2387 which exceeds the limit of {}, \
2388 AND its bytes searched per state is greater \
2389 than the configured minimum of {}, \
2390 therefore lazy DFA is continuing! \
2391 (bytes searched since cache clear = {}, \
2392 number of states = {})",
2393 self.cache.clear_count,
2394 min_count,
2395 min_bytes_per,
2396 len,
2397 self.cache.states.len(),
2398 );
2399 }
2400 } else {
2401 trace!(
2402 "lazy DFA cache has been cleared {} times, \
2403 which exceeds the limit of {}, \
2404 since there is no configured bytes per state \
2405 minimum, lazy DFA is giving up",
2406 self.cache.clear_count,
2407 min_count,
2408 );
2409 return Err(CacheError::too_many_cache_clears());
2410 }
2411 }
2412 }
2413 self.clear_cache();
2414 Ok(())
2415 }
2416
2417 /// Clears _and_ resets the cache. Resetting the cache means that no
2418 /// states are persisted and the clear count is reset to 0. No heap memory
2419 /// is released.
2420 ///
2421 /// Note that the caller may reset a cache with a different DFA than what
2422 /// it was created from. In which case, the cache can now be used with the
2423 /// new DFA (and not the old DFA).
2424 fn reset_cache(&mut self) {
2425 self.cache.state_saver = StateSaver::none();
2426 self.clear_cache();
2427 // If a new DFA is used, it might have a different number of NFA
2428 // states, so we need to make sure our sparse sets have the appropriate
2429 // size.
2430 self.cache.sparses.resize(self.dfa.get_nfa().states().len());
2431 self.cache.clear_count = 0;
2432 self.cache.progress = None;
2433 }
2434
2435 /// Clear the cache used by this lazy DFA.
2436 ///
2437 /// If 'self.state_saver' is set to save a state, then this state is
2438 /// persisted through cache clearing. Otherwise, the cache is returned to
2439 /// its state after initialization with two exceptions: its clear count
2440 /// is incremented and some of its memory likely has additional capacity.
2441 /// That is, clearing a cache does _not_ release memory.
2442 ///
2443 /// Otherwise, any lazy state ID generated by the cache prior to resetting
2444 /// it is invalid after the reset.
2445 fn clear_cache(&mut self) {
2446 self.cache.trans.clear();
2447 self.cache.starts.clear();
2448 self.cache.states.clear();
2449 self.cache.states_to_id.clear();
2450 self.cache.memory_usage_state = 0;
2451 self.cache.clear_count += 1;
2452 self.cache.bytes_searched = 0;
2453 if let Some(ref mut progress) = self.cache.progress {
2454 progress.start = progress.at;
2455 }
2456 trace!(
2457 "lazy DFA cache has been cleared (count: {})",
2458 self.cache.clear_count
2459 );
2460 self.init_cache();
2461 // If the state we want to save is one of the sentinel
2462 // (unknown/dead/quit) states, then 'init_cache' adds those back, and
2463 // their identifier values remains invariant. So there's no need to add
2464 // it again. (And indeed, doing so would be incorrect!)
2465 if let Some((old_id, state)) = self.cache.state_saver.take_to_save() {
2466 // If the state is one of the special sentinel states, then it is
2467 // automatically added by cache initialization and its ID always
2468 // remains the same. With that said, this should never occur since
2469 // the sentinel states are all loop states back to themselves. So
2470 // we should never be in a position where we're attempting to save
2471 // a sentinel state since we never compute transitions out of a
2472 // sentinel state.
2473 assert!(
2474 !self.as_ref().is_sentinel(old_id),
2475 "cannot save sentinel state"
2476 );
2477 let new_id = self
2478 .add_state(state, |id| {
2479 if old_id.is_start() {
2480 // We don't need to consult the
2481 // 'specialize_start_states' config knob here, because
2482 // if it's disabled, old_id.is_start() will never
2483 // return true.
2484 id.to_start()
2485 } else {
2486 id
2487 }
2488 })
2489 // The unwrap here is OK because lazy DFA creation ensures that
2490 // we have room in the cache to add MIN_STATES states. Since
2491 // 'init_cache' above adds 3, this adds a 4th.
2492 .expect("adding one state after cache clear must work");
2493 self.cache.state_saver = StateSaver::Saved(new_id);
2494 }
2495 }
2496
2497 /// Initialize this cache from emptiness to a place where it can be used
2498 /// for search.
2499 ///
2500 /// This is called both at cache creation time and after the cache has been
2501 /// cleared.
2502 ///
2503 /// Primarily, this adds the three sentinel states and allocates some
2504 /// initial memory.
2505 fn init_cache(&mut self) {
2506 // Why multiply by 2 here? Because we make room for both the unanchored
2507 // and anchored start states. Unanchored is first and then anchored.
2508 let mut starts_len = Start::len().checked_mul(2).unwrap();
2509 // ... but if we also want start states for every pattern, we make room
2510 // for that too.
2511 if self.dfa.get_config().get_starts_for_each_pattern() {
2512 starts_len += Start::len() * self.dfa.pattern_len();
2513 }
2514 self.cache
2515 .starts
2516 .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len));
2517 // This is the set of NFA states that corresponds to each of our three
2518 // sentinel states: the empty set.
2519 let dead = State::dead();
2520 // This sets up some states that we use as sentinels that are present
2521 // in every DFA. While it would be technically possible to implement
2522 // this DFA without explicitly putting these states in the transition
2523 // table, this is convenient to do to make `next_state` correct for all
2524 // valid state IDs without needing explicit conditionals to special
2525 // case these sentinel states.
2526 //
2527 // All three of these states are "dead" states. That is, all of
2528 // them transition only to themselves. So once you enter one of
2529 // these states, it's impossible to leave them. Thus, any correct
2530 // search routine must explicitly check for these state types. (Sans
2531 // `unknown`, since that is only used internally to represent missing
2532 // states.)
2533 let unk_id =
2534 self.add_state(dead.clone(), |id| id.to_unknown()).unwrap();
2535 let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap();
2536 let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap();
2537 assert_eq!(unk_id, self.as_ref().unknown_id());
2538 assert_eq!(dead_id, self.as_ref().dead_id());
2539 assert_eq!(quit_id, self.as_ref().quit_id());
2540 // The idea here is that if you start in an unknown/dead/quit state and
2541 // try to transition on them, then you should end up where you started.
2542 self.set_all_transitions(unk_id, unk_id);
2543 self.set_all_transitions(dead_id, dead_id);
2544 self.set_all_transitions(quit_id, quit_id);
2545 // All of these states are technically equivalent from the FSM
2546 // perspective, so putting all three of them in the cache isn't
2547 // possible. (They are distinct merely because we use their
2548 // identifiers as sentinels to mean something, as indicated by the
2549 // names.) Moreover, we wouldn't want to do that. Unknown and quit
2550 // states are special in that they are artificial constructions
2551 // this implementation. But dead states are a natural part of
2552 // determinization. When you reach a point in the NFA where you cannot
2553 // go anywhere else, a dead state will naturally arise and we MUST
2554 // reuse the canonical dead state that we've created here. Why? Because
2555 // it is the state ID that tells the search routine whether a state is
2556 // dead or not, and thus, whether to stop the search. Having a bunch of
2557 // distinct dead states would be quite wasteful!
2558 self.cache.states_to_id.insert(dead, dead_id);
2559 }
2560
2561 /// Save the state corresponding to the ID given such that the state
2562 /// persists through a cache clearing.
2563 ///
2564 /// While the state may persist, the ID may not. In order to discover the
2565 /// new state ID, one must call 'saved_state_id' after a cache clearing.
2566 fn save_state(&mut self, id: LazyStateID) {
2567 let state = self.as_ref().get_cached_state(id).clone();
2568 self.cache.state_saver = StateSaver::ToSave { id, state };
2569 }
2570
2571 /// Returns the updated lazy state ID for a state that was persisted
2572 /// through a cache clearing.
2573 ///
2574 /// It is only correct to call this routine when both a state has been
2575 /// saved and the cache has just been cleared. Otherwise, this panics.
2576 fn saved_state_id(&mut self) -> LazyStateID {
2577 self.cache
2578 .state_saver
2579 .take_saved()
2580 .expect("state saver does not have saved state ID")
2581 }
2582
2583 /// Set all transitions on the state 'from' to 'to'.
2584 fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) {
2585 for unit in self.dfa.classes.representatives(..) {
2586 self.set_transition(from, unit, to);
2587 }
2588 }
2589
2590 /// Set the transition on 'from' for 'unit' to 'to'.
2591 ///
2592 /// This panics if either 'from' or 'to' is invalid.
2593 ///
2594 /// All unit values are OK.
2595 fn set_transition(
2596 &mut self,
2597 from: LazyStateID,
2598 unit: alphabet::Unit,
2599 to: LazyStateID,
2600 ) {
2601 assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from);
2602 assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to);
2603 let offset =
2604 from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit);
2605 self.cache.trans[offset] = to;
2606 }
2607
2608 /// Set the start ID for the given pattern ID (if given) and starting
2609 /// configuration to the ID given.
2610 ///
2611 /// This panics if 'id' is not valid or if a pattern ID is given and
2612 /// 'starts_for_each_pattern' is not enabled.
2613 fn set_start_state(
2614 &mut self,
2615 anchored: Anchored,
2616 start: Start,
2617 id: LazyStateID,
2618 ) {
2619 assert!(self.as_ref().is_valid(id));
2620 let start_index = start.as_usize();
2621 let index = match anchored {
2622 Anchored::No => start_index,
2623 Anchored::Yes => Start::len() + start_index,
2624 Anchored::Pattern(pid) => {
2625 assert!(
2626 self.dfa.get_config().get_starts_for_each_pattern(),
2627 "attempted to search for a specific pattern \
2628 without enabling starts_for_each_pattern",
2629 );
2630 let pid = pid.as_usize();
2631 (2 * Start::len()) + (Start::len() * pid) + start_index
2632 }
2633 };
2634 self.cache.starts[index] = id;
2635 }
2636
2637 /// Returns a state builder from this DFA that might have existing
2638 /// capacity. This helps avoid allocs in cases where a state is built that
2639 /// turns out to already be cached.
2640 ///
2641 /// Callers must put the state builder back with 'put_state_builder',
2642 /// otherwise the allocation reuse won't work.
2643 fn get_state_builder(&mut self) -> StateBuilderEmpty {
2644 core::mem::replace(
2645 &mut self.cache.scratch_state_builder,
2646 StateBuilderEmpty::new(),
2647 )
2648 }
2649
2650 /// Puts the given state builder back into this DFA for reuse.
2651 ///
2652 /// Note that building a 'State' from a builder always creates a new alloc,
2653 /// so callers should always put the builder back.
2654 fn put_state_builder(&mut self, builder: StateBuilderNFA) {
2655 let _ = core::mem::replace(
2656 &mut self.cache.scratch_state_builder,
2657 builder.clear(),
2658 );
2659 }
2660}
2661
2662/// A type that groups methods that require the base NFA/DFA and read-only
2663/// access to the cache.
2664#[derive(Debug)]
2665struct LazyRef<'i, 'c> {
2666 dfa: &'i DFA,
2667 cache: &'c Cache,
2668}
2669
2670impl<'i, 'c> LazyRef<'i, 'c> {
2671 /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
2672 fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> {
2673 LazyRef { dfa, cache }
2674 }
2675
2676 /// Return the ID of the start state for the given configuration.
2677 ///
2678 /// If the start state has not yet been computed, then this returns an
2679 /// unknown lazy state ID.
2680 #[cfg_attr(feature = "perf-inline", inline(always))]
2681 fn get_cached_start_id(
2682 &self,
2683 anchored: Anchored,
2684 start: Start,
2685 ) -> Result<LazyStateID, StartError> {
2686 let start_index = start.as_usize();
2687 let index = match anchored {
2688 Anchored::No => start_index,
2689 Anchored::Yes => Start::len() + start_index,
2690 Anchored::Pattern(pid) => {
2691 if !self.dfa.get_config().get_starts_for_each_pattern() {
2692 return Err(StartError::unsupported_anchored(anchored));
2693 }
2694 if pid.as_usize() >= self.dfa.pattern_len() {
2695 return Ok(self.dead_id());
2696 }
2697 (2 * Start::len())
2698 + (Start::len() * pid.as_usize())
2699 + start_index
2700 }
2701 };
2702 Ok(self.cache.starts[index])
2703 }
2704
2705 /// Return the cached NFA/DFA powerset state for the given ID.
2706 ///
2707 /// This panics if the given ID does not address a valid state.
2708 fn get_cached_state(&self, sid: LazyStateID) -> &State {
2709 let index = sid.as_usize_untagged() >> self.dfa.stride2();
2710 &self.cache.states[index]
2711 }
2712
2713 /// Returns true if and only if the given ID corresponds to a "sentinel"
2714 /// state.
2715 ///
2716 /// A sentinel state is a state that signifies a special condition of
2717 /// search, and where every transition maps back to itself. See LazyStateID
2718 /// for more details. Note that start and match states are _not_ sentinels
2719 /// since they may otherwise be real states with non-trivial transitions.
2720 /// The purposes of sentinel states is purely to indicate something. Their
2721 /// transitions are not meant to be followed.
2722 fn is_sentinel(&self, id: LazyStateID) -> bool {
2723 id == self.unknown_id() || id == self.dead_id() || id == self.quit_id()
2724 }
2725
2726 /// Returns the ID of the unknown state for this lazy DFA.
2727 fn unknown_id(&self) -> LazyStateID {
2728 // This unwrap is OK since 0 is always a valid state ID.
2729 LazyStateID::new(0).unwrap().to_unknown()
2730 }
2731
2732 /// Returns the ID of the dead state for this lazy DFA.
2733 fn dead_id(&self) -> LazyStateID {
2734 // This unwrap is OK since the maximum value here is 1 * 512 = 512,
2735 // which is <= 2047 (the maximum state ID on 16-bit systems). Where
2736 // 512 is the worst case for our equivalence classes (every byte is a
2737 // distinct class).
2738 LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead()
2739 }
2740
2741 /// Returns the ID of the quit state for this lazy DFA.
2742 fn quit_id(&self) -> LazyStateID {
2743 // This unwrap is OK since the maximum value here is 2 * 512 = 1024,
2744 // which is <= 2047 (the maximum state ID on 16-bit systems). Where
2745 // 512 is the worst case for our equivalence classes (every byte is a
2746 // distinct class).
2747 LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit()
2748 }
2749
2750 /// Returns true if and only if the given ID is valid.
2751 ///
2752 /// An ID is valid if it is both a valid index into the transition table
2753 /// and is a multiple of the DFA's stride.
2754 fn is_valid(&self, id: LazyStateID) -> bool {
2755 let id = id.as_usize_untagged();
2756 id < self.cache.trans.len() && id % self.dfa.stride() == 0
2757 }
2758
2759 /// Returns true if adding the state given would fit in this cache.
2760 fn state_fits_in_cache(&self, state: &State) -> bool {
2761 let needed = self.cache.memory_usage()
2762 + self.memory_usage_for_one_more_state(state.memory_usage());
2763 trace!(
2764 "lazy DFA cache capacity check: {:?} ?<=? {:?}",
2765 needed,
2766 self.dfa.cache_capacity
2767 );
2768 needed <= self.dfa.cache_capacity
2769 }
2770
2771 /// Returns true if adding the state to be built by the given builder would
2772 /// fit in this cache.
2773 fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool {
2774 let needed = self.cache.memory_usage()
2775 + self.memory_usage_for_one_more_state(state.as_bytes().len());
2776 needed <= self.dfa.cache_capacity
2777 }
2778
2779 /// Returns the additional memory usage, in bytes, required to add one more
2780 /// state to this cache. The given size should be the heap size, in bytes,
2781 /// that would be used by the new state being added.
2782 fn memory_usage_for_one_more_state(
2783 &self,
2784 state_heap_size: usize,
2785 ) -> usize {
2786 const ID_SIZE: usize = size_of::<LazyStateID>();
2787 const STATE_SIZE: usize = size_of::<State>();
2788
2789 self.dfa.stride() * ID_SIZE // additional space needed in trans table
2790 + STATE_SIZE // space in cache.states
2791 + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id
2792 + state_heap_size // heap memory used by state itself
2793 }
2794}
2795
2796/// A simple type that encapsulates the saving of a state ID through a cache
2797/// clearing.
2798///
2799/// A state ID can be marked for saving with ToSave, while a state ID can be
2800/// saved itself with Saved.
2801#[derive(Clone, Debug)]
2802enum StateSaver {
2803 /// An empty state saver. In this case, no states (other than the special
2804 /// sentinel states) are preserved after clearing the cache.
2805 None,
2806 /// An ID of a state (and the state itself) that should be preserved after
2807 /// the lazy DFA's cache has been cleared. After clearing, the updated ID
2808 /// is stored in 'Saved' since it may have changed.
2809 ToSave { id: LazyStateID, state: State },
2810 /// An ID that of a state that has been persisted through a lazy DFA
2811 /// cache clearing. The ID recorded here corresponds to an ID that was
2812 /// once marked as ToSave. The IDs are likely not equivalent even though
2813 /// the states they point to are.
2814 Saved(LazyStateID),
2815}
2816
2817impl StateSaver {
2818 /// Create an empty state saver.
2819 fn none() -> StateSaver {
2820 StateSaver::None
2821 }
2822
2823 /// Replace this state saver with an empty saver, and if this saver is a
2824 /// request to save a state, return that request.
2825 fn take_to_save(&mut self) -> Option<(LazyStateID, State)> {
2826 match core::mem::replace(self, StateSaver::None) {
2827 StateSaver::None | StateSaver::Saved(_) => None,
2828 StateSaver::ToSave { id, state } => Some((id, state)),
2829 }
2830 }
2831
2832 /// Replace this state saver with an empty saver, and if this saver is a
2833 /// saved state (or a request to save a state), return that state's ID.
2834 ///
2835 /// The idea here is that a request to save a state isn't necessarily
2836 /// honored because it might not be needed. e.g., Some higher level code
2837 /// might request a state to be saved on the off chance that the cache gets
2838 /// cleared when a new state is added at a lower level. But if that new
2839 /// state is never added, then the cache is never cleared and the state and
2840 /// its ID remain unchanged.
2841 fn take_saved(&mut self) -> Option<LazyStateID> {
2842 match core::mem::replace(self, StateSaver::None) {
2843 StateSaver::None => None,
2844 StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id),
2845 }
2846 }
2847}
2848
2849/// The configuration used for building a lazy DFA.
2850///
2851/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
2852/// advantage of the former is that it often lets you avoid importing the
2853/// `Config` type directly.
2854///
2855/// A lazy DFA configuration is a simple data object that is typically used
2856/// with [`Builder::configure`].
2857///
2858/// The default configuration guarantees that a search will never return a
2859/// "gave up" or "quit" error, although it is possible for a search to fail
2860/// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
2861/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
2862#[derive(Clone, Debug, Default)]
2863pub struct Config {
2864 // As with other configuration types in this crate, we put all our knobs
2865 // in options so that we can distinguish between "default" and "not set."
2866 // This makes it possible to easily combine multiple configurations
2867 // without default values overwriting explicitly specified values. See the
2868 // 'overwrite' method.
2869 //
2870 // For docs on the fields below, see the corresponding method setters.
2871 match_kind: Option<MatchKind>,
2872 pre: Option<Option<Prefilter>>,
2873 starts_for_each_pattern: Option<bool>,
2874 byte_classes: Option<bool>,
2875 unicode_word_boundary: Option<bool>,
2876 quitset: Option<ByteSet>,
2877 specialize_start_states: Option<bool>,
2878 cache_capacity: Option<usize>,
2879 skip_cache_capacity_check: Option<bool>,
2880 minimum_cache_clear_count: Option<Option<usize>>,
2881 minimum_bytes_per_state: Option<Option<usize>>,
2882}
2883
2884impl Config {
2885 /// Return a new default lazy DFA builder configuration.
2886 pub fn new() -> Config {
2887 Config::default()
2888 }
2889
2890 /// Set the desired match semantics.
2891 ///
2892 /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
2893 /// match semantics of Perl-like regex engines. That is, when multiple
2894 /// patterns would match at the same leftmost position, the pattern that
2895 /// appears first in the concrete syntax is chosen.
2896 ///
2897 /// Currently, the only other kind of match semantics supported is
2898 /// [`MatchKind::All`]. This corresponds to classical DFA construction
2899 /// where all possible matches are added to the lazy DFA.
2900 ///
2901 /// Typically, `All` is used when one wants to execute an overlapping
2902 /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
2903 /// sense to use `All` with the various "leftmost" find routines, since the
2904 /// leftmost routines depend on the `LeftmostFirst` automata construction
2905 /// strategy. Specifically, `LeftmostFirst` adds dead states to the
2906 /// lazy DFA as a way to terminate the search and report a match.
2907 /// `LeftmostFirst` also supports non-greedy matches using this strategy
2908 /// where as `All` does not.
2909 ///
2910 /// # Example: overlapping search
2911 ///
2912 /// This example shows the typical use of `MatchKind::All`, which is to
2913 /// report overlapping matches.
2914 ///
2915 /// ```
2916 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
2917 /// use regex_automata::{
2918 /// hybrid::dfa::{DFA, OverlappingState},
2919 /// HalfMatch, Input, MatchKind,
2920 /// };
2921 ///
2922 /// let dfa = DFA::builder()
2923 /// .configure(DFA::config().match_kind(MatchKind::All))
2924 /// .build_many(&[r"\w+$", r"\S+$"])?;
2925 /// let mut cache = dfa.create_cache();
2926 /// let haystack = "@foo";
2927 /// let mut state = OverlappingState::start();
2928 ///
2929 /// let expected = Some(HalfMatch::must(1, 4));
2930 /// dfa.try_search_overlapping_fwd(
2931 /// &mut cache, &Input::new(haystack), &mut state,
2932 /// )?;
2933 /// assert_eq!(expected, state.get_match());
2934 ///
2935 /// // The first pattern also matches at the same position, so re-running
2936 /// // the search will yield another match. Notice also that the first
2937 /// // pattern is returned after the second. This is because the second
2938 /// // pattern begins its match before the first, is therefore an earlier
2939 /// // match and is thus reported first.
2940 /// let expected = Some(HalfMatch::must(0, 4));
2941 /// dfa.try_search_overlapping_fwd(
2942 /// &mut cache, &Input::new(haystack), &mut state,
2943 /// )?;
2944 /// assert_eq!(expected, state.get_match());
2945 ///
2946 /// # Ok::<(), Box<dyn std::error::Error>>(())
2947 /// ```
2948 ///
2949 /// # Example: reverse automaton to find start of match
2950 ///
2951 /// Another example for using `MatchKind::All` is for constructing a
2952 /// reverse automaton to find the start of a match. `All` semantics are
2953 /// used for this in order to find the longest possible match, which
2954 /// corresponds to the leftmost starting position.
2955 ///
2956 /// Note that if you need the starting position then
2957 /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this
2958 /// for you, so it's usually not necessary to do this yourself.
2959 ///
2960 /// ```
2961 /// use regex_automata::{
2962 /// hybrid::dfa::DFA,
2963 /// nfa::thompson::NFA,
2964 /// Anchored, HalfMatch, Input, MatchKind,
2965 /// };
2966 ///
2967 /// let input = Input::new("123foobar456");
2968 /// let pattern = r"[a-z]+r";
2969 ///
2970 /// let dfa_fwd = DFA::new(pattern)?;
2971 /// let dfa_rev = DFA::builder()
2972 /// .thompson(NFA::config().reverse(true))
2973 /// .configure(DFA::config().match_kind(MatchKind::All))
2974 /// .build(pattern)?;
2975 /// let mut cache_fwd = dfa_fwd.create_cache();
2976 /// let mut cache_rev = dfa_rev.create_cache();
2977 ///
2978 /// let expected_fwd = HalfMatch::must(0, 9);
2979 /// let expected_rev = HalfMatch::must(0, 3);
2980 /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap();
2981 /// // Here we don't specify the pattern to search for since there's only
2982 /// // one pattern and we're doing a leftmost search. But if this were an
2983 /// // overlapping search, you'd need to specify the pattern that matched
2984 /// // in the forward direction. (Otherwise, you might wind up finding the
2985 /// // starting position of a match of some other pattern.) That in turn
2986 /// // requires building the reverse automaton with starts_for_each_pattern
2987 /// // enabled.
2988 /// let input = input
2989 /// .clone()
2990 /// .range(..got_fwd.offset())
2991 /// .anchored(Anchored::Yes);
2992 /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap();
2993 /// assert_eq!(expected_fwd, got_fwd);
2994 /// assert_eq!(expected_rev, got_rev);
2995 ///
2996 /// # Ok::<(), Box<dyn std::error::Error>>(())
2997 /// ```
2998 pub fn match_kind(mut self, kind: MatchKind) -> Config {
2999 self.match_kind = Some(kind);
3000 self
3001 }
3002
3003 /// Set a prefilter to be used whenever a start state is entered.
3004 ///
3005 /// A [`Prefilter`] in this context is meant to accelerate searches by
3006 /// looking for literal prefixes that every match for the corresponding
3007 /// pattern (or patterns) must start with. Once a prefilter produces a
3008 /// match, the underlying search routine continues on to try and confirm
3009 /// the match.
3010 ///
3011 /// Be warned that setting a prefilter does not guarantee that the search
3012 /// will be faster. While it's usually a good bet, if the prefilter
3013 /// produces a lot of false positive candidates (i.e., positions matched
3014 /// by the prefilter but not by the regex), then the overall result can
3015 /// be slower than if you had just executed the regex engine without any
3016 /// prefilters.
3017 ///
3018 /// Note that unless [`Config::specialize_start_states`] has been
3019 /// explicitly set, then setting this will also enable (when `pre` is
3020 /// `Some`) or disable (when `pre` is `None`) start state specialization.
3021 /// This occurs because without start state specialization, a prefilter
3022 /// is likely to be less effective. And without a prefilter, start state
3023 /// specialization is usually pointless.
3024 ///
3025 /// By default no prefilter is set.
3026 ///
3027 /// # Example
3028 ///
3029 /// ```
3030 /// use regex_automata::{
3031 /// hybrid::dfa::DFA,
3032 /// util::prefilter::Prefilter,
3033 /// Input, HalfMatch, MatchKind,
3034 /// };
3035 ///
3036 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
3037 /// let re = DFA::builder()
3038 /// .configure(DFA::config().prefilter(pre))
3039 /// .build(r"(foo|bar)[a-z]+")?;
3040 /// let mut cache = re.create_cache();
3041 /// let input = Input::new("foo1 barfox bar");
3042 /// assert_eq!(
3043 /// Some(HalfMatch::must(0, 11)),
3044 /// re.try_search_fwd(&mut cache, &input)?,
3045 /// );
3046 ///
3047 /// # Ok::<(), Box<dyn std::error::Error>>(())
3048 /// ```
3049 ///
3050 /// Be warned though that an incorrect prefilter can lead to incorrect
3051 /// results!
3052 ///
3053 /// ```
3054 /// use regex_automata::{
3055 /// hybrid::dfa::DFA,
3056 /// util::prefilter::Prefilter,
3057 /// Input, HalfMatch, MatchKind,
3058 /// };
3059 ///
3060 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
3061 /// let re = DFA::builder()
3062 /// .configure(DFA::config().prefilter(pre))
3063 /// .build(r"(foo|bar)[a-z]+")?;
3064 /// let mut cache = re.create_cache();
3065 /// let input = Input::new("foo1 barfox bar");
3066 /// assert_eq!(
3067 /// // No match reported even though there clearly is one!
3068 /// None,
3069 /// re.try_search_fwd(&mut cache, &input)?,
3070 /// );
3071 ///
3072 /// # Ok::<(), Box<dyn std::error::Error>>(())
3073 /// ```
3074 pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
3075 self.pre = Some(pre);
3076 if self.specialize_start_states.is_none() {
3077 self.specialize_start_states =
3078 Some(self.get_prefilter().is_some());
3079 }
3080 self
3081 }
3082
3083 /// Whether to compile a separate start state for each pattern in the
3084 /// lazy DFA.
3085 ///
3086 /// When enabled, a separate **anchored** start state is added for each
3087 /// pattern in the lazy DFA. When this start state is used, then the DFA
3088 /// will only search for matches for the pattern specified, even if there
3089 /// are other patterns in the DFA.
3090 ///
3091 /// The main downside of this option is that it can potentially increase
3092 /// the size of the DFA and/or increase the time it takes to build the
3093 /// DFA at search time. However, since this is configuration for a lazy
3094 /// DFA, these states aren't actually built unless they're used. Enabling
3095 /// this isn't necessarily free, however, as it may result in higher cache
3096 /// usage.
3097 ///
3098 /// There are a few reasons one might want to enable this (it's disabled
3099 /// by default):
3100 ///
3101 /// 1. When looking for the start of an overlapping match (using a reverse
3102 /// DFA), doing it correctly requires starting the reverse search using the
3103 /// starting state of the pattern that matched in the forward direction.
3104 /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it
3105 /// will automatically enable this option when building the reverse DFA
3106 /// internally.
3107 /// 2. When you want to use a DFA with multiple patterns to both search
3108 /// for matches of any pattern or to search for anchored matches of one
3109 /// particular pattern while using the same DFA. (Otherwise, you would need
3110 /// to compile a new DFA for each pattern.)
3111 ///
3112 /// By default this is disabled.
3113 ///
3114 /// # Example
3115 ///
3116 /// This example shows how to use this option to permit the same lazy DFA
3117 /// to run both general searches for any pattern and anchored searches for
3118 /// a specific pattern.
3119 ///
3120 /// ```
3121 /// use regex_automata::{
3122 /// hybrid::dfa::DFA,
3123 /// Anchored, HalfMatch, Input, PatternID,
3124 /// };
3125 ///
3126 /// let dfa = DFA::builder()
3127 /// .configure(DFA::config().starts_for_each_pattern(true))
3128 /// .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?;
3129 /// let mut cache = dfa.create_cache();
3130 /// let haystack = "bar foo123";
3131 ///
3132 /// // Here's a normal unanchored search that looks for any pattern.
3133 /// let expected = HalfMatch::must(0, 10);
3134 /// let input = Input::new(haystack);
3135 /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3136 /// // We can also do a normal anchored search for any pattern. Since it's
3137 /// // an anchored search, we position the start of the search where we
3138 /// // know the match will begin.
3139 /// let expected = HalfMatch::must(0, 10);
3140 /// let input = Input::new(haystack).range(4..);
3141 /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3142 /// // Since we compiled anchored start states for each pattern, we can
3143 /// // also look for matches of other patterns explicitly, even if a
3144 /// // different pattern would have normally matched.
3145 /// let expected = HalfMatch::must(1, 10);
3146 /// let input = Input::new(haystack)
3147 /// .range(4..)
3148 /// .anchored(Anchored::Pattern(PatternID::must(1)));
3149 /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3150 ///
3151 /// # Ok::<(), Box<dyn std::error::Error>>(())
3152 /// ```
3153 pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
3154 self.starts_for_each_pattern = Some(yes);
3155 self
3156 }
3157
3158 /// Whether to attempt to shrink the size of the lazy DFA's alphabet or
3159 /// not.
3160 ///
3161 /// This option is enabled by default and should never be disabled unless
3162 /// one is debugging the lazy DFA.
3163 ///
3164 /// When enabled, the lazy DFA will use a map from all possible bytes
3165 /// to their corresponding equivalence class. Each equivalence class
3166 /// represents a set of bytes that does not discriminate between a match
3167 /// and a non-match in the DFA. For example, the pattern `[ab]+` has at
3168 /// least two equivalence classes: a set containing `a` and `b` and a set
3169 /// containing every byte except for `a` and `b`. `a` and `b` are in the
3170 /// same equivalence classes because they never discriminate between a
3171 /// match and a non-match.
3172 ///
3173 /// The advantage of this map is that the size of the transition table
3174 /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)`
3175 /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of
3176 /// equivalence classes (rounded up to the nearest power of 2). As a
3177 /// result, total space usage can decrease substantially. Moreover, since a
3178 /// smaller alphabet is used, DFA compilation during search becomes faster
3179 /// as well since it will potentially be able to reuse a single transition
3180 /// for multiple bytes.
3181 ///
3182 /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling
3183 /// this does not yield any speed advantages. Namely, even when this is
3184 /// disabled, a byte class map is still used while searching. The only
3185 /// difference is that every byte will be forced into its own distinct
3186 /// equivalence class. This is useful for debugging the actual generated
3187 /// transitions because it lets one see the transitions defined on actual
3188 /// bytes instead of the equivalence classes.
3189 pub fn byte_classes(mut self, yes: bool) -> Config {
3190 self.byte_classes = Some(yes);
3191 self
3192 }
3193
3194 /// Heuristically enable Unicode word boundaries.
3195 ///
3196 /// When set, this will attempt to implement Unicode word boundaries as if
3197 /// they were ASCII word boundaries. This only works when the search input
3198 /// is ASCII only. If a non-ASCII byte is observed while searching, then a
3199 /// [`MatchError::quit`] error is returned.
3200 ///
3201 /// A possible alternative to enabling this option is to simply use an
3202 /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
3203 /// option is if you absolutely need Unicode support. This option lets one
3204 /// use a fast search implementation (a DFA) for some potentially very
3205 /// common cases, while providing the option to fall back to some other
3206 /// regex engine to handle the general case when an error is returned.
3207 ///
3208 /// If the pattern provided has no Unicode word boundary in it, then this
3209 /// option has no effect. (That is, quitting on a non-ASCII byte only
3210 /// occurs when this option is enabled _and_ a Unicode word boundary is
3211 /// present in the pattern.)
3212 ///
3213 /// This is almost equivalent to setting all non-ASCII bytes to be quit
3214 /// bytes. The only difference is that this will cause non-ASCII bytes to
3215 /// be quit bytes _only_ when a Unicode word boundary is present in the
3216 /// pattern.
3217 ///
3218 /// When enabling this option, callers _must_ be prepared to
3219 /// handle a [`MatchError`] error during search. When using a
3220 /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3221 /// `try_` suite of methods. Alternatively, if callers can guarantee that
3222 /// their input is ASCII only, then a [`MatchError::quit`] error will never
3223 /// be returned while searching.
3224 ///
3225 /// This is disabled by default.
3226 ///
3227 /// # Example
3228 ///
3229 /// This example shows how to heuristically enable Unicode word boundaries
3230 /// in a pattern. It also shows what happens when a search comes across a
3231 /// non-ASCII byte.
3232 ///
3233 /// ```
3234 /// use regex_automata::{
3235 /// hybrid::dfa::DFA,
3236 /// HalfMatch, Input, MatchError,
3237 /// };
3238 ///
3239 /// let dfa = DFA::builder()
3240 /// .configure(DFA::config().unicode_word_boundary(true))
3241 /// .build(r"\b[0-9]+\b")?;
3242 /// let mut cache = dfa.create_cache();
3243 ///
3244 /// // The match occurs before the search ever observes the snowman
3245 /// // character, so no error occurs.
3246 /// let haystack = "foo 123 ☃";
3247 /// let expected = Some(HalfMatch::must(0, 7));
3248 /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3249 /// assert_eq!(expected, got);
3250 ///
3251 /// // Notice that this search fails, even though the snowman character
3252 /// // occurs after the ending match offset. This is because search
3253 /// // routines read one byte past the end of the search to account for
3254 /// // look-around, and indeed, this is required here to determine whether
3255 /// // the trailing \b matches.
3256 /// let haystack = "foo 123 ☃";
3257 /// let expected = MatchError::quit(0xE2, 8);
3258 /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack));
3259 /// assert_eq!(Err(expected), got);
3260 ///
3261 /// // Another example is executing a search where the span of the haystack
3262 /// // we specify is all ASCII, but there is non-ASCII just before it. This
3263 /// // correctly also reports an error.
3264 /// let input = Input::new("β123").range(2..);
3265 /// let expected = MatchError::quit(0xB2, 1);
3266 /// let got = dfa.try_search_fwd(&mut cache, &input);
3267 /// assert_eq!(Err(expected), got);
3268 ///
3269 /// // And similarly for the trailing word boundary.
3270 /// let input = Input::new("123β").range(..3);
3271 /// let expected = MatchError::quit(0xCE, 3);
3272 /// let got = dfa.try_search_fwd(&mut cache, &input);
3273 /// assert_eq!(Err(expected), got);
3274 ///
3275 /// # Ok::<(), Box<dyn std::error::Error>>(())
3276 /// ```
3277 pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
3278 // We have a separate option for this instead of just setting the
3279 // appropriate quit bytes here because we don't want to set quit bytes
3280 // for every regex. We only want to set them when the regex contains a
3281 // Unicode word boundary.
3282 self.unicode_word_boundary = Some(yes);
3283 self
3284 }
3285
3286 /// Add a "quit" byte to the lazy DFA.
3287 ///
3288 /// When a quit byte is seen during search time, then search will return a
3289 /// [`MatchError::quit`] error indicating the offset at which the search
3290 /// stopped.
3291 ///
3292 /// A quit byte will always overrule any other aspects of a regex. For
3293 /// example, if the `x` byte is added as a quit byte and the regex `\w` is
3294 /// used, then observing `x` will cause the search to quit immediately
3295 /// despite the fact that `x` is in the `\w` class.
3296 ///
3297 /// This mechanism is primarily useful for heuristically enabling certain
3298 /// features like Unicode word boundaries in a DFA. Namely, if the input
3299 /// to search is ASCII, then a Unicode word boundary can be implemented
3300 /// via an ASCII word boundary with no change in semantics. Thus, a DFA
3301 /// can attempt to match a Unicode word boundary but give up as soon as it
3302 /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
3303 /// to be quit bytes, then Unicode word boundaries will be permitted when
3304 /// building lazy DFAs. Of course, callers should enable
3305 /// [`Config::unicode_word_boundary`] if they want this behavior instead.
3306 /// (The advantage being that non-ASCII quit bytes will only be added if a
3307 /// Unicode word boundary is in the pattern.)
3308 ///
3309 /// When enabling this option, callers _must_ be prepared to
3310 /// handle a [`MatchError`] error during search. When using a
3311 /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3312 /// `try_` suite of methods.
3313 ///
3314 /// By default, there are no quit bytes set.
3315 ///
3316 /// # Panics
3317 ///
3318 /// This panics if heuristic Unicode word boundaries are enabled and any
3319 /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
3320 /// Unicode word boundaries requires setting every non-ASCII byte to a quit
3321 /// byte. So if the caller attempts to undo any of that, then this will
3322 /// panic.
3323 ///
3324 /// # Example
3325 ///
3326 /// This example shows how to cause a search to terminate if it sees a
3327 /// `\n` byte. This could be useful if, for example, you wanted to prevent
3328 /// a user supplied pattern from matching across a line boundary.
3329 ///
3330 /// ```
3331 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3332 /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3333 ///
3334 /// let dfa = DFA::builder()
3335 /// .configure(DFA::config().quit(b'\n', true))
3336 /// .build(r"foo\p{any}+bar")?;
3337 /// let mut cache = dfa.create_cache();
3338 ///
3339 /// let haystack = "foo\nbar";
3340 /// // Normally this would produce a match, since \p{any} contains '\n'.
3341 /// // But since we instructed the automaton to enter a quit state if a
3342 /// // '\n' is observed, this produces a match error instead.
3343 /// let expected = MatchError::quit(b'\n', 3);
3344 /// let got = dfa.try_search_fwd(
3345 /// &mut cache,
3346 /// &Input::new(haystack),
3347 /// ).unwrap_err();
3348 /// assert_eq!(expected, got);
3349 ///
3350 /// # Ok::<(), Box<dyn std::error::Error>>(())
3351 /// ```
3352 pub fn quit(mut self, byte: u8, yes: bool) -> Config {
3353 if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
3354 panic!(
3355 "cannot set non-ASCII byte to be non-quit when \
3356 Unicode word boundaries are enabled"
3357 );
3358 }
3359 if self.quitset.is_none() {
3360 self.quitset = Some(ByteSet::empty());
3361 }
3362 if yes {
3363 self.quitset.as_mut().unwrap().add(byte);
3364 } else {
3365 self.quitset.as_mut().unwrap().remove(byte);
3366 }
3367 self
3368 }
3369
3370 /// Enable specializing start states in the lazy DFA.
3371 ///
3372 /// When start states are specialized, an implementor of a search routine
3373 /// using a lazy DFA can tell when the search has entered a starting state.
3374 /// When start states aren't specialized, then it is impossible to know
3375 /// whether the search has entered a start state.
3376 ///
3377 /// Ideally, this option wouldn't need to exist and we could always
3378 /// specialize start states. The problem is that start states can be quite
3379 /// active. This in turn means that an efficient search routine is likely
3380 /// to ping-pong between a heavily optimized hot loop that handles most
3381 /// states and to a less optimized specialized handling of start states.
3382 /// This causes branches to get heavily mispredicted and overall can
3383 /// materially decrease throughput. Therefore, specializing start states
3384 /// should only be enabled when it is needed.
3385 ///
3386 /// Knowing whether a search is in a start state is typically useful when a
3387 /// prefilter is active for the search. A prefilter is typically only run
3388 /// when in a start state and a prefilter can greatly accelerate a search.
3389 /// Therefore, the possible cost of specializing start states is worth it
3390 /// in this case. Otherwise, if you have no prefilter, there is likely no
3391 /// reason to specialize start states.
3392 ///
3393 /// This is disabled by default, but note that it is automatically
3394 /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
3395 /// `specialize_start_states` has already been set, [`Config::prefilter`]
3396 /// will automatically enable or disable it based on whether a prefilter
3397 /// is present or not, respectively. This is done because a prefilter's
3398 /// effectiveness is rooted in being executed whenever the DFA is in a
3399 /// start state, and that's only possible to do when they are specialized.
3400 ///
3401 /// Note that it is plausibly reasonable to _disable_ this option
3402 /// explicitly while _enabling_ a prefilter. In that case, a prefilter
3403 /// will still be run at the beginning of a search, but never again. This
3404 /// in theory could strike a good balance if you're in a situation where a
3405 /// prefilter is likely to produce many false positive candidates.
3406 ///
3407 /// # Example
3408 ///
3409 /// This example shows how to enable start state specialization and then
3410 /// shows how to check whether a state is a start state or not.
3411 ///
3412 /// ```
3413 /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3414 ///
3415 /// let dfa = DFA::builder()
3416 /// .configure(DFA::config().specialize_start_states(true))
3417 /// .build(r"[a-z]+")?;
3418 /// let mut cache = dfa.create_cache();
3419 ///
3420 /// let haystack = "123 foobar 4567".as_bytes();
3421 /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3422 /// // The ID returned by 'start_state_forward' will always be tagged as
3423 /// // a start state when start state specialization is enabled.
3424 /// assert!(sid.is_tagged());
3425 /// assert!(sid.is_start());
3426 ///
3427 /// # Ok::<(), Box<dyn std::error::Error>>(())
3428 /// ```
3429 ///
3430 /// Compare the above with the default lazy DFA configuration where
3431 /// start states are _not_ specialized. In this case, the start state
3432 /// is not tagged and `sid.is_start()` returns false.
3433 ///
3434 /// ```
3435 /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3436 ///
3437 /// let dfa = DFA::new(r"[a-z]+")?;
3438 /// let mut cache = dfa.create_cache();
3439 ///
3440 /// let haystack = "123 foobar 4567".as_bytes();
3441 /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3442 /// // Start states are not tagged in the default configuration!
3443 /// assert!(!sid.is_tagged());
3444 /// assert!(!sid.is_start());
3445 ///
3446 /// # Ok::<(), Box<dyn std::error::Error>>(())
3447 /// ```
3448 pub fn specialize_start_states(mut self, yes: bool) -> Config {
3449 self.specialize_start_states = Some(yes);
3450 self
3451 }
3452
3453 /// Sets the maximum amount of heap memory, in bytes, to allocate to the
3454 /// cache for use during a lazy DFA search. If the lazy DFA would otherwise
3455 /// use more heap memory, then, depending on other configuration knobs,
3456 /// either stop the search and return an error or clear the cache and
3457 /// continue the search.
3458 ///
3459 /// The default cache capacity is some "reasonable" number that will
3460 /// accommodate most regular expressions. You may find that if you need
3461 /// to build a large DFA then it may be necessary to increase the cache
3462 /// capacity.
3463 ///
3464 /// Note that while building a lazy DFA will do a "minimum" check to ensure
3465 /// the capacity is big enough, this is more or less about correctness.
3466 /// If the cache is bigger than the minimum but still "too small," then the
3467 /// lazy DFA could wind up spending a lot of time clearing the cache and
3468 /// recomputing transitions, thus negating the performance benefits of a
3469 /// lazy DFA. Thus, setting the cache capacity is mostly an experimental
3470 /// endeavor. For most common patterns, however, the default should be
3471 /// sufficient.
3472 ///
3473 /// For more details on how the lazy DFA's cache is used, see the
3474 /// documentation for [`Cache`].
3475 ///
3476 /// # Example
3477 ///
3478 /// This example shows what happens if the configured cache capacity is
3479 /// too small. In such cases, one can override the cache capacity to make
3480 /// it bigger. Alternatively, one might want to use less memory by setting
3481 /// a smaller cache capacity.
3482 ///
3483 /// ```
3484 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3485 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3486 ///
3487 /// let pattern = r"\p{L}{1000}";
3488 ///
3489 /// // The default cache capacity is likely too small to deal with regexes
3490 /// // that are very large. Large repetitions of large Unicode character
3491 /// // classes are a common way to make very large regexes.
3492 /// let _ = DFA::new(pattern).unwrap_err();
3493 /// // Bump up the capacity to something bigger.
3494 /// let dfa = DFA::builder()
3495 /// .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB
3496 /// .build(pattern)?;
3497 /// let mut cache = dfa.create_cache();
3498 ///
3499 /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
3500 /// let expected = Some(HalfMatch::must(0, 2000));
3501 /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3502 /// assert_eq!(expected, got);
3503 ///
3504 /// # Ok::<(), Box<dyn std::error::Error>>(())
3505 /// ```
3506 pub fn cache_capacity(mut self, bytes: usize) -> Config {
3507 self.cache_capacity = Some(bytes);
3508 self
3509 }
3510
3511 /// Configures construction of a lazy DFA to use the minimum cache capacity
3512 /// if the configured capacity is otherwise too small for the provided NFA.
3513 ///
3514 /// This is useful if you never want lazy DFA construction to fail because
3515 /// of a capacity that is too small.
3516 ///
3517 /// In general, this option is typically not a good idea. In particular,
3518 /// while a minimum cache capacity does permit the lazy DFA to function
3519 /// where it otherwise couldn't, it's plausible that it may not function
3520 /// well if it's constantly running out of room. In that case, the speed
3521 /// advantages of the lazy DFA may be negated. On the other hand, the
3522 /// "minimum" cache capacity computed may not be completely accurate and
3523 /// could actually be bigger than what is really necessary. Therefore, it
3524 /// is plausible that using the minimum cache capacity could still result
3525 /// in very good performance.
3526 ///
3527 /// This is disabled by default.
3528 ///
3529 /// # Example
3530 ///
3531 /// This example shows what happens if the configured cache capacity is
3532 /// too small. In such cases, one could override the capacity explicitly.
3533 /// An alternative, demonstrated here, let's us force construction to use
3534 /// the minimum cache capacity if the configured capacity is otherwise
3535 /// too small.
3536 ///
3537 /// ```
3538 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3539 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3540 ///
3541 /// let pattern = r"\p{L}{1000}";
3542 ///
3543 /// // The default cache capacity is likely too small to deal with regexes
3544 /// // that are very large. Large repetitions of large Unicode character
3545 /// // classes are a common way to make very large regexes.
3546 /// let _ = DFA::new(pattern).unwrap_err();
3547 /// // Configure construction such it automatically selects the minimum
3548 /// // cache capacity if it would otherwise be too small.
3549 /// let dfa = DFA::builder()
3550 /// .configure(DFA::config().skip_cache_capacity_check(true))
3551 /// .build(pattern)?;
3552 /// let mut cache = dfa.create_cache();
3553 ///
3554 /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
3555 /// let expected = Some(HalfMatch::must(0, 2000));
3556 /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3557 /// assert_eq!(expected, got);
3558 ///
3559 /// # Ok::<(), Box<dyn std::error::Error>>(())
3560 /// ```
3561 pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config {
3562 self.skip_cache_capacity_check = Some(yes);
3563 self
3564 }
3565
3566 /// Configure a lazy DFA search to quit after a certain number of cache
3567 /// clearings.
3568 ///
3569 /// When a minimum is set, then a lazy DFA search will *possibly* "give
3570 /// up" after the minimum number of cache clearings has occurred. This is
3571 /// typically useful in scenarios where callers want to detect whether the
3572 /// lazy DFA search is "efficient" or not. If the cache is cleared too many
3573 /// times, this is a good indicator that it is not efficient, and thus, the
3574 /// caller may wish to use some other regex engine.
3575 ///
3576 /// Note that the number of times a cache is cleared is a property of
3577 /// the cache itself. Thus, if a cache is used in a subsequent search
3578 /// with a similarly configured lazy DFA, then it could cause the
3579 /// search to "give up" if the cache needed to be cleared, depending
3580 /// on its internal count and configured minimum. The cache clear
3581 /// count can only be reset to `0` via [`DFA::reset_cache`] (or
3582 /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if
3583 /// you're using the `Regex` API).
3584 ///
3585 /// By default, no minimum is configured. Thus, a lazy DFA search will
3586 /// never give up due to cache clearings. If you do set this option, you
3587 /// might consider also setting [`Config::minimum_bytes_per_state`] in
3588 /// order for the lazy DFA to take efficiency into account before giving
3589 /// up.
3590 ///
3591 /// # Example
3592 ///
3593 /// This example uses a somewhat pathological configuration to demonstrate
3594 /// the _possible_ behavior of cache clearing and how it might result
3595 /// in a search that returns an error.
3596 ///
3597 /// It is important to note that the precise mechanics of how and when
3598 /// a cache gets cleared is an implementation detail.
3599 ///
3600 /// ```
3601 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3602 /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind};
3603 ///
3604 /// // This is a carefully chosen regex. The idea is to pick one
3605 /// // that requires some decent number of states (hence the bounded
3606 /// // repetition). But we specifically choose to create a class with an
3607 /// // ASCII letter and a non-ASCII letter so that we can check that no new
3608 /// // states are created once the cache is full. Namely, if we fill up the
3609 /// // cache on a haystack of 'a's, then in order to match one 'β', a new
3610 /// // state will need to be created since a 'β' is encoded with multiple
3611 /// // bytes. Since there's no room for this state, the search should quit
3612 /// // at the very first position.
3613 /// let pattern = r"[aβ]{100}";
3614 /// let dfa = DFA::builder()
3615 /// .configure(
3616 /// // Configure it so that we have the minimum cache capacity
3617 /// // possible. And that if any clearings occur, the search quits.
3618 /// DFA::config()
3619 /// .skip_cache_capacity_check(true)
3620 /// .cache_capacity(0)
3621 /// .minimum_cache_clear_count(Some(0)),
3622 /// )
3623 /// .build(pattern)?;
3624 /// let mut cache = dfa.create_cache();
3625 ///
3626 /// // Our search will give up before reaching the end!
3627 /// let haystack = "a".repeat(101).into_bytes();
3628 /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3629 /// assert!(matches!(
3630 /// *result.unwrap_err().kind(),
3631 /// MatchErrorKind::GaveUp { .. },
3632 /// ));
3633 ///
3634 /// // Now that we know the cache is full, if we search a haystack that we
3635 /// // know will require creating at least one new state, it should not
3636 /// // be able to make much progress.
3637 /// let haystack = "β".repeat(101).into_bytes();
3638 /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3639 /// assert!(matches!(
3640 /// *result.unwrap_err().kind(),
3641 /// MatchErrorKind::GaveUp { .. },
3642 /// ));
3643 ///
3644 /// // If we reset the cache, then we should be able to create more states
3645 /// // and make more progress with searching for betas.
3646 /// cache.reset(&dfa);
3647 /// let haystack = "β".repeat(101).into_bytes();
3648 /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3649 /// assert!(matches!(
3650 /// *result.unwrap_err().kind(),
3651 /// MatchErrorKind::GaveUp { .. },
3652 /// ));
3653 ///
3654 /// // ... switching back to ASCII still makes progress since it just needs
3655 /// // to set transitions on existing states!
3656 /// let haystack = "a".repeat(101).into_bytes();
3657 /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3658 /// assert!(matches!(
3659 /// *result.unwrap_err().kind(),
3660 /// MatchErrorKind::GaveUp { .. },
3661 /// ));
3662 ///
3663 /// # Ok::<(), Box<dyn std::error::Error>>(())
3664 /// ```
3665 pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config {
3666 self.minimum_cache_clear_count = Some(min);
3667 self
3668 }
3669
3670 /// Configure a lazy DFA search to quit only when its efficiency drops
3671 /// below the given minimum.
3672 ///
3673 /// The efficiency of the cache is determined by the number of DFA states
3674 /// compiled per byte of haystack searched. For example, if the efficiency
3675 /// is 2, then it means the lazy DFA is creating a new DFA state after
3676 /// searching approximately 2 bytes in a haystack. Generally speaking, 2
3677 /// is quite bad and it's likely that even a slower regex engine like the
3678 /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster.
3679 ///
3680 /// This has no effect if [`Config::minimum_cache_clear_count`] is not set.
3681 /// Namely, this option only kicks in when the cache has been cleared more
3682 /// than the minimum number. If no minimum is set, then the cache is simply
3683 /// cleared whenever it fills up and it is impossible for the lazy DFA to
3684 /// quit due to ineffective use of the cache.
3685 ///
3686 /// In general, if one is setting [`Config::minimum_cache_clear_count`],
3687 /// then one should probably also set this knob as well. The reason is
3688 /// that the absolute number of times the cache is cleared is generally
3689 /// not a great predictor of efficiency. For example, if a new DFA state
3690 /// is created for every 1,000 bytes searched, then it wouldn't be hard
3691 /// for the cache to get cleared more than `N` times and then cause the
3692 /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite
3693 /// good from a performance perspective, and it's likely that the lazy
3694 /// DFA should continue searching, even if it requires clearing the cache
3695 /// occasionally.
3696 ///
3697 /// Finally, note that if you're implementing your own lazy DFA search
3698 /// routine and also want this efficiency check to work correctly, then
3699 /// you'll need to use the following routines to record search progress:
3700 ///
3701 /// * Call [`Cache::search_start`] at the beginning of every search.
3702 /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is
3703 /// called.
3704 /// * Call [`Cache::search_finish`] before completing a search. (It is
3705 /// not strictly necessary to call this when an error is returned, as
3706 /// `Cache::search_start` will automatically finish the previous search
3707 /// for you. But calling it where possible before returning helps improve
3708 /// the accuracy of how many bytes have actually been searched.)
3709 pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config {
3710 self.minimum_bytes_per_state = Some(min);
3711 self
3712 }
3713
3714 /// Returns the match semantics set in this configuration.
3715 pub fn get_match_kind(&self) -> MatchKind {
3716 self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
3717 }
3718
3719 /// Returns the prefilter set in this configuration, if one at all.
3720 pub fn get_prefilter(&self) -> Option<&Prefilter> {
3721 self.pre.as_ref().unwrap_or(&None).as_ref()
3722 }
3723
3724 /// Returns whether this configuration has enabled anchored starting states
3725 /// for every pattern in the DFA.
3726 pub fn get_starts_for_each_pattern(&self) -> bool {
3727 self.starts_for_each_pattern.unwrap_or(false)
3728 }
3729
3730 /// Returns whether this configuration has enabled byte classes or not.
3731 /// This is typically a debugging oriented option, as disabling it confers
3732 /// no speed benefit.
3733 pub fn get_byte_classes(&self) -> bool {
3734 self.byte_classes.unwrap_or(true)
3735 }
3736
3737 /// Returns whether this configuration has enabled heuristic Unicode word
3738 /// boundary support. When enabled, it is possible for a search to return
3739 /// an error.
3740 pub fn get_unicode_word_boundary(&self) -> bool {
3741 self.unicode_word_boundary.unwrap_or(false)
3742 }
3743
3744 /// Returns whether this configuration will instruct the lazy DFA to enter
3745 /// a quit state whenever the given byte is seen during a search. When at
3746 /// least one byte has this enabled, it is possible for a search to return
3747 /// an error.
3748 pub fn get_quit(&self, byte: u8) -> bool {
3749 self.quitset.map_or(false, |q| q.contains(byte))
3750 }
3751
3752 /// Returns whether this configuration will instruct the lazy DFA to
3753 /// "specialize" start states. When enabled, the lazy DFA will tag start
3754 /// states so that search routines using the lazy DFA can detect when
3755 /// it's in a start state and do some kind of optimization (like run a
3756 /// prefilter).
3757 pub fn get_specialize_start_states(&self) -> bool {
3758 self.specialize_start_states.unwrap_or(false)
3759 }
3760
3761 /// Returns the cache capacity set on this configuration.
3762 pub fn get_cache_capacity(&self) -> usize {
3763 self.cache_capacity.unwrap_or(2 * (1 << 20))
3764 }
3765
3766 /// Returns whether the cache capacity check should be skipped.
3767 pub fn get_skip_cache_capacity_check(&self) -> bool {
3768 self.skip_cache_capacity_check.unwrap_or(false)
3769 }
3770
3771 /// Returns, if set, the minimum number of times the cache must be cleared
3772 /// before a lazy DFA search can give up. When no minimum is set, then a
3773 /// search will never quit and will always clear the cache whenever it
3774 /// fills up.
3775 pub fn get_minimum_cache_clear_count(&self) -> Option<usize> {
3776 self.minimum_cache_clear_count.unwrap_or(None)
3777 }
3778
3779 /// Returns, if set, the minimum number of bytes per state that need to be
3780 /// processed in order for the lazy DFA to keep going. If the minimum falls
3781 /// below this number (and the cache has been cleared a minimum number of
3782 /// times), then the lazy DFA will return a "gave up" error.
3783 pub fn get_minimum_bytes_per_state(&self) -> Option<usize> {
3784 self.minimum_bytes_per_state.unwrap_or(None)
3785 }
3786
3787 /// Returns the minimum lazy DFA cache capacity required for the given NFA.
3788 ///
3789 /// The cache capacity required for a particular NFA may change without
3790 /// notice. Callers should not rely on it being stable.
3791 ///
3792 /// This is useful for informational purposes, but can also be useful for
3793 /// other reasons. For example, if one wants to check the minimum cache
3794 /// capacity themselves or if one wants to set the capacity based on the
3795 /// minimum.
3796 ///
3797 /// This may return an error if this configuration does not support all of
3798 /// the instructions used in the given NFA. For example, if the NFA has a
3799 /// Unicode word boundary but this configuration does not enable heuristic
3800 /// support for Unicode word boundaries.
3801 pub fn get_minimum_cache_capacity(
3802 &self,
3803 nfa: &thompson::NFA,
3804 ) -> Result<usize, BuildError> {
3805 let quitset = self.quit_set_from_nfa(nfa)?;
3806 let classes = self.byte_classes_from_nfa(nfa, &quitset);
3807 let starts = self.get_starts_for_each_pattern();
3808 Ok(minimum_cache_capacity(nfa, &classes, starts))
3809 }
3810
3811 /// Returns the byte class map used during search from the given NFA.
3812 ///
3813 /// If byte classes are disabled on this configuration, then a map is
3814 /// returned that puts each byte in its own equivalent class.
3815 fn byte_classes_from_nfa(
3816 &self,
3817 nfa: &thompson::NFA,
3818 quit: &ByteSet,
3819 ) -> ByteClasses {
3820 if !self.get_byte_classes() {
3821 // The lazy DFA will always use the equivalence class map, but
3822 // enabling this option is useful for debugging. Namely, this will
3823 // cause all transitions to be defined over their actual bytes
3824 // instead of an opaque equivalence class identifier. The former is
3825 // much easier to grok as a human.
3826 ByteClasses::singletons()
3827 } else {
3828 let mut set = nfa.byte_class_set().clone();
3829 // It is important to distinguish any "quit" bytes from all other
3830 // bytes. Otherwise, a non-quit byte may end up in the same class
3831 // as a quit byte, and thus cause the DFA stop when it shouldn't.
3832 //
3833 // Test case:
3834 //
3835 // regex-cli find match hybrid --unicode-word-boundary \
3836 // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
3837 if !quit.is_empty() {
3838 set.add_set(&quit);
3839 }
3840 set.byte_classes()
3841 }
3842 }
3843
3844 /// Return the quit set for this configuration and the given NFA.
3845 ///
3846 /// This may return an error if the NFA is incompatible with this
3847 /// configuration's quit set. For example, if the NFA has a Unicode word
3848 /// boundary and the quit set doesn't include non-ASCII bytes.
3849 fn quit_set_from_nfa(
3850 &self,
3851 nfa: &thompson::NFA,
3852 ) -> Result<ByteSet, BuildError> {
3853 let mut quit = self.quitset.unwrap_or(ByteSet::empty());
3854 if nfa.look_set_any().contains_word_unicode() {
3855 if self.get_unicode_word_boundary() {
3856 for b in 0x80..=0xFF {
3857 quit.add(b);
3858 }
3859 } else {
3860 // If heuristic support for Unicode word boundaries wasn't
3861 // enabled, then we can still check if our quit set is correct.
3862 // If the caller set their quit bytes in a way that causes the
3863 // DFA to quit on at least all non-ASCII bytes, then that's all
3864 // we need for heuristic support to work.
3865 if !quit.contains_range(0x80, 0xFF) {
3866 return Err(
3867 BuildError::unsupported_dfa_word_boundary_unicode(),
3868 );
3869 }
3870 }
3871 }
3872 Ok(quit)
3873 }
3874
3875 /// Overwrite the default configuration such that the options in `o` are
3876 /// always used. If an option in `o` is not set, then the corresponding
3877 /// option in `self` is used. If it's not set in `self` either, then it
3878 /// remains not set.
3879 fn overwrite(&self, o: Config) -> Config {
3880 Config {
3881 match_kind: o.match_kind.or(self.match_kind),
3882 pre: o.pre.or_else(|| self.pre.clone()),
3883 starts_for_each_pattern: o
3884 .starts_for_each_pattern
3885 .or(self.starts_for_each_pattern),
3886 byte_classes: o.byte_classes.or(self.byte_classes),
3887 unicode_word_boundary: o
3888 .unicode_word_boundary
3889 .or(self.unicode_word_boundary),
3890 quitset: o.quitset.or(self.quitset),
3891 specialize_start_states: o
3892 .specialize_start_states
3893 .or(self.specialize_start_states),
3894 cache_capacity: o.cache_capacity.or(self.cache_capacity),
3895 skip_cache_capacity_check: o
3896 .skip_cache_capacity_check
3897 .or(self.skip_cache_capacity_check),
3898 minimum_cache_clear_count: o
3899 .minimum_cache_clear_count
3900 .or(self.minimum_cache_clear_count),
3901 minimum_bytes_per_state: o
3902 .minimum_bytes_per_state
3903 .or(self.minimum_bytes_per_state),
3904 }
3905 }
3906}
3907
3908/// A builder for constructing a lazy deterministic finite automaton from
3909/// regular expressions.
3910///
3911/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The
3912/// advantage of the former is that it often lets you avoid importing the
3913/// `Builder` type directly.
3914///
3915/// This builder provides two main things:
3916///
3917/// 1. It provides a few different `build` routines for actually constructing
3918/// a DFA from different kinds of inputs. The most convenient is
3919/// [`Builder::build`], which builds a DFA directly from a pattern string. The
3920/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
3921/// from an NFA.
3922/// 2. The builder permits configuring a number of things.
3923/// [`Builder::configure`] is used with [`Config`] to configure aspects of
3924/// the DFA and the construction process itself. [`Builder::syntax`] and
3925/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
3926/// construction, respectively. The syntax and thompson configurations only
3927/// apply when building from a pattern string.
3928///
3929/// This builder always constructs a *single* lazy DFA. As such, this builder
3930/// can only be used to construct regexes that either detect the presence
3931/// of a match or find the end location of a match. A single DFA cannot
3932/// produce both the start and end of a match. For that information, use a
3933/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured
3934/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason
3935/// to use a DFA directly is if the end location of a match is enough for your
3936/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one,
3937/// since a second reverse DFA is needed to find the start of a match.
3938///
3939/// # Example
3940///
3941/// This example shows how to build a lazy DFA that uses a tiny cache capacity
3942/// and completely disables Unicode. That is:
3943///
3944/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
3945/// and `\b` are ASCII-only while `.` matches any byte except for `\n`
3946/// (instead of any UTF-8 encoding of a Unicode scalar value except for
3947/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
3948/// * The pattern itself is permitted to match invalid UTF-8. For example,
3949/// things like `[^a]` that match any byte except for `a` are permitted.
3950///
3951/// ```
3952/// use regex_automata::{
3953/// hybrid::dfa::DFA,
3954/// nfa::thompson,
3955/// util::syntax,
3956/// HalfMatch, Input,
3957/// };
3958///
3959/// let dfa = DFA::builder()
3960/// .configure(DFA::config().cache_capacity(5_000))
3961/// .thompson(thompson::Config::new().utf8(false))
3962/// .syntax(syntax::Config::new().unicode(false).utf8(false))
3963/// .build(r"foo[^b]ar.*")?;
3964/// let mut cache = dfa.create_cache();
3965///
3966/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
3967/// let expected = Some(HalfMatch::must(0, 10));
3968/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3969/// assert_eq!(expected, got);
3970///
3971/// # Ok::<(), Box<dyn std::error::Error>>(())
3972/// ```
3973#[derive(Clone, Debug)]
3974pub struct Builder {
3975 config: Config,
3976 #[cfg(feature = "syntax")]
3977 thompson: thompson::Compiler,
3978}
3979
3980impl Builder {
3981 /// Create a new lazy DFA builder with the default configuration.
3982 pub fn new() -> Builder {
3983 Builder {
3984 config: Config::default(),
3985 #[cfg(feature = "syntax")]
3986 thompson: thompson::Compiler::new(),
3987 }
3988 }
3989
3990 /// Build a lazy DFA from the given pattern.
3991 ///
3992 /// If there was a problem parsing or compiling the pattern, then an error
3993 /// is returned.
3994 #[cfg(feature = "syntax")]
3995 pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
3996 self.build_many(&[pattern])
3997 }
3998
3999 /// Build a lazy DFA from the given patterns.
4000 ///
4001 /// When matches are returned, the pattern ID corresponds to the index of
4002 /// the pattern in the slice given.
4003 #[cfg(feature = "syntax")]
4004 pub fn build_many<P: AsRef<str>>(
4005 &self,
4006 patterns: &[P],
4007 ) -> Result<DFA, BuildError> {
4008 let nfa = self
4009 .thompson
4010 .clone()
4011 // We can always forcefully disable captures because DFAs do not
4012 // support them.
4013 .configure(
4014 thompson::Config::new()
4015 .which_captures(thompson::WhichCaptures::None),
4016 )
4017 .build_many(patterns)
4018 .map_err(BuildError::nfa)?;
4019 self.build_from_nfa(nfa)
4020 }
4021
4022 /// Build a DFA from the given NFA.
4023 ///
4024 /// Note that this requires owning a `thompson::NFA`. While this may force
4025 /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs
4026 /// are defined internally to support shared ownership such that cloning is
4027 /// very cheap.
4028 ///
4029 /// # Example
4030 ///
4031 /// This example shows how to build a lazy DFA if you already have an NFA
4032 /// in hand.
4033 ///
4034 /// ```
4035 /// use regex_automata::{
4036 /// hybrid::dfa::DFA,
4037 /// nfa::thompson,
4038 /// HalfMatch, Input,
4039 /// };
4040 ///
4041 /// let haystack = "foo123bar";
4042 ///
4043 /// // This shows how to set non-default options for building an NFA.
4044 /// let nfa = thompson::Compiler::new()
4045 /// .configure(thompson::Config::new().shrink(true))
4046 /// .build(r"[0-9]+")?;
4047 /// let dfa = DFA::builder().build_from_nfa(nfa)?;
4048 /// let mut cache = dfa.create_cache();
4049 /// let expected = Some(HalfMatch::must(0, 6));
4050 /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
4051 /// assert_eq!(expected, got);
4052 ///
4053 /// # Ok::<(), Box<dyn std::error::Error>>(())
4054 /// ```
4055 pub fn build_from_nfa(
4056 &self,
4057 nfa: thompson::NFA,
4058 ) -> Result<DFA, BuildError> {
4059 let quitset = self.config.quit_set_from_nfa(&nfa)?;
4060 let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
4061 // Check that we can fit at least a few states into our cache,
4062 // otherwise it's pretty senseless to use the lazy DFA. This does have
4063 // a possible failure mode though. This assumes the maximum size of a
4064 // state in powerset space (so, the total number of NFA states), which
4065 // may never actually materialize, and could be quite a bit larger
4066 // than the actual biggest state. If this turns out to be a problem,
4067 // we could expose a knob that disables this check. But if so, we have
4068 // to be careful not to panic in other areas of the code (the cache
4069 // clearing and init code) that tend to assume some minimum useful
4070 // cache capacity.
4071 let min_cache = minimum_cache_capacity(
4072 &nfa,
4073 &classes,
4074 self.config.get_starts_for_each_pattern(),
4075 );
4076 let mut cache_capacity = self.config.get_cache_capacity();
4077 if cache_capacity < min_cache {
4078 // When the caller has asked us to skip the cache capacity check,
4079 // then we simply force the cache capacity to its minimum amount
4080 // and mush on.
4081 if self.config.get_skip_cache_capacity_check() {
4082 debug!(
4083 "given capacity ({}) is too small, \
4084 since skip_cache_capacity_check is enabled, \
4085 setting cache capacity to minimum ({})",
4086 cache_capacity, min_cache,
4087 );
4088 cache_capacity = min_cache;
4089 } else {
4090 return Err(BuildError::insufficient_cache_capacity(
4091 min_cache,
4092 cache_capacity,
4093 ));
4094 }
4095 }
4096 // We also need to check that we can fit at least some small number
4097 // of states in our state ID space. This is unlikely to trigger in
4098 // >=32-bit systems, but 16-bit systems have a pretty small state ID
4099 // space since a number of bits are used up as sentinels.
4100 if let Err(err) = minimum_lazy_state_id(&classes) {
4101 return Err(BuildError::insufficient_state_id_capacity(err));
4102 }
4103 let stride2 = classes.stride2();
4104 let start_map = StartByteMap::new(nfa.look_matcher());
4105 Ok(DFA {
4106 config: self.config.clone(),
4107 nfa,
4108 stride2,
4109 start_map,
4110 classes,
4111 quitset,
4112 cache_capacity,
4113 })
4114 }
4115
4116 /// Apply the given lazy DFA configuration options to this builder.
4117 pub fn configure(&mut self, config: Config) -> &mut Builder {
4118 self.config = self.config.overwrite(config);
4119 self
4120 }
4121
4122 /// Set the syntax configuration for this builder using
4123 /// [`syntax::Config`](crate::util::syntax::Config).
4124 ///
4125 /// This permits setting things like case insensitivity, Unicode and multi
4126 /// line mode.
4127 ///
4128 /// These settings only apply when constructing a lazy DFA directly from a
4129 /// pattern.
4130 #[cfg(feature = "syntax")]
4131 pub fn syntax(
4132 &mut self,
4133 config: crate::util::syntax::Config,
4134 ) -> &mut Builder {
4135 self.thompson.syntax(config);
4136 self
4137 }
4138
4139 /// Set the Thompson NFA configuration for this builder using
4140 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
4141 ///
4142 /// This permits setting things like whether the DFA should match the regex
4143 /// in reverse or if additional time should be spent shrinking the size of
4144 /// the NFA.
4145 ///
4146 /// These settings only apply when constructing a DFA directly from a
4147 /// pattern.
4148 #[cfg(feature = "syntax")]
4149 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
4150 self.thompson.configure(config);
4151 self
4152 }
4153}
4154
4155/// Represents the current state of an overlapping search.
4156///
4157/// This is used for overlapping searches since they need to know something
4158/// about the previous search. For example, when multiple patterns match at the
4159/// same position, this state tracks the last reported pattern so that the next
4160/// search knows whether to report another matching pattern or continue with
4161/// the search at the next position. Additionally, it also tracks which state
4162/// the last search call terminated in.
4163///
4164/// This type provides little introspection capabilities. The only thing a
4165/// caller can do is construct it and pass it around to permit search routines
4166/// to use it to track state, and also ask whether a match has been found.
4167///
4168/// Callers should always provide a fresh state constructed via
4169/// [`OverlappingState::start`] when starting a new search. Reusing state from
4170/// a previous search may result in incorrect results.
4171#[derive(Clone, Debug, Eq, PartialEq)]
4172pub struct OverlappingState {
4173 /// The match reported by the most recent overlapping search to use this
4174 /// state.
4175 ///
4176 /// If a search does not find any matches, then it is expected to clear
4177 /// this value.
4178 pub(crate) mat: Option<HalfMatch>,
4179 /// The state ID of the state at which the search was in when the call
4180 /// terminated. When this is a match state, `last_match` must be set to a
4181 /// non-None value.
4182 ///
4183 /// A `None` value indicates the start state of the corresponding
4184 /// automaton. We cannot use the actual ID, since any one automaton may
4185 /// have many start states, and which one is in use depends on several
4186 /// search-time factors.
4187 pub(crate) id: Option<LazyStateID>,
4188 /// The position of the search.
4189 ///
4190 /// When `id` is None (i.e., we are starting a search), this is set to
4191 /// the beginning of the search as given by the caller regardless of its
4192 /// current value. Subsequent calls to an overlapping search pick up at
4193 /// this offset.
4194 pub(crate) at: usize,
4195 /// The index into the matching patterns of the next match to report if the
4196 /// current state is a match state. Note that this may be 1 greater than
4197 /// the total number of matches to report for the current match state. (In
4198 /// which case, no more matches should be reported at the current position
4199 /// and the search should advance to the next position.)
4200 pub(crate) next_match_index: Option<usize>,
4201 /// This is set to true when a reverse overlapping search has entered its
4202 /// EOI transitions.
4203 ///
4204 /// This isn't used in a forward search because it knows to stop once the
4205 /// position exceeds the end of the search range. In a reverse search,
4206 /// since we use unsigned offsets, we don't "know" once we've gone past
4207 /// `0`. So the only way to detect it is with this extra flag. The reverse
4208 /// overlapping search knows to terminate specifically after it has
4209 /// reported all matches after following the EOI transition.
4210 pub(crate) rev_eoi: bool,
4211}
4212
4213impl OverlappingState {
4214 /// Create a new overlapping state that begins at the start state of any
4215 /// automaton.
4216 pub fn start() -> OverlappingState {
4217 OverlappingState {
4218 mat: None,
4219 id: None,
4220 at: 0,
4221 next_match_index: None,
4222 rev_eoi: false,
4223 }
4224 }
4225
4226 /// Return the match result of the most recent search to execute with this
4227 /// state.
4228 ///
4229 /// A searches will clear this result automatically, such that if no
4230 /// match is found, this will correctly report `None`.
4231 pub fn get_match(&self) -> Option<HalfMatch> {
4232 self.mat
4233 }
4234}
4235
4236/// Runs the given overlapping `search` function (forwards or backwards) until
4237/// a match is found whose offset does not split a codepoint.
4238///
4239/// This is *not* always correct to call. It should only be called when the
4240/// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width
4241/// matches. Calling this when both of those things aren't true might result
4242/// in legitimate matches getting skipped.
4243#[cold]
4244#[inline(never)]
4245fn skip_empty_utf8_splits_overlapping<F>(
4246 input: &Input<'_>,
4247 state: &mut OverlappingState,
4248 mut search: F,
4249) -> Result<(), MatchError>
4250where
4251 F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,
4252{
4253 // Note that this routine works for forwards and reverse searches
4254 // even though there's no code here to handle those cases. That's
4255 // because overlapping searches drive themselves to completion via
4256 // `OverlappingState`. So all we have to do is push it until no matches are
4257 // found.
4258
4259 let mut hm = match state.get_match() {
4260 None => return Ok(()),
4261 Some(hm) => hm,
4262 };
4263 if input.get_anchored().is_anchored() {
4264 if !input.is_char_boundary(hm.offset()) {
4265 state.mat = None;
4266 }
4267 return Ok(());
4268 }
4269 while !input.is_char_boundary(hm.offset()) {
4270 search(input, state)?;
4271 hm = match state.get_match() {
4272 None => return Ok(()),
4273 Some(hm) => hm,
4274 };
4275 }
4276 Ok(())
4277}
4278
4279/// Based on the minimum number of states required for a useful lazy DFA cache,
4280/// this returns the minimum lazy state ID that must be representable.
4281///
4282/// It's not likely for this to have any impact 32-bit systems (or higher), but
4283/// on 16-bit systems, the lazy state ID space is quite constrained and thus
4284/// may be insufficient if our MIN_STATES value is (for some reason) too high.
4285fn minimum_lazy_state_id(
4286 classes: &ByteClasses,
4287) -> Result<LazyStateID, LazyStateIDError> {
4288 let stride = 1 << classes.stride2();
4289 let min_state_index = MIN_STATES.checked_sub(1).unwrap();
4290 LazyStateID::new(min_state_index * stride)
4291}
4292
4293/// Based on the minimum number of states required for a useful lazy DFA cache,
4294/// this returns a heuristic minimum number of bytes of heap space required.
4295///
4296/// This is a "heuristic" because the minimum it returns is likely bigger than
4297/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses
4298/// the maximum number of NFA states (all of them). This is likely bigger
4299/// than what is required in practice. Computing the true minimum effectively
4300/// requires determinization, which is probably too much work to do for a
4301/// simple check like this.
4302///
4303/// One of the issues with this approach IMO is that it requires that this
4304/// be in sync with the calculation above for computing how much heap memory
4305/// the DFA cache uses. If we get it wrong, it's possible for example for the
4306/// minimum to be smaller than the computed heap memory, and thus, it may be
4307/// the case that we can't add the required minimum number of states. That in
4308/// turn will make lazy DFA panic because we assume that we can add at least a
4309/// minimum number of states.
4310///
4311/// Another approach would be to always allow the minimum number of states to
4312/// be added to the lazy DFA cache, even if it exceeds the configured cache
4313/// limit. This does mean that the limit isn't really a limit in all cases,
4314/// which is unfortunate. But it does at least guarantee that the lazy DFA can
4315/// always make progress, even if it is slow. (This approach is very similar to
4316/// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't
4317/// rely on cache size calculation. Instead, it would just always permit a
4318/// minimum number of states to be added.)
4319fn minimum_cache_capacity(
4320 nfa: &thompson::NFA,
4321 classes: &ByteClasses,
4322 starts_for_each_pattern: bool,
4323) -> usize {
4324 const ID_SIZE: usize = size_of::<LazyStateID>();
4325 const STATE_SIZE: usize = size_of::<State>();
4326
4327 let stride = 1 << classes.stride2();
4328 let states_len = nfa.states().len();
4329 let sparses = 2 * states_len * NFAStateID::SIZE;
4330 let trans = MIN_STATES * stride * ID_SIZE;
4331
4332 let mut starts = Start::len() * ID_SIZE;
4333 if starts_for_each_pattern {
4334 starts += (Start::len() * nfa.pattern_len()) * ID_SIZE;
4335 }
4336
4337 // The min number of states HAS to be at least 4: we have 3 sentinel states
4338 // and then we need space for one more when we save a state after clearing
4339 // the cache. We also need space for one more, otherwise we get stuck in a
4340 // loop where we try to add a 5th state, which gets rejected, which clears
4341 // the cache, which adds back a saved state (4th total state) which then
4342 // tries to add the 5th state again.
4343 assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5");
4344 // The minimum number of non-sentinel states. We consider this separately
4345 // because sentinel states are much smaller in that they contain no NFA
4346 // states. Given our aggressive calculation here, it's worth being more
4347 // precise with the number of states we need.
4348 let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap();
4349
4350 // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of
4351 // patterns, followed by 32-bit encodings of patterns and then delta
4352 // varint encodings of NFA state IDs. We use the worst case (which isn't
4353 // technically possible) of 5 bytes for each NFA state ID.
4354 //
4355 // HOWEVER, three of the states needed by a lazy DFA are just the sentinel
4356 // unknown, dead and quit states. Those states have a known size and it is
4357 // small.
4358 let dead_state_size = State::dead().memory_usage();
4359 let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5);
4360 let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size))
4361 + (non_sentinel * (STATE_SIZE + max_state_size));
4362 // NOTE: We don't double count heap memory used by State for this map since
4363 // we use reference counting to avoid doubling memory usage. (This tends to
4364 // be where most memory is allocated in the cache.)
4365 let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE);
4366 let stack = states_len * NFAStateID::SIZE;
4367 let scratch_state_builder = max_state_size;
4368
4369 trans
4370 + starts
4371 + states
4372 + states_to_sid
4373 + sparses
4374 + stack
4375 + scratch_state_builder
4376}
4377
4378#[cfg(all(test, feature = "syntax"))]
4379mod tests {
4380 use super::*;
4381
4382 // Tests that we handle heuristic Unicode word boundary support in reverse
4383 // DFAs in the specific case of contextual searches.
4384 //
4385 // I wrote this test when I discovered a bug in how heuristic word
4386 // boundaries were handled. Namely, that the starting state selection
4387 // didn't consider the DFA's quit byte set when looking at the byte
4388 // immediately before the start of the search (or immediately after the
4389 // end of the search in the case of a reverse search). As a result, it was
4390 // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte
4391 // in the 'β' codepoint would be treated as a non-word character. But of
4392 // course, this search should trigger the DFA to quit, since there is a
4393 // non-ASCII byte in consideration.
4394 //
4395 // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set
4396 // if it wasn't empty. The forward case is tested in the doc test for the
4397 // Config::unicode_word_boundary API. We test the reverse case here, which
4398 // is sufficiently niche that it doesn't really belong in a doc test.
4399 #[test]
4400 fn heuristic_unicode_reverse() {
4401 let dfa = DFA::builder()
4402 .configure(DFA::config().unicode_word_boundary(true))
4403 .thompson(thompson::Config::new().reverse(true))
4404 .build(r"\b[0-9]+\b")
4405 .unwrap();
4406 let mut cache = dfa.create_cache();
4407
4408 let input = Input::new("β123").range(2..);
4409 let expected = MatchError::quit(0xB2, 1);
4410 let got = dfa.try_search_rev(&mut cache, &input);
4411 assert_eq!(Err(expected), got);
4412
4413 let input = Input::new("123β").range(..3);
4414 let expected = MatchError::quit(0xCE, 3);
4415 let got = dfa.try_search_rev(&mut cache, &input);
4416 assert_eq!(Err(expected), got);
4417 }
4418}
4419