1/*!
2An NFA backed bounded backtracker for executing regex searches with capturing
3groups.
4
5This module provides a [`BoundedBacktracker`] that works by simulating an NFA
6using the classical backtracking algorithm with a twist: it avoids redoing
7work that it has done before and thereby avoids worst case exponential time.
8In exchange, it can only be used on "short" haystacks. Its advantage is that
9is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases
10because it does less book-keeping.
11*/
12
13use alloc::{vec, vec::Vec};
14
15use crate::{
16 nfa::thompson::{self, BuildError, State, NFA},
17 util::{
18 captures::Captures,
19 empty, iter,
20 prefilter::Prefilter,
21 primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
22 search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
23 },
24};
25
26/// Returns the minimum visited capacity for the given haystack.
27///
28/// This function can be used as the argument to [`Config::visited_capacity`]
29/// in order to guarantee that a backtracking search for the given `input`
30/// won't return an error when using a [`BoundedBacktracker`] built from the
31/// given `NFA`.
32///
33/// This routine exists primarily as a way to test that the bounded backtracker
34/// works correctly when its capacity is set to the smallest possible amount.
35/// Still, it may be useful in cases where you know you want to use the bounded
36/// backtracker for a specific input, and just need to know what visited
37/// capacity to provide to make it work.
38///
39/// Be warned that this number could be quite large as it is multiplicative in
40/// the size the given NFA and haystack.
41pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize {
42 div_ceil(nfa.states().len() * (input.get_span().len() + 1), 8)
43}
44
45/// The configuration used for building a bounded backtracker.
46///
47/// A bounded backtracker configuration is a simple data object that is
48/// typically used with [`Builder::configure`].
49#[derive(Clone, Debug, Default)]
50pub struct Config {
51 pre: Option<Option<Prefilter>>,
52 visited_capacity: Option<usize>,
53}
54
55impl Config {
56 /// Return a new default regex configuration.
57 pub fn new() -> Config {
58 Config::default()
59 }
60
61 /// Set a prefilter to be used whenever a start state is entered.
62 ///
63 /// A [`Prefilter`] in this context is meant to accelerate searches by
64 /// looking for literal prefixes that every match for the corresponding
65 /// pattern (or patterns) must start with. Once a prefilter produces a
66 /// match, the underlying search routine continues on to try and confirm
67 /// the match.
68 ///
69 /// Be warned that setting a prefilter does not guarantee that the search
70 /// will be faster. While it's usually a good bet, if the prefilter
71 /// produces a lot of false positive candidates (i.e., positions matched
72 /// by the prefilter but not by the regex), then the overall result can
73 /// be slower than if you had just executed the regex engine without any
74 /// prefilters.
75 ///
76 /// By default no prefilter is set.
77 ///
78 /// # Example
79 ///
80 /// ```
81 /// use regex_automata::{
82 /// nfa::thompson::backtrack::BoundedBacktracker,
83 /// util::prefilter::Prefilter,
84 /// Input, Match, MatchKind,
85 /// };
86 ///
87 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
88 /// let re = BoundedBacktracker::builder()
89 /// .configure(BoundedBacktracker::config().prefilter(pre))
90 /// .build(r"(foo|bar)[a-z]+")?;
91 /// let mut cache = re.create_cache();
92 /// let input = Input::new("foo1 barfox bar");
93 /// assert_eq!(
94 /// Some(Match::must(0, 5..11)),
95 /// re.try_find(&mut cache, input)?,
96 /// );
97 ///
98 /// # Ok::<(), Box<dyn std::error::Error>>(())
99 /// ```
100 ///
101 /// Be warned though that an incorrect prefilter can lead to incorrect
102 /// results!
103 ///
104 /// ```
105 /// use regex_automata::{
106 /// nfa::thompson::backtrack::BoundedBacktracker,
107 /// util::prefilter::Prefilter,
108 /// Input, HalfMatch, MatchKind,
109 /// };
110 ///
111 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
112 /// let re = BoundedBacktracker::builder()
113 /// .configure(BoundedBacktracker::config().prefilter(pre))
114 /// .build(r"(foo|bar)[a-z]+")?;
115 /// let mut cache = re.create_cache();
116 /// let input = Input::new("foo1 barfox bar");
117 /// // No match reported even though there clearly is one!
118 /// assert_eq!(None, re.try_find(&mut cache, input)?);
119 ///
120 /// # Ok::<(), Box<dyn std::error::Error>>(())
121 /// ```
122 pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
123 self.pre = Some(pre);
124 self
125 }
126
127 /// Set the visited capacity used to bound backtracking.
128 ///
129 /// The visited capacity represents the amount of heap memory (in bytes) to
130 /// allocate toward tracking which parts of the backtracking search have
131 /// been done before. The heap memory needed for any particular search is
132 /// proportional to `haystack.len() * nfa.states().len()`, which an be
133 /// quite large. Therefore, the bounded backtracker is typically only able
134 /// to run on shorter haystacks.
135 ///
136 /// For a given regex, increasing the visited capacity means that the
137 /// maximum haystack length that can be searched is increased. The
138 /// [`BoundedBacktracker::max_haystack_len`] method returns that maximum.
139 ///
140 /// The default capacity is a reasonable but empirically chosen size.
141 ///
142 /// # Example
143 ///
144 /// As with other regex engines, Unicode is what tends to make the bounded
145 /// backtracker less useful by making the maximum haystack length quite
146 /// small. If necessary, increasing the visited capacity using this routine
147 /// will increase the maximum haystack length at the cost of using more
148 /// memory.
149 ///
150 /// Note though that the specific maximum values here are not an API
151 /// guarantee. The default visited capacity is subject to change and not
152 /// covered by semver.
153 ///
154 /// ```
155 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
156 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
157 ///
158 /// // Unicode inflates the size of the underlying NFA quite a bit, and
159 /// // thus means that the backtracker can only handle smaller haystacks,
160 /// // assuming that the visited capacity remains unchanged.
161 /// let re = BoundedBacktracker::new(r"\w+")?;
162 /// assert!(re.max_haystack_len() <= 7_000);
163 /// // But we can increase the visited capacity to handle bigger haystacks!
164 /// let re = BoundedBacktracker::builder()
165 /// .configure(BoundedBacktracker::config().visited_capacity(1<<20))
166 /// .build(r"\w+")?;
167 /// assert!(re.max_haystack_len() >= 25_000);
168 /// assert!(re.max_haystack_len() <= 28_000);
169 /// # Ok::<(), Box<dyn std::error::Error>>(())
170 /// ```
171 pub fn visited_capacity(mut self, capacity: usize) -> Config {
172 self.visited_capacity = Some(capacity);
173 self
174 }
175
176 /// Returns the prefilter set in this configuration, if one at all.
177 pub fn get_prefilter(&self) -> Option<&Prefilter> {
178 self.pre.as_ref().unwrap_or(&None).as_ref()
179 }
180
181 /// Returns the configured visited capacity.
182 ///
183 /// Note that the actual capacity used may be slightly bigger than the
184 /// configured capacity.
185 pub fn get_visited_capacity(&self) -> usize {
186 const DEFAULT: usize = 256 * (1 << 10); // 256 KB
187 self.visited_capacity.unwrap_or(DEFAULT)
188 }
189
190 /// Overwrite the default configuration such that the options in `o` are
191 /// always used. If an option in `o` is not set, then the corresponding
192 /// option in `self` is used. If it's not set in `self` either, then it
193 /// remains not set.
194 pub(crate) fn overwrite(&self, o: Config) -> Config {
195 Config {
196 pre: o.pre.or_else(|| self.pre.clone()),
197 visited_capacity: o.visited_capacity.or(self.visited_capacity),
198 }
199 }
200}
201
202/// A builder for a bounded backtracker.
203///
204/// This builder permits configuring options for the syntax of a pattern, the
205/// NFA construction and the `BoundedBacktracker` construction. This builder
206/// is different from a general purpose regex builder in that it permits fine
207/// grain configuration of the construction process. The trade off for this is
208/// complexity, and the possibility of setting a configuration that might not
209/// make sense. For example, there are two different UTF-8 modes:
210///
211/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
212/// whether the pattern itself can contain sub-expressions that match invalid
213/// UTF-8.
214/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
215/// advance the starting position of the next search when a match with zero
216/// length is found.
217///
218/// Generally speaking, callers will want to either enable all of these or
219/// disable all of these.
220///
221/// # Example
222///
223/// This example shows how to disable UTF-8 mode in the syntax and the regex
224/// itself. This is generally what you want for matching on arbitrary bytes.
225///
226/// ```
227/// use regex_automata::{
228/// nfa::thompson::{self, backtrack::BoundedBacktracker},
229/// util::syntax,
230/// Match,
231/// };
232///
233/// let re = BoundedBacktracker::builder()
234/// .syntax(syntax::Config::new().utf8(false))
235/// .thompson(thompson::Config::new().utf8(false))
236/// .build(r"foo(?-u:[^b])ar.*")?;
237/// let mut cache = re.create_cache();
238///
239/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
240/// let expected = Some(Ok(Match::must(0, 1..9)));
241/// let got = re.try_find_iter(&mut cache, haystack).next();
242/// assert_eq!(expected, got);
243/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
244/// // but the subsequent `.*` does not! Disabling UTF-8
245/// // on the syntax permits this.
246/// //
247/// // N.B. This example does not show the impact of
248/// // disabling UTF-8 mode on a BoundedBacktracker Config, since that
249/// // only impacts regexes that can produce matches of
250/// // length 0.
251/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap()?.range()]);
252///
253/// # Ok::<(), Box<dyn std::error::Error>>(())
254/// ```
255#[derive(Clone, Debug)]
256pub struct Builder {
257 config: Config,
258 #[cfg(feature = "syntax")]
259 thompson: thompson::Compiler,
260}
261
262impl Builder {
263 /// Create a new BoundedBacktracker builder with its default configuration.
264 pub fn new() -> Builder {
265 Builder {
266 config: Config::default(),
267 #[cfg(feature = "syntax")]
268 thompson: thompson::Compiler::new(),
269 }
270 }
271
272 /// Build a `BoundedBacktracker` from the given pattern.
273 ///
274 /// If there was a problem parsing or compiling the pattern, then an error
275 /// is returned.
276 #[cfg(feature = "syntax")]
277 pub fn build(
278 &self,
279 pattern: &str,
280 ) -> Result<BoundedBacktracker, BuildError> {
281 self.build_many(&[pattern])
282 }
283
284 /// Build a `BoundedBacktracker` from the given patterns.
285 #[cfg(feature = "syntax")]
286 pub fn build_many<P: AsRef<str>>(
287 &self,
288 patterns: &[P],
289 ) -> Result<BoundedBacktracker, BuildError> {
290 let nfa = self.thompson.build_many(patterns)?;
291 self.build_from_nfa(nfa)
292 }
293
294 /// Build a `BoundedBacktracker` directly from its NFA.
295 ///
296 /// Note that when using this method, any configuration that applies to the
297 /// construction of the NFA itself will of course be ignored, since the NFA
298 /// given here is already built.
299 pub fn build_from_nfa(
300 &self,
301 nfa: NFA,
302 ) -> Result<BoundedBacktracker, BuildError> {
303 nfa.look_set_any().available().map_err(BuildError::word)?;
304 Ok(BoundedBacktracker { config: self.config.clone(), nfa })
305 }
306
307 /// Apply the given `BoundedBacktracker` configuration options to this
308 /// builder.
309 pub fn configure(&mut self, config: Config) -> &mut Builder {
310 self.config = self.config.overwrite(config);
311 self
312 }
313
314 /// Set the syntax configuration for this builder using
315 /// [`syntax::Config`](crate::util::syntax::Config).
316 ///
317 /// This permits setting things like case insensitivity, Unicode and multi
318 /// line mode.
319 ///
320 /// These settings only apply when constructing a `BoundedBacktracker`
321 /// directly from a pattern.
322 #[cfg(feature = "syntax")]
323 pub fn syntax(
324 &mut self,
325 config: crate::util::syntax::Config,
326 ) -> &mut Builder {
327 self.thompson.syntax(config);
328 self
329 }
330
331 /// Set the Thompson NFA configuration for this builder using
332 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
333 ///
334 /// This permits setting things like if additional time should be spent
335 /// shrinking the size of the NFA.
336 ///
337 /// These settings only apply when constructing a `BoundedBacktracker`
338 /// directly from a pattern.
339 #[cfg(feature = "syntax")]
340 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
341 self.thompson.configure(config);
342 self
343 }
344}
345
346/// A backtracking regex engine that bounds its execution to avoid exponential
347/// blow-up.
348///
349/// This regex engine only implements leftmost-first match semantics and
350/// only supports leftmost searches. It effectively does the same thing as a
351/// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because
352/// it doesn't have to worry about copying capturing group spans for most NFA
353/// states. Instead, the backtracker can maintain one set of captures (provided
354/// by the caller) and never needs to copy them. In exchange, the backtracker
355/// bounds itself to ensure it doesn't exhibit worst case exponential time.
356/// This results in the backtracker only being able to handle short haystacks
357/// given reasonable memory usage.
358///
359/// # Searches may return an error!
360///
361/// By design, this backtracking regex engine is bounded. This bound is
362/// implemented by not visiting any combination of NFA state ID and position
363/// in a haystack more than once. Thus, the total memory required to bound
364/// backtracking is proportional to `haystack.len() * nfa.states().len()`.
365/// This can obviously get quite large, since large haystacks aren't terribly
366/// uncommon. To avoid using exorbitant memory, the capacity is bounded by
367/// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total
368/// capacity required for a particular regex and a haystack exceeds this
369/// capacity, then the search routine will return an error.
370///
371/// Unlike other regex engines that may return an error at search time (like
372/// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded
373/// backtracker will work for every haystack. Therefore, this regex engine
374/// _only_ exposes fallible search routines to avoid the footgun of panicking
375/// when running a search on a haystack that is too big.
376///
377/// If one wants to use the fallible search APIs without handling the
378/// error, the only way to guarantee an error won't occur from the
379/// haystack length is to ensure the haystack length does not exceed
380/// [`BoundedBacktracker::max_haystack_len`].
381///
382/// # Example: Unicode word boundaries
383///
384/// This example shows that the bounded backtracker implements Unicode word
385/// boundaries correctly by default.
386///
387/// ```
388/// # if cfg!(miri) { return Ok(()); } // miri takes too long
389/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
390///
391/// let re = BoundedBacktracker::new(r"\b\w+\b")?;
392/// let mut cache = re.create_cache();
393///
394/// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс");
395/// assert_eq!(Some(Ok(Match::must(0, 0..12))), it.next());
396/// assert_eq!(Some(Ok(Match::must(0, 13..23))), it.next());
397/// assert_eq!(None, it.next());
398/// # Ok::<(), Box<dyn std::error::Error>>(())
399/// ```
400///
401/// # Example: multiple regex patterns
402///
403/// The bounded backtracker supports searching for multiple patterns
404/// simultaneously, just like other regex engines. Note though that because it
405/// uses a backtracking strategy, this regex engine is unlikely to scale well
406/// as more patterns are added. But then again, as more patterns are added, the
407/// maximum haystack length allowed will also shorten (assuming the visited
408/// capacity remains invariant).
409///
410/// ```
411/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
412///
413/// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
414/// let mut cache = re.create_cache();
415///
416/// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
417/// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
418/// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
419/// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
420/// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
421/// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
422/// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
423/// assert_eq!(None, it.next());
424/// # Ok::<(), Box<dyn std::error::Error>>(())
425/// ```
426#[derive(Clone, Debug)]
427pub struct BoundedBacktracker {
428 config: Config,
429 nfa: NFA,
430}
431
432impl BoundedBacktracker {
433 /// Parse the given regular expression using the default configuration and
434 /// return the corresponding `BoundedBacktracker`.
435 ///
436 /// If you want a non-default configuration, then use the [`Builder`] to
437 /// set your own configuration.
438 ///
439 /// # Example
440 ///
441 /// ```
442 /// use regex_automata::{
443 /// nfa::thompson::backtrack::BoundedBacktracker,
444 /// Match,
445 /// };
446 ///
447 /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
448 /// let mut cache = re.create_cache();
449 /// assert_eq!(
450 /// Some(Ok(Match::must(0, 3..14))),
451 /// re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(),
452 /// );
453 /// # Ok::<(), Box<dyn std::error::Error>>(())
454 /// ```
455 #[cfg(feature = "syntax")]
456 pub fn new(pattern: &str) -> Result<BoundedBacktracker, BuildError> {
457 BoundedBacktracker::builder().build(pattern)
458 }
459
460 /// Like `new`, but parses multiple patterns into a single "multi regex."
461 /// This similarly uses the default regex configuration.
462 ///
463 /// # Example
464 ///
465 /// ```
466 /// use regex_automata::{
467 /// nfa::thompson::backtrack::BoundedBacktracker,
468 /// Match,
469 /// };
470 ///
471 /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
472 /// let mut cache = re.create_cache();
473 ///
474 /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
475 /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
476 /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
477 /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
478 /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
479 /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
480 /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
481 /// assert_eq!(None, it.next());
482 /// # Ok::<(), Box<dyn std::error::Error>>(())
483 /// ```
484 #[cfg(feature = "syntax")]
485 pub fn new_many<P: AsRef<str>>(
486 patterns: &[P],
487 ) -> Result<BoundedBacktracker, BuildError> {
488 BoundedBacktracker::builder().build_many(patterns)
489 }
490
491 /// # Example
492 ///
493 /// This shows how to hand assemble a regular expression via its HIR,
494 /// compile an NFA from it and build a BoundedBacktracker from the NFA.
495 ///
496 /// ```
497 /// use regex_automata::{
498 /// nfa::thompson::{NFA, backtrack::BoundedBacktracker},
499 /// Match,
500 /// };
501 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
502 ///
503 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
504 /// ClassBytesRange::new(b'0', b'9'),
505 /// ClassBytesRange::new(b'A', b'Z'),
506 /// ClassBytesRange::new(b'_', b'_'),
507 /// ClassBytesRange::new(b'a', b'z'),
508 /// ])));
509 ///
510 /// let config = NFA::config().nfa_size_limit(Some(1_000));
511 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
512 ///
513 /// let re = BoundedBacktracker::new_from_nfa(nfa)?;
514 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
515 /// let expected = Some(Match::must(0, 3..4));
516 /// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?;
517 /// assert_eq!(expected, caps.get_match());
518 ///
519 /// # Ok::<(), Box<dyn std::error::Error>>(())
520 /// ```
521 pub fn new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError> {
522 BoundedBacktracker::builder().build_from_nfa(nfa)
523 }
524
525 /// Create a new `BoundedBacktracker` that matches every input.
526 ///
527 /// # Example
528 ///
529 /// ```
530 /// use regex_automata::{
531 /// nfa::thompson::backtrack::BoundedBacktracker,
532 /// Match,
533 /// };
534 ///
535 /// let re = BoundedBacktracker::always_match()?;
536 /// let mut cache = re.create_cache();
537 ///
538 /// let expected = Some(Ok(Match::must(0, 0..0)));
539 /// assert_eq!(expected, re.try_find_iter(&mut cache, "").next());
540 /// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next());
541 /// # Ok::<(), Box<dyn std::error::Error>>(())
542 /// ```
543 pub fn always_match() -> Result<BoundedBacktracker, BuildError> {
544 let nfa = thompson::NFA::always_match();
545 BoundedBacktracker::new_from_nfa(nfa)
546 }
547
548 /// Create a new `BoundedBacktracker` that never matches any input.
549 ///
550 /// # Example
551 ///
552 /// ```
553 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
554 ///
555 /// let re = BoundedBacktracker::never_match()?;
556 /// let mut cache = re.create_cache();
557 ///
558 /// assert_eq!(None, re.try_find_iter(&mut cache, "").next());
559 /// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next());
560 /// # Ok::<(), Box<dyn std::error::Error>>(())
561 /// ```
562 pub fn never_match() -> Result<BoundedBacktracker, BuildError> {
563 let nfa = thompson::NFA::never_match();
564 BoundedBacktracker::new_from_nfa(nfa)
565 }
566
567 /// Return a default configuration for a `BoundedBacktracker`.
568 ///
569 /// This is a convenience routine to avoid needing to import the `Config`
570 /// type when customizing the construction of a `BoundedBacktracker`.
571 ///
572 /// # Example
573 ///
574 /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
575 /// disabled, zero-width matches that split a codepoint are allowed.
576 /// Otherwise they are never reported.
577 ///
578 /// In the code below, notice that `""` is permitted to match positions
579 /// that split the encoding of a codepoint.
580 ///
581 /// ```
582 /// use regex_automata::{
583 /// nfa::thompson::{self, backtrack::BoundedBacktracker},
584 /// Match,
585 /// };
586 ///
587 /// let re = BoundedBacktracker::builder()
588 /// .thompson(thompson::Config::new().utf8(false))
589 /// .build(r"")?;
590 /// let mut cache = re.create_cache();
591 ///
592 /// let haystack = "a☃z";
593 /// let mut it = re.try_find_iter(&mut cache, haystack);
594 /// assert_eq!(Some(Ok(Match::must(0, 0..0))), it.next());
595 /// assert_eq!(Some(Ok(Match::must(0, 1..1))), it.next());
596 /// assert_eq!(Some(Ok(Match::must(0, 2..2))), it.next());
597 /// assert_eq!(Some(Ok(Match::must(0, 3..3))), it.next());
598 /// assert_eq!(Some(Ok(Match::must(0, 4..4))), it.next());
599 /// assert_eq!(Some(Ok(Match::must(0, 5..5))), it.next());
600 /// assert_eq!(None, it.next());
601 ///
602 /// # Ok::<(), Box<dyn std::error::Error>>(())
603 /// ```
604 pub fn config() -> Config {
605 Config::new()
606 }
607
608 /// Return a builder for configuring the construction of a
609 /// `BoundedBacktracker`.
610 ///
611 /// This is a convenience routine to avoid needing to import the
612 /// [`Builder`] type in common cases.
613 ///
614 /// # Example
615 ///
616 /// This example shows how to use the builder to disable UTF-8 mode
617 /// everywhere.
618 ///
619 /// ```
620 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
621 /// use regex_automata::{
622 /// nfa::thompson::{self, backtrack::BoundedBacktracker},
623 /// util::syntax,
624 /// Match,
625 /// };
626 ///
627 /// let re = BoundedBacktracker::builder()
628 /// .syntax(syntax::Config::new().utf8(false))
629 /// .thompson(thompson::Config::new().utf8(false))
630 /// .build(r"foo(?-u:[^b])ar.*")?;
631 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
632 ///
633 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
634 /// let expected = Some(Match::must(0, 1..9));
635 /// re.try_captures(&mut cache, haystack, &mut caps)?;
636 /// assert_eq!(expected, caps.get_match());
637 ///
638 /// # Ok::<(), Box<dyn std::error::Error>>(())
639 /// ```
640 pub fn builder() -> Builder {
641 Builder::new()
642 }
643
644 /// Create a new cache for this regex.
645 ///
646 /// The cache returned should only be used for searches for this
647 /// regex. If you want to reuse the cache for another regex, then you
648 /// must call [`Cache::reset`] with that regex (or, equivalently,
649 /// [`BoundedBacktracker::reset_cache`]).
650 pub fn create_cache(&self) -> Cache {
651 Cache::new(self)
652 }
653
654 /// Create a new empty set of capturing groups that is guaranteed to be
655 /// valid for the search APIs on this `BoundedBacktracker`.
656 ///
657 /// A `Captures` value created for a specific `BoundedBacktracker` cannot
658 /// be used with any other `BoundedBacktracker`.
659 ///
660 /// This is a convenience function for [`Captures::all`]. See the
661 /// [`Captures`] documentation for an explanation of its alternative
662 /// constructors that permit the `BoundedBacktracker` to do less work
663 /// during a search, and thus might make it faster.
664 pub fn create_captures(&self) -> Captures {
665 Captures::all(self.get_nfa().group_info().clone())
666 }
667
668 /// Reset the given cache such that it can be used for searching with the
669 /// this `BoundedBacktracker` (and only this `BoundedBacktracker`).
670 ///
671 /// A cache reset permits reusing memory already allocated in this cache
672 /// with a different `BoundedBacktracker`.
673 ///
674 /// # Example
675 ///
676 /// This shows how to re-purpose a cache for use with a different
677 /// `BoundedBacktracker`.
678 ///
679 /// ```
680 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
681 /// use regex_automata::{
682 /// nfa::thompson::backtrack::BoundedBacktracker,
683 /// Match,
684 /// };
685 ///
686 /// let re1 = BoundedBacktracker::new(r"\w")?;
687 /// let re2 = BoundedBacktracker::new(r"\W")?;
688 ///
689 /// let mut cache = re1.create_cache();
690 /// assert_eq!(
691 /// Some(Ok(Match::must(0, 0..2))),
692 /// re1.try_find_iter(&mut cache, "Δ").next(),
693 /// );
694 ///
695 /// // Using 'cache' with re2 is not allowed. It may result in panics or
696 /// // incorrect results. In order to re-purpose the cache, we must reset
697 /// // it with the BoundedBacktracker we'd like to use it with.
698 /// //
699 /// // Similarly, after this reset, using the cache with 're1' is also not
700 /// // allowed.
701 /// cache.reset(&re2);
702 /// assert_eq!(
703 /// Some(Ok(Match::must(0, 0..3))),
704 /// re2.try_find_iter(&mut cache, "☃").next(),
705 /// );
706 ///
707 /// # Ok::<(), Box<dyn std::error::Error>>(())
708 /// ```
709 pub fn reset_cache(&self, cache: &mut Cache) {
710 cache.reset(self);
711 }
712
713 /// Returns the total number of patterns compiled into this
714 /// `BoundedBacktracker`.
715 ///
716 /// In the case of a `BoundedBacktracker` that contains no patterns, this
717 /// returns `0`.
718 ///
719 /// # Example
720 ///
721 /// This example shows the pattern length for a `BoundedBacktracker` that
722 /// never matches:
723 ///
724 /// ```
725 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
726 ///
727 /// let re = BoundedBacktracker::never_match()?;
728 /// assert_eq!(re.pattern_len(), 0);
729 /// # Ok::<(), Box<dyn std::error::Error>>(())
730 /// ```
731 ///
732 /// And another example for a `BoundedBacktracker` that matches at every
733 /// position:
734 ///
735 /// ```
736 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
737 ///
738 /// let re = BoundedBacktracker::always_match()?;
739 /// assert_eq!(re.pattern_len(), 1);
740 /// # Ok::<(), Box<dyn std::error::Error>>(())
741 /// ```
742 ///
743 /// And finally, a `BoundedBacktracker` that was constructed from multiple
744 /// patterns:
745 ///
746 /// ```
747 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
748 ///
749 /// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
750 /// assert_eq!(re.pattern_len(), 3);
751 /// # Ok::<(), Box<dyn std::error::Error>>(())
752 /// ```
753 pub fn pattern_len(&self) -> usize {
754 self.nfa.pattern_len()
755 }
756
757 /// Return the config for this `BoundedBacktracker`.
758 #[inline]
759 pub fn get_config(&self) -> &Config {
760 &self.config
761 }
762
763 /// Returns a reference to the underlying NFA.
764 #[inline]
765 pub fn get_nfa(&self) -> &NFA {
766 &self.nfa
767 }
768
769 /// Returns the maximum haystack length supported by this backtracker.
770 ///
771 /// This routine is a function of both [`Config::visited_capacity`] and the
772 /// internal size of the backtracker's NFA.
773 ///
774 /// # Example
775 ///
776 /// This example shows how the maximum haystack length can vary depending
777 /// on the size of the regex itself. Note though that the specific maximum
778 /// values here are not an API guarantee. The default visited capacity is
779 /// subject to change and not covered by semver.
780 ///
781 /// ```
782 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
783 /// use regex_automata::{
784 /// nfa::thompson::backtrack::BoundedBacktracker,
785 /// Match, MatchError,
786 /// };
787 ///
788 /// // If you're only using ASCII, you get a big budget.
789 /// let re = BoundedBacktracker::new(r"(?-u)\w+")?;
790 /// let mut cache = re.create_cache();
791 /// assert_eq!(re.max_haystack_len(), 299_592);
792 /// // Things work up to the max.
793 /// let mut haystack = "a".repeat(299_592);
794 /// let expected = Some(Ok(Match::must(0, 0..299_592)));
795 /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
796 /// // But you'll get an error if you provide a haystack that's too big.
797 /// // Notice that we use the 'try_find_iter' routine instead, which
798 /// // yields Result<Match, MatchError> instead of Match.
799 /// haystack.push('a');
800 /// let expected = Some(Err(MatchError::haystack_too_long(299_593)));
801 /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
802 ///
803 /// // Unicode inflates the size of the underlying NFA quite a bit, and
804 /// // thus means that the backtracker can only handle smaller haystacks,
805 /// // assuming that the visited capacity remains unchanged.
806 /// let re = BoundedBacktracker::new(r"\w+")?;
807 /// assert!(re.max_haystack_len() <= 7_000);
808 /// // But we can increase the visited capacity to handle bigger haystacks!
809 /// let re = BoundedBacktracker::builder()
810 /// .configure(BoundedBacktracker::config().visited_capacity(1<<20))
811 /// .build(r"\w+")?;
812 /// assert!(re.max_haystack_len() >= 25_000);
813 /// assert!(re.max_haystack_len() <= 28_000);
814 /// # Ok::<(), Box<dyn std::error::Error>>(())
815 /// ```
816 #[inline]
817 pub fn max_haystack_len(&self) -> usize {
818 // The capacity given in the config is "bytes of heap memory," but the
819 // capacity we use here is "number of bits." So convert the capacity in
820 // bytes to the capacity in bits.
821 let capacity = 8 * self.get_config().get_visited_capacity();
822 let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
823 let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
824 // It's possible for `real_capacity` to be smaller than the number of
825 // NFA states for particularly large regexes, so we saturate towards
826 // zero.
827 (real_capacity / self.nfa.states().len()).saturating_sub(1)
828 }
829}
830
831impl BoundedBacktracker {
832 /// Returns true if and only if this regex matches the given haystack.
833 ///
834 /// In the case of a backtracking regex engine, and unlike most other
835 /// regex engines in this crate, short circuiting isn't practical. However,
836 /// this routine may still be faster because it instructs backtracking to
837 /// not keep track of any capturing groups.
838 ///
839 /// # Errors
840 ///
841 /// This routine only errors if the search could not complete. For this
842 /// backtracking regex engine, this only occurs when the haystack length
843 /// exceeds [`BoundedBacktracker::max_haystack_len`].
844 ///
845 /// When a search cannot complete, callers cannot know whether a match
846 /// exists or not.
847 ///
848 /// # Example
849 ///
850 /// ```
851 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
852 ///
853 /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
854 /// let mut cache = re.create_cache();
855 ///
856 /// assert!(re.try_is_match(&mut cache, "foo12345bar")?);
857 /// assert!(!re.try_is_match(&mut cache, "foobar")?);
858 /// # Ok::<(), Box<dyn std::error::Error>>(())
859 /// ```
860 ///
861 /// # Example: consistency with search APIs
862 ///
863 /// `is_match` is guaranteed to return `true` whenever `find` returns a
864 /// match. This includes searches that are executed entirely within a
865 /// codepoint:
866 ///
867 /// ```
868 /// use regex_automata::{
869 /// nfa::thompson::backtrack::BoundedBacktracker,
870 /// Input,
871 /// };
872 ///
873 /// let re = BoundedBacktracker::new("a*")?;
874 /// let mut cache = re.create_cache();
875 ///
876 /// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
877 /// # Ok::<(), Box<dyn std::error::Error>>(())
878 /// ```
879 ///
880 /// Notice that when UTF-8 mode is disabled, then the above reports a
881 /// match because the restriction against zero-width matches that split a
882 /// codepoint has been lifted:
883 ///
884 /// ```
885 /// use regex_automata::{
886 /// nfa::thompson::{backtrack::BoundedBacktracker, NFA},
887 /// Input,
888 /// };
889 ///
890 /// let re = BoundedBacktracker::builder()
891 /// .thompson(NFA::config().utf8(false))
892 /// .build("a*")?;
893 /// let mut cache = re.create_cache();
894 ///
895 /// assert!(re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
896 /// # Ok::<(), Box<dyn std::error::Error>>(())
897 /// ```
898 #[inline]
899 pub fn try_is_match<'h, I: Into<Input<'h>>>(
900 &self,
901 cache: &mut Cache,
902 input: I,
903 ) -> Result<bool, MatchError> {
904 let input = input.into().earliest(true);
905 self.try_search_slots(cache, &input, &mut []).map(|pid| pid.is_some())
906 }
907
908 /// Executes a leftmost forward search and returns a `Match` if one exists.
909 ///
910 /// This routine only includes the overall match span. To get
911 /// access to the individual spans of each capturing group, use
912 /// [`BoundedBacktracker::try_captures`].
913 ///
914 /// # Errors
915 ///
916 /// This routine only errors if the search could not complete. For this
917 /// backtracking regex engine, this only occurs when the haystack length
918 /// exceeds [`BoundedBacktracker::max_haystack_len`].
919 ///
920 /// When a search cannot complete, callers cannot know whether a match
921 /// exists or not.
922 ///
923 /// # Example
924 ///
925 /// ```
926 /// use regex_automata::{
927 /// nfa::thompson::backtrack::BoundedBacktracker,
928 /// Match,
929 /// };
930 ///
931 /// let re = BoundedBacktracker::new("foo[0-9]+")?;
932 /// let mut cache = re.create_cache();
933 /// let expected = Match::must(0, 0..8);
934 /// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?);
935 ///
936 /// # Ok::<(), Box<dyn std::error::Error>>(())
937 /// ```
938 #[inline]
939 pub fn try_find<'h, I: Into<Input<'h>>>(
940 &self,
941 cache: &mut Cache,
942 input: I,
943 ) -> Result<Option<Match>, MatchError> {
944 let input = input.into();
945 if self.get_nfa().pattern_len() == 1 {
946 let mut slots = [None, None];
947 let pid = match self.try_search_slots(cache, &input, &mut slots)? {
948 None => return Ok(None),
949 Some(pid) => pid,
950 };
951 let start = match slots[0] {
952 None => return Ok(None),
953 Some(s) => s.get(),
954 };
955 let end = match slots[1] {
956 None => return Ok(None),
957 Some(s) => s.get(),
958 };
959 return Ok(Some(Match::new(pid, Span { start, end })));
960 }
961 let ginfo = self.get_nfa().group_info();
962 let slots_len = ginfo.implicit_slot_len();
963 let mut slots = vec![None; slots_len];
964 let pid = match self.try_search_slots(cache, &input, &mut slots)? {
965 None => return Ok(None),
966 Some(pid) => pid,
967 };
968 let start = match slots[pid.as_usize() * 2] {
969 None => return Ok(None),
970 Some(s) => s.get(),
971 };
972 let end = match slots[pid.as_usize() * 2 + 1] {
973 None => return Ok(None),
974 Some(s) => s.get(),
975 };
976 Ok(Some(Match::new(pid, Span { start, end })))
977 }
978
979 /// Executes a leftmost forward search and writes the spans of capturing
980 /// groups that participated in a match into the provided [`Captures`]
981 /// value. If no match was found, then [`Captures::is_match`] is guaranteed
982 /// to return `false`.
983 ///
984 /// # Errors
985 ///
986 /// This routine only errors if the search could not complete. For this
987 /// backtracking regex engine, this only occurs when the haystack length
988 /// exceeds [`BoundedBacktracker::max_haystack_len`].
989 ///
990 /// When a search cannot complete, callers cannot know whether a match
991 /// exists or not.
992 ///
993 /// # Example
994 ///
995 /// ```
996 /// use regex_automata::{
997 /// nfa::thompson::backtrack::BoundedBacktracker,
998 /// Span,
999 /// };
1000 ///
1001 /// let re = BoundedBacktracker::new(
1002 /// r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$",
1003 /// )?;
1004 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1005 ///
1006 /// re.try_captures(&mut cache, "2010-03-14", &mut caps)?;
1007 /// assert!(caps.is_match());
1008 /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
1009 /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
1010 /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
1011 ///
1012 /// # Ok::<(), Box<dyn std::error::Error>>(())
1013 /// ```
1014 #[inline]
1015 pub fn try_captures<'h, I: Into<Input<'h>>>(
1016 &self,
1017 cache: &mut Cache,
1018 input: I,
1019 caps: &mut Captures,
1020 ) -> Result<(), MatchError> {
1021 self.try_search(cache, &input.into(), caps)
1022 }
1023
1024 /// Returns an iterator over all non-overlapping leftmost matches in the
1025 /// given bytes. If no match exists, then the iterator yields no elements.
1026 ///
1027 /// If the regex engine returns an error at any point, then the iterator
1028 /// will yield that error.
1029 ///
1030 /// # Example
1031 ///
1032 /// ```
1033 /// use regex_automata::{
1034 /// nfa::thompson::backtrack::BoundedBacktracker,
1035 /// Match, MatchError,
1036 /// };
1037 ///
1038 /// let re = BoundedBacktracker::new("foo[0-9]+")?;
1039 /// let mut cache = re.create_cache();
1040 ///
1041 /// let text = "foo1 foo12 foo123";
1042 /// let result: Result<Vec<Match>, MatchError> = re
1043 /// .try_find_iter(&mut cache, text)
1044 /// .collect();
1045 /// let matches = result?;
1046 /// assert_eq!(matches, vec![
1047 /// Match::must(0, 0..4),
1048 /// Match::must(0, 5..10),
1049 /// Match::must(0, 11..17),
1050 /// ]);
1051 /// # Ok::<(), Box<dyn std::error::Error>>(())
1052 /// ```
1053 #[inline]
1054 pub fn try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1055 &'r self,
1056 cache: &'c mut Cache,
1057 input: I,
1058 ) -> TryFindMatches<'r, 'c, 'h> {
1059 let caps = Captures::matches(self.get_nfa().group_info().clone());
1060 let it = iter::Searcher::new(input.into());
1061 TryFindMatches { re: self, cache, caps, it }
1062 }
1063
1064 /// Returns an iterator over all non-overlapping `Captures` values. If no
1065 /// match exists, then the iterator yields no elements.
1066 ///
1067 /// This yields the same matches as [`BoundedBacktracker::try_find_iter`],
1068 /// but it includes the spans of all capturing groups that participate in
1069 /// each match.
1070 ///
1071 /// If the regex engine returns an error at any point, then the iterator
1072 /// will yield that error.
1073 ///
1074 /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
1075 /// how to correctly iterate over all matches in a haystack while avoiding
1076 /// the creation of a new `Captures` value for every match. (Which you are
1077 /// forced to do with an `Iterator`.)
1078 ///
1079 /// # Example
1080 ///
1081 /// ```
1082 /// use regex_automata::{
1083 /// nfa::thompson::backtrack::BoundedBacktracker,
1084 /// Span,
1085 /// };
1086 ///
1087 /// let re = BoundedBacktracker::new("foo(?P<numbers>[0-9]+)")?;
1088 /// let mut cache = re.create_cache();
1089 ///
1090 /// let text = "foo1 foo12 foo123";
1091 /// let mut spans = vec![];
1092 /// for result in re.try_captures_iter(&mut cache, text) {
1093 /// let caps = result?;
1094 /// // The unwrap is OK since 'numbers' matches if the pattern matches.
1095 /// spans.push(caps.get_group_by_name("numbers").unwrap());
1096 /// }
1097 /// assert_eq!(spans, vec![
1098 /// Span::from(3..4),
1099 /// Span::from(8..10),
1100 /// Span::from(14..17),
1101 /// ]);
1102 /// # Ok::<(), Box<dyn std::error::Error>>(())
1103 /// ```
1104 #[inline]
1105 pub fn try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1106 &'r self,
1107 cache: &'c mut Cache,
1108 input: I,
1109 ) -> TryCapturesMatches<'r, 'c, 'h> {
1110 let caps = self.create_captures();
1111 let it = iter::Searcher::new(input.into());
1112 TryCapturesMatches { re: self, cache, caps, it }
1113 }
1114}
1115
1116impl BoundedBacktracker {
1117 /// Executes a leftmost forward search and writes the spans of capturing
1118 /// groups that participated in a match into the provided [`Captures`]
1119 /// value. If no match was found, then [`Captures::is_match`] is guaranteed
1120 /// to return `false`.
1121 ///
1122 /// This is like [`BoundedBacktracker::try_captures`], but it accepts a
1123 /// concrete `&Input` instead of an `Into<Input>`.
1124 ///
1125 /// # Errors
1126 ///
1127 /// This routine only errors if the search could not complete. For this
1128 /// backtracking regex engine, this only occurs when the haystack length
1129 /// exceeds [`BoundedBacktracker::max_haystack_len`].
1130 ///
1131 /// When a search cannot complete, callers cannot know whether a match
1132 /// exists or not.
1133 ///
1134 /// # Example: specific pattern search
1135 ///
1136 /// This example shows how to build a multi bounded backtracker that
1137 /// permits searching for specific patterns.
1138 ///
1139 /// ```
1140 /// use regex_automata::{
1141 /// nfa::thompson::backtrack::BoundedBacktracker,
1142 /// Anchored, Input, Match, PatternID,
1143 /// };
1144 ///
1145 /// let re = BoundedBacktracker::new_many(&[
1146 /// "[a-z0-9]{6}",
1147 /// "[a-z][a-z0-9]{5}",
1148 /// ])?;
1149 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1150 /// let haystack = "foo123";
1151 ///
1152 /// // Since we are using the default leftmost-first match and both
1153 /// // patterns match at the same starting position, only the first pattern
1154 /// // will be returned in this case when doing a search for any of the
1155 /// // patterns.
1156 /// let expected = Some(Match::must(0, 0..6));
1157 /// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?;
1158 /// assert_eq!(expected, caps.get_match());
1159 ///
1160 /// // But if we want to check whether some other pattern matches, then we
1161 /// // can provide its pattern ID.
1162 /// let expected = Some(Match::must(1, 0..6));
1163 /// let input = Input::new(haystack)
1164 /// .anchored(Anchored::Pattern(PatternID::must(1)));
1165 /// re.try_search(&mut cache, &input, &mut caps)?;
1166 /// assert_eq!(expected, caps.get_match());
1167 ///
1168 /// # Ok::<(), Box<dyn std::error::Error>>(())
1169 /// ```
1170 ///
1171 /// # Example: specifying the bounds of a search
1172 ///
1173 /// This example shows how providing the bounds of a search can produce
1174 /// different results than simply sub-slicing the haystack.
1175 ///
1176 /// ```
1177 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1178 /// use regex_automata::{
1179 /// nfa::thompson::backtrack::BoundedBacktracker,
1180 /// Match, Input,
1181 /// };
1182 ///
1183 /// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?;
1184 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1185 /// let haystack = "foo123bar";
1186 ///
1187 /// // Since we sub-slice the haystack, the search doesn't know about
1188 /// // the larger context and assumes that `123` is surrounded by word
1189 /// // boundaries. And of course, the match position is reported relative
1190 /// // to the sub-slice as well, which means we get `0..3` instead of
1191 /// // `3..6`.
1192 /// let expected = Some(Match::must(0, 0..3));
1193 /// re.try_search(&mut cache, &Input::new(&haystack[3..6]), &mut caps)?;
1194 /// assert_eq!(expected, caps.get_match());
1195 ///
1196 /// // But if we provide the bounds of the search within the context of the
1197 /// // entire haystack, then the search can take the surrounding context
1198 /// // into account. (And if we did find a match, it would be reported
1199 /// // as a valid offset into `haystack` instead of its sub-slice.)
1200 /// let expected = None;
1201 /// re.try_search(
1202 /// &mut cache, &Input::new(haystack).range(3..6), &mut caps,
1203 /// )?;
1204 /// assert_eq!(expected, caps.get_match());
1205 ///
1206 /// # Ok::<(), Box<dyn std::error::Error>>(())
1207 /// ```
1208 #[inline]
1209 pub fn try_search(
1210 &self,
1211 cache: &mut Cache,
1212 input: &Input<'_>,
1213 caps: &mut Captures,
1214 ) -> Result<(), MatchError> {
1215 caps.set_pattern(None);
1216 let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
1217 caps.set_pattern(pid);
1218 Ok(())
1219 }
1220
1221 /// Executes a leftmost forward search and writes the spans of capturing
1222 /// groups that participated in a match into the provided `slots`, and
1223 /// returns the matching pattern ID. The contents of the slots for patterns
1224 /// other than the matching pattern are unspecified. If no match was found,
1225 /// then `None` is returned and the contents of all `slots` is unspecified.
1226 ///
1227 /// This is like [`BoundedBacktracker::try_search`], but it accepts a raw
1228 /// slots slice instead of a `Captures` value. This is useful in contexts
1229 /// where you don't want or need to allocate a `Captures`.
1230 ///
1231 /// It is legal to pass _any_ number of slots to this routine. If the regex
1232 /// engine would otherwise write a slot offset that doesn't fit in the
1233 /// provided slice, then it is simply skipped. In general though, there are
1234 /// usually three slice lengths you might want to use:
1235 ///
1236 /// * An empty slice, if you only care about which pattern matched.
1237 /// * A slice with
1238 /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
1239 /// slots, if you only care about the overall match spans for each matching
1240 /// pattern.
1241 /// * A slice with
1242 /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1243 /// permits recording match offsets for every capturing group in every
1244 /// pattern.
1245 ///
1246 /// # Errors
1247 ///
1248 /// This routine only errors if the search could not complete. For this
1249 /// backtracking regex engine, this only occurs when the haystack length
1250 /// exceeds [`BoundedBacktracker::max_haystack_len`].
1251 ///
1252 /// When a search cannot complete, callers cannot know whether a match
1253 /// exists or not.
1254 ///
1255 /// # Example
1256 ///
1257 /// This example shows how to find the overall match offsets in a
1258 /// multi-pattern search without allocating a `Captures` value. Indeed, we
1259 /// can put our slots right on the stack.
1260 ///
1261 /// ```
1262 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1263 /// use regex_automata::{
1264 /// nfa::thompson::backtrack::BoundedBacktracker,
1265 /// PatternID, Input,
1266 /// };
1267 ///
1268 /// let re = BoundedBacktracker::new_many(&[
1269 /// r"\pL+",
1270 /// r"\d+",
1271 /// ])?;
1272 /// let mut cache = re.create_cache();
1273 /// let input = Input::new("!@#123");
1274 ///
1275 /// // We only care about the overall match offsets here, so we just
1276 /// // allocate two slots for each pattern. Each slot records the start
1277 /// // and end of the match.
1278 /// let mut slots = [None; 4];
1279 /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
1280 /// assert_eq!(Some(PatternID::must(1)), pid);
1281 ///
1282 /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1283 /// // See 'GroupInfo' for more details on the mapping between groups and
1284 /// // slot indices.
1285 /// let slot_start = pid.unwrap().as_usize() * 2;
1286 /// let slot_end = slot_start + 1;
1287 /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
1288 /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
1289 ///
1290 /// # Ok::<(), Box<dyn std::error::Error>>(())
1291 /// ```
1292 #[inline]
1293 pub fn try_search_slots(
1294 &self,
1295 cache: &mut Cache,
1296 input: &Input<'_>,
1297 slots: &mut [Option<NonMaxUsize>],
1298 ) -> Result<Option<PatternID>, MatchError> {
1299 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1300 if !utf8empty {
1301 let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1302 return Ok(maybe_hm.map(|hm| hm.pattern()));
1303 }
1304 // See PikeVM::try_search_slots for why we do this.
1305 let min = self.get_nfa().group_info().implicit_slot_len();
1306 if slots.len() >= min {
1307 let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1308 return Ok(maybe_hm.map(|hm| hm.pattern()));
1309 }
1310 if self.get_nfa().pattern_len() == 1 {
1311 let mut enough = [None, None];
1312 let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1313 // This is OK because we know `enough_slots` is strictly bigger
1314 // than `slots`, otherwise this special case isn't reached.
1315 slots.copy_from_slice(&enough[..slots.len()]);
1316 return Ok(got.map(|hm| hm.pattern()));
1317 }
1318 let mut enough = vec![None; min];
1319 let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1320 // This is OK because we know `enough_slots` is strictly bigger than
1321 // `slots`, otherwise this special case isn't reached.
1322 slots.copy_from_slice(&enough[..slots.len()]);
1323 Ok(got.map(|hm| hm.pattern()))
1324 }
1325
1326 /// This is the actual implementation of `try_search_slots_imp` that
1327 /// doesn't account for the special case when 1) the NFA has UTF-8 mode
1328 /// enabled, 2) the NFA can match the empty string and 3) the caller has
1329 /// provided an insufficient number of slots to record match offsets.
1330 #[inline(never)]
1331 fn try_search_slots_imp(
1332 &self,
1333 cache: &mut Cache,
1334 input: &Input<'_>,
1335 slots: &mut [Option<NonMaxUsize>],
1336 ) -> Result<Option<HalfMatch>, MatchError> {
1337 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1338 let hm = match self.search_imp(cache, input, slots)? {
1339 None => return Ok(None),
1340 Some(hm) if !utf8empty => return Ok(Some(hm)),
1341 Some(hm) => hm,
1342 };
1343 empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1344 Ok(self
1345 .search_imp(cache, input, slots)?
1346 .map(|hm| (hm, hm.offset())))
1347 })
1348 }
1349
1350 /// The implementation of standard leftmost backtracking search.
1351 ///
1352 /// Capturing group spans are written to 'caps', but only if requested.
1353 /// 'caps' can be one of three things: 1) totally empty, in which case, we
1354 /// only report the pattern that matched or 2) only has slots for recording
1355 /// the overall match offsets for any pattern or 3) has all slots available
1356 /// for recording the spans of any groups participating in a match.
1357 fn search_imp(
1358 &self,
1359 cache: &mut Cache,
1360 input: &Input<'_>,
1361 slots: &mut [Option<NonMaxUsize>],
1362 ) -> Result<Option<HalfMatch>, MatchError> {
1363 // Unlike in the PikeVM, we write our capturing group spans directly
1364 // into the caller's captures groups. So we have to make sure we're
1365 // starting with a blank slate first. In the PikeVM, we avoid this
1366 // by construction: the spans that are copied to every slot in the
1367 // 'Captures' value already account for presence/absence. In this
1368 // backtracker, we write directly into the caller provided slots, where
1369 // as in the PikeVM, we write into scratch space first and only copy
1370 // them to the caller provided slots when a match is found.
1371 for slot in slots.iter_mut() {
1372 *slot = None;
1373 }
1374 cache.setup_search(&self, input)?;
1375 if input.is_done() {
1376 return Ok(None);
1377 }
1378 let (anchored, start_id) = match input.get_anchored() {
1379 // Only way we're unanchored is if both the caller asked for an
1380 // unanchored search *and* the pattern is itself not anchored.
1381 Anchored::No => (
1382 self.nfa.is_always_start_anchored(),
1383 // We always use the anchored starting state here, even if
1384 // doing an unanchored search. The "unanchored" part of it is
1385 // implemented in the loop below, by simply trying the next
1386 // byte offset if the previous backtracking exploration failed.
1387 self.nfa.start_anchored(),
1388 ),
1389 Anchored::Yes => (true, self.nfa.start_anchored()),
1390 Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) {
1391 None => return Ok(None),
1392 Some(sid) => (true, sid),
1393 },
1394 };
1395 if anchored {
1396 let at = input.start();
1397 return Ok(self.backtrack(cache, input, at, start_id, slots));
1398 }
1399 let pre = self.get_config().get_prefilter();
1400 let mut at = input.start();
1401 while at <= input.end() {
1402 if let Some(ref pre) = pre {
1403 let span = Span::from(at..input.end());
1404 match pre.find(input.haystack(), span) {
1405 None => break,
1406 Some(ref span) => at = span.start,
1407 }
1408 }
1409 if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
1410 {
1411 return Ok(Some(hm));
1412 }
1413 at += 1;
1414 }
1415 Ok(None)
1416 }
1417
1418 /// Look for a match starting at `at` in `input` and write the matching
1419 /// pattern ID and group spans to `caps`. The search uses `start_id` as its
1420 /// starting state in the underlying NFA.
1421 ///
1422 /// If no match was found, then the caller should increment `at` and try
1423 /// at the next position.
1424 #[cfg_attr(feature = "perf-inline", inline(always))]
1425 fn backtrack(
1426 &self,
1427 cache: &mut Cache,
1428 input: &Input<'_>,
1429 at: usize,
1430 start_id: StateID,
1431 slots: &mut [Option<NonMaxUsize>],
1432 ) -> Option<HalfMatch> {
1433 cache.stack.push(Frame::Step { sid: start_id, at });
1434 while let Some(frame) = cache.stack.pop() {
1435 match frame {
1436 Frame::Step { sid, at } => {
1437 if let Some(hm) = self.step(cache, input, sid, at, slots) {
1438 return Some(hm);
1439 }
1440 }
1441 Frame::RestoreCapture { slot, offset } => {
1442 slots[slot] = offset;
1443 }
1444 }
1445 }
1446 None
1447 }
1448
1449 // LAMENTATION: The actual backtracking search is implemented in about
1450 // 75 lines below. Yet this file is over 2,000 lines long. What have I
1451 // done?
1452
1453 /// Execute a "step" in the backtracing algorithm.
1454 ///
1455 /// A "step" is somewhat of a misnomer, because this routine keeps going
1456 /// until it either runs out of things to try or fins a match. In the
1457 /// former case, it may have pushed some things on to the backtracking
1458 /// stack, in which case, those will be tried next as part of the
1459 /// 'backtrack' routine above.
1460 #[cfg_attr(feature = "perf-inline", inline(always))]
1461 fn step(
1462 &self,
1463 cache: &mut Cache,
1464 input: &Input<'_>,
1465 mut sid: StateID,
1466 mut at: usize,
1467 slots: &mut [Option<NonMaxUsize>],
1468 ) -> Option<HalfMatch> {
1469 loop {
1470 if !cache.visited.insert(sid, at - input.start()) {
1471 return None;
1472 }
1473 match *self.nfa.state(sid) {
1474 State::ByteRange { ref trans } => {
1475 // Why do we need this? Unlike other regex engines in this
1476 // crate, the backtracker can steam roll ahead in the
1477 // haystack outside of the main loop over the bytes in the
1478 // haystack. While 'trans.matches()' below handles the case
1479 // of 'at' being out of bounds of 'input.haystack()', we
1480 // also need to handle the case of 'at' going out of bounds
1481 // of the span the caller asked to search.
1482 //
1483 // We should perhaps make the 'trans.matches()' API accept
1484 // an '&Input' instead of a '&[u8]'. Or at least, add a new
1485 // API that does it.
1486 if at >= input.end() {
1487 return None;
1488 }
1489 if !trans.matches(input.haystack(), at) {
1490 return None;
1491 }
1492 sid = trans.next;
1493 at += 1;
1494 }
1495 State::Sparse(ref sparse) => {
1496 if at >= input.end() {
1497 return None;
1498 }
1499 sid = sparse.matches(input.haystack(), at)?;
1500 at += 1;
1501 }
1502 State::Dense(ref dense) => {
1503 if at >= input.end() {
1504 return None;
1505 }
1506 sid = dense.matches(input.haystack(), at)?;
1507 at += 1;
1508 }
1509 State::Look { look, next } => {
1510 // OK because we don't permit building a searcher with a
1511 // Unicode word boundary if the requisite Unicode data is
1512 // unavailable.
1513 if !self.nfa.look_matcher().matches_inline(
1514 look,
1515 input.haystack(),
1516 at,
1517 ) {
1518 return None;
1519 }
1520 sid = next;
1521 }
1522 State::Union { ref alternates } => {
1523 sid = match alternates.get(0) {
1524 None => return None,
1525 Some(&sid) => sid,
1526 };
1527 cache.stack.extend(
1528 alternates[1..]
1529 .iter()
1530 .copied()
1531 .rev()
1532 .map(|sid| Frame::Step { sid, at }),
1533 );
1534 }
1535 State::BinaryUnion { alt1, alt2 } => {
1536 sid = alt1;
1537 cache.stack.push(Frame::Step { sid: alt2, at });
1538 }
1539 State::Capture { next, slot, .. } => {
1540 if slot.as_usize() < slots.len() {
1541 cache.stack.push(Frame::RestoreCapture {
1542 slot,
1543 offset: slots[slot],
1544 });
1545 slots[slot] = NonMaxUsize::new(at);
1546 }
1547 sid = next;
1548 }
1549 State::Fail => return None,
1550 State::Match { pattern_id } => {
1551 return Some(HalfMatch::new(pattern_id, at));
1552 }
1553 }
1554 }
1555 }
1556}
1557
1558/// An iterator over all non-overlapping matches for a fallible search.
1559///
1560/// The iterator yields a `Result<Match, MatchError` value until no more
1561/// matches could be found.
1562///
1563/// The lifetime parameters are as follows:
1564///
1565/// * `'r` represents the lifetime of the BoundedBacktracker.
1566/// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1567/// * `'h` represents the lifetime of the haystack being searched.
1568///
1569/// This iterator can be created with the [`BoundedBacktracker::try_find_iter`]
1570/// method.
1571#[derive(Debug)]
1572pub struct TryFindMatches<'r, 'c, 'h> {
1573 re: &'r BoundedBacktracker,
1574 cache: &'c mut Cache,
1575 caps: Captures,
1576 it: iter::Searcher<'h>,
1577}
1578
1579impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> {
1580 type Item = Result<Match, MatchError>;
1581
1582 #[inline]
1583 fn next(&mut self) -> Option<Result<Match, MatchError>> {
1584 // Splitting 'self' apart seems necessary to appease borrowck.
1585 let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } =
1586 *self;
1587 it.try_advance(|input| {
1588 re.try_search(cache, input, caps)?;
1589 Ok(caps.get_match())
1590 })
1591 .transpose()
1592 }
1593}
1594
1595/// An iterator over all non-overlapping leftmost matches, with their capturing
1596/// groups, for a fallible search.
1597///
1598/// The iterator yields a `Result<Captures, MatchError>` value until no more
1599/// matches could be found.
1600///
1601/// The lifetime parameters are as follows:
1602///
1603/// * `'r` represents the lifetime of the BoundedBacktracker.
1604/// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1605/// * `'h` represents the lifetime of the haystack being searched.
1606///
1607/// This iterator can be created with the
1608/// [`BoundedBacktracker::try_captures_iter`] method.
1609#[derive(Debug)]
1610pub struct TryCapturesMatches<'r, 'c, 'h> {
1611 re: &'r BoundedBacktracker,
1612 cache: &'c mut Cache,
1613 caps: Captures,
1614 it: iter::Searcher<'h>,
1615}
1616
1617impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> {
1618 type Item = Result<Captures, MatchError>;
1619
1620 #[inline]
1621 fn next(&mut self) -> Option<Result<Captures, MatchError>> {
1622 // Splitting 'self' apart seems necessary to appease borrowck.
1623 let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
1624 *self;
1625 let _ = it
1626 .try_advance(|input| {
1627 re.try_search(cache, input, caps)?;
1628 Ok(caps.get_match())
1629 })
1630 .transpose()?;
1631 if caps.is_match() {
1632 Some(Ok(caps.clone()))
1633 } else {
1634 None
1635 }
1636 }
1637}
1638
1639/// A cache represents mutable state that a [`BoundedBacktracker`] requires
1640/// during a search.
1641///
1642/// For a given [`BoundedBacktracker`], its corresponding cache may be created
1643/// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`].
1644/// They are equivalent in every way, except the former does not require
1645/// explicitly importing `Cache`.
1646///
1647/// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which
1648/// it was created. It may only be used with that `BoundedBacktracker`. A cache
1649/// and its allocations may be re-purposed via [`Cache::reset`], in which case,
1650/// it can only be used with the new `BoundedBacktracker` (and not the old
1651/// one).
1652#[derive(Clone, Debug)]
1653pub struct Cache {
1654 /// Stack used on the heap for doing backtracking instead of the
1655 /// traditional recursive approach. We don't want recursion because then
1656 /// we're likely to hit a stack overflow for bigger regexes.
1657 stack: Vec<Frame>,
1658 /// The set of (StateID, HaystackOffset) pairs that have been visited
1659 /// by the backtracker within a single search. If such a pair has been
1660 /// visited, then we avoid doing the work for that pair again. This is
1661 /// what "bounds" the backtracking and prevents it from having worst case
1662 /// exponential time.
1663 visited: Visited,
1664}
1665
1666impl Cache {
1667 /// Create a new [`BoundedBacktracker`] cache.
1668 ///
1669 /// A potentially more convenient routine to create a cache is
1670 /// [`BoundedBacktracker::create_cache`], as it does not require also
1671 /// importing the `Cache` type.
1672 ///
1673 /// If you want to reuse the returned `Cache` with some other
1674 /// `BoundedBacktracker`, then you must call [`Cache::reset`] with the
1675 /// desired `BoundedBacktracker`.
1676 pub fn new(re: &BoundedBacktracker) -> Cache {
1677 Cache { stack: vec![], visited: Visited::new(re) }
1678 }
1679
1680 /// Reset this cache such that it can be used for searching with different
1681 /// [`BoundedBacktracker`].
1682 ///
1683 /// A cache reset permits reusing memory already allocated in this cache
1684 /// with a different `BoundedBacktracker`.
1685 ///
1686 /// # Example
1687 ///
1688 /// This shows how to re-purpose a cache for use with a different
1689 /// `BoundedBacktracker`.
1690 ///
1691 /// ```
1692 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1693 /// use regex_automata::{
1694 /// nfa::thompson::backtrack::BoundedBacktracker,
1695 /// Match,
1696 /// };
1697 ///
1698 /// let re1 = BoundedBacktracker::new(r"\w")?;
1699 /// let re2 = BoundedBacktracker::new(r"\W")?;
1700 ///
1701 /// let mut cache = re1.create_cache();
1702 /// assert_eq!(
1703 /// Some(Ok(Match::must(0, 0..2))),
1704 /// re1.try_find_iter(&mut cache, "Δ").next(),
1705 /// );
1706 ///
1707 /// // Using 'cache' with re2 is not allowed. It may result in panics or
1708 /// // incorrect results. In order to re-purpose the cache, we must reset
1709 /// // it with the BoundedBacktracker we'd like to use it with.
1710 /// //
1711 /// // Similarly, after this reset, using the cache with 're1' is also not
1712 /// // allowed.
1713 /// cache.reset(&re2);
1714 /// assert_eq!(
1715 /// Some(Ok(Match::must(0, 0..3))),
1716 /// re2.try_find_iter(&mut cache, "☃").next(),
1717 /// );
1718 ///
1719 /// # Ok::<(), Box<dyn std::error::Error>>(())
1720 /// ```
1721 pub fn reset(&mut self, re: &BoundedBacktracker) {
1722 self.visited.reset(re);
1723 }
1724
1725 /// Returns the heap memory usage, in bytes, of this cache.
1726 ///
1727 /// This does **not** include the stack size used up by this cache. To
1728 /// compute that, use `std::mem::size_of::<Cache>()`.
1729 pub fn memory_usage(&self) -> usize {
1730 self.stack.len() * core::mem::size_of::<Frame>()
1731 + self.visited.memory_usage()
1732 }
1733
1734 /// Clears this cache. This should be called at the start of every search
1735 /// to ensure we start with a clean slate.
1736 ///
1737 /// This also sets the length of the capturing groups used in the current
1738 /// search. This permits an optimization where by 'SlotTable::for_state'
1739 /// only returns the number of slots equivalent to the number of slots
1740 /// given in the 'Captures' value. This may be less than the total number
1741 /// of possible slots, e.g., when one only wants to track overall match
1742 /// offsets. This in turn permits less copying of capturing group spans
1743 /// in the BoundedBacktracker.
1744 fn setup_search(
1745 &mut self,
1746 re: &BoundedBacktracker,
1747 input: &Input<'_>,
1748 ) -> Result<(), MatchError> {
1749 self.stack.clear();
1750 self.visited.setup_search(re, input)?;
1751 Ok(())
1752 }
1753}
1754
1755/// Represents a stack frame on the heap while doing backtracking.
1756///
1757/// Instead of using explicit recursion for backtracking, we use a stack on
1758/// the heap to keep track of things that we want to explore if the current
1759/// backtracking branch turns out to not lead to a match.
1760#[derive(Clone, Debug)]
1761enum Frame {
1762 /// Look for a match starting at `sid` and the given position in the
1763 /// haystack.
1764 Step { sid: StateID, at: usize },
1765 /// Reset the given `slot` to the given `offset` (which might be `None`).
1766 /// This effectively gives a "scope" to capturing groups, such that an
1767 /// offset for a particular group only gets returned if the match goes
1768 /// through that capturing group. If backtracking ends up going down a
1769 /// different branch that results in a different offset (or perhaps none at
1770 /// all), then this "restore capture" frame will cause the offset to get
1771 /// reset.
1772 RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
1773}
1774
1775/// A bitset that keeps track of whether a particular (StateID, offset) has
1776/// been considered during backtracking. If it has already been visited, then
1777/// backtracking skips it. This is what gives backtracking its "bound."
1778#[derive(Clone, Debug)]
1779struct Visited {
1780 /// The actual underlying bitset. Each element in the bitset corresponds
1781 /// to a particular (StateID, offset) pair. States correspond to the rows
1782 /// and the offsets correspond to the columns.
1783 ///
1784 /// If our underlying NFA has N states and the haystack we're searching
1785 /// has M bytes, then we have N*(M+1) entries in our bitset table. The
1786 /// M+1 occurs because our matches are delayed by one byte (to support
1787 /// look-around), and so we need to handle the end position itself rather
1788 /// than stopping just before the end. (If there is no end position, then
1789 /// it's treated as "end-of-input," which is matched by things like '$'.)
1790 ///
1791 /// Given BITS=N*(M+1), we wind up with div_ceil(BITS, sizeof(usize))
1792 /// blocks.
1793 ///
1794 /// We use 'usize' to represent our blocks because it makes some of the
1795 /// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for
1796 /// our block, we'd either need to cast u32s to usizes or usizes to u32s.
1797 bitset: Vec<usize>,
1798 /// The stride represents one plus length of the haystack we're searching
1799 /// (as described above). The stride must be initialized for each search.
1800 stride: usize,
1801}
1802
1803impl Visited {
1804 /// The size of each block, in bits.
1805 const BLOCK_SIZE: usize = 8 * core::mem::size_of::<usize>();
1806
1807 /// Create a new visited set for the given backtracker.
1808 ///
1809 /// The set is ready to use, but must be setup at the beginning of each
1810 /// search by calling `setup_search`.
1811 fn new(re: &BoundedBacktracker) -> Visited {
1812 let mut visited = Visited { bitset: vec![], stride: 0 };
1813 visited.reset(re);
1814 visited
1815 }
1816
1817 /// Insert the given (StateID, offset) pair into this set. If it already
1818 /// exists, then this is a no-op and it returns false. Otherwise this
1819 /// returns true.
1820 fn insert(&mut self, sid: StateID, at: usize) -> bool {
1821 let table_index = sid.as_usize() * self.stride + at;
1822 let block_index = table_index / Visited::BLOCK_SIZE;
1823 let bit = table_index % Visited::BLOCK_SIZE;
1824 let block_with_bit = 1 << bit;
1825 if self.bitset[block_index] & block_with_bit != 0 {
1826 return false;
1827 }
1828 self.bitset[block_index] |= block_with_bit;
1829 true
1830 }
1831
1832 /// Reset this visited set to work with the given bounded backtracker.
1833 fn reset(&mut self, _: &BoundedBacktracker) {
1834 self.bitset.truncate(0);
1835 }
1836
1837 /// Setup this visited set to work for a search using the given NFA
1838 /// and input configuration. The NFA must be the same NFA used by the
1839 /// BoundedBacktracker given to Visited::reset. Failing to call this might
1840 /// result in panics or silently incorrect search behavior.
1841 fn setup_search(
1842 &mut self,
1843 re: &BoundedBacktracker,
1844 input: &Input<'_>,
1845 ) -> Result<(), MatchError> {
1846 // Our haystack length is only the length of the span of the entire
1847 // haystack that we'll be searching.
1848 let haylen = input.get_span().len();
1849 let err = || MatchError::haystack_too_long(haylen);
1850 // Our stride is one more than the length of the input because our main
1851 // search loop includes the position at input.end(). (And it does this
1852 // because matches are delayed by one byte to account for look-around.)
1853 self.stride = haylen + 1;
1854 let needed_capacity =
1855 match re.get_nfa().states().len().checked_mul(self.stride) {
1856 None => return Err(err()),
1857 Some(capacity) => capacity,
1858 };
1859 let max_capacity = 8 * re.get_config().get_visited_capacity();
1860 if needed_capacity > max_capacity {
1861 return Err(err());
1862 }
1863 let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE);
1864 self.bitset.truncate(needed_blocks);
1865 for block in self.bitset.iter_mut() {
1866 *block = 0;
1867 }
1868 if needed_blocks > self.bitset.len() {
1869 self.bitset.resize(needed_blocks, 0);
1870 }
1871 Ok(())
1872 }
1873
1874 /// Return the heap memory usage, in bytes, of this visited set.
1875 fn memory_usage(&self) -> usize {
1876 self.bitset.len() * core::mem::size_of::<usize>()
1877 }
1878}
1879
1880/// Integer division, but rounds up instead of down.
1881fn div_ceil(lhs: usize, rhs: usize) -> usize {
1882 if lhs % rhs == 0 {
1883 lhs / rhs
1884 } else {
1885 (lhs / rhs) + 1
1886 }
1887}
1888
1889#[cfg(test)]
1890mod tests {
1891 use super::*;
1892
1893 // This is a regression test for the maximum haystack length computation.
1894 // Previously, it assumed that the total capacity of the backtracker's
1895 // bitset would always be greater than the number of NFA states. But there
1896 // is of course no guarantee that this is true. This regression test
1897 // ensures that not only does `max_haystack_len` not panic, but that it
1898 // should return `0`.
1899 #[cfg(feature = "syntax")]
1900 #[test]
1901 fn max_haystack_len_overflow() {
1902 let re = BoundedBacktracker::builder()
1903 .configure(BoundedBacktracker::config().visited_capacity(10))
1904 .build(r"[0-9A-Za-z]{100}")
1905 .unwrap();
1906 assert_eq!(0, re.max_haystack_len());
1907 }
1908}
1909