1/*!
2An NFA backed Pike VM for executing regex searches with capturing groups.
3
4This module provides a [`PikeVM`] that works by simulating an NFA and
5resolving all spans of capturing groups that participate in a match.
6*/
7
8#[cfg(feature = "internal-instrument-pikevm")]
9use core::cell::RefCell;
10
11use alloc::{vec, vec::Vec};
12
13use crate::{
14 nfa::thompson::{self, BuildError, State, NFA},
15 util::{
16 captures::Captures,
17 empty, iter,
18 prefilter::Prefilter,
19 primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
20 search::{
21 Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span,
22 },
23 sparse_set::SparseSet,
24 },
25};
26
27/// A simple macro for conditionally executing instrumentation logic when
28/// the 'trace' log level is enabled. This is a compile-time no-op when the
29/// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that
30/// this makes it easier to avoid doing extra work when instrumentation isn't
31/// enabled.
32///
33/// This macro accepts a closure of type `|&mut Counters|`. The closure can
34/// then increment counters (or whatever) in accordance with what one wants
35/// to track.
36macro_rules! instrument {
37 ($fun:expr) => {
38 #[cfg(feature = "internal-instrument-pikevm")]
39 {
40 let fun: &mut dyn FnMut(&mut Counters) = &mut $fun;
41 COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut()));
42 }
43 };
44}
45
46#[cfg(feature = "internal-instrument-pikevm")]
47std::thread_local! {
48 /// Effectively global state used to keep track of instrumentation
49 /// counters. The "proper" way to do this is to thread it through the
50 /// PikeVM, but it makes the code quite icky. Since this is just a
51 /// debugging feature, we're content to relegate it to thread local
52 /// state. When instrumentation is enabled, the counters are reset at the
53 /// beginning of every search and printed (with the 'trace' log level) at
54 /// the end of every search.
55 static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty());
56}
57
58/// The configuration used for building a [`PikeVM`].
59///
60/// A PikeVM configuration is a simple data object that is typically used with
61/// [`Builder::configure`]. It can be cheaply cloned.
62///
63/// A default configuration can be created either with `Config::new`, or
64/// perhaps more conveniently, with [`PikeVM::config`].
65#[derive(Clone, Debug, Default)]
66pub struct Config {
67 match_kind: Option<MatchKind>,
68 pre: Option<Option<Prefilter>>,
69}
70
71impl Config {
72 /// Return a new default PikeVM configuration.
73 pub fn new() -> Config {
74 Config::default()
75 }
76
77 /// Set the desired match semantics.
78 ///
79 /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
80 /// match semantics of Perl-like regex engines. That is, when multiple
81 /// patterns would match at the same leftmost position, the pattern that
82 /// appears first in the concrete syntax is chosen.
83 ///
84 /// Currently, the only other kind of match semantics supported is
85 /// [`MatchKind::All`]. This corresponds to "classical DFA" construction
86 /// where all possible matches are visited in the NFA by the `PikeVM`.
87 ///
88 /// Typically, `All` is used when one wants to execute an overlapping
89 /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
90 /// sense to use `All` with the various "leftmost" find routines, since the
91 /// leftmost routines depend on the `LeftmostFirst` automata construction
92 /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM`
93 /// simulating dead states as a way to terminate the search and report a
94 /// match. `LeftmostFirst` also supports non-greedy matches using this
95 /// strategy where as `All` does not.
96 pub fn match_kind(mut self, kind: MatchKind) -> Config {
97 self.match_kind = Some(kind);
98 self
99 }
100
101 /// Set a prefilter to be used whenever a start state is entered.
102 ///
103 /// A [`Prefilter`] in this context is meant to accelerate searches by
104 /// looking for literal prefixes that every match for the corresponding
105 /// pattern (or patterns) must start with. Once a prefilter produces a
106 /// match, the underlying search routine continues on to try and confirm
107 /// the match.
108 ///
109 /// Be warned that setting a prefilter does not guarantee that the search
110 /// will be faster. While it's usually a good bet, if the prefilter
111 /// produces a lot of false positive candidates (i.e., positions matched
112 /// by the prefilter but not by the regex), then the overall result can
113 /// be slower than if you had just executed the regex engine without any
114 /// prefilters.
115 ///
116 /// By default no prefilter is set.
117 ///
118 /// # Example
119 ///
120 /// ```
121 /// use regex_automata::{
122 /// nfa::thompson::pikevm::PikeVM,
123 /// util::prefilter::Prefilter,
124 /// Input, Match, MatchKind,
125 /// };
126 ///
127 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
128 /// let re = PikeVM::builder()
129 /// .configure(PikeVM::config().prefilter(pre))
130 /// .build(r"(foo|bar)[a-z]+")?;
131 /// let mut cache = re.create_cache();
132 /// let input = Input::new("foo1 barfox bar");
133 /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input));
134 ///
135 /// # Ok::<(), Box<dyn std::error::Error>>(())
136 /// ```
137 ///
138 /// Be warned though that an incorrect prefilter can lead to incorrect
139 /// results!
140 ///
141 /// ```
142 /// use regex_automata::{
143 /// nfa::thompson::pikevm::PikeVM,
144 /// util::prefilter::Prefilter,
145 /// Input, HalfMatch, MatchKind,
146 /// };
147 ///
148 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
149 /// let re = PikeVM::builder()
150 /// .configure(PikeVM::config().prefilter(pre))
151 /// .build(r"(foo|bar)[a-z]+")?;
152 /// let mut cache = re.create_cache();
153 /// let input = Input::new("foo1 barfox bar");
154 /// // No match reported even though there clearly is one!
155 /// assert_eq!(None, re.find(&mut cache, input));
156 ///
157 /// # Ok::<(), Box<dyn std::error::Error>>(())
158 /// ```
159 pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
160 self.pre = Some(pre);
161 self
162 }
163
164 /// Returns the match semantics set in this configuration.
165 pub fn get_match_kind(&self) -> MatchKind {
166 self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
167 }
168
169 /// Returns the prefilter set in this configuration, if one at all.
170 pub fn get_prefilter(&self) -> Option<&Prefilter> {
171 self.pre.as_ref().unwrap_or(&None).as_ref()
172 }
173
174 /// Overwrite the default configuration such that the options in `o` are
175 /// always used. If an option in `o` is not set, then the corresponding
176 /// option in `self` is used. If it's not set in `self` either, then it
177 /// remains not set.
178 pub(crate) fn overwrite(&self, o: Config) -> Config {
179 Config {
180 match_kind: o.match_kind.or(self.match_kind),
181 pre: o.pre.or_else(|| self.pre.clone()),
182 }
183 }
184}
185
186/// A builder for a `PikeVM`.
187///
188/// This builder permits configuring options for the syntax of a pattern,
189/// the NFA construction and the `PikeVM` construction. This builder is
190/// different from a general purpose regex builder in that it permits fine
191/// grain configuration of the construction process. The trade off for this is
192/// complexity, and the possibility of setting a configuration that might not
193/// make sense. For example, there are two different UTF-8 modes:
194///
195/// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8)
196/// controls whether the pattern itself can contain sub-expressions that match
197/// invalid UTF-8.
198/// * [`thompson::Config::utf8`] controls whether empty matches that split a
199/// Unicode codepoint are reported or not.
200///
201/// Generally speaking, callers will want to either enable all of these or
202/// disable all of these.
203///
204/// # Example
205///
206/// This example shows how to disable UTF-8 mode in the syntax and the regex
207/// itself. This is generally what you want for matching on arbitrary bytes.
208///
209/// ```
210/// use regex_automata::{
211/// nfa::thompson::{self, pikevm::PikeVM},
212/// util::syntax,
213/// Match,
214/// };
215///
216/// let re = PikeVM::builder()
217/// .syntax(syntax::Config::new().utf8(false))
218/// .thompson(thompson::Config::new().utf8(false))
219/// .build(r"foo(?-u:[^b])ar.*")?;
220/// let mut cache = re.create_cache();
221///
222/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
223/// let expected = Some(Match::must(0, 1..9));
224/// let got = re.find_iter(&mut cache, haystack).next();
225/// assert_eq!(expected, got);
226/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
227/// // but the subsequent `.*` does not! Disabling UTF-8
228/// // on the syntax permits this.
229/// //
230/// // N.B. This example does not show the impact of
231/// // disabling UTF-8 mode on a PikeVM Config, since that
232/// // only impacts regexes that can produce matches of
233/// // length 0.
234/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
235///
236/// # Ok::<(), Box<dyn std::error::Error>>(())
237/// ```
238#[derive(Clone, Debug)]
239pub struct Builder {
240 config: Config,
241 #[cfg(feature = "syntax")]
242 thompson: thompson::Compiler,
243}
244
245impl Builder {
246 /// Create a new PikeVM builder with its default configuration.
247 pub fn new() -> Builder {
248 Builder {
249 config: Config::default(),
250 #[cfg(feature = "syntax")]
251 thompson: thompson::Compiler::new(),
252 }
253 }
254
255 /// Build a `PikeVM` from the given pattern.
256 ///
257 /// If there was a problem parsing or compiling the pattern, then an error
258 /// is returned.
259 #[cfg(feature = "syntax")]
260 pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> {
261 self.build_many(&[pattern])
262 }
263
264 /// Build a `PikeVM` from the given patterns.
265 #[cfg(feature = "syntax")]
266 pub fn build_many<P: AsRef<str>>(
267 &self,
268 patterns: &[P],
269 ) -> Result<PikeVM, BuildError> {
270 let nfa = self.thompson.build_many(patterns)?;
271 self.build_from_nfa(nfa)
272 }
273
274 /// Build a `PikeVM` directly from its NFA.
275 ///
276 /// Note that when using this method, any configuration that applies to the
277 /// construction of the NFA itself will of course be ignored, since the NFA
278 /// given here is already built.
279 pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> {
280 nfa.look_set_any().available().map_err(BuildError::word)?;
281 Ok(PikeVM { config: self.config.clone(), nfa })
282 }
283
284 /// Apply the given `PikeVM` configuration options to this builder.
285 pub fn configure(&mut self, config: Config) -> &mut Builder {
286 self.config = self.config.overwrite(config);
287 self
288 }
289
290 /// Set the syntax configuration for this builder using
291 /// [`syntax::Config`](crate::util::syntax::Config).
292 ///
293 /// This permits setting things like case insensitivity, Unicode and multi
294 /// line mode.
295 ///
296 /// These settings only apply when constructing a PikeVM directly from a
297 /// pattern.
298 #[cfg(feature = "syntax")]
299 pub fn syntax(
300 &mut self,
301 config: crate::util::syntax::Config,
302 ) -> &mut Builder {
303 self.thompson.syntax(config);
304 self
305 }
306
307 /// Set the Thompson NFA configuration for this builder using
308 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
309 ///
310 /// This permits setting things like if additional time should be spent
311 /// shrinking the size of the NFA.
312 ///
313 /// These settings only apply when constructing a PikeVM directly from a
314 /// pattern.
315 #[cfg(feature = "syntax")]
316 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
317 self.thompson.configure(config);
318 self
319 }
320}
321
322/// A virtual machine for executing regex searches with capturing groups.
323///
324/// # Infallible APIs
325///
326/// Unlike most other regex engines in this crate, a `PikeVM` never returns an
327/// error at search time. It supports all [`Anchored`] configurations, never
328/// quits and works on haystacks of arbitrary length.
329///
330/// There are two caveats to mention though:
331///
332/// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`],
333/// then the PikeVM will report "no match." This is consistent with all other
334/// regex engines in this crate.
335/// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`]
336/// that has insufficient capacity to store all valid pattern IDs, then if a
337/// match occurs for a `PatternID` that cannot be inserted, it is silently
338/// dropped as if it did not match.
339///
340/// # Advice
341///
342/// The `PikeVM` is generally the most "powerful" regex engine in this crate.
343/// "Powerful" in this context means that it can handle any regular expression
344/// that is parseable by `regex-syntax` and any size haystack. Regretably,
345/// the `PikeVM` is also simultaneously often the _slowest_ regex engine in
346/// practice. This results in an annoying situation where one generally tries
347/// to pick any other regex engine (or perhaps none at all) before being
348/// forced to fall back to a `PikeVM`.
349///
350/// For example, a common strategy for dealing with capturing groups is to
351/// actually look for the overall match of the regex using a faster regex
352/// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall
353/// match is found, one can then run the `PikeVM` on just the match span to
354/// find the spans of the capturing groups. In this way, the faster regex
355/// engine does the majority of the work, while the `PikeVM` only lends its
356/// power in a more limited role.
357///
358/// Unfortunately, this isn't always possible because the faster regex engines
359/// don't support all of the regex features in `regex-syntax`. This notably
360/// includes (and is currently limited to) Unicode word boundaries. So if
361/// your pattern has Unicode word boundaries, you typically can't use a
362/// DFA-based regex engine at all (unless you [enable heuristic support for
363/// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass
364/// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for
365/// anchored searches only, but in a cruel sort of joke, many Unicode features
366/// tend to result in making the regex _not_ one-pass.)
367///
368/// # Example
369///
370/// This example shows that the `PikeVM` implements Unicode word boundaries
371/// correctly by default.
372///
373/// ```
374/// # if cfg!(miri) { return Ok(()); } // miri takes too long
375/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
376///
377/// let re = PikeVM::new(r"\b\w+\b")?;
378/// let mut cache = re.create_cache();
379///
380/// let mut it = re.find_iter(&mut cache, "Шерлок Холмс");
381/// assert_eq!(Some(Match::must(0, 0..12)), it.next());
382/// assert_eq!(Some(Match::must(0, 13..23)), it.next());
383/// assert_eq!(None, it.next());
384/// # Ok::<(), Box<dyn std::error::Error>>(())
385/// ```
386#[derive(Clone, Debug)]
387pub struct PikeVM {
388 config: Config,
389 nfa: NFA,
390}
391
392impl PikeVM {
393 /// Parse the given regular expression using the default configuration and
394 /// return the corresponding `PikeVM`.
395 ///
396 /// If you want a non-default configuration, then use the [`Builder`] to
397 /// set your own configuration.
398 ///
399 /// # Example
400 ///
401 /// ```
402 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
403 ///
404 /// let re = PikeVM::new("foo[0-9]+bar")?;
405 /// let mut cache = re.create_cache();
406 /// assert_eq!(
407 /// Some(Match::must(0, 3..14)),
408 /// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(),
409 /// );
410 /// # Ok::<(), Box<dyn std::error::Error>>(())
411 /// ```
412 #[cfg(feature = "syntax")]
413 pub fn new(pattern: &str) -> Result<PikeVM, BuildError> {
414 PikeVM::builder().build(pattern)
415 }
416
417 /// Like `new`, but parses multiple patterns into a single "multi regex."
418 /// This similarly uses the default regex configuration.
419 ///
420 /// # Example
421 ///
422 /// ```
423 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
424 ///
425 /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?;
426 /// let mut cache = re.create_cache();
427 ///
428 /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
429 /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
430 /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
431 /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
432 /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
433 /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
434 /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
435 /// assert_eq!(None, it.next());
436 /// # Ok::<(), Box<dyn std::error::Error>>(())
437 /// ```
438 #[cfg(feature = "syntax")]
439 pub fn new_many<P: AsRef<str>>(
440 patterns: &[P],
441 ) -> Result<PikeVM, BuildError> {
442 PikeVM::builder().build_many(patterns)
443 }
444
445 /// Like `new`, but builds a PikeVM directly from an NFA. This is useful
446 /// if you already have an NFA, or even if you hand-assembled the NFA.
447 ///
448 /// # Example
449 ///
450 /// This shows how to hand assemble a regular expression via its HIR,
451 /// compile an NFA from it and build a PikeVM from the NFA.
452 ///
453 /// ```
454 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
455 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
456 ///
457 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
458 /// ClassBytesRange::new(b'0', b'9'),
459 /// ClassBytesRange::new(b'A', b'Z'),
460 /// ClassBytesRange::new(b'_', b'_'),
461 /// ClassBytesRange::new(b'a', b'z'),
462 /// ])));
463 ///
464 /// let config = NFA::config().nfa_size_limit(Some(1_000));
465 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
466 ///
467 /// let re = PikeVM::new_from_nfa(nfa)?;
468 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
469 /// let expected = Some(Match::must(0, 3..4));
470 /// re.captures(&mut cache, "!@#A#@!", &mut caps);
471 /// assert_eq!(expected, caps.get_match());
472 ///
473 /// # Ok::<(), Box<dyn std::error::Error>>(())
474 /// ```
475 pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> {
476 PikeVM::builder().build_from_nfa(nfa)
477 }
478
479 /// Create a new `PikeVM` that matches every input.
480 ///
481 /// # Example
482 ///
483 /// ```
484 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
485 ///
486 /// let re = PikeVM::always_match()?;
487 /// let mut cache = re.create_cache();
488 ///
489 /// let expected = Match::must(0, 0..0);
490 /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next());
491 /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next());
492 /// # Ok::<(), Box<dyn std::error::Error>>(())
493 /// ```
494 pub fn always_match() -> Result<PikeVM, BuildError> {
495 let nfa = thompson::NFA::always_match();
496 PikeVM::new_from_nfa(nfa)
497 }
498
499 /// Create a new `PikeVM` that never matches any input.
500 ///
501 /// # Example
502 ///
503 /// ```
504 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
505 ///
506 /// let re = PikeVM::never_match()?;
507 /// let mut cache = re.create_cache();
508 ///
509 /// assert_eq!(None, re.find_iter(&mut cache, "").next());
510 /// assert_eq!(None, re.find_iter(&mut cache, "foo").next());
511 /// # Ok::<(), Box<dyn std::error::Error>>(())
512 /// ```
513 pub fn never_match() -> Result<PikeVM, BuildError> {
514 let nfa = thompson::NFA::never_match();
515 PikeVM::new_from_nfa(nfa)
516 }
517
518 /// Return a default configuration for a `PikeVM`.
519 ///
520 /// This is a convenience routine to avoid needing to import the `Config`
521 /// type when customizing the construction of a `PikeVM`.
522 ///
523 /// # Example
524 ///
525 /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
526 /// disabled, zero-width matches that split a codepoint are allowed.
527 /// Otherwise they are never reported.
528 ///
529 /// In the code below, notice that `""` is permitted to match positions
530 /// that split the encoding of a codepoint.
531 ///
532 /// ```
533 /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match};
534 ///
535 /// let re = PikeVM::builder()
536 /// .thompson(thompson::Config::new().utf8(false))
537 /// .build(r"")?;
538 /// let mut cache = re.create_cache();
539 ///
540 /// let haystack = "a☃z";
541 /// let mut it = re.find_iter(&mut cache, haystack);
542 /// assert_eq!(Some(Match::must(0, 0..0)), it.next());
543 /// assert_eq!(Some(Match::must(0, 1..1)), it.next());
544 /// assert_eq!(Some(Match::must(0, 2..2)), it.next());
545 /// assert_eq!(Some(Match::must(0, 3..3)), it.next());
546 /// assert_eq!(Some(Match::must(0, 4..4)), it.next());
547 /// assert_eq!(Some(Match::must(0, 5..5)), it.next());
548 /// assert_eq!(None, it.next());
549 ///
550 /// # Ok::<(), Box<dyn std::error::Error>>(())
551 /// ```
552 pub fn config() -> Config {
553 Config::new()
554 }
555
556 /// Return a builder for configuring the construction of a `PikeVM`.
557 ///
558 /// This is a convenience routine to avoid needing to import the
559 /// [`Builder`] type in common cases.
560 ///
561 /// # Example
562 ///
563 /// This example shows how to use the builder to disable UTF-8 mode
564 /// everywhere.
565 ///
566 /// ```
567 /// use regex_automata::{
568 /// nfa::thompson::{self, pikevm::PikeVM},
569 /// util::syntax,
570 /// Match,
571 /// };
572 ///
573 /// let re = PikeVM::builder()
574 /// .syntax(syntax::Config::new().utf8(false))
575 /// .thompson(thompson::Config::new().utf8(false))
576 /// .build(r"foo(?-u:[^b])ar.*")?;
577 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
578 ///
579 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
580 /// let expected = Some(Match::must(0, 1..9));
581 /// re.captures(&mut cache, haystack, &mut caps);
582 /// assert_eq!(expected, caps.get_match());
583 ///
584 /// # Ok::<(), Box<dyn std::error::Error>>(())
585 /// ```
586 pub fn builder() -> Builder {
587 Builder::new()
588 }
589
590 /// Create a new empty set of capturing groups that is guaranteed to be
591 /// valid for the search APIs on this `PikeVM`.
592 ///
593 /// A `Captures` value created for a specific `PikeVM` cannot be used with
594 /// any other `PikeVM`.
595 ///
596 /// This is a convenience function for [`Captures::all`]. See the
597 /// [`Captures`] documentation for an explanation of its alternative
598 /// constructors that permit the `PikeVM` to do less work during a search,
599 /// and thus might make it faster.
600 pub fn create_captures(&self) -> Captures {
601 Captures::all(self.get_nfa().group_info().clone())
602 }
603
604 /// Create a new cache for this `PikeVM`.
605 ///
606 /// The cache returned should only be used for searches for this
607 /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then
608 /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently,
609 /// [`PikeVM::reset_cache`]).
610 pub fn create_cache(&self) -> Cache {
611 Cache::new(self)
612 }
613
614 /// Reset the given cache such that it can be used for searching with the
615 /// this `PikeVM` (and only this `PikeVM`).
616 ///
617 /// A cache reset permits reusing memory already allocated in this cache
618 /// with a different `PikeVM`.
619 ///
620 /// # Example
621 ///
622 /// This shows how to re-purpose a cache for use with a different `PikeVM`.
623 ///
624 /// ```
625 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
626 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
627 ///
628 /// let re1 = PikeVM::new(r"\w")?;
629 /// let re2 = PikeVM::new(r"\W")?;
630 ///
631 /// let mut cache = re1.create_cache();
632 /// assert_eq!(
633 /// Some(Match::must(0, 0..2)),
634 /// re1.find_iter(&mut cache, "Δ").next(),
635 /// );
636 ///
637 /// // Using 'cache' with re2 is not allowed. It may result in panics or
638 /// // incorrect results. In order to re-purpose the cache, we must reset
639 /// // it with the PikeVM we'd like to use it with.
640 /// //
641 /// // Similarly, after this reset, using the cache with 're1' is also not
642 /// // allowed.
643 /// re2.reset_cache(&mut cache);
644 /// assert_eq!(
645 /// Some(Match::must(0, 0..3)),
646 /// re2.find_iter(&mut cache, "☃").next(),
647 /// );
648 ///
649 /// # Ok::<(), Box<dyn std::error::Error>>(())
650 /// ```
651 pub fn reset_cache(&self, cache: &mut Cache) {
652 cache.reset(self);
653 }
654
655 /// Returns the total number of patterns compiled into this `PikeVM`.
656 ///
657 /// In the case of a `PikeVM` that contains no patterns, this returns `0`.
658 ///
659 /// # Example
660 ///
661 /// This example shows the pattern length for a `PikeVM` that never
662 /// matches:
663 ///
664 /// ```
665 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
666 ///
667 /// let re = PikeVM::never_match()?;
668 /// assert_eq!(re.pattern_len(), 0);
669 /// # Ok::<(), Box<dyn std::error::Error>>(())
670 /// ```
671 ///
672 /// And another example for a `PikeVM` that matches at every position:
673 ///
674 /// ```
675 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
676 ///
677 /// let re = PikeVM::always_match()?;
678 /// assert_eq!(re.pattern_len(), 1);
679 /// # Ok::<(), Box<dyn std::error::Error>>(())
680 /// ```
681 ///
682 /// And finally, a `PikeVM` that was constructed from multiple patterns:
683 ///
684 /// ```
685 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
686 ///
687 /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
688 /// assert_eq!(re.pattern_len(), 3);
689 /// # Ok::<(), Box<dyn std::error::Error>>(())
690 /// ```
691 pub fn pattern_len(&self) -> usize {
692 self.nfa.pattern_len()
693 }
694
695 /// Return the config for this `PikeVM`.
696 #[inline]
697 pub fn get_config(&self) -> &Config {
698 &self.config
699 }
700
701 /// Returns a reference to the underlying NFA.
702 #[inline]
703 pub fn get_nfa(&self) -> &NFA {
704 &self.nfa
705 }
706}
707
708impl PikeVM {
709 /// Returns true if and only if this `PikeVM` matches the given haystack.
710 ///
711 /// This routine may short circuit if it knows that scanning future
712 /// input will never lead to a different result. In particular, if the
713 /// underlying NFA enters a match state, then this routine will return
714 /// `true` immediately without inspecting any future input. (Consider how
715 /// this might make a difference given the regex `a+` on the haystack
716 /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
717 /// but routines like `find` need to continue searching because `+` is
718 /// greedy by default.)
719 ///
720 /// # Example
721 ///
722 /// This shows basic usage:
723 ///
724 /// ```
725 /// use regex_automata::nfa::thompson::pikevm::PikeVM;
726 ///
727 /// let re = PikeVM::new("foo[0-9]+bar")?;
728 /// let mut cache = re.create_cache();
729 ///
730 /// assert!(re.is_match(&mut cache, "foo12345bar"));
731 /// assert!(!re.is_match(&mut cache, "foobar"));
732 /// # Ok::<(), Box<dyn std::error::Error>>(())
733 /// ```
734 ///
735 /// # Example: consistency with search APIs
736 ///
737 /// `is_match` is guaranteed to return `true` whenever `find` returns a
738 /// match. This includes searches that are executed entirely within a
739 /// codepoint:
740 ///
741 /// ```
742 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input};
743 ///
744 /// let re = PikeVM::new("a*")?;
745 /// let mut cache = re.create_cache();
746 ///
747 /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
748 /// # Ok::<(), Box<dyn std::error::Error>>(())
749 /// ```
750 ///
751 /// Notice that when UTF-8 mode is disabled, then the above reports a
752 /// match because the restriction against zero-width matches that split a
753 /// codepoint has been lifted:
754 ///
755 /// ```
756 /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input};
757 ///
758 /// let re = PikeVM::builder()
759 /// .thompson(NFA::config().utf8(false))
760 /// .build("a*")?;
761 /// let mut cache = re.create_cache();
762 ///
763 /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
764 /// # Ok::<(), Box<dyn std::error::Error>>(())
765 /// ```
766 #[inline]
767 pub fn is_match<'h, I: Into<Input<'h>>>(
768 &self,
769 cache: &mut Cache,
770 input: I,
771 ) -> bool {
772 let input = input.into().earliest(true);
773 self.search_slots(cache, &input, &mut []).is_some()
774 }
775
776 /// Executes a leftmost forward search and returns a `Match` if one exists.
777 ///
778 /// This routine only includes the overall match span. To get access to the
779 /// individual spans of each capturing group, use [`PikeVM::captures`].
780 ///
781 /// # Example
782 ///
783 /// Leftmost first match semantics corresponds to the match with the
784 /// smallest starting offset, but where the end offset is determined by
785 /// preferring earlier branches in the original regular expression. For
786 /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
787 /// will match `Samwise` in `Samwise`.
788 ///
789 /// Generally speaking, the "leftmost first" match is how most backtracking
790 /// regular expressions tend to work. This is in contrast to POSIX-style
791 /// regular expressions that yield "leftmost longest" matches. Namely,
792 /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
793 /// leftmost longest semantics. (This crate does not currently support
794 /// leftmost longest semantics.)
795 ///
796 /// ```
797 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
798 ///
799 /// let re = PikeVM::new("foo[0-9]+")?;
800 /// let mut cache = re.create_cache();
801 /// let expected = Match::must(0, 0..8);
802 /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
803 ///
804 /// // Even though a match is found after reading the first byte (`a`),
805 /// // the leftmost first match semantics demand that we find the earliest
806 /// // match that prefers earlier parts of the pattern over later parts.
807 /// let re = PikeVM::new("abc|a")?;
808 /// let mut cache = re.create_cache();
809 /// let expected = Match::must(0, 0..3);
810 /// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
811 ///
812 /// # Ok::<(), Box<dyn std::error::Error>>(())
813 /// ```
814 #[inline]
815 pub fn find<'h, I: Into<Input<'h>>>(
816 &self,
817 cache: &mut Cache,
818 input: I,
819 ) -> Option<Match> {
820 let input = input.into();
821 if self.get_nfa().pattern_len() == 1 {
822 let mut slots = [None, None];
823 let pid = self.search_slots(cache, &input, &mut slots)?;
824 let start = slots[0]?.get();
825 let end = slots[1]?.get();
826 return Some(Match::new(pid, Span { start, end }));
827 }
828 let ginfo = self.get_nfa().group_info();
829 let slots_len = ginfo.implicit_slot_len();
830 let mut slots = vec![None; slots_len];
831 let pid = self.search_slots(cache, &input, &mut slots)?;
832 let start = slots[pid.as_usize() * 2]?.get();
833 let end = slots[pid.as_usize() * 2 + 1]?.get();
834 Some(Match::new(pid, Span { start, end }))
835 }
836
837 /// Executes a leftmost forward search and writes the spans of capturing
838 /// groups that participated in a match into the provided [`Captures`]
839 /// value. If no match was found, then [`Captures::is_match`] is guaranteed
840 /// to return `false`.
841 ///
842 /// # Example
843 ///
844 /// ```
845 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
846 ///
847 /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
848 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
849 ///
850 /// re.captures(&mut cache, "2010-03-14", &mut caps);
851 /// assert!(caps.is_match());
852 /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
853 /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
854 /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
855 ///
856 /// # Ok::<(), Box<dyn std::error::Error>>(())
857 /// ```
858 #[inline]
859 pub fn captures<'h, I: Into<Input<'h>>>(
860 &self,
861 cache: &mut Cache,
862 input: I,
863 caps: &mut Captures,
864 ) {
865 self.search(cache, &input.into(), caps)
866 }
867
868 /// Returns an iterator over all non-overlapping leftmost matches in the
869 /// given bytes. If no match exists, then the iterator yields no elements.
870 ///
871 /// # Example
872 ///
873 /// ```
874 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
875 ///
876 /// let re = PikeVM::new("foo[0-9]+")?;
877 /// let mut cache = re.create_cache();
878 ///
879 /// let text = "foo1 foo12 foo123";
880 /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
881 /// assert_eq!(matches, vec![
882 /// Match::must(0, 0..4),
883 /// Match::must(0, 5..10),
884 /// Match::must(0, 11..17),
885 /// ]);
886 /// # Ok::<(), Box<dyn std::error::Error>>(())
887 /// ```
888 #[inline]
889 pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
890 &'r self,
891 cache: &'c mut Cache,
892 input: I,
893 ) -> FindMatches<'r, 'c, 'h> {
894 let caps = Captures::matches(self.get_nfa().group_info().clone());
895 let it = iter::Searcher::new(input.into());
896 FindMatches { re: self, cache, caps, it }
897 }
898
899 /// Returns an iterator over all non-overlapping `Captures` values. If no
900 /// match exists, then the iterator yields no elements.
901 ///
902 /// This yields the same matches as [`PikeVM::find_iter`], but it includes
903 /// the spans of all capturing groups that participate in each match.
904 ///
905 /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
906 /// how to correctly iterate over all matches in a haystack while avoiding
907 /// the creation of a new `Captures` value for every match. (Which you are
908 /// forced to do with an `Iterator`.)
909 ///
910 /// # Example
911 ///
912 /// ```
913 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
914 ///
915 /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?;
916 /// let mut cache = re.create_cache();
917 ///
918 /// let text = "foo1 foo12 foo123";
919 /// let matches: Vec<Span> = re
920 /// .captures_iter(&mut cache, text)
921 /// // The unwrap is OK since 'numbers' matches if the pattern matches.
922 /// .map(|caps| caps.get_group_by_name("numbers").unwrap())
923 /// .collect();
924 /// assert_eq!(matches, vec![
925 /// Span::from(3..4),
926 /// Span::from(8..10),
927 /// Span::from(14..17),
928 /// ]);
929 /// # Ok::<(), Box<dyn std::error::Error>>(())
930 /// ```
931 #[inline]
932 pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
933 &'r self,
934 cache: &'c mut Cache,
935 input: I,
936 ) -> CapturesMatches<'r, 'c, 'h> {
937 let caps = self.create_captures();
938 let it = iter::Searcher::new(input.into());
939 CapturesMatches { re: self, cache, caps, it }
940 }
941}
942
943impl PikeVM {
944 /// Executes a leftmost forward search and writes the spans of capturing
945 /// groups that participated in a match into the provided [`Captures`]
946 /// value. If no match was found, then [`Captures::is_match`] is guaranteed
947 /// to return `false`.
948 ///
949 /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input`
950 /// instead of an `Into<Input>`.
951 ///
952 /// # Example: specific pattern search
953 ///
954 /// This example shows how to build a multi-PikeVM that permits searching
955 /// for specific patterns.
956 ///
957 /// ```
958 /// use regex_automata::{
959 /// nfa::thompson::pikevm::PikeVM,
960 /// Anchored, Match, PatternID, Input,
961 /// };
962 ///
963 /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
964 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
965 /// let haystack = "foo123";
966 ///
967 /// // Since we are using the default leftmost-first match and both
968 /// // patterns match at the same starting position, only the first pattern
969 /// // will be returned in this case when doing a search for any of the
970 /// // patterns.
971 /// let expected = Some(Match::must(0, 0..6));
972 /// re.search(&mut cache, &Input::new(haystack), &mut caps);
973 /// assert_eq!(expected, caps.get_match());
974 ///
975 /// // But if we want to check whether some other pattern matches, then we
976 /// // can provide its pattern ID.
977 /// let expected = Some(Match::must(1, 0..6));
978 /// let input = Input::new(haystack)
979 /// .anchored(Anchored::Pattern(PatternID::must(1)));
980 /// re.search(&mut cache, &input, &mut caps);
981 /// assert_eq!(expected, caps.get_match());
982 ///
983 /// # Ok::<(), Box<dyn std::error::Error>>(())
984 /// ```
985 ///
986 /// # Example: specifying the bounds of a search
987 ///
988 /// This example shows how providing the bounds of a search can produce
989 /// different results than simply sub-slicing the haystack.
990 ///
991 /// ```
992 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
993 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
994 ///
995 /// let re = PikeVM::new(r"\b[0-9]{3}\b")?;
996 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
997 /// let haystack = "foo123bar";
998 ///
999 /// // Since we sub-slice the haystack, the search doesn't know about
1000 /// // the larger context and assumes that `123` is surrounded by word
1001 /// // boundaries. And of course, the match position is reported relative
1002 /// // to the sub-slice as well, which means we get `0..3` instead of
1003 /// // `3..6`.
1004 /// let expected = Some(Match::must(0, 0..3));
1005 /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps);
1006 /// assert_eq!(expected, caps.get_match());
1007 ///
1008 /// // But if we provide the bounds of the search within the context of the
1009 /// // entire haystack, then the search can take the surrounding context
1010 /// // into account. (And if we did find a match, it would be reported
1011 /// // as a valid offset into `haystack` instead of its sub-slice.)
1012 /// let expected = None;
1013 /// let input = Input::new(haystack).range(3..6);
1014 /// re.search(&mut cache, &input, &mut caps);
1015 /// assert_eq!(expected, caps.get_match());
1016 ///
1017 /// # Ok::<(), Box<dyn std::error::Error>>(())
1018 /// ```
1019 #[inline]
1020 pub fn search(
1021 &self,
1022 cache: &mut Cache,
1023 input: &Input<'_>,
1024 caps: &mut Captures,
1025 ) {
1026 caps.set_pattern(None);
1027 let pid = self.search_slots(cache, input, caps.slots_mut());
1028 caps.set_pattern(pid);
1029 }
1030
1031 /// Executes a leftmost forward search and writes the spans of capturing
1032 /// groups that participated in a match into the provided `slots`, and
1033 /// returns the matching pattern ID. The contents of the slots for patterns
1034 /// other than the matching pattern are unspecified. If no match was found,
1035 /// then `None` is returned and the contents of `slots` is unspecified.
1036 ///
1037 /// This is like [`PikeVM::search`], but it accepts a raw slots slice
1038 /// instead of a `Captures` value. This is useful in contexts where you
1039 /// don't want or need to allocate a `Captures`.
1040 ///
1041 /// It is legal to pass _any_ number of slots to this routine. If the regex
1042 /// engine would otherwise write a slot offset that doesn't fit in the
1043 /// provided slice, then it is simply skipped. In general though, there are
1044 /// usually three slice lengths you might want to use:
1045 ///
1046 /// * An empty slice, if you only care about which pattern matched.
1047 /// * A slice with
1048 /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
1049 /// slots, if you only care about the overall match spans for each matching
1050 /// pattern.
1051 /// * A slice with
1052 /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1053 /// permits recording match offsets for every capturing group in every
1054 /// pattern.
1055 ///
1056 /// # Example
1057 ///
1058 /// This example shows how to find the overall match offsets in a
1059 /// multi-pattern search without allocating a `Captures` value. Indeed, we
1060 /// can put our slots right on the stack.
1061 ///
1062 /// ```
1063 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1064 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input};
1065 ///
1066 /// let re = PikeVM::new_many(&[
1067 /// r"\pL+",
1068 /// r"\d+",
1069 /// ])?;
1070 /// let mut cache = re.create_cache();
1071 /// let input = Input::new("!@#123");
1072 ///
1073 /// // We only care about the overall match offsets here, so we just
1074 /// // allocate two slots for each pattern. Each slot records the start
1075 /// // and end of the match.
1076 /// let mut slots = [None; 4];
1077 /// let pid = re.search_slots(&mut cache, &input, &mut slots);
1078 /// assert_eq!(Some(PatternID::must(1)), pid);
1079 ///
1080 /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1081 /// // See 'GroupInfo' for more details on the mapping between groups and
1082 /// // slot indices.
1083 /// let slot_start = pid.unwrap().as_usize() * 2;
1084 /// let slot_end = slot_start + 1;
1085 /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
1086 /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
1087 ///
1088 /// # Ok::<(), Box<dyn std::error::Error>>(())
1089 /// ```
1090 #[inline]
1091 pub fn search_slots(
1092 &self,
1093 cache: &mut Cache,
1094 input: &Input<'_>,
1095 slots: &mut [Option<NonMaxUsize>],
1096 ) -> Option<PatternID> {
1097 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1098 if !utf8empty {
1099 let hm = self.search_slots_imp(cache, input, slots)?;
1100 return Some(hm.pattern());
1101 }
1102 // There is an unfortunate special case where if the regex can
1103 // match the empty string and UTF-8 mode is enabled, the search
1104 // implementation requires that the slots have at least as much space
1105 // to report the bounds of any match. This is so zero-width matches
1106 // that split a codepoint can be filtered out.
1107 //
1108 // Note that if utf8empty is true, we specialize the case for when
1109 // the number of patterns is 1. In that case, we can just use a stack
1110 // allocation. Otherwise we resort to a heap allocation, which we
1111 // convince ourselves we're fine with due to the pathological nature of
1112 // this case.
1113 let min = self.get_nfa().group_info().implicit_slot_len();
1114 if slots.len() >= min {
1115 let hm = self.search_slots_imp(cache, input, slots)?;
1116 return Some(hm.pattern());
1117 }
1118 if self.get_nfa().pattern_len() == 1 {
1119 let mut enough = [None, None];
1120 let got = self.search_slots_imp(cache, input, &mut enough);
1121 // This is OK because we know `enough` is strictly bigger than
1122 // `slots`, otherwise this special case isn't reached.
1123 slots.copy_from_slice(&enough[..slots.len()]);
1124 return got.map(|hm| hm.pattern());
1125 }
1126 let mut enough = vec![None; min];
1127 let got = self.search_slots_imp(cache, input, &mut enough);
1128 // This is OK because we know `enough` is strictly bigger than `slots`,
1129 // otherwise this special case isn't reached.
1130 slots.copy_from_slice(&enough[..slots.len()]);
1131 got.map(|hm| hm.pattern())
1132 }
1133
1134 /// This is the actual implementation of `search_slots_imp` that
1135 /// doesn't account for the special case when 1) the NFA has UTF-8 mode
1136 /// enabled, 2) the NFA can match the empty string and 3) the caller has
1137 /// provided an insufficient number of slots to record match offsets.
1138 #[inline(never)]
1139 fn search_slots_imp(
1140 &self,
1141 cache: &mut Cache,
1142 input: &Input<'_>,
1143 slots: &mut [Option<NonMaxUsize>],
1144 ) -> Option<HalfMatch> {
1145 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1146 let hm = match self.search_imp(cache, input, slots) {
1147 None => return None,
1148 Some(hm) if !utf8empty => return Some(hm),
1149 Some(hm) => hm,
1150 };
1151 empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1152 Ok(self
1153 .search_imp(cache, input, slots)
1154 .map(|hm| (hm, hm.offset())))
1155 })
1156 // OK because the PikeVM never errors.
1157 .unwrap()
1158 }
1159
1160 /// Writes the set of patterns that match anywhere in the given search
1161 /// configuration to `patset`. If multiple patterns match at the same
1162 /// position and this `PikeVM` was configured with [`MatchKind::All`]
1163 /// semantics, then all matching patterns are written to the given set.
1164 ///
1165 /// Unless all of the patterns in this `PikeVM` are anchored, then
1166 /// generally speaking, this will visit every byte in the haystack.
1167 ///
1168 /// This search routine *does not* clear the pattern set. This gives some
1169 /// flexibility to the caller (e.g., running multiple searches with the
1170 /// same pattern set), but does make the API bug-prone if you're reusing
1171 /// the same pattern set for multiple searches but intended them to be
1172 /// independent.
1173 ///
1174 /// If a pattern ID matched but the given `PatternSet` does not have
1175 /// sufficient capacity to store it, then it is not inserted and silently
1176 /// dropped.
1177 ///
1178 /// # Example
1179 ///
1180 /// This example shows how to find all matching patterns in a haystack,
1181 /// even when some patterns match at the same position as other patterns.
1182 ///
1183 /// ```
1184 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1185 /// use regex_automata::{
1186 /// nfa::thompson::pikevm::PikeVM,
1187 /// Input, MatchKind, PatternSet,
1188 /// };
1189 ///
1190 /// let patterns = &[
1191 /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
1192 /// ];
1193 /// let re = PikeVM::builder()
1194 /// .configure(PikeVM::config().match_kind(MatchKind::All))
1195 /// .build_many(patterns)?;
1196 /// let mut cache = re.create_cache();
1197 ///
1198 /// let input = Input::new("foobar");
1199 /// let mut patset = PatternSet::new(re.pattern_len());
1200 /// re.which_overlapping_matches(&mut cache, &input, &mut patset);
1201 /// let expected = vec![0, 2, 3, 4, 6];
1202 /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
1203 /// assert_eq!(expected, got);
1204 ///
1205 /// # Ok::<(), Box<dyn std::error::Error>>(())
1206 /// ```
1207 #[inline]
1208 pub fn which_overlapping_matches(
1209 &self,
1210 cache: &mut Cache,
1211 input: &Input<'_>,
1212 patset: &mut PatternSet,
1213 ) {
1214 self.which_overlapping_imp(cache, input, patset)
1215 }
1216}
1217
1218impl PikeVM {
1219 /// The implementation of standard leftmost search.
1220 ///
1221 /// Capturing group spans are written to `slots`, but only if requested.
1222 /// `slots` can be any length. Any slot in the NFA that is activated but
1223 /// which is out of bounds for the given `slots` is ignored.
1224 fn search_imp(
1225 &self,
1226 cache: &mut Cache,
1227 input: &Input<'_>,
1228 slots: &mut [Option<NonMaxUsize>],
1229 ) -> Option<HalfMatch> {
1230 cache.setup_search(slots.len());
1231 if input.is_done() {
1232 return None;
1233 }
1234 // Why do we even care about this? Well, in our 'Captures'
1235 // representation, we use usize::MAX as a sentinel to indicate "no
1236 // match." This isn't problematic so long as our haystack doesn't have
1237 // a maximal length. Byte slices are guaranteed by Rust to have a
1238 // length that fits into isize, and so this assert should always pass.
1239 // But we put it here to make our assumption explicit.
1240 assert!(
1241 input.haystack().len() < core::usize::MAX,
1242 "byte slice lengths must be less than usize MAX",
1243 );
1244 instrument!(|c| c.reset(&self.nfa));
1245
1246 // Whether we want to visit all match states instead of emulating the
1247 // 'leftmost' semantics of typical backtracking regex engines.
1248 let allmatches =
1249 self.config.get_match_kind().continue_past_first_match();
1250 let (anchored, start_id) = match self.start_config(input) {
1251 None => return None,
1252 Some(config) => config,
1253 };
1254
1255 let pre =
1256 if anchored { None } else { self.get_config().get_prefilter() };
1257 let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
1258 let mut hm = None;
1259 // Yes, our search doesn't end at input.end(), but includes it. This
1260 // is necessary because matches are delayed by one byte, just like
1261 // how the DFA engines work. The delay is used to handle look-behind
1262 // assertions. In the case of the PikeVM, the delay is implemented
1263 // by not considering a match to exist until it is visited in
1264 // 'steps'. Technically, we know a match exists in the previous
1265 // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA
1266 // determinization. We don't mark a DFA state as a match state if it
1267 // contains an NFA match state, but rather, whether the DFA state was
1268 // generated by a transition from a DFA state that contains an NFA
1269 // match state.)
1270 let mut at = input.start();
1271 while at <= input.end() {
1272 // If we have no states left to visit, then there are some cases
1273 // where we know we can quit early or even skip ahead.
1274 if curr.set.is_empty() {
1275 // We have a match and we haven't been instructed to continue
1276 // on even after finding a match, so we can quit.
1277 if hm.is_some() && !allmatches {
1278 break;
1279 }
1280 // If we're running an anchored search and we've advanced
1281 // beyond the start position with no other states to try, then
1282 // we will never observe a match and thus can stop.
1283 if anchored && at > input.start() {
1284 break;
1285 }
1286 // If there no states left to explore at this position and we
1287 // know we can't terminate early, then we are effectively at
1288 // the starting state of the NFA. If we fell through here,
1289 // we'd end up adding our '(?s-u:.)*?' prefix and it would be
1290 // the only thing in 'curr'. So we might as well just skip
1291 // ahead until we find something that we know might advance us
1292 // forward.
1293 if let Some(ref pre) = pre {
1294 let span = Span::from(at..input.end());
1295 match pre.find(input.haystack(), span) {
1296 None => break,
1297 Some(ref span) => at = span.start,
1298 }
1299 }
1300 }
1301 // Instead of using the NFA's unanchored start state, we actually
1302 // always use its anchored starting state. As a result, when doing
1303 // an unanchored search, we need to simulate our own '(?s-u:.)*?'
1304 // prefix, to permit a match to appear anywhere.
1305 //
1306 // Now, we don't *have* to do things this way. We could use the
1307 // NFA's unanchored starting state and do one 'epsilon_closure'
1308 // call from that starting state before the main loop here. And
1309 // that is just as correct. However, it turns out to be slower
1310 // than our approach here because it slightly increases the cost
1311 // of processing each byte by requiring us to visit more NFA
1312 // states to deal with the additional NFA states in the unanchored
1313 // prefix. By simulating it explicitly here, we lower those costs
1314 // substantially. The cost is itself small, but it adds up for
1315 // large haystacks.
1316 //
1317 // In order to simulate the '(?s-u:.)*?' prefix---which is not
1318 // greedy---we are careful not to perform an epsilon closure on
1319 // the start state if we already have a match. Namely, if we
1320 // did otherwise, we would never reach a terminating condition
1321 // because there would always be additional states to process.
1322 // In effect, the exclusion of running 'epsilon_closure' when
1323 // we have a match corresponds to the "dead" states we have in
1324 // our DFA regex engines. Namely, in a DFA, match states merely
1325 // instruct the search execution to record the current offset as
1326 // the most recently seen match. It is the dead state that actually
1327 // indicates when to stop the search (other than EOF or quit
1328 // states).
1329 //
1330 // However, when 'allmatches' is true, the caller has asked us to
1331 // leave in every possible match state. This tends not to make a
1332 // whole lot of sense in unanchored searches, because it means the
1333 // search really cannot terminate until EOF. And often, in that
1334 // case, you wind up skipping over a bunch of matches and are left
1335 // with the "last" match. Arguably, it just doesn't make a lot of
1336 // sense to run a 'leftmost' search (which is what this routine is)
1337 // with 'allmatches' set to true. But the DFAs support it and this
1338 // matches their behavior. (Generally, 'allmatches' is useful for
1339 // overlapping searches or leftmost anchored searches to find the
1340 // longest possible match by ignoring match priority.)
1341 //
1342 // Additionally, when we're running an anchored search, this
1343 // epsilon closure should only be computed at the beginning of the
1344 // search. If we re-computed it at every position, we would be
1345 // simulating an unanchored search when we were tasked to perform
1346 // an anchored search.
1347 if (!hm.is_some() || allmatches)
1348 && (!anchored || at == input.start())
1349 {
1350 // Since we are adding to the 'curr' active states and since
1351 // this is for the start ID, we use a slots slice that is
1352 // guaranteed to have the right length but where every element
1353 // is absent. This is exactly what we want, because this
1354 // epsilon closure is responsible for simulating an unanchored
1355 // '(?s:.)*?' prefix. It is specifically outside of any
1356 // capturing groups, and thus, using slots that are always
1357 // absent is correct.
1358 //
1359 // Note though that we can't just use '&mut []' here, since
1360 // this epsilon closure may traverse through 'Captures' epsilon
1361 // transitions, and thus must be able to write offsets to the
1362 // slots given which are later copied to slot values in 'curr'.
1363 let slots = next.slot_table.all_absent();
1364 self.epsilon_closure(stack, slots, curr, input, at, start_id);
1365 }
1366 if let Some(pid) = self.nexts(stack, curr, next, input, at, slots)
1367 {
1368 hm = Some(HalfMatch::new(pid, at));
1369 }
1370 // Unless the caller asked us to return early, we need to mush on
1371 // to see if we can extend our match. (But note that 'nexts' will
1372 // quit right after seeing a match when match_kind==LeftmostFirst,
1373 // as is consistent with leftmost-first match priority.)
1374 if input.get_earliest() && hm.is_some() {
1375 break;
1376 }
1377 core::mem::swap(curr, next);
1378 next.set.clear();
1379 at += 1;
1380 }
1381 instrument!(|c| c.eprint(&self.nfa));
1382 hm
1383 }
1384
1385 /// The implementation for the 'which_overlapping_matches' API. Basically,
1386 /// we do a single scan through the entire haystack (unless our regex
1387 /// or search is anchored) and record every pattern that matched. In
1388 /// particular, when MatchKind::All is used, this supports overlapping
1389 /// matches. So if we have the regexes 'sam' and 'samwise', they will
1390 /// *both* be reported in the pattern set when searching the haystack
1391 /// 'samwise'.
1392 fn which_overlapping_imp(
1393 &self,
1394 cache: &mut Cache,
1395 input: &Input<'_>,
1396 patset: &mut PatternSet,
1397 ) {
1398 // NOTE: This is effectively a copy of 'search_imp' above, but with no
1399 // captures support and instead writes patterns that matched directly
1400 // to 'patset'. See that routine for better commentary about what's
1401 // going on in this routine. We probably could unify the routines using
1402 // generics or more helper routines, but I'm not sure it's worth it.
1403 //
1404 // NOTE: We somewhat go out of our way here to support things like
1405 // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither
1406 // of those seem particularly relevant to this routine, but they are
1407 // both supported by the DFA analogs of this routine by construction
1408 // and composition, so it seems like good sense to have the PikeVM
1409 // match that behavior.
1410
1411 cache.setup_search(0);
1412 if input.is_done() {
1413 return;
1414 }
1415 assert!(
1416 input.haystack().len() < core::usize::MAX,
1417 "byte slice lengths must be less than usize MAX",
1418 );
1419 instrument!(|c| c.reset(&self.nfa));
1420
1421 let allmatches =
1422 self.config.get_match_kind().continue_past_first_match();
1423 let (anchored, start_id) = match self.start_config(input) {
1424 None => return,
1425 Some(config) => config,
1426 };
1427
1428 let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
1429 for at in input.start()..=input.end() {
1430 let any_matches = !patset.is_empty();
1431 if curr.set.is_empty() {
1432 if any_matches && !allmatches {
1433 break;
1434 }
1435 if anchored && at > input.start() {
1436 break;
1437 }
1438 }
1439 if !any_matches || allmatches {
1440 let slots = &mut [];
1441 self.epsilon_closure(stack, slots, curr, input, at, start_id);
1442 }
1443 self.nexts_overlapping(stack, curr, next, input, at, patset);
1444 // If we found a match and filled our set, then there is no more
1445 // additional info that we can provide. Thus, we can quit. We also
1446 // quit if the caller asked us to stop at the earliest point that
1447 // we know a match exists.
1448 if patset.is_full() || input.get_earliest() {
1449 break;
1450 }
1451 core::mem::swap(curr, next);
1452 next.set.clear();
1453 }
1454 instrument!(|c| c.eprint(&self.nfa));
1455 }
1456
1457 /// Process the active states in 'curr' to find the states (written to
1458 /// 'next') we should process for the next byte in the haystack.
1459 ///
1460 /// 'stack' is used to perform a depth first traversal of the NFA when
1461 /// computing an epsilon closure.
1462 ///
1463 /// When a match is found, the slots for that match state (in 'curr') are
1464 /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
1465 /// stops (unless the PikeVM was configured with MatchKind::All semantics).
1466 #[cfg_attr(feature = "perf-inline", inline(always))]
1467 fn nexts(
1468 &self,
1469 stack: &mut Vec<FollowEpsilon>,
1470 curr: &mut ActiveStates,
1471 next: &mut ActiveStates,
1472 input: &Input<'_>,
1473 at: usize,
1474 slots: &mut [Option<NonMaxUsize>],
1475 ) -> Option<PatternID> {
1476 instrument!(|c| c.record_state_set(&curr.set));
1477 let mut pid = None;
1478 let ActiveStates { ref set, ref mut slot_table } = *curr;
1479 for sid in set.iter() {
1480 pid = match self.next(stack, slot_table, next, input, at, sid) {
1481 None => continue,
1482 Some(pid) => Some(pid),
1483 };
1484 slots.copy_from_slice(slot_table.for_state(sid));
1485 if !self.config.get_match_kind().continue_past_first_match() {
1486 break;
1487 }
1488 }
1489 pid
1490 }
1491
1492 /// Like 'nexts', but for the overlapping case. This doesn't write any
1493 /// slots, and instead just writes which pattern matched in 'patset'.
1494 #[cfg_attr(feature = "perf-inline", inline(always))]
1495 fn nexts_overlapping(
1496 &self,
1497 stack: &mut Vec<FollowEpsilon>,
1498 curr: &mut ActiveStates,
1499 next: &mut ActiveStates,
1500 input: &Input<'_>,
1501 at: usize,
1502 patset: &mut PatternSet,
1503 ) {
1504 instrument!(|c| c.record_state_set(&curr.set));
1505 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1506 let ActiveStates { ref set, ref mut slot_table } = *curr;
1507 for sid in set.iter() {
1508 let pid = match self.next(stack, slot_table, next, input, at, sid)
1509 {
1510 None => continue,
1511 Some(pid) => pid,
1512 };
1513 // This handles the case of finding a zero-width match that splits
1514 // a codepoint. Namely, if we're in UTF-8 mode AND we know we can
1515 // match the empty string, then the only valid way of getting to
1516 // this point with an offset that splits a codepoint is when we
1517 // have an empty match. Such matches, in UTF-8 mode, must not be
1518 // reported. So we just skip them here and pretend as if we did
1519 // not see a match.
1520 if utf8empty && !input.is_char_boundary(at) {
1521 continue;
1522 }
1523 let _ = patset.try_insert(pid);
1524 if !self.config.get_match_kind().continue_past_first_match() {
1525 break;
1526 }
1527 }
1528 }
1529
1530 /// Starting from 'sid', if the position 'at' in the 'input' haystack has a
1531 /// transition defined out of 'sid', then add the state transitioned to and
1532 /// its epsilon closure to the 'next' set of states to explore.
1533 ///
1534 /// 'stack' is used by the epsilon closure computation to perform a depth
1535 /// first traversal of the NFA.
1536 ///
1537 /// 'curr_slot_table' should be the table of slots for the current set of
1538 /// states being explored. If there is a transition out of 'sid', then
1539 /// sid's row in the slot table is used to perform the epsilon closure.
1540 #[cfg_attr(feature = "perf-inline", inline(always))]
1541 fn next(
1542 &self,
1543 stack: &mut Vec<FollowEpsilon>,
1544 curr_slot_table: &mut SlotTable,
1545 next: &mut ActiveStates,
1546 input: &Input<'_>,
1547 at: usize,
1548 sid: StateID,
1549 ) -> Option<PatternID> {
1550 instrument!(|c| c.record_step(sid));
1551 match *self.nfa.state(sid) {
1552 State::Fail
1553 | State::Look { .. }
1554 | State::Union { .. }
1555 | State::BinaryUnion { .. }
1556 | State::Capture { .. } => None,
1557 State::ByteRange { ref trans } => {
1558 if trans.matches(input.haystack(), at) {
1559 let slots = curr_slot_table.for_state(sid);
1560 // OK because 'at <= haystack.len() < usize::MAX', so
1561 // adding 1 will never wrap.
1562 let at = at.wrapping_add(1);
1563 self.epsilon_closure(
1564 stack, slots, next, input, at, trans.next,
1565 );
1566 }
1567 None
1568 }
1569 State::Sparse(ref sparse) => {
1570 if let Some(next_sid) = sparse.matches(input.haystack(), at) {
1571 let slots = curr_slot_table.for_state(sid);
1572 // OK because 'at <= haystack.len() < usize::MAX', so
1573 // adding 1 will never wrap.
1574 let at = at.wrapping_add(1);
1575 self.epsilon_closure(
1576 stack, slots, next, input, at, next_sid,
1577 );
1578 }
1579 None
1580 }
1581 State::Dense(ref dense) => {
1582 if let Some(next_sid) = dense.matches(input.haystack(), at) {
1583 let slots = curr_slot_table.for_state(sid);
1584 // OK because 'at <= haystack.len() < usize::MAX', so
1585 // adding 1 will never wrap.
1586 let at = at.wrapping_add(1);
1587 self.epsilon_closure(
1588 stack, slots, next, input, at, next_sid,
1589 );
1590 }
1591 None
1592 }
1593 State::Match { pattern_id } => Some(pattern_id),
1594 }
1595 }
1596
1597 /// Compute the epsilon closure of 'sid', writing the closure into 'next'
1598 /// while copying slot values from 'curr_slots' into corresponding states
1599 /// in 'next'. 'curr_slots' should be the slot values corresponding to
1600 /// 'sid'.
1601 ///
1602 /// The given 'stack' is used to perform a depth first traversal of the
1603 /// NFA by recursively following all epsilon transitions out of 'sid'.
1604 /// Conditional epsilon transitions are followed if and only if they are
1605 /// satisfied for the position 'at' in the 'input' haystack.
1606 ///
1607 /// While this routine may write to 'curr_slots', once it returns, any
1608 /// writes are undone and the original values (even if absent) are
1609 /// restored.
1610 #[cfg_attr(feature = "perf-inline", inline(always))]
1611 fn epsilon_closure(
1612 &self,
1613 stack: &mut Vec<FollowEpsilon>,
1614 curr_slots: &mut [Option<NonMaxUsize>],
1615 next: &mut ActiveStates,
1616 input: &Input<'_>,
1617 at: usize,
1618 sid: StateID,
1619 ) {
1620 instrument!(|c| {
1621 c.record_closure(sid);
1622 c.record_stack_push(sid);
1623 });
1624 stack.push(FollowEpsilon::Explore(sid));
1625 while let Some(frame) = stack.pop() {
1626 match frame {
1627 FollowEpsilon::RestoreCapture { slot, offset: pos } => {
1628 curr_slots[slot] = pos;
1629 }
1630 FollowEpsilon::Explore(sid) => {
1631 self.epsilon_closure_explore(
1632 stack, curr_slots, next, input, at, sid,
1633 );
1634 }
1635 }
1636 }
1637 }
1638
1639 /// Explore all of the epsilon transitions out of 'sid'. This is mostly
1640 /// split out from 'epsilon_closure' in order to clearly delineate
1641 /// the actual work of computing an epsilon closure from the stack
1642 /// book-keeping.
1643 ///
1644 /// This will push any additional explorations needed on to 'stack'.
1645 ///
1646 /// 'curr_slots' should refer to the slots for the currently active NFA
1647 /// state. That is, the current state we are stepping through. These
1648 /// slots are mutated in place as new 'Captures' states are traversed
1649 /// during epsilon closure, but the slots are restored to their original
1650 /// values once the full epsilon closure is completed. The ultimate use of
1651 /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that
1652 /// the capturing group spans are forwarded from the currently active state
1653 /// to the next.
1654 ///
1655 /// 'next' refers to the next set of active states. Computing an epsilon
1656 /// closure may increase the next set of active states.
1657 ///
1658 /// 'input' refers to the caller's input configuration and 'at' refers to
1659 /// the current position in the haystack. These are used to check whether
1660 /// conditional epsilon transitions (like look-around) are satisfied at
1661 /// the current position. If they aren't, then the epsilon closure won't
1662 /// include them.
1663 #[cfg_attr(feature = "perf-inline", inline(always))]
1664 fn epsilon_closure_explore(
1665 &self,
1666 stack: &mut Vec<FollowEpsilon>,
1667 curr_slots: &mut [Option<NonMaxUsize>],
1668 next: &mut ActiveStates,
1669 input: &Input<'_>,
1670 at: usize,
1671 mut sid: StateID,
1672 ) {
1673 // We can avoid pushing some state IDs on to our stack in precisely
1674 // the cases where a 'push(x)' would be immediately followed by a 'x
1675 // = pop()'. This is achieved by this outer-loop. We simply set 'sid'
1676 // to be the next state ID we want to explore once we're done with
1677 // our initial exploration. In practice, this avoids a lot of stack
1678 // thrashing.
1679 loop {
1680 instrument!(|c| c.record_set_insert(sid));
1681 // Record this state as part of our next set of active states. If
1682 // we've already explored it, then no need to do it again.
1683 if !next.set.insert(sid) {
1684 return;
1685 }
1686 match *self.nfa.state(sid) {
1687 State::Fail
1688 | State::Match { .. }
1689 | State::ByteRange { .. }
1690 | State::Sparse { .. }
1691 | State::Dense { .. } => {
1692 next.slot_table.for_state(sid).copy_from_slice(curr_slots);
1693 return;
1694 }
1695 State::Look { look, next } => {
1696 // OK because we don't permit building a searcher with a
1697 // Unicode word boundary if the requisite Unicode data is
1698 // unavailable.
1699 if !self.nfa.look_matcher().matches_inline(
1700 look,
1701 input.haystack(),
1702 at,
1703 ) {
1704 return;
1705 }
1706 sid = next;
1707 }
1708 State::Union { ref alternates } => {
1709 sid = match alternates.get(0) {
1710 None => return,
1711 Some(&sid) => sid,
1712 };
1713 instrument!(|c| {
1714 for &alt in &alternates[1..] {
1715 c.record_stack_push(alt);
1716 }
1717 });
1718 stack.extend(
1719 alternates[1..]
1720 .iter()
1721 .copied()
1722 .rev()
1723 .map(FollowEpsilon::Explore),
1724 );
1725 }
1726 State::BinaryUnion { alt1, alt2 } => {
1727 sid = alt1;
1728 instrument!(|c| c.record_stack_push(sid));
1729 stack.push(FollowEpsilon::Explore(alt2));
1730 }
1731 State::Capture { next, slot, .. } => {
1732 // There's no need to do anything with slots that
1733 // ultimately won't be copied into the caller-provided
1734 // 'Captures' value. So we just skip dealing with them at
1735 // all.
1736 if slot.as_usize() < curr_slots.len() {
1737 instrument!(|c| c.record_stack_push(sid));
1738 stack.push(FollowEpsilon::RestoreCapture {
1739 slot,
1740 offset: curr_slots[slot],
1741 });
1742 // OK because length of a slice must fit into an isize.
1743 curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap());
1744 }
1745 sid = next;
1746 }
1747 }
1748 }
1749 }
1750
1751 /// Return the starting configuration of a PikeVM search.
1752 ///
1753 /// The "start config" is basically whether the search should be anchored
1754 /// or not and the NFA state ID at which to begin the search. The state ID
1755 /// returned always corresponds to an anchored starting state even when the
1756 /// search is unanchored. This is because the PikeVM search loop deals with
1757 /// unanchored searches with an explicit epsilon closure out of the start
1758 /// state.
1759 ///
1760 /// This routine accounts for both the caller's `Input` configuration
1761 /// and the pattern itself. For example, even if the caller asks for an
1762 /// unanchored search, if the pattern itself is anchored, then this will
1763 /// always return 'true' because implementing an unanchored search in that
1764 /// case would be incorrect.
1765 ///
1766 /// Similarly, if the caller requests an anchored search for a particular
1767 /// pattern, then the starting state ID returned will reflect that.
1768 ///
1769 /// If a pattern ID is given in the input configuration that is not in
1770 /// this regex, then `None` is returned.
1771 fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> {
1772 match input.get_anchored() {
1773 // Only way we're unanchored is if both the caller asked for an
1774 // unanchored search *and* the pattern is itself not anchored.
1775 Anchored::No => Some((
1776 self.nfa.is_always_start_anchored(),
1777 self.nfa.start_anchored(),
1778 )),
1779 Anchored::Yes => Some((true, self.nfa.start_anchored())),
1780 Anchored::Pattern(pid) => {
1781 Some((true, self.nfa.start_pattern(pid)?))
1782 }
1783 }
1784 }
1785}
1786
1787/// An iterator over all non-overlapping matches for a particular search.
1788///
1789/// The iterator yields a [`Match`] value until no more matches could be found.
1790///
1791/// The lifetime parameters are as follows:
1792///
1793/// * `'r` represents the lifetime of the PikeVM.
1794/// * `'c` represents the lifetime of the PikeVM's cache.
1795/// * `'h` represents the lifetime of the haystack being searched.
1796///
1797/// This iterator can be created with the [`PikeVM::find_iter`] method.
1798#[derive(Debug)]
1799pub struct FindMatches<'r, 'c, 'h> {
1800 re: &'r PikeVM,
1801 cache: &'c mut Cache,
1802 caps: Captures,
1803 it: iter::Searcher<'h>,
1804}
1805
1806impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
1807 type Item = Match;
1808
1809 #[inline]
1810 fn next(&mut self) -> Option<Match> {
1811 // Splitting 'self' apart seems necessary to appease borrowck.
1812 let FindMatches { re, ref mut cache, ref mut caps, ref mut it } =
1813 *self;
1814 // 'advance' converts errors into panics, which is OK here because
1815 // the PikeVM can never return an error.
1816 it.advance(|input| {
1817 re.search(cache, input, caps);
1818 Ok(caps.get_match())
1819 })
1820 }
1821}
1822
1823/// An iterator over all non-overlapping leftmost matches, with their capturing
1824/// groups, for a particular search.
1825///
1826/// The iterator yields a [`Captures`] value until no more matches could be
1827/// found.
1828///
1829/// The lifetime parameters are as follows:
1830///
1831/// * `'r` represents the lifetime of the PikeVM.
1832/// * `'c` represents the lifetime of the PikeVM's cache.
1833/// * `'h` represents the lifetime of the haystack being searched.
1834///
1835/// This iterator can be created with the [`PikeVM::captures_iter`] method.
1836#[derive(Debug)]
1837pub struct CapturesMatches<'r, 'c, 'h> {
1838 re: &'r PikeVM,
1839 cache: &'c mut Cache,
1840 caps: Captures,
1841 it: iter::Searcher<'h>,
1842}
1843
1844impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> {
1845 type Item = Captures;
1846
1847 #[inline]
1848 fn next(&mut self) -> Option<Captures> {
1849 // Splitting 'self' apart seems necessary to appease borrowck.
1850 let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
1851 *self;
1852 // 'advance' converts errors into panics, which is OK here because
1853 // the PikeVM can never return an error.
1854 it.advance(|input| {
1855 re.search(cache, input, caps);
1856 Ok(caps.get_match())
1857 });
1858 if caps.is_match() {
1859 Some(caps.clone())
1860 } else {
1861 None
1862 }
1863 }
1864}
1865
1866/// A cache represents mutable state that a [`PikeVM`] requires during a
1867/// search.
1868///
1869/// For a given [`PikeVM`], its corresponding cache may be created either via
1870/// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in
1871/// every way, except the former does not require explicitly importing `Cache`.
1872///
1873/// A particular `Cache` is coupled with the [`PikeVM`] from which it
1874/// was created. It may only be used with that `PikeVM`. A cache and its
1875/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
1876/// only be used with the new `PikeVM` (and not the old one).
1877#[derive(Clone, Debug)]
1878pub struct Cache {
1879 /// Stack used while computing epsilon closure. This effectively lets us
1880 /// move what is more naturally expressed through recursion to a stack
1881 /// on the heap.
1882 stack: Vec<FollowEpsilon>,
1883 /// The current active states being explored for the current byte in the
1884 /// haystack.
1885 curr: ActiveStates,
1886 /// The next set of states we're building that will be explored for the
1887 /// next byte in the haystack.
1888 next: ActiveStates,
1889}
1890
1891impl Cache {
1892 /// Create a new [`PikeVM`] cache.
1893 ///
1894 /// A potentially more convenient routine to create a cache is
1895 /// [`PikeVM::create_cache`], as it does not require also importing the
1896 /// `Cache` type.
1897 ///
1898 /// If you want to reuse the returned `Cache` with some other `PikeVM`,
1899 /// then you must call [`Cache::reset`] with the desired `PikeVM`.
1900 pub fn new(re: &PikeVM) -> Cache {
1901 Cache {
1902 stack: vec![],
1903 curr: ActiveStates::new(re),
1904 next: ActiveStates::new(re),
1905 }
1906 }
1907
1908 /// Reset this cache such that it can be used for searching with a
1909 /// different [`PikeVM`].
1910 ///
1911 /// A cache reset permits reusing memory already allocated in this cache
1912 /// with a different `PikeVM`.
1913 ///
1914 /// # Example
1915 ///
1916 /// This shows how to re-purpose a cache for use with a different `PikeVM`.
1917 ///
1918 /// ```
1919 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1920 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
1921 ///
1922 /// let re1 = PikeVM::new(r"\w")?;
1923 /// let re2 = PikeVM::new(r"\W")?;
1924 ///
1925 /// let mut cache = re1.create_cache();
1926 /// assert_eq!(
1927 /// Some(Match::must(0, 0..2)),
1928 /// re1.find_iter(&mut cache, "Δ").next(),
1929 /// );
1930 ///
1931 /// // Using 'cache' with re2 is not allowed. It may result in panics or
1932 /// // incorrect results. In order to re-purpose the cache, we must reset
1933 /// // it with the PikeVM we'd like to use it with.
1934 /// //
1935 /// // Similarly, after this reset, using the cache with 're1' is also not
1936 /// // allowed.
1937 /// cache.reset(&re2);
1938 /// assert_eq!(
1939 /// Some(Match::must(0, 0..3)),
1940 /// re2.find_iter(&mut cache, "☃").next(),
1941 /// );
1942 ///
1943 /// # Ok::<(), Box<dyn std::error::Error>>(())
1944 /// ```
1945 pub fn reset(&mut self, re: &PikeVM) {
1946 self.curr.reset(re);
1947 self.next.reset(re);
1948 }
1949
1950 /// Returns the heap memory usage, in bytes, of this cache.
1951 ///
1952 /// This does **not** include the stack size used up by this cache. To
1953 /// compute that, use `std::mem::size_of::<Cache>()`.
1954 pub fn memory_usage(&self) -> usize {
1955 use core::mem::size_of;
1956 (self.stack.len() * size_of::<FollowEpsilon>())
1957 + self.curr.memory_usage()
1958 + self.next.memory_usage()
1959 }
1960
1961 /// Clears this cache. This should be called at the start of every search
1962 /// to ensure we start with a clean slate.
1963 ///
1964 /// This also sets the length of the capturing groups used in the current
1965 /// search. This permits an optimization where by 'SlotTable::for_state'
1966 /// only returns the number of slots equivalent to the number of slots
1967 /// given in the 'Captures' value. This may be less than the total number
1968 /// of possible slots, e.g., when one only wants to track overall match
1969 /// offsets. This in turn permits less copying of capturing group spans
1970 /// in the PikeVM.
1971 fn setup_search(&mut self, captures_slot_len: usize) {
1972 self.stack.clear();
1973 self.curr.setup_search(captures_slot_len);
1974 self.next.setup_search(captures_slot_len);
1975 }
1976}
1977
1978/// A set of active states used to "simulate" the execution of an NFA via the
1979/// PikeVM.
1980///
1981/// There are two sets of these used during NFA simulation. One set corresponds
1982/// to the "current" set of states being traversed for the current position
1983/// in a haystack. The other set corresponds to the "next" set of states being
1984/// built, which will become the new "current" set for the next position in the
1985/// haystack. These two sets correspond to CLIST and NLIST in Thompson's
1986/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
1987///
1988/// In addition to representing a set of NFA states, this also maintains slot
1989/// values for each state. These slot values are what turn the NFA simulation
1990/// into the "Pike VM." Namely, they track capturing group values for each
1991/// state. During the computation of epsilon closure, we copy slot values from
1992/// states in the "current" set to the "next" set. Eventually, once a match
1993/// is found, the slot values for that match state are what we write to the
1994/// caller provided 'Captures' value.
1995#[derive(Clone, Debug)]
1996struct ActiveStates {
1997 /// The set of active NFA states. This set preserves insertion order, which
1998 /// is critical for simulating the match semantics of backtracking regex
1999 /// engines.
2000 set: SparseSet,
2001 /// The slots for every NFA state, where each slot stores a (possibly
2002 /// absent) offset. Every capturing group has two slots. One for a start
2003 /// offset and one for an end offset.
2004 slot_table: SlotTable,
2005}
2006
2007impl ActiveStates {
2008 /// Create a new set of active states for the given PikeVM. The active
2009 /// states returned may only be used with the given PikeVM. (Use 'reset'
2010 /// to re-purpose the allocation for a different PikeVM.)
2011 fn new(re: &PikeVM) -> ActiveStates {
2012 let mut active = ActiveStates {
2013 set: SparseSet::new(0),
2014 slot_table: SlotTable::new(),
2015 };
2016 active.reset(re);
2017 active
2018 }
2019
2020 /// Reset this set of active states such that it can be used with the given
2021 /// PikeVM (and only that PikeVM).
2022 fn reset(&mut self, re: &PikeVM) {
2023 self.set.resize(re.get_nfa().states().len());
2024 self.slot_table.reset(re);
2025 }
2026
2027 /// Return the heap memory usage, in bytes, used by this set of active
2028 /// states.
2029 ///
2030 /// This does not include the stack size of this value.
2031 fn memory_usage(&self) -> usize {
2032 self.set.memory_usage() + self.slot_table.memory_usage()
2033 }
2034
2035 /// Setup this set of active states for a new search. The given slot
2036 /// length should be the number of slots in a caller provided 'Captures'
2037 /// (and may be zero).
2038 fn setup_search(&mut self, captures_slot_len: usize) {
2039 self.set.clear();
2040 self.slot_table.setup_search(captures_slot_len);
2041 }
2042}
2043
2044/// A table of slots, where each row represent a state in an NFA. Thus, the
2045/// table has room for storing slots for every single state in an NFA.
2046///
2047/// This table is represented with a single contiguous allocation. In general,
2048/// the notion of "capturing group" doesn't really exist at this level of
2049/// abstraction, hence the name "slot" instead. (Indeed, every capturing group
2050/// maps to a pair of slots, one for the start offset and one for the end
2051/// offset.) Slots are indexed by the 'Captures' NFA state.
2052///
2053/// N.B. Not every state actually needs a row of slots. Namely, states that
2054/// only have epsilon transitions currently never have anything written to
2055/// their rows in this table. Thus, the table is somewhat wasteful in its heap
2056/// usage. However, it is important to maintain fast random access by state
2057/// ID, which means one giant table tends to work well. RE2 takes a different
2058/// approach here and allocates each row as its own reference counted thing.
2059/// I explored such a strategy at one point here, but couldn't get it to work
2060/// well using entirely safe code. (To the ambitious reader: I encourage you to
2061/// re-litigate that experiment.) I very much wanted to stick to safe code, but
2062/// could be convinced otherwise if there was a solid argument and the safety
2063/// was encapsulated well.
2064#[derive(Clone, Debug)]
2065struct SlotTable {
2066 /// The actual table of offsets.
2067 table: Vec<Option<NonMaxUsize>>,
2068 /// The number of slots per state, i.e., the table's stride or the length
2069 /// of each row.
2070 slots_per_state: usize,
2071 /// The number of slots in the caller-provided 'Captures' value for the
2072 /// current search. Setting this to 'slots_per_state' is always correct,
2073 /// but may be wasteful.
2074 slots_for_captures: usize,
2075}
2076
2077impl SlotTable {
2078 /// Create a new slot table.
2079 ///
2080 /// One should call 'reset' with the corresponding PikeVM before use.
2081 fn new() -> SlotTable {
2082 SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
2083 }
2084
2085 /// Reset this slot table such that it can be used with the given PikeVM
2086 /// (and only that PikeVM).
2087 fn reset(&mut self, re: &PikeVM) {
2088 let nfa = re.get_nfa();
2089 self.slots_per_state = nfa.group_info().slot_len();
2090 // This is always correct, but may be reduced for a particular search
2091 // if a 'Captures' has fewer slots, e.g., none at all or only slots
2092 // for tracking the overall match instead of all slots for every
2093 // group.
2094 self.slots_for_captures = core::cmp::max(
2095 self.slots_per_state,
2096 nfa.pattern_len().checked_mul(2).unwrap(),
2097 );
2098 let len = nfa
2099 .states()
2100 .len()
2101 .checked_mul(self.slots_per_state)
2102 // Add space to account for scratch space used during a search.
2103 .and_then(|x| x.checked_add(self.slots_for_captures))
2104 // It seems like this could actually panic on legitimate inputs on
2105 // 32-bit targets, and very likely to panic on 16-bit. Should we
2106 // somehow convert this to an error? What about something similar
2107 // for the lazy DFA cache? If you're tripping this assert, please
2108 // file a bug.
2109 .expect("slot table length doesn't overflow");
2110 // This happens about as often as a regex is compiled, so it probably
2111 // should be at debug level, but I found it quite distracting and not
2112 // particularly useful.
2113 trace!(
2114 "resizing PikeVM active states table to {} entries \
2115 (slots_per_state={})",
2116 len,
2117 self.slots_per_state,
2118 );
2119 self.table.resize(len, None);
2120 }
2121
2122 /// Return the heap memory usage, in bytes, used by this slot table.
2123 ///
2124 /// This does not include the stack size of this value.
2125 fn memory_usage(&self) -> usize {
2126 self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>()
2127 }
2128
2129 /// Perform any per-search setup for this slot table.
2130 ///
2131 /// In particular, this sets the length of the number of slots used in the
2132 /// 'Captures' given by the caller (if any at all). This number may be
2133 /// smaller than the total number of slots available, e.g., when the caller
2134 /// is only interested in tracking the overall match and not the spans of
2135 /// every matching capturing group. Only tracking the overall match can
2136 /// save a substantial amount of time copying capturing spans during a
2137 /// search.
2138 fn setup_search(&mut self, captures_slot_len: usize) {
2139 self.slots_for_captures = captures_slot_len;
2140 }
2141
2142 /// Return a mutable slice of the slots for the given state.
2143 ///
2144 /// Note that the length of the slice returned may be less than the total
2145 /// number of slots available for this state. In particular, the length
2146 /// always matches the number of slots indicated via 'setup_search'.
2147 fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
2148 let i = sid.as_usize() * self.slots_per_state;
2149 &mut self.table[i..i + self.slots_for_captures]
2150 }
2151
2152 /// Return a slice of slots of appropriate length where every slot offset
2153 /// is guaranteed to be absent. This is useful in cases where you need to
2154 /// compute an epsilon closure outside of the user supplied regex, and thus
2155 /// never want it to have any capturing slots set.
2156 fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
2157 let i = self.table.len() - self.slots_for_captures;
2158 &mut self.table[i..i + self.slots_for_captures]
2159 }
2160}
2161
2162/// Represents a stack frame for use while computing an epsilon closure.
2163///
2164/// (An "epsilon closure" refers to the set of reachable NFA states from a
2165/// single state without consuming any input. That is, the set of all epsilon
2166/// transitions not only from that single state, but from every other state
2167/// reachable by an epsilon transition as well. This is why it's called a
2168/// "closure." Computing an epsilon closure is also done during DFA
2169/// determinization! Compare and contrast the epsilon closure here in this
2170/// PikeVM and the one used for determinization in crate::util::determinize.)
2171///
2172/// Computing the epsilon closure in a Thompson NFA proceeds via a depth
2173/// first traversal over all epsilon transitions from a particular state.
2174/// (A depth first traversal is important because it emulates the same priority
2175/// of matches that is typically found in backtracking regex engines.) This
2176/// depth first traversal is naturally expressed using recursion, but to avoid
2177/// a call stack size proportional to the size of a regex, we put our stack on
2178/// the heap instead.
2179///
2180/// This stack thus consists of call frames. The typical call frame is
2181/// `Explore`, which instructs epsilon closure to explore the epsilon
2182/// transitions from that state. (Subsequent epsilon transitions are then
2183/// pushed on to the stack as more `Explore` frames.) If the state ID being
2184/// explored has no epsilon transitions, then the capturing group slots are
2185/// copied from the original state that sparked the epsilon closure (from the
2186/// 'step' routine) to the state ID being explored. This way, capturing group
2187/// slots are forwarded from the previous state to the next.
2188///
2189/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
2190/// set the position for a particular slot back to some particular offset. This
2191/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
2192/// set the offset of the slot indicated in `Capture` to the current offset,
2193/// and then push the old offset on to the stack as a `RestoreCapture` frame.
2194/// Thus, the new offset is only used until the epsilon closure reverts back to
2195/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
2196/// transition its "scope" to only states that come "after" it during depth
2197/// first traversal.
2198#[derive(Clone, Debug)]
2199enum FollowEpsilon {
2200 /// Explore the epsilon transitions from a state ID.
2201 Explore(StateID),
2202 /// Reset the given `slot` to the given `offset` (which might be `None`).
2203 RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
2204}
2205
2206/// A set of counters that "instruments" a PikeVM search. To enable this, you
2207/// must enable the 'internal-instrument-pikevm' feature. Then run your Rust
2208/// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in
2209/// the environment. The metrics collected will be dumped automatically for
2210/// every search executed by the PikeVM.
2211///
2212/// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an
2213/// absolute decrease in wall-clock performance, even if the 'trace' log level
2214/// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't
2215/// enabled.) The main point of instrumentation is to get counts of various
2216/// events that occur during the PikeVM's execution.
2217///
2218/// This is a somewhat hacked together collection of metrics that are useful
2219/// to gather from a PikeVM search. In particular, it lets us scrutinize the
2220/// performance profile of a search beyond what general purpose profiling tools
2221/// give us. Namely, we orient the profiling data around the specific states of
2222/// the NFA.
2223///
2224/// In other words, this lets us see which parts of the NFA graph are most
2225/// frequently activated. This then provides direction for optimization
2226/// opportunities.
2227///
2228/// The really sad part about this is that it absolutely clutters up the PikeVM
2229/// implementation. :'( Another approach would be to just manually add this
2230/// code in whenever I want this kind of profiling data, but it's complicated
2231/// and tedious enough that I went with this approach... for now.
2232///
2233/// When instrumentation is enabled (which also turns on 'logging'), then a
2234/// `Counters` is initialized for every search and `trace`'d just before the
2235/// search returns to the caller.
2236///
2237/// Tip: When debugging performance problems with the PikeVM, it's best to try
2238/// to work with an NFA that is as small as possible. Otherwise the state graph
2239/// is likely to be too big to digest.
2240#[cfg(feature = "internal-instrument-pikevm")]
2241#[derive(Clone, Debug)]
2242struct Counters {
2243 /// The number of times the NFA is in a particular permutation of states.
2244 state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>,
2245 /// The number of times 'step' is called for a particular state ID (which
2246 /// indexes this array).
2247 steps: Vec<u64>,
2248 /// The number of times an epsilon closure was computed for a state.
2249 closures: Vec<u64>,
2250 /// The number of times a particular state ID is pushed on to a stack while
2251 /// computing an epsilon closure.
2252 stack_pushes: Vec<u64>,
2253 /// The number of times a particular state ID is inserted into a sparse set
2254 /// while computing an epsilon closure.
2255 set_inserts: Vec<u64>,
2256}
2257
2258#[cfg(feature = "internal-instrument-pikevm")]
2259impl Counters {
2260 fn empty() -> Counters {
2261 Counters {
2262 state_sets: alloc::collections::BTreeMap::new(),
2263 steps: vec![],
2264 closures: vec![],
2265 stack_pushes: vec![],
2266 set_inserts: vec![],
2267 }
2268 }
2269
2270 fn reset(&mut self, nfa: &NFA) {
2271 let len = nfa.states().len();
2272
2273 self.state_sets.clear();
2274
2275 self.steps.clear();
2276 self.steps.resize(len, 0);
2277
2278 self.closures.clear();
2279 self.closures.resize(len, 0);
2280
2281 self.stack_pushes.clear();
2282 self.stack_pushes.resize(len, 0);
2283
2284 self.set_inserts.clear();
2285 self.set_inserts.resize(len, 0);
2286 }
2287
2288 fn eprint(&self, nfa: &NFA) {
2289 trace!("===== START PikeVM Instrumentation Output =====");
2290 // We take the top-K most occurring state sets. Otherwise the output
2291 // is likely to be overwhelming. And we probably only care about the
2292 // most frequently occurring ones anyway.
2293 const LIMIT: usize = 20;
2294 let mut set_counts =
2295 self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>();
2296 set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count));
2297 trace!("## PikeVM frequency of state sets (top {})", LIMIT);
2298 for (set, count) in set_counts.iter().take(LIMIT) {
2299 trace!("{:?}: {}", set, count);
2300 }
2301 if set_counts.len() > LIMIT {
2302 trace!(
2303 "... {} sets omitted (out of {} total)",
2304 set_counts.len() - LIMIT,
2305 set_counts.len(),
2306 );
2307 }
2308
2309 trace!("");
2310 trace!("## PikeVM total frequency of events");
2311 trace!(
2312 "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}",
2313 self.steps.iter().copied().sum::<u64>(),
2314 self.closures.iter().copied().sum::<u64>(),
2315 self.stack_pushes.iter().copied().sum::<u64>(),
2316 self.set_inserts.iter().copied().sum::<u64>(),
2317 );
2318
2319 trace!("");
2320 trace!("## PikeVM frequency of events broken down by state");
2321 for sid in 0..self.steps.len() {
2322 trace!(
2323 "{:06}: steps: {}, closures: {}, \
2324 stack-pushes: {}, set-inserts: {}",
2325 sid,
2326 self.steps[sid],
2327 self.closures[sid],
2328 self.stack_pushes[sid],
2329 self.set_inserts[sid],
2330 );
2331 }
2332
2333 trace!("");
2334 trace!("## NFA debug display");
2335 trace!("{:?}", nfa);
2336 trace!("===== END PikeVM Instrumentation Output =====");
2337 }
2338
2339 fn record_state_set(&mut self, set: &SparseSet) {
2340 let set = set.iter().collect::<Vec<StateID>>();
2341 *self.state_sets.entry(set).or_insert(0) += 1;
2342 }
2343
2344 fn record_step(&mut self, sid: StateID) {
2345 self.steps[sid] += 1;
2346 }
2347
2348 fn record_closure(&mut self, sid: StateID) {
2349 self.closures[sid] += 1;
2350 }
2351
2352 fn record_stack_push(&mut self, sid: StateID) {
2353 self.stack_pushes[sid] += 1;
2354 }
2355
2356 fn record_set_insert(&mut self, sid: StateID) {
2357 self.set_inserts[sid] += 1;
2358 }
2359}
2360