pikevm.rs source code [crates/regex-automata-0.4.9/src/nfa/thompson/pikevm.rs]

1	/!*
2	An NFA backed Pike VM for executing regex searches with capturing groups.
3
4	This module provides a [`PikeVM`] that works by simulating an NFA and
5	resolving all spans of capturing groups that participate in a match.
6	*/
7
8	#[cfg(feature = "internal-instrument-pikevm")]
9	use core::cell::RefCell;
10
11	use alloc::{vec, vec::Vec};
12
13	use crate::{
14	nfa::thompson::{self, BuildError, State, NFA},
15	util::{
16	captures::Captures,
17	empty, iter,
18	prefilter::Prefilter,
19	primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
20	search::{
21	Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span,
22	},
23	sparse_set::SparseSet,
24	},
25	};
26
27	/// A simple macro for conditionally executing instrumentation logic when
28	/// the 'trace' log level is enabled. This is a compile-time no-op when the
29	/// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that
30	/// this makes it easier to avoid doing extra work when instrumentation isn't
31	/// enabled.
32	///
33	/// This macro accepts a closure of type `\|&mut Counters\|`. The closure can
34	/// then increment counters (or whatever) in accordance with what one wants
35	/// to track.
36	macro_rules! instrument {
37	($fun:expr) => {
38	#[cfg(feature = "internal-instrument-pikevm")]
39	{
40	let fun: &mut dyn FnMut(&mut Counters) = &mut $fun;
41	COUNTERS.with(\|c: &RefCell<Counters>\| fun(&mut *c.borrow_mut()));
42	}
43	};
44	}
45
46	#[cfg(feature = "internal-instrument-pikevm")]
47	std::thread_local! {
48	/// Effectively global state used to keep track of instrumentation
49	/// counters. The "proper" way to do this is to thread it through the
50	/// PikeVM, but it makes the code quite icky. Since this is just a
51	/// debugging feature, we're content to relegate it to thread local
52	/// state. When instrumentation is enabled, the counters are reset at the
53	/// beginning of every search and printed (with the 'trace' log level) at
54	/// the end of every search.
55	static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty());
56	}
57
58	/// The configuration used for building a [`PikeVM`].
59	///
60	/// A PikeVM configuration is a simple data object that is typically used with
61	/// [`Builder::configure`]. It can be cheaply cloned.
62	///
63	/// A default configuration can be created either with `Config::new`, or
64	/// perhaps more conveniently, with [`PikeVM::config`].
65	#[derive(Clone, Debug, Default)]
66	pub struct Config {
67	match_kind: Option<MatchKind>,
68	pre: Option<Option<Prefilter>>,
69	}
70
71	impl Config {
72	/// Return a new default PikeVM configuration.
73	pub fn new() -> Config {
74	Config::default()
75	}
76
77	/// Set the desired match semantics.
78	///
79	/// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
80	/// match semantics of Perl-like regex engines. That is, when multiple
81	/// patterns would match at the same leftmost position, the pattern that
82	/// appears first in the concrete syntax is chosen.
83	///
84	/// Currently, the only other kind of match semantics supported is
85	/// [`MatchKind::All`]. This corresponds to "classical DFA" construction
86	/// where all possible matches are visited in the NFA by the `PikeVM`.
87	///
88	/// Typically, `All` is used when one wants to execute an overlapping
89	/// search and `LeftmostFirst` otherwise. In particular, it rarely makes
90	/// sense to use `All` with the various "leftmost" find routines, since the
91	/// leftmost routines depend on the `LeftmostFirst` automata construction
92	/// strategy. Specifically, `LeftmostFirst` results in the `PikeVM`
93	/// simulating dead states as a way to terminate the search and report a
94	/// match. `LeftmostFirst` also supports non-greedy matches using this
95	/// strategy where as `All` does not.
96	pub fn match_kind(mut self, kind: MatchKind) -> Config {
97	self.match_kind = Some(kind);
98	self
99	}
100
101	/// Set a prefilter to be used whenever a start state is entered.
102	///
103	/// A [`Prefilter`] in this context is meant to accelerate searches by
104	/// looking for literal prefixes that every match for the corresponding
105	/// pattern (or patterns) must start with. Once a prefilter produces a
106	/// match, the underlying search routine continues on to try and confirm
107	/// the match.
108	///
109	/// Be warned that setting a prefilter does not guarantee that the search
110	/// will be faster. While it's usually a good bet, if the prefilter
111	/// produces a lot of false positive candidates (i.e., positions matched
112	/// by the prefilter but not by the regex), then the overall result can
113	/// be slower than if you had just executed the regex engine without any
114	/// prefilters.
115	///
116	/// By default no prefilter is set.
117	///
118	/// # Example
119	///
120	/// ```
121	/// use regex_automata::{
122	/// nfa::thompson::pikevm::PikeVM,
123	/// util::prefilter::Prefilter,
124	/// Input, Match, MatchKind,
125	/// };
126	///
127	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
128	/// let re = PikeVM::builder()
129	/// .configure(PikeVM::config().prefilter(pre))
130	/// .build(r"(foo\|bar)[a-z]+")?;
131	/// let mut cache = re.create_cache();
132	/// let input = Input::new("foo1 barfox bar");
133	/// assert_eq!(Some(Match::must(`0`, `5`..`11`)), re.find(&mut cache, input));
134	///
135	/// # Ok::<(), Box<dyn std::error::Error>>(())
136	/// ```
137	///
138	/// Be warned though that an incorrect prefilter can lead to incorrect
139	/// results!
140	///
141	/// ```
142	/// use regex_automata::{
143	/// nfa::thompson::pikevm::PikeVM,
144	/// util::prefilter::Prefilter,
145	/// Input, HalfMatch, MatchKind,
146	/// };
147	///
148	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
149	/// let re = PikeVM::builder()
150	/// .configure(PikeVM::config().prefilter(pre))
151	/// .build(r"(foo\|bar)[a-z]+")?;
152	/// let mut cache = re.create_cache();
153	/// let input = Input::new("foo1 barfox bar");
154	/// // No match reported even though there clearly is one!
155	/// assert_eq!(None, re.find(&mut cache, input));
156	///
157	/// # Ok::<(), Box<dyn std::error::Error>>(())
158	/// ```
159	pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
160	self.pre = Some(pre);
161	self
162	}
163
164	/// Returns the match semantics set in this configuration.
165	pub fn get_match_kind(&self) -> MatchKind {
166	self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
167	}
168
169	/// Returns the prefilter set in this configuration, if one at all.
170	pub fn get_prefilter(&self) -> Option<&Prefilter> {
171	self.pre.as_ref().unwrap_or(&None).as_ref()
172	}
173
174	/// Overwrite the default configuration such that the options in `o` are
175	/// always used. If an option in `o` is not set, then the corresponding
176	/// option in `self` is used. If it's not set in `self` either, then it
177	/// remains not set.
178	pub(crate) fn overwrite(&self, o: Config) -> Config {
179	Config {
180	match_kind: o.match_kind.or(self.match_kind),
181	pre: o.pre.or_else(\|\| self.pre.clone()),
182	}
183	}
184	}
185
186	/// A builder for a `PikeVM`.
187	///
188	/// This builder permits configuring options for the syntax of a pattern,
189	/// the NFA construction and the `PikeVM` construction. This builder is
190	/// different from a general purpose regex builder in that it permits fine
191	/// grain configuration of the construction process. The trade off for this is
192	/// complexity, and the possibility of setting a configuration that might not
193	/// make sense. For example, there are two different UTF-8 modes:
194	///
195	/// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8)
196	/// controls whether the pattern itself can contain sub-expressions that match
197	/// invalid UTF-8.
198	/// * [`thompson::Config::utf8`] controls whether empty matches that split a
199	/// Unicode codepoint are reported or not.
200	///
201	/// Generally speaking, callers will want to either enable all of these or
202	/// disable all of these.
203	///
204	/// # Example
205	///
206	/// This example shows how to disable UTF-8 mode in the syntax and the regex
207	/// itself. This is generally what you want for matching on arbitrary bytes.
208	///
209	/// ```
210	/// use regex_automata::{
211	/// nfa::thompson::{self, pikevm::PikeVM},
212	/// util::syntax,
213	/// Match,
214	/// };
215	///
216	/// let re = PikeVM::builder()
217	/// .syntax(syntax::Config::new().utf8(`false`))
218	/// .thompson(thompson::Config::new().utf8(`false`))
219	/// .build(r"foo(?-u:[^b])ar.*")?;
220	/// let mut cache = re.create_cache();
221	///
222	/// let haystack = b"`\xFE`foo`\xFF`arzz`\xE2\x98\xFF\n`";
223	/// let expected = Some(Match::must(`0`, `1`..`9`));
224	/// let got = re.find_iter(&mut cache, haystack).next();
225	/// assert_eq!(expected, got);
226	/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
227	/// // but the subsequent `.` does not! Disabling UTF-8*
228	/// // on the syntax permits this.
229	/// //
230	/// // N.B. This example does not show the impact of
231	/// // disabling UTF-8 mode on a PikeVM Config, since that
232	/// // only impacts regexes that can produce matches of
233	/// // length 0.
234	/// assert_eq!(b"foo`\xFF`arzz", &haystack[got.unwrap().range()]);
235	///
236	/// # Ok::<(), Box<dyn std::error::Error>>(())
237	/// ```
238	#[derive(Clone, Debug)]
239	pub struct Builder {
240	config: Config,
241	#[cfg(feature = "syntax")]
242	thompson: thompson::Compiler,
243	}
244
245	impl Builder {
246	/// Create a new PikeVM builder with its default configuration.
247	pub fn new() -> Builder {
248	Builder {
249	config: Config::default(),
250	#[cfg(feature = "syntax")]
251	thompson: thompson::Compiler::new(),
252	}
253	}
254
255	/// Build a `PikeVM` from the given pattern.
256	///
257	/// If there was a problem parsing or compiling the pattern, then an error
258	/// is returned.
259	#[cfg(feature = "syntax")]
260	pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> {
261	self.build_many(&[pattern])
262	}
263
264	/// Build a `PikeVM` from the given patterns.
265	#[cfg(feature = "syntax")]
266	pub fn build_many<P: AsRef<str>>(
267	&self,
268	patterns: &[P],
269	) -> Result<PikeVM, BuildError> {
270	let nfa = self.thompson.build_many(patterns)?;
271	self.build_from_nfa(nfa)
272	}
273
274	/// Build a `PikeVM` directly from its NFA.
275	///
276	/// Note that when using this method, any configuration that applies to the
277	/// construction of the NFA itself will of course be ignored, since the NFA
278	/// given here is already built.
279	pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> {
280	nfa.look_set_any().available().map_err(BuildError::word)?;
281	Ok(PikeVM { config: self.config.clone(), nfa })
282	}
283
284	/// Apply the given `PikeVM` configuration options to this builder.
285	pub fn configure(&mut self, config: Config) -> &mut Builder {
286	self.config = self.config.overwrite(config);
287	self
288	}
289
290	/// Set the syntax configuration for this builder using
291	/// [`syntax::Config`](crate::util::syntax::Config).
292	///
293	/// This permits setting things like case insensitivity, Unicode and multi
294	/// line mode.
295	///
296	/// These settings only apply when constructing a PikeVM directly from a
297	/// pattern.
298	#[cfg(feature = "syntax")]
299	pub fn syntax(
300	&mut self,
301	config: crate::util::syntax::Config,
302	) -> &mut Builder {
303	self.thompson.syntax(config);
304	self
305	}
306
307	/// Set the Thompson NFA configuration for this builder using
308	/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
309	///
310	/// This permits setting things like if additional time should be spent
311	/// shrinking the size of the NFA.
312	///
313	/// These settings only apply when constructing a PikeVM directly from a
314	/// pattern.
315	#[cfg(feature = "syntax")]
316	pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
317	self.thompson.configure(config);
318	self
319	}
320	}
321
322	/// A virtual machine for executing regex searches with capturing groups.
323	///
324	/// # Infallible APIs
325	///
326	/// Unlike most other regex engines in this crate, a `PikeVM` never returns an
327	/// error at search time. It supports all [`Anchored`] configurations, never
328	/// quits and works on haystacks of arbitrary length.
329	///
330	/// There are two caveats to mention though:
331	///
332	/// If an invalid pattern ID is given to a search via* [`Anchored::Pattern`],
333	/// then the PikeVM will report "no match." This is consistent with all other
334	/// regex engines in this crate.
335	/// When using* [`PikeVM::which_overlapping_matches`] with a [`PatternSet`]
336	/// that has insufficient capacity to store all valid pattern IDs, then if a
337	/// match occurs for a `PatternID` that cannot be inserted, it is silently
338	/// dropped as if it did not match.
339	///
340	/// # Advice
341	///
342	/// The `PikeVM` is generally the most "powerful" regex engine in this crate.
343	/// "Powerful" in this context means that it can handle any regular expression
344	/// that is parseable by `regex-syntax` and any size haystack. Regretably,
345	/// the `PikeVM` is also simultaneously often the _slowest_ regex engine in
346	/// practice. This results in an annoying situation where one generally tries
347	/// to pick any other regex engine (or perhaps none at all) before being
348	/// forced to fall back to a `PikeVM`.
349	///
350	/// For example, a common strategy for dealing with capturing groups is to
351	/// actually look for the overall match of the regex using a faster regex
352	/// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall
353	/// match is found, one can then run the `PikeVM` on just the match span to
354	/// find the spans of the capturing groups. In this way, the faster regex
355	/// engine does the majority of the work, while the `PikeVM` only lends its
356	/// power in a more limited role.
357	///
358	/// Unfortunately, this isn't always possible because the faster regex engines
359	/// don't support all of the regex features in `regex-syntax`. This notably
360	/// includes (and is currently limited to) Unicode word boundaries. So if
361	/// your pattern has Unicode word boundaries, you typically can't use a
362	/// DFA-based regex engine at all (unless you [enable heuristic support for
363	/// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass
364	/// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for
365	/// anchored searches only, but in a cruel sort of joke, many Unicode features
366	/// tend to result in making the regex _not_ one-pass.)
367	///
368	/// # Example
369	///
370	/// This example shows that the `PikeVM` implements Unicode word boundaries
371	/// correctly by default.
372	///
373	/// ```
374	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
375	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
376	///
377	/// let re = PikeVM::new(r"\b\w+\b")?;
378	/// let mut cache = re.create_cache();
379	///
380	/// let mut it = re.find_iter(&mut cache, "Шерлок Холмс");
381	/// assert_eq!(Some(Match::must(`0`, `0`..`12`)), it.next());
382	/// assert_eq!(Some(Match::must(`0`, `13`..`23`)), it.next());
383	/// assert_eq!(None, it.next());
384	/// # Ok::<(), Box<dyn std::error::Error>>(())
385	/// ```
386	#[derive(Clone, Debug)]
387	pub struct PikeVM {
388	config: Config,
389	nfa: NFA,
390	}
391
392	impl PikeVM {
393	/// Parse the given regular expression using the default configuration and
394	/// return the corresponding `PikeVM`.
395	///
396	/// If you want a non-default configuration, then use the [`Builder`] to
397	/// set your own configuration.
398	///
399	/// # Example
400	///
401	/// ```
402	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
403	///
404	/// let re = PikeVM::new("foo[0-9]+bar")?;
405	/// let mut cache = re.create_cache();
406	/// assert_eq!(
407	/// Some(Match::must(`0`, `3`..`14`)),
408	/// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(),
409	/// );
410	/// # Ok::<(), Box<dyn std::error::Error>>(())
411	/// ```
412	#[cfg(feature = "syntax")]
413	pub fn new(pattern: &str) -> Result<PikeVM, BuildError> {
414	PikeVM::builder().build(pattern)
415	}
416
417	/// Like `new`, but parses multiple patterns into a single "multi regex."
418	/// This similarly uses the default regex configuration.
419	///
420	/// # Example
421	///
422	/// ```
423	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
424	///
425	/// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?;
426	/// let mut cache = re.create_cache();
427	///
428	/// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
429	/// assert_eq!(Some(Match::must(`0`, `0`..`3`)), it.next());
430	/// assert_eq!(Some(Match::must(`1`, `4`..`5`)), it.next());
431	/// assert_eq!(Some(Match::must(`0`, `6`..`9`)), it.next());
432	/// assert_eq!(Some(Match::must(`1`, `10`..`14`)), it.next());
433	/// assert_eq!(Some(Match::must(`1`, `15`..`16`)), it.next());
434	/// assert_eq!(Some(Match::must(`0`, `17`..`21`)), it.next());
435	/// assert_eq!(None, it.next());
436	/// # Ok::<(), Box<dyn std::error::Error>>(())
437	/// ```
438	#[cfg(feature = "syntax")]
439	pub fn new_many<P: AsRef<str>>(
440	patterns: &[P],
441	) -> Result<PikeVM, BuildError> {
442	PikeVM::builder().build_many(patterns)
443	}
444
445	/// Like `new`, but builds a PikeVM directly from an NFA. This is useful
446	/// if you already have an NFA, or even if you hand-assembled the NFA.
447	///
448	/// # Example
449	///
450	/// This shows how to hand assemble a regular expression via its HIR,
451	/// compile an NFA from it and build a PikeVM from the NFA.
452	///
453	/// ```
454	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
455	/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
456	///
457	/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
458	/// ClassBytesRange::new(b'0', b'9'),
459	/// ClassBytesRange::new(b'A', b'Z'),
460	/// ClassBytesRange::new(b'_', b'_'),
461	/// ClassBytesRange::new(b'a', b'z'),
462	/// ])));
463	///
464	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
465	/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
466	///
467	/// let re = PikeVM::new_from_nfa(nfa)?;
468	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
469	/// let expected = Some(Match::must(`0`, `3`..`4`));
470	/// re.captures(&mut cache, "!@#A#@!", &mut caps);
471	/// assert_eq!(expected, caps.get_match());
472	///
473	/// # Ok::<(), Box<dyn std::error::Error>>(())
474	/// ```
475	pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> {
476	PikeVM::builder().build_from_nfa(nfa)
477	}
478
479	/// Create a new `PikeVM` that matches every input.
480	///
481	/// # Example
482	///
483	/// ```
484	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
485	///
486	/// let re = PikeVM::always_match()?;
487	/// let mut cache = re.create_cache();
488	///
489	/// let expected = Match::must(`0`, `0`..`0`);
490	/// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next());
491	/// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next());
492	/// # Ok::<(), Box<dyn std::error::Error>>(())
493	/// ```
494	pub fn always_match() -> Result<PikeVM, BuildError> {
495	let nfa = thompson::NFA::always_match();
496	PikeVM::new_from_nfa(nfa)
497	}
498
499	/// Create a new `PikeVM` that never matches any input.
500	///
501	/// # Example
502	///
503	/// ```
504	/// use regex_automata::nfa::thompson::pikevm::PikeVM;
505	///
506	/// let re = PikeVM::never_match()?;
507	/// let mut cache = re.create_cache();
508	///
509	/// assert_eq!(None, re.find_iter(&mut cache, "").next());
510	/// assert_eq!(None, re.find_iter(&mut cache, "foo").next());
511	/// # Ok::<(), Box<dyn std::error::Error>>(())
512	/// ```
513	pub fn never_match() -> Result<PikeVM, BuildError> {
514	let nfa = thompson::NFA::never_match();
515	PikeVM::new_from_nfa(nfa)
516	}
517
518	/// Return a default configuration for a `PikeVM`.
519	///
520	/// This is a convenience routine to avoid needing to import the `Config`
521	/// type when customizing the construction of a `PikeVM`.
522	///
523	/// # Example
524	///
525	/// This example shows how to disable UTF-8 mode. When UTF-8 mode is
526	/// disabled, zero-width matches that split a codepoint are allowed.
527	/// Otherwise they are never reported.
528	///
529	/// In the code below, notice that `""` is permitted to match positions
530	/// that split the encoding of a codepoint.
531	///
532	/// ```
533	/// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match};
534	///
535	/// let re = PikeVM::builder()
536	/// .thompson(thompson::Config::new().utf8(`false`))
537	/// .build(r"")?;
538	/// let mut cache = re.create_cache();
539	///
540	/// let haystack = "a☃z";
541	/// let mut it = re.find_iter(&mut cache, haystack);
542	/// assert_eq!(Some(Match::must(`0`, `0`..`0`)), it.next());
543	/// assert_eq!(Some(Match::must(`0`, `1`..`1`)), it.next());
544	/// assert_eq!(Some(Match::must(`0`, `2`..`2`)), it.next());
545	/// assert_eq!(Some(Match::must(`0`, `3`..`3`)), it.next());
546	/// assert_eq!(Some(Match::must(`0`, `4`..`4`)), it.next());
547	/// assert_eq!(Some(Match::must(`0`, `5`..`5`)), it.next());
548	/// assert_eq!(None, it.next());
549	///
550	/// # Ok::<(), Box<dyn std::error::Error>>(())
551	/// ```
552	pub fn config() -> Config {
553	Config::new()
554	}
555
556	/// Return a builder for configuring the construction of a `PikeVM`.
557	///
558	/// This is a convenience routine to avoid needing to import the
559	/// [`Builder`] type in common cases.
560	///
561	/// # Example
562	///
563	/// This example shows how to use the builder to disable UTF-8 mode
564	/// everywhere.
565	///
566	/// ```
567	/// use regex_automata::{
568	/// nfa::thompson::{self, pikevm::PikeVM},
569	/// util::syntax,
570	/// Match,
571	/// };
572	///
573	/// let re = PikeVM::builder()
574	/// .syntax(syntax::Config::new().utf8(`false`))
575	/// .thompson(thompson::Config::new().utf8(`false`))
576	/// .build(r"foo(?-u:[^b])ar.*")?;
577	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
578	///
579	/// let haystack = b"`\xFE`foo`\xFF`arzz`\xE2\x98\xFF\n`";
580	/// let expected = Some(Match::must(`0`, `1`..`9`));
581	/// re.captures(&mut cache, haystack, &mut caps);
582	/// assert_eq!(expected, caps.get_match());
583	///
584	/// # Ok::<(), Box<dyn std::error::Error>>(())
585	/// ```
586	pub fn builder() -> Builder {
587	Builder::new()
588	}
589
590	/// Create a new empty set of capturing groups that is guaranteed to be
591	/// valid for the search APIs on this `PikeVM`.
592	///
593	/// A `Captures` value created for a specific `PikeVM` cannot be used with
594	/// any other `PikeVM`.
595	///
596	/// This is a convenience function for [`Captures::all`]. See the
597	/// [`Captures`] documentation for an explanation of its alternative
598	/// constructors that permit the `PikeVM` to do less work during a search,
599	/// and thus might make it faster.
600	pub fn create_captures(&self) -> Captures {
601	Captures::all(self.get_nfa().group_info().clone())
602	}
603
604	/// Create a new cache for this `PikeVM`.
605	///
606	/// The cache returned should only be used for searches for this
607	/// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then
608	/// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently,
609	/// [`PikeVM::reset_cache`]).
610	pub fn create_cache(&self) -> Cache {
611	Cache::new(self)
612	}
613
614	/// Reset the given cache such that it can be used for searching with the
615	/// this `PikeVM` (and only this `PikeVM`).
616	///
617	/// A cache reset permits reusing memory already allocated in this cache
618	/// with a different `PikeVM`.
619	///
620	/// # Example
621	///
622	/// This shows how to re-purpose a cache for use with a different `PikeVM`.
623	///
624	/// ```
625	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
626	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
627	///
628	/// let re1 = PikeVM::new(r"\w")?;
629	/// let re2 = PikeVM::new(r"\W")?;
630	///
631	/// let mut cache = re1.create_cache();
632	/// assert_eq!(
633	/// Some(Match::must(`0`, `0`..`2`)),
634	/// re1.find_iter(&mut cache, "Δ").next(),
635	/// );
636	///
637	/// // Using 'cache' with re2 is not allowed. It may result in panics or
638	/// // incorrect results. In order to re-purpose the cache, we must reset
639	/// // it with the PikeVM we'd like to use it with.
640	/// //
641	/// // Similarly, after this reset, using the cache with 're1' is also not
642	/// // allowed.
643	/// re2.reset_cache(&mut cache);
644	/// assert_eq!(
645	/// Some(Match::must(`0`, `0`..`3`)),
646	/// re2.find_iter(&mut cache, "☃").next(),
647	/// );
648	///
649	/// # Ok::<(), Box<dyn std::error::Error>>(())
650	/// ```
651	pub fn reset_cache(&self, cache: &mut Cache) {
652	cache.reset(self);
653	}
654
655	/// Returns the total number of patterns compiled into this `PikeVM`.
656	///
657	/// In the case of a `PikeVM` that contains no patterns, this returns `0`.
658	///
659	/// # Example
660	///
661	/// This example shows the pattern length for a `PikeVM` that never
662	/// matches:
663	///
664	/// ```
665	/// use regex_automata::nfa::thompson::pikevm::PikeVM;
666	///
667	/// let re = PikeVM::never_match()?;
668	/// assert_eq!(re.pattern_len(), `0`);
669	/// # Ok::<(), Box<dyn std::error::Error>>(())
670	/// ```
671	///
672	/// And another example for a `PikeVM` that matches at every position:
673	///
674	/// ```
675	/// use regex_automata::nfa::thompson::pikevm::PikeVM;
676	///
677	/// let re = PikeVM::always_match()?;
678	/// assert_eq!(re.pattern_len(), `1`);
679	/// # Ok::<(), Box<dyn std::error::Error>>(())
680	/// ```
681	///
682	/// And finally, a `PikeVM` that was constructed from multiple patterns:
683	///
684	/// ```
685	/// use regex_automata::nfa::thompson::pikevm::PikeVM;
686	///
687	/// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
688	/// assert_eq!(re.pattern_len(), `3`);
689	/// # Ok::<(), Box<dyn std::error::Error>>(())
690	/// ```
691	pub fn pattern_len(&self) -> usize {
692	self.nfa.pattern_len()
693	}
694
695	/// Return the config for this `PikeVM`.
696	#[inline]
697	pub fn get_config(&self) -> &Config {
698	&self.config
699	}
700
701	/// Returns a reference to the underlying NFA.
702	#[inline]
703	pub fn get_nfa(&self) -> &NFA {
704	&self.nfa
705	}
706	}
707
708	impl PikeVM {
709	/// Returns true if and only if this `PikeVM` matches the given haystack.
710	///
711	/// This routine may short circuit if it knows that scanning future
712	/// input will never lead to a different result. In particular, if the
713	/// underlying NFA enters a match state, then this routine will return
714	/// `true` immediately without inspecting any future input. (Consider how
715	/// this might make a difference given the regex `a+` on the haystack
716	/// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
717	/// but routines like `find` need to continue searching because `+` is
718	/// greedy by default.)
719	///
720	/// # Example
721	///
722	/// This shows basic usage:
723	///
724	/// ```
725	/// use regex_automata::nfa::thompson::pikevm::PikeVM;
726	///
727	/// let re = PikeVM::new("foo[0-9]+bar")?;
728	/// let mut cache = re.create_cache();
729	///
730	/// assert!(re.is_match(&mut cache, "foo12345bar"));
731	/// assert!(!re.is_match(&mut cache, "foobar"));
732	/// # Ok::<(), Box<dyn std::error::Error>>(())
733	/// ```
734	///
735	/// # Example: consistency with search APIs
736	///
737	/// `is_match` is guaranteed to return `true` whenever `find` returns a
738	/// match. This includes searches that are executed entirely within a
739	/// codepoint:
740	///
741	/// ```
742	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input};
743	///
744	/// let re = PikeVM::new("a*")?;
745	/// let mut cache = re.create_cache();
746	///
747	/// assert!(!re.is_match(&mut cache, Input::new("☃").span(`1`..`2`)));
748	/// # Ok::<(), Box<dyn std::error::Error>>(())
749	/// ```
750	///
751	/// Notice that when UTF-8 mode is disabled, then the above reports a
752	/// match because the restriction against zero-width matches that split a
753	/// codepoint has been lifted:
754	///
755	/// ```
756	/// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input};
757	///
758	/// let re = PikeVM::builder()
759	/// .thompson(NFA::config().utf8(`false`))
760	/// .build("a*")?;
761	/// let mut cache = re.create_cache();
762	///
763	/// assert!(re.is_match(&mut cache, Input::new("☃").span(`1`..`2`)));
764	/// # Ok::<(), Box<dyn std::error::Error>>(())
765	/// ```
766	#[inline]
767	pub fn is_match<'h, I: Into<Input<'h>>>(
768	&self,
769	cache: &mut Cache,
770	input: I,
771	) -> bool {
772	let input = input.into().earliest(`true`);
773	self.search_slots(cache, &input, &mut []).is_some()
774	}
775
776	/// Executes a leftmost forward search and returns a `Match` if one exists.
777	///
778	/// This routine only includes the overall match span. To get access to the
779	/// individual spans of each capturing group, use [`PikeVM::captures`].
780	///
781	/// # Example
782	///
783	/// Leftmost first match semantics corresponds to the match with the
784	/// smallest starting offset, but where the end offset is determined by
785	/// preferring earlier branches in the original regular expression. For
786	/// example, `Sam\|Samwise` will match `Sam` in `Samwise`, but `Samwise\|Sam`
787	/// will match `Samwise` in `Samwise`.
788	///
789	/// Generally speaking, the "leftmost first" match is how most backtracking
790	/// regular expressions tend to work. This is in contrast to POSIX-style
791	/// regular expressions that yield "leftmost longest" matches. Namely,
792	/// both `Sam\|Samwise` and `Samwise\|Sam` match `Samwise` when using
793	/// leftmost longest semantics. (This crate does not currently support
794	/// leftmost longest semantics.)
795	///
796	/// ```
797	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
798	///
799	/// let re = PikeVM::new("foo[0-9]+")?;
800	/// let mut cache = re.create_cache();
801	/// let expected = Match::must(`0`, `0`..`8`);
802	/// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
803	///
804	/// // Even though a match is found after reading the first byte (`a`),
805	/// // the leftmost first match semantics demand that we find the earliest
806	/// // match that prefers earlier parts of the pattern over later parts.
807	/// let re = PikeVM::new("abc\|a")?;
808	/// let mut cache = re.create_cache();
809	/// let expected = Match::must(`0`, `0`..`3`);
810	/// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
811	///
812	/// # Ok::<(), Box<dyn std::error::Error>>(())
813	/// ```
814	#[inline]
815	pub fn find<'h, I: Into<Input<'h>>>(
816	&self,
817	cache: &mut Cache,
818	input: I,
819	) -> Option<Match> {
820	let input = input.into();
821	if self.get_nfa().pattern_len() == `1` {
822	let mut slots = [None, None];
823	let pid = self.search_slots(cache, &input, &mut slots)?;
824	let start = slots[`0`]?.get();
825	let end = slots[`1`]?.get();
826	return Some(Match::new(pid, Span { start, end }));
827	}
828	let ginfo = self.get_nfa().group_info();
829	let slots_len = ginfo.implicit_slot_len();
830	let mut slots = vec![None; slots_len];
831	let pid = self.search_slots(cache, &input, &mut slots)?;
832	let start = slots[pid.as_usize() * `2`]?.get();
833	let end = slots[pid.as_usize() * `2` + `1`]?.get();
834	Some(Match::new(pid, Span { start, end }))
835	}
836
837	/// Executes a leftmost forward search and writes the spans of capturing
838	/// groups that participated in a match into the provided [`Captures`]
839	/// value. If no match was found, then [`Captures::is_match`] is guaranteed
840	/// to return `false`.
841	///
842	/// # Example
843	///
844	/// ```
845	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
846	///
847	/// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
848	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
849	///
850	/// re.captures(&mut cache, "2010-03-14", &mut caps);
851	/// assert!(caps.is_match());
852	/// assert_eq!(Some(Span::from(`0`..`4`)), caps.get_group(`1`));
853	/// assert_eq!(Some(Span::from(`5`..`7`)), caps.get_group(`2`));
854	/// assert_eq!(Some(Span::from(`8`..`10`)), caps.get_group(`3`));
855	///
856	/// # Ok::<(), Box<dyn std::error::Error>>(())
857	/// ```
858	#[inline]
859	pub fn captures<'h, I: Into<Input<'h>>>(
860	&self,
861	cache: &mut Cache,
862	input: I,
863	caps: &mut Captures,
864	) {
865	self.search(cache, &input.into(), caps)
866	}
867
868	/// Returns an iterator over all non-overlapping leftmost matches in the
869	/// given bytes. If no match exists, then the iterator yields no elements.
870	///
871	/// # Example
872	///
873	/// ```
874	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
875	///
876	/// let re = PikeVM::new("foo[0-9]+")?;
877	/// let mut cache = re.create_cache();
878	///
879	/// let text = "foo1 foo12 foo123";
880	/// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
881	/// assert_eq!(matches, vec![
882	/// Match::must(`0`, `0`..`4`),
883	/// Match::must(`0`, `5`..`10`),
884	/// Match::must(`0`, `11`..`17`),
885	/// ]);
886	/// # Ok::<(), Box<dyn std::error::Error>>(())
887	/// ```
888	#[inline]
889	pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
890	&'r self,
891	cache: &'c mut Cache,
892	input: I,
893	) -> FindMatches<'r, 'c, 'h> {
894	let caps = Captures::matches(self.get_nfa().group_info().clone());
895	let it = iter::Searcher::new(input.into());
896	FindMatches { re: self, cache, caps, it }
897	}
898
899	/// Returns an iterator over all non-overlapping `Captures` values. If no
900	/// match exists, then the iterator yields no elements.
901	///
902	/// This yields the same matches as [`PikeVM::find_iter`], but it includes
903	/// the spans of all capturing groups that participate in each match.
904	///
905	/// Tip:* See* [`util::iter::Searcher`](crate::util::iter::Searcher) for
906	/// how to correctly iterate over all matches in a haystack while avoiding
907	/// the creation of a new `Captures` value for every match. (Which you are
908	/// forced to do with an `Iterator`.)
909	///
910	/// # Example
911	///
912	/// ```
913	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
914	///
915	/// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?;
916	/// let mut cache = re.create_cache();
917	///
918	/// let text = "foo1 foo12 foo123";
919	/// let matches: Vec<Span> = re
920	/// .captures_iter(&mut cache, text)
921	/// // The unwrap is OK since 'numbers' matches if the pattern matches.
922	/// .map(\|caps\| caps.get_group_by_name("numbers").unwrap())
923	/// .collect();
924	/// assert_eq!(matches, vec![
925	/// Span::from(`3`..`4`),
926	/// Span::from(`8`..`10`),
927	/// Span::from(`14`..`17`),
928	/// ]);
929	/// # Ok::<(), Box<dyn std::error::Error>>(())
930	/// ```
931	#[inline]
932	pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
933	&'r self,
934	cache: &'c mut Cache,
935	input: I,
936	) -> CapturesMatches<'r, 'c, 'h> {
937	let caps = self.create_captures();
938	let it = iter::Searcher::new(input.into());
939	CapturesMatches { re: self, cache, caps, it }
940	}
941	}
942
943	impl PikeVM {
944	/// Executes a leftmost forward search and writes the spans of capturing
945	/// groups that participated in a match into the provided [`Captures`]
946	/// value. If no match was found, then [`Captures::is_match`] is guaranteed
947	/// to return `false`.
948	///
949	/// This is like [`PikeVM::captures`], but it accepts a concrete `&Input`
950	/// instead of an `Into<Input>`.
951	///
952	/// # Example: specific pattern search
953	///
954	/// This example shows how to build a multi-PikeVM that permits searching
955	/// for specific patterns.
956	///
957	/// ```
958	/// use regex_automata::{
959	/// nfa::thompson::pikevm::PikeVM,
960	/// Anchored, Match, PatternID, Input,
961	/// };
962	///
963	/// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
964	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
965	/// let haystack = "foo123";
966	///
967	/// // Since we are using the default leftmost-first match and both
968	/// // patterns match at the same starting position, only the first pattern
969	/// // will be returned in this case when doing a search for any of the
970	/// // patterns.
971	/// let expected = Some(Match::must(`0`, `0`..`6`));
972	/// re.search(&mut cache, &Input::new(haystack), &mut caps);
973	/// assert_eq!(expected, caps.get_match());
974	///
975	/// // But if we want to check whether some other pattern matches, then we
976	/// // can provide its pattern ID.
977	/// let expected = Some(Match::must(`1`, `0`..`6`));
978	/// let input = Input::new(haystack)
979	/// .anchored(Anchored::Pattern(PatternID::must(`1`)));
980	/// re.search(&mut cache, &input, &mut caps);
981	/// assert_eq!(expected, caps.get_match());
982	///
983	/// # Ok::<(), Box<dyn std::error::Error>>(())
984	/// ```
985	///
986	/// # Example: specifying the bounds of a search
987	///
988	/// This example shows how providing the bounds of a search can produce
989	/// different results than simply sub-slicing the haystack.
990	///
991	/// ```
992	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
993	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
994	///
995	/// let re = PikeVM::new(r"\b[0-9]{3}\b")?;
996	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
997	/// let haystack = "foo123bar";
998	///
999	/// // Since we sub-slice the haystack, the search doesn't know about
1000	/// // the larger context and assumes that `123` is surrounded by word
1001	/// // boundaries. And of course, the match position is reported relative
1002	/// // to the sub-slice as well, which means we get `0..3` instead of
1003	/// // `3..6`.
1004	/// let expected = Some(Match::must(`0`, `0`..`3`));
1005	/// re.search(&mut cache, &Input::new(&haystack[`3`..`6`]), &mut caps);
1006	/// assert_eq!(expected, caps.get_match());
1007	///
1008	/// // But if we provide the bounds of the search within the context of the
1009	/// // entire haystack, then the search can take the surrounding context
1010	/// // into account. (And if we did find a match, it would be reported
1011	/// // as a valid offset into `haystack` instead of its sub-slice.)
1012	/// let expected = None;
1013	/// let input = Input::new(haystack).range(`3`..`6`);
1014	/// re.search(&mut cache, &input, &mut caps);
1015	/// assert_eq!(expected, caps.get_match());
1016	///
1017	/// # Ok::<(), Box<dyn std::error::Error>>(())
1018	/// ```
1019	#[inline]
1020	pub fn search(
1021	&self,
1022	cache: &mut Cache,
1023	input: &Input<'_>,
1024	caps: &mut Captures,
1025	) {
1026	caps.set_pattern(None);
1027	let pid = self.search_slots(cache, input, caps.slots_mut());
1028	caps.set_pattern(pid);
1029	}
1030
1031	/// Executes a leftmost forward search and writes the spans of capturing
1032	/// groups that participated in a match into the provided `slots`, and
1033	/// returns the matching pattern ID. The contents of the slots for patterns
1034	/// other than the matching pattern are unspecified. If no match was found,
1035	/// then `None` is returned and the contents of `slots` is unspecified.
1036	///
1037	/// This is like [`PikeVM::search`], but it accepts a raw slots slice
1038	/// instead of a `Captures` value. This is useful in contexts where you
1039	/// don't want or need to allocate a `Captures`.
1040	///
1041	/// It is legal to pass _any_ number of slots to this routine. If the regex
1042	/// engine would otherwise write a slot offset that doesn't fit in the
1043	/// provided slice, then it is simply skipped. In general though, there are
1044	/// usually three slice lengths you might want to use:
1045	///
1046	/// An empty slice, if you only care about which pattern matched.*
1047	/// A slice with*
1048	/// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
1049	/// slots, if you only care about the overall match spans for each matching
1050	/// pattern.
1051	/// A slice with*
1052	/// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1053	/// permits recording match offsets for every capturing group in every
1054	/// pattern.
1055	///
1056	/// # Example
1057	///
1058	/// This example shows how to find the overall match offsets in a
1059	/// multi-pattern search without allocating a `Captures` value. Indeed, we
1060	/// can put our slots right on the stack.
1061	///
1062	/// ```
1063	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1064	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input};
1065	///
1066	/// let re = PikeVM::new_many(&[
1067	/// r"\pL+",
1068	/// r"\d+",
1069	/// ])?;
1070	/// let mut cache = re.create_cache();
1071	/// let input = Input::new("!@#123");
1072	///
1073	/// // We only care about the overall match offsets here, so we just
1074	/// // allocate two slots for each pattern. Each slot records the start
1075	/// // and end of the match.
1076	/// let mut slots = [None; `4`];
1077	/// let pid = re.search_slots(&mut cache, &input, &mut slots);
1078	/// assert_eq!(Some(PatternID::must(`1`)), pid);
1079	///
1080	/// // The overall match offsets are always at 'pid 2' and 'pid * 2 + 1'.*
1081	/// // See 'GroupInfo' for more details on the mapping between groups and
1082	/// // slot indices.
1083	/// let slot_start = pid.unwrap().as_usize() * `2`;
1084	/// let slot_end = slot_start + `1`;
1085	/// assert_eq!(Some(`3`), slots[slot_start].map(\|s\| s.get()));
1086	/// assert_eq!(Some(`6`), slots[slot_end].map(\|s\| s.get()));
1087	///
1088	/// # Ok::<(), Box<dyn std::error::Error>>(())
1089	/// ```
1090	#[inline]
1091	pub fn search_slots(
1092	&self,
1093	cache: &mut Cache,
1094	input: &Input<'_>,
1095	slots: &mut [Option<NonMaxUsize>],
1096	) -> Option<PatternID> {
1097	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1098	if !utf8empty {
1099	let hm = self.search_slots_imp(cache, input, slots)?;
1100	return Some(hm.pattern());
1101	}
1102	// There is an unfortunate special case where if the regex can
1103	// match the empty string and UTF-8 mode is enabled, the search
1104	// implementation requires that the slots have at least as much space
1105	// to report the bounds of any match. This is so zero-width matches
1106	// that split a codepoint can be filtered out.
1107	//
1108	// Note that if utf8empty is true, we specialize the case for when
1109	// the number of patterns is 1. In that case, we can just use a stack
1110	// allocation. Otherwise we resort to a heap allocation, which we
1111	// convince ourselves we're fine with due to the pathological nature of
1112	// this case.
1113	let min = self.get_nfa().group_info().implicit_slot_len();
1114	if slots.len() >= min {
1115	let hm = self.search_slots_imp(cache, input, slots)?;
1116	return Some(hm.pattern());
1117	}
1118	if self.get_nfa().pattern_len() == `1` {
1119	let mut enough = [None, None];
1120	let got = self.search_slots_imp(cache, input, &mut enough);
1121	// This is OK because we know `enough` is strictly bigger than
1122	// `slots`, otherwise this special case isn't reached.
1123	slots.copy_from_slice(&enough[..slots.len()]);
1124	return got.map(\|hm\| hm.pattern());
1125	}
1126	let mut enough = vec![None; min];
1127	let got = self.search_slots_imp(cache, input, &mut enough);
1128	// This is OK because we know `enough` is strictly bigger than `slots`,
1129	// otherwise this special case isn't reached.
1130	slots.copy_from_slice(&enough[..slots.len()]);
1131	got.map(\|hm\| hm.pattern())
1132	}
1133
1134	/// This is the actual implementation of `search_slots_imp` that
1135	/// doesn't account for the special case when 1) the NFA has UTF-8 mode
1136	/// enabled, 2) the NFA can match the empty string and 3) the caller has
1137	/// provided an insufficient number of slots to record match offsets.
1138	#[inline(never)]
1139	fn search_slots_imp(
1140	&self,
1141	cache: &mut Cache,
1142	input: &Input<'_>,
1143	slots: &mut [Option<NonMaxUsize>],
1144	) -> Option<HalfMatch> {
1145	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1146	let hm = match self.search_imp(cache, input, slots) {
1147	None => return None,
1148	Some(hm) if !utf8empty => return Some(hm),
1149	Some(hm) => hm,
1150	};
1151	empty::skip_splits_fwd(input, hm, hm.offset(), \|input\| {
1152	Ok(self
1153	.search_imp(cache, input, slots)
1154	.map(\|hm\| (hm, hm.offset())))
1155	})
1156	// OK because the PikeVM never errors.
1157	.unwrap()
1158	}
1159
1160	/// Writes the set of patterns that match anywhere in the given search
1161	/// configuration to `patset`. If multiple patterns match at the same
1162	/// position and this `PikeVM` was configured with [`MatchKind::All`]
1163	/// semantics, then all matching patterns are written to the given set.
1164	///
1165	/// Unless all of the patterns in this `PikeVM` are anchored, then
1166	/// generally speaking, this will visit every byte in the haystack.
1167	///
1168	/// This search routine does not* clear the pattern set. This gives some*
1169	/// flexibility to the caller (e.g., running multiple searches with the
1170	/// same pattern set), but does make the API bug-prone if you're reusing
1171	/// the same pattern set for multiple searches but intended them to be
1172	/// independent.
1173	///
1174	/// If a pattern ID matched but the given `PatternSet` does not have
1175	/// sufficient capacity to store it, then it is not inserted and silently
1176	/// dropped.
1177	///
1178	/// # Example
1179	///
1180	/// This example shows how to find all matching patterns in a haystack,
1181	/// even when some patterns match at the same position as other patterns.
1182	///
1183	/// ```
1184	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1185	/// use regex_automata::{
1186	/// nfa::thompson::pikevm::PikeVM,
1187	/// Input, MatchKind, PatternSet,
1188	/// };
1189	///
1190	/// let patterns = &[
1191	/// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
1192	/// ];
1193	/// let re = PikeVM::builder()
1194	/// .configure(PikeVM::config().match_kind(MatchKind::All))
1195	/// .build_many(patterns)?;
1196	/// let mut cache = re.create_cache();
1197	///
1198	/// let input = Input::new("foobar");
1199	/// let mut patset = PatternSet::new(re.pattern_len());
1200	/// re.which_overlapping_matches(&mut cache, &input, &mut patset);
1201	/// let expected = vec![`0`, `2`, `3`, `4`, `6`];
1202	/// let got: Vec<usize> = patset.iter().map(\|p\| p.as_usize()).collect();
1203	/// assert_eq!(expected, got);
1204	///
1205	/// # Ok::<(), Box<dyn std::error::Error>>(())
1206	/// ```
1207	#[inline]
1208	pub fn which_overlapping_matches(
1209	&self,
1210	cache: &mut Cache,
1211	input: &Input<'_>,
1212	patset: &mut PatternSet,
1213	) {
1214	self.which_overlapping_imp(cache, input, patset)
1215	}
1216	}
1217
1218	impl PikeVM {
1219	/// The implementation of standard leftmost search.
1220	///
1221	/// Capturing group spans are written to `slots`, but only if requested.
1222	/// `slots` can be any length. Any slot in the NFA that is activated but
1223	/// which is out of bounds for the given `slots` is ignored.
1224	fn search_imp(
1225	&self,
1226	cache: &mut Cache,
1227	input: &Input<'_>,
1228	slots: &mut [Option<NonMaxUsize>],
1229	) -> Option<HalfMatch> {
1230	cache.setup_search(slots.len());
1231	if input.is_done() {
1232	return None;
1233	}
1234	// Why do we even care about this? Well, in our 'Captures'
1235	// representation, we use usize::MAX as a sentinel to indicate "no
1236	// match." This isn't problematic so long as our haystack doesn't have
1237	// a maximal length. Byte slices are guaranteed by Rust to have a
1238	// length that fits into isize, and so this assert should always pass.
1239	// But we put it here to make our assumption explicit.
1240	assert!(
1241	input.haystack().len() < core::usize::MAX,
1242	"byte slice lengths must be less than usize MAX",
1243	);
1244	instrument!(\|c\| c.reset(&self.nfa));
1245
1246	// Whether we want to visit all match states instead of emulating the
1247	// 'leftmost' semantics of typical backtracking regex engines.
1248	let allmatches =
1249	self.config.get_match_kind().continue_past_first_match();
1250	let (anchored, start_id) = match self.start_config(input) {
1251	None => return None,
1252	Some(config) => config,
1253	};
1254
1255	let pre =
1256	if anchored { None } else { self.get_config().get_prefilter() };
1257	let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
1258	let mut hm = None;
1259	// Yes, our search doesn't end at input.end(), but includes it. This
1260	// is necessary because matches are delayed by one byte, just like
1261	// how the DFA engines work. The delay is used to handle look-behind
1262	// assertions. In the case of the PikeVM, the delay is implemented
1263	// by not considering a match to exist until it is visited in
1264	// 'steps'. Technically, we know a match exists in the previous
1265	// iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA
1266	// determinization. We don't mark a DFA state as a match state if it
1267	// contains an NFA match state, but rather, whether the DFA state was
1268	// generated by a transition from a DFA state that contains an NFA
1269	// match state.)
1270	let mut at = input.start();
1271	while at <= input.end() {
1272	// If we have no states left to visit, then there are some cases
1273	// where we know we can quit early or even skip ahead.
1274	if curr.set.is_empty() {
1275	// We have a match and we haven't been instructed to continue
1276	// on even after finding a match, so we can quit.
1277	if hm.is_some() && !allmatches {
1278	break;
1279	}
1280	// If we're running an anchored search and we've advanced
1281	// beyond the start position with no other states to try, then
1282	// we will never observe a match and thus can stop.
1283	if anchored && at > input.start() {
1284	break;
1285	}
1286	// If there no states left to explore at this position and we
1287	// know we can't terminate early, then we are effectively at
1288	// the starting state of the NFA. If we fell through here,
1289	// we'd end up adding our '(?s-u:.)?' prefix and it would be*
1290	// the only thing in 'curr'. So we might as well just skip
1291	// ahead until we find something that we know might advance us
1292	// forward.
1293	if let Some(ref pre) = pre {
1294	let span = Span::from(at..input.end());
1295	match pre.find(input.haystack(), span) {
1296	None => break,
1297	Some(ref span) => at = span.start,
1298	}
1299	}
1300	}
1301	// Instead of using the NFA's unanchored start state, we actually
1302	// always use its anchored starting state. As a result, when doing
1303	// an unanchored search, we need to simulate our own '(?s-u:.)?'*
1304	// prefix, to permit a match to appear anywhere.
1305	//
1306	// Now, we don't have* to do things this way. We could use the*
1307	// NFA's unanchored starting state and do one 'epsilon_closure'
1308	// call from that starting state before the main loop here. And
1309	// that is just as correct. However, it turns out to be slower
1310	// than our approach here because it slightly increases the cost
1311	// of processing each byte by requiring us to visit more NFA
1312	// states to deal with the additional NFA states in the unanchored
1313	// prefix. By simulating it explicitly here, we lower those costs
1314	// substantially. The cost is itself small, but it adds up for
1315	// large haystacks.
1316	//
1317	// In order to simulate the '(?s-u:.)?' prefix---which is not*
1318	// greedy---we are careful not to perform an epsilon closure on
1319	// the start state if we already have a match. Namely, if we
1320	// did otherwise, we would never reach a terminating condition
1321	// because there would always be additional states to process.
1322	// In effect, the exclusion of running 'epsilon_closure' when
1323	// we have a match corresponds to the "dead" states we have in
1324	// our DFA regex engines. Namely, in a DFA, match states merely
1325	// instruct the search execution to record the current offset as
1326	// the most recently seen match. It is the dead state that actually
1327	// indicates when to stop the search (other than EOF or quit
1328	// states).
1329	//
1330	// However, when 'allmatches' is true, the caller has asked us to
1331	// leave in every possible match state. This tends not to make a
1332	// whole lot of sense in unanchored searches, because it means the
1333	// search really cannot terminate until EOF. And often, in that
1334	// case, you wind up skipping over a bunch of matches and are left
1335	// with the "last" match. Arguably, it just doesn't make a lot of
1336	// sense to run a 'leftmost' search (which is what this routine is)
1337	// with 'allmatches' set to true. But the DFAs support it and this
1338	// matches their behavior. (Generally, 'allmatches' is useful for
1339	// overlapping searches or leftmost anchored searches to find the
1340	// longest possible match by ignoring match priority.)
1341	//
1342	// Additionally, when we're running an anchored search, this
1343	// epsilon closure should only be computed at the beginning of the
1344	// search. If we re-computed it at every position, we would be
1345	// simulating an unanchored search when we were tasked to perform
1346	// an anchored search.
1347	if (!hm.is_some() \|\| allmatches)
1348	&& (!anchored \|\| at == input.start())
1349	{
1350	// Since we are adding to the 'curr' active states and since
1351	// this is for the start ID, we use a slots slice that is
1352	// guaranteed to have the right length but where every element
1353	// is absent. This is exactly what we want, because this
1354	// epsilon closure is responsible for simulating an unanchored
1355	// '(?s:.)?' prefix. It is specifically outside of any*
1356	// capturing groups, and thus, using slots that are always
1357	// absent is correct.
1358	//
1359	// Note though that we can't just use '&mut []' here, since
1360	// this epsilon closure may traverse through 'Captures' epsilon
1361	// transitions, and thus must be able to write offsets to the
1362	// slots given which are later copied to slot values in 'curr'.
1363	let slots = next.slot_table.all_absent();
1364	self.epsilon_closure(stack, slots, curr, input, at, start_id);
1365	}
1366	if let Some(pid) = self.nexts(stack, curr, next, input, at, slots)
1367	{
1368	hm = Some(HalfMatch::new(pid, at));
1369	}
1370	// Unless the caller asked us to return early, we need to mush on
1371	// to see if we can extend our match. (But note that 'nexts' will
1372	// quit right after seeing a match when match_kind==LeftmostFirst,
1373	// as is consistent with leftmost-first match priority.)
1374	if input.get_earliest() && hm.is_some() {
1375	break;
1376	}
1377	core::mem::swap(curr, next);
1378	next.set.clear();
1379	at += `1`;
1380	}
1381	instrument!(\|c\| c.eprint(&self.nfa));
1382	hm
1383	}
1384
1385	/// The implementation for the 'which_overlapping_matches' API. Basically,
1386	/// we do a single scan through the entire haystack (unless our regex
1387	/// or search is anchored) and record every pattern that matched. In
1388	/// particular, when MatchKind::All is used, this supports overlapping
1389	/// matches. So if we have the regexes 'sam' and 'samwise', they will
1390	/// both* be reported in the pattern set when searching the haystack*
1391	/// 'samwise'.
1392	fn which_overlapping_imp(
1393	&self,
1394	cache: &mut Cache,
1395	input: &Input<'_>,
1396	patset: &mut PatternSet,
1397	) {
1398	// NOTE: This is effectively a copy of 'search_imp' above, but with no
1399	// captures support and instead writes patterns that matched directly
1400	// to 'patset'. See that routine for better commentary about what's
1401	// going on in this routine. We probably could unify the routines using
1402	// generics or more helper routines, but I'm not sure it's worth it.
1403	//
1404	// NOTE: We somewhat go out of our way here to support things like
1405	// 'input.get_earliest()' and 'leftmost-first' match semantics. Neither
1406	// of those seem particularly relevant to this routine, but they are
1407	// both supported by the DFA analogs of this routine by construction
1408	// and composition, so it seems like good sense to have the PikeVM
1409	// match that behavior.
1410
1411	cache.setup_search(`0`);
1412	if input.is_done() {
1413	return;
1414	}
1415	assert!(
1416	input.haystack().len() < core::usize::MAX,
1417	"byte slice lengths must be less than usize MAX",
1418	);
1419	instrument!(\|c\| c.reset(&self.nfa));
1420
1421	let allmatches =
1422	self.config.get_match_kind().continue_past_first_match();
1423	let (anchored, start_id) = match self.start_config(input) {
1424	None => return,
1425	Some(config) => config,
1426	};
1427
1428	let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
1429	for at in input.start()..=input.end() {
1430	let any_matches = !patset.is_empty();
1431	if curr.set.is_empty() {
1432	if any_matches && !allmatches {
1433	break;
1434	}
1435	if anchored && at > input.start() {
1436	break;
1437	}
1438	}
1439	if !any_matches \|\| allmatches {
1440	let slots = &mut [];
1441	self.epsilon_closure(stack, slots, curr, input, at, start_id);
1442	}
1443	self.nexts_overlapping(stack, curr, next, input, at, patset);
1444	// If we found a match and filled our set, then there is no more
1445	// additional info that we can provide. Thus, we can quit. We also
1446	// quit if the caller asked us to stop at the earliest point that
1447	// we know a match exists.
1448	if patset.is_full() \|\| input.get_earliest() {
1449	break;
1450	}
1451	core::mem::swap(curr, next);
1452	next.set.clear();
1453	}
1454	instrument!(\|c\| c.eprint(&self.nfa));
1455	}
1456
1457	/// Process the active states in 'curr' to find the states (written to
1458	/// 'next') we should process for the next byte in the haystack.
1459	///
1460	/// 'stack' is used to perform a depth first traversal of the NFA when
1461	/// computing an epsilon closure.
1462	///
1463	/// When a match is found, the slots for that match state (in 'curr') are
1464	/// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
1465	/// stops (unless the PikeVM was configured with MatchKind::All semantics).
1466	#[cfg_attr(feature = "perf-inline", inline(always))]
1467	fn nexts(
1468	&self,
1469	stack: &mut Vec<FollowEpsilon>,
1470	curr: &mut ActiveStates,
1471	next: &mut ActiveStates,
1472	input: &Input<'_>,
1473	at: usize,
1474	slots: &mut [Option<NonMaxUsize>],
1475	) -> Option<PatternID> {
1476	instrument!(\|c\| c.record_state_set(&curr.set));
1477	let mut pid = None;
1478	let ActiveStates { ref set, ref mut slot_table } = *curr;
1479	for sid in set.iter() {
1480	pid = match self.next(stack, slot_table, next, input, at, sid) {
1481	None => continue,
1482	Some(pid) => Some(pid),
1483	};
1484	slots.copy_from_slice(slot_table.for_state(sid));
1485	if !self.config.get_match_kind().continue_past_first_match() {
1486	break;
1487	}
1488	}
1489	pid
1490	}
1491
1492	/// Like 'nexts', but for the overlapping case. This doesn't write any
1493	/// slots, and instead just writes which pattern matched in 'patset'.
1494	#[cfg_attr(feature = "perf-inline", inline(always))]
1495	fn nexts_overlapping(
1496	&self,
1497	stack: &mut Vec<FollowEpsilon>,
1498	curr: &mut ActiveStates,
1499	next: &mut ActiveStates,
1500	input: &Input<'_>,
1501	at: usize,
1502	patset: &mut PatternSet,
1503	) {
1504	instrument!(\|c\| c.record_state_set(&curr.set));
1505	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1506	let ActiveStates { ref set, ref mut slot_table } = *curr;
1507	for sid in set.iter() {
1508	let pid = match self.next(stack, slot_table, next, input, at, sid)
1509	{
1510	None => continue,
1511	Some(pid) => pid,
1512	};
1513	// This handles the case of finding a zero-width match that splits
1514	// a codepoint. Namely, if we're in UTF-8 mode AND we know we can
1515	// match the empty string, then the only valid way of getting to
1516	// this point with an offset that splits a codepoint is when we
1517	// have an empty match. Such matches, in UTF-8 mode, must not be
1518	// reported. So we just skip them here and pretend as if we did
1519	// not see a match.
1520	if utf8empty && !input.is_char_boundary(at) {
1521	continue;
1522	}
1523	let _ = patset.try_insert(pid);
1524	if !self.config.get_match_kind().continue_past_first_match() {
1525	break;
1526	}
1527	}
1528	}
1529
1530	/// Starting from 'sid', if the position 'at' in the 'input' haystack has a
1531	/// transition defined out of 'sid', then add the state transitioned to and
1532	/// its epsilon closure to the 'next' set of states to explore.
1533	///
1534	/// 'stack' is used by the epsilon closure computation to perform a depth
1535	/// first traversal of the NFA.
1536	///
1537	/// 'curr_slot_table' should be the table of slots for the current set of
1538	/// states being explored. If there is a transition out of 'sid', then
1539	/// sid's row in the slot table is used to perform the epsilon closure.
1540	#[cfg_attr(feature = "perf-inline", inline(always))]
1541	fn next(
1542	&self,
1543	stack: &mut Vec<FollowEpsilon>,
1544	curr_slot_table: &mut SlotTable,
1545	next: &mut ActiveStates,
1546	input: &Input<'_>,
1547	at: usize,
1548	sid: StateID,
1549	) -> Option<PatternID> {
1550	instrument!(\|c\| c.record_step(sid));
1551	match *self.nfa.state(sid) {
1552	State::Fail
1553	\| State::Look { .. }
1554	\| State::Union { .. }
1555	\| State::BinaryUnion { .. }
1556	\| State::Capture { .. } => None,
1557	State::ByteRange { ref trans } => {
1558	if trans.matches(input.haystack(), at) {
1559	let slots = curr_slot_table.for_state(sid);
1560	// OK because 'at <= haystack.len() < usize::MAX', so
1561	// adding 1 will never wrap.
1562	let at = at.wrapping_add(`1`);
1563	self.epsilon_closure(
1564	stack, slots, next, input, at, trans.next,
1565	);
1566	}
1567	None
1568	}
1569	State::Sparse(ref sparse) => {
1570	if let Some(next_sid) = sparse.matches(input.haystack(), at) {
1571	let slots = curr_slot_table.for_state(sid);
1572	// OK because 'at <= haystack.len() < usize::MAX', so
1573	// adding 1 will never wrap.
1574	let at = at.wrapping_add(`1`);
1575	self.epsilon_closure(
1576	stack, slots, next, input, at, next_sid,
1577	);
1578	}
1579	None
1580	}
1581	State::Dense(ref dense) => {
1582	if let Some(next_sid) = dense.matches(input.haystack(), at) {
1583	let slots = curr_slot_table.for_state(sid);
1584	// OK because 'at <= haystack.len() < usize::MAX', so
1585	// adding 1 will never wrap.
1586	let at = at.wrapping_add(`1`);
1587	self.epsilon_closure(
1588	stack, slots, next, input, at, next_sid,
1589	);
1590	}
1591	None
1592	}
1593	State::Match { pattern_id } => Some(pattern_id),
1594	}
1595	}
1596
1597	/// Compute the epsilon closure of 'sid', writing the closure into 'next'
1598	/// while copying slot values from 'curr_slots' into corresponding states
1599	/// in 'next'. 'curr_slots' should be the slot values corresponding to
1600	/// 'sid'.
1601	///
1602	/// The given 'stack' is used to perform a depth first traversal of the
1603	/// NFA by recursively following all epsilon transitions out of 'sid'.
1604	/// Conditional epsilon transitions are followed if and only if they are
1605	/// satisfied for the position 'at' in the 'input' haystack.
1606	///
1607	/// While this routine may write to 'curr_slots', once it returns, any
1608	/// writes are undone and the original values (even if absent) are
1609	/// restored.
1610	#[cfg_attr(feature = "perf-inline", inline(always))]
1611	fn epsilon_closure(
1612	&self,
1613	stack: &mut Vec<FollowEpsilon>,
1614	curr_slots: &mut [Option<NonMaxUsize>],
1615	next: &mut ActiveStates,
1616	input: &Input<'_>,
1617	at: usize,
1618	sid: StateID,
1619	) {
1620	instrument!(\|c\| {
1621	c.record_closure(sid);
1622	c.record_stack_push(sid);
1623	});
1624	stack.push(FollowEpsilon::Explore(sid));
1625	while let Some(frame) = stack.pop() {
1626	match frame {
1627	FollowEpsilon::RestoreCapture { slot, offset: pos } => {
1628	curr_slots[slot] = pos;
1629	}
1630	FollowEpsilon::Explore(sid) => {
1631	self.epsilon_closure_explore(
1632	stack, curr_slots, next, input, at, sid,
1633	);
1634	}
1635	}
1636	}
1637	}
1638
1639	/// Explore all of the epsilon transitions out of 'sid'. This is mostly
1640	/// split out from 'epsilon_closure' in order to clearly delineate
1641	/// the actual work of computing an epsilon closure from the stack
1642	/// book-keeping.
1643	///
1644	/// This will push any additional explorations needed on to 'stack'.
1645	///
1646	/// 'curr_slots' should refer to the slots for the currently active NFA
1647	/// state. That is, the current state we are stepping through. These
1648	/// slots are mutated in place as new 'Captures' states are traversed
1649	/// during epsilon closure, but the slots are restored to their original
1650	/// values once the full epsilon closure is completed. The ultimate use of
1651	/// 'curr_slots' is to copy them to the corresponding 'next_slots', so that
1652	/// the capturing group spans are forwarded from the currently active state
1653	/// to the next.
1654	///
1655	/// 'next' refers to the next set of active states. Computing an epsilon
1656	/// closure may increase the next set of active states.
1657	///
1658	/// 'input' refers to the caller's input configuration and 'at' refers to
1659	/// the current position in the haystack. These are used to check whether
1660	/// conditional epsilon transitions (like look-around) are satisfied at
1661	/// the current position. If they aren't, then the epsilon closure won't
1662	/// include them.
1663	#[cfg_attr(feature = "perf-inline", inline(always))]
1664	fn epsilon_closure_explore(
1665	&self,
1666	stack: &mut Vec<FollowEpsilon>,
1667	curr_slots: &mut [Option<NonMaxUsize>],
1668	next: &mut ActiveStates,
1669	input: &Input<'_>,
1670	at: usize,
1671	mut sid: StateID,
1672	) {
1673	// We can avoid pushing some state IDs on to our stack in precisely
1674	// the cases where a 'push(x)' would be immediately followed by a 'x
1675	// = pop()'. This is achieved by this outer-loop. We simply set 'sid'
1676	// to be the next state ID we want to explore once we're done with
1677	// our initial exploration. In practice, this avoids a lot of stack
1678	// thrashing.
1679	loop {
1680	instrument!(\|c\| c.record_set_insert(sid));
1681	// Record this state as part of our next set of active states. If
1682	// we've already explored it, then no need to do it again.
1683	if !next.set.insert(sid) {
1684	return;
1685	}
1686	match *self.nfa.state(sid) {
1687	State::Fail
1688	\| State::Match { .. }
1689	\| State::ByteRange { .. }
1690	\| State::Sparse { .. }
1691	\| State::Dense { .. } => {
1692	next.slot_table.for_state(sid).copy_from_slice(curr_slots);
1693	return;
1694	}
1695	State::Look { look, next } => {
1696	// OK because we don't permit building a searcher with a
1697	// Unicode word boundary if the requisite Unicode data is
1698	// unavailable.
1699	if !self.nfa.look_matcher().matches_inline(
1700	look,
1701	input.haystack(),
1702	at,
1703	) {
1704	return;
1705	}
1706	sid = next;
1707	}
1708	State::Union { ref alternates } => {
1709	sid = match alternates.get(`0`) {
1710	None => return,
1711	Some(&sid) => sid,
1712	};
1713	instrument!(\|c\| {
1714	for &alt in &alternates[`1`..] {
1715	c.record_stack_push(alt);
1716	}
1717	});
1718	stack.extend(
1719	alternates[`1`..]
1720	.iter()
1721	.copied()
1722	.rev()
1723	.map(FollowEpsilon::Explore),
1724	);
1725	}
1726	State::BinaryUnion { alt1, alt2 } => {
1727	sid = alt1;
1728	instrument!(\|c\| c.record_stack_push(sid));
1729	stack.push(FollowEpsilon::Explore(alt2));
1730	}
1731	State::Capture { next, slot, .. } => {
1732	// There's no need to do anything with slots that
1733	// ultimately won't be copied into the caller-provided
1734	// 'Captures' value. So we just skip dealing with them at
1735	// all.
1736	if slot.as_usize() < curr_slots.len() {
1737	instrument!(\|c\| c.record_stack_push(sid));
1738	stack.push(FollowEpsilon::RestoreCapture {
1739	slot,
1740	offset: curr_slots[slot],
1741	});
1742	// OK because length of a slice must fit into an isize.
1743	curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap());
1744	}
1745	sid = next;
1746	}
1747	}
1748	}
1749	}
1750
1751	/// Return the starting configuration of a PikeVM search.
1752	///
1753	/// The "start config" is basically whether the search should be anchored
1754	/// or not and the NFA state ID at which to begin the search. The state ID
1755	/// returned always corresponds to an anchored starting state even when the
1756	/// search is unanchored. This is because the PikeVM search loop deals with
1757	/// unanchored searches with an explicit epsilon closure out of the start
1758	/// state.
1759	///
1760	/// This routine accounts for both the caller's `Input` configuration
1761	/// and the pattern itself. For example, even if the caller asks for an
1762	/// unanchored search, if the pattern itself is anchored, then this will
1763	/// always return 'true' because implementing an unanchored search in that
1764	/// case would be incorrect.
1765	///
1766	/// Similarly, if the caller requests an anchored search for a particular
1767	/// pattern, then the starting state ID returned will reflect that.
1768	///
1769	/// If a pattern ID is given in the input configuration that is not in
1770	/// this regex, then `None` is returned.
1771	fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> {
1772	match input.get_anchored() {
1773	// Only way we're unanchored is if both the caller asked for an
1774	// unanchored search and* the pattern is itself not anchored.*
1775	Anchored::No => Some((
1776	self.nfa.is_always_start_anchored(),
1777	self.nfa.start_anchored(),
1778	)),
1779	Anchored::Yes => Some((`true`, self.nfa.start_anchored())),
1780	Anchored::Pattern(pid) => {
1781	Some((`true`, self.nfa.start_pattern(pid)?))
1782	}
1783	}
1784	}
1785	}
1786
1787	/// An iterator over all non-overlapping matches for a particular search.
1788	///
1789	/// The iterator yields a [`Match`] value until no more matches could be found.
1790	///
1791	/// The lifetime parameters are as follows:
1792	///
1793	/// `'r` represents the lifetime of the PikeVM.*
1794	/// `'c` represents the lifetime of the PikeVM's cache.*
1795	/// `'h` represents the lifetime of the haystack being searched.*
1796	///
1797	/// This iterator can be created with the [`PikeVM::find_iter`] method.
1798	#[derive(Debug)]
1799	pub struct FindMatches<'r, 'c, 'h> {
1800	re: &'r PikeVM,
1801	cache: &'c mut Cache,
1802	caps: Captures,
1803	it: iter::Searcher<'h>,
1804	}
1805
1806	impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
1807	type Item = Match;
1808
1809	#[inline]
1810	fn next(&mut self) -> Option<Match> {
1811	// Splitting 'self' apart seems necessary to appease borrowck.
1812	let FindMatches { re: &PikeVM, ref mut cache: &mut &mut Cache, ref mut caps: &mut Captures, ref mut it: &mut Searcher<'_> } =
1813	*self;
1814	// 'advance' converts errors into panics, which is OK here because
1815	// the PikeVM can never return an error.
1816	it.advance(\|input: &Input<'_>\| {
1817	re.search(cache, input, caps);
1818	Ok(caps.get_match())
1819	})
1820	}
1821	}
1822
1823	/// An iterator over all non-overlapping leftmost matches, with their capturing
1824	/// groups, for a particular search.
1825	///
1826	/// The iterator yields a [`Captures`] value until no more matches could be
1827	/// found.
1828	///
1829	/// The lifetime parameters are as follows:
1830	///
1831	/// `'r` represents the lifetime of the PikeVM.*
1832	/// `'c` represents the lifetime of the PikeVM's cache.*
1833	/// `'h` represents the lifetime of the haystack being searched.*
1834	///
1835	/// This iterator can be created with the [`PikeVM::captures_iter`] method.
1836	#[derive(Debug)]
1837	pub struct CapturesMatches<'r, 'c, 'h> {
1838	re: &'r PikeVM,
1839	cache: &'c mut Cache,
1840	caps: Captures,
1841	it: iter::Searcher<'h>,
1842	}
1843
1844	impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> {
1845	type Item = Captures;
1846
1847	#[inline]
1848	fn next(&mut self) -> Option<Captures> {
1849	// Splitting 'self' apart seems necessary to appease borrowck.
1850	let CapturesMatches { re: &PikeVM, ref mut cache: &mut &mut Cache, ref mut caps: &mut Captures, ref mut it: &mut Searcher<'_> } =
1851	*self;
1852	// 'advance' converts errors into panics, which is OK here because
1853	// the PikeVM can never return an error.
1854	it.advance(\|input: &Input<'_>\| {
1855	re.search(cache, input, caps);
1856	Ok(caps.get_match())
1857	});
1858	if caps.is_match() {
1859	Some(caps.clone())
1860	} else {
1861	None
1862	}
1863	}
1864	}
1865
1866	/// A cache represents mutable state that a [`PikeVM`] requires during a
1867	/// search.
1868	///
1869	/// For a given [`PikeVM`], its corresponding cache may be created either via
1870	/// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in
1871	/// every way, except the former does not require explicitly importing `Cache`.
1872	///
1873	/// A particular `Cache` is coupled with the [`PikeVM`] from which it
1874	/// was created. It may only be used with that `PikeVM`. A cache and its
1875	/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
1876	/// only be used with the new `PikeVM` (and not the old one).
1877	#[derive(Clone, Debug)]
1878	pub struct Cache {
1879	/// Stack used while computing epsilon closure. This effectively lets us
1880	/// move what is more naturally expressed through recursion to a stack
1881	/// on the heap.
1882	stack: Vec<FollowEpsilon>,
1883	/// The current active states being explored for the current byte in the
1884	/// haystack.
1885	curr: ActiveStates,
1886	/// The next set of states we're building that will be explored for the
1887	/// next byte in the haystack.
1888	next: ActiveStates,
1889	}
1890
1891	impl Cache {
1892	/// Create a new [`PikeVM`] cache.
1893	///
1894	/// A potentially more convenient routine to create a cache is
1895	/// [`PikeVM::create_cache`], as it does not require also importing the
1896	/// `Cache` type.
1897	///
1898	/// If you want to reuse the returned `Cache` with some other `PikeVM`,
1899	/// then you must call [`Cache::reset`] with the desired `PikeVM`.
1900	pub fn new(re: &PikeVM) -> Cache {
1901	Cache {
1902	stack: vec![],
1903	curr: ActiveStates::new(re),
1904	next: ActiveStates::new(re),
1905	}
1906	}
1907
1908	/// Reset this cache such that it can be used for searching with a
1909	/// different [`PikeVM`].
1910	///
1911	/// A cache reset permits reusing memory already allocated in this cache
1912	/// with a different `PikeVM`.
1913	///
1914	/// # Example
1915	///
1916	/// This shows how to re-purpose a cache for use with a different `PikeVM`.
1917	///
1918	/// ```
1919	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1920	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
1921	///
1922	/// let re1 = PikeVM::new(r"\w")?;
1923	/// let re2 = PikeVM::new(r"\W")?;
1924	///
1925	/// let mut cache = re1.create_cache();
1926	/// assert_eq!(
1927	/// Some(Match::must(`0`, `0`..`2`)),
1928	/// re1.find_iter(&mut cache, "Δ").next(),
1929	/// );
1930	///
1931	/// // Using 'cache' with re2 is not allowed. It may result in panics or
1932	/// // incorrect results. In order to re-purpose the cache, we must reset
1933	/// // it with the PikeVM we'd like to use it with.
1934	/// //
1935	/// // Similarly, after this reset, using the cache with 're1' is also not
1936	/// // allowed.
1937	/// cache.reset(&re2);
1938	/// assert_eq!(
1939	/// Some(Match::must(`0`, `0`..`3`)),
1940	/// re2.find_iter(&mut cache, "☃").next(),
1941	/// );
1942	///
1943	/// # Ok::<(), Box<dyn std::error::Error>>(())
1944	/// ```
1945	pub fn reset(&mut self, re: &PikeVM) {
1946	self.curr.reset(re);
1947	self.next.reset(re);
1948	}
1949
1950	/// Returns the heap memory usage, in bytes, of this cache.
1951	///
1952	/// This does not* include the stack size used up by this cache. To*
1953	/// compute that, use `std::mem::size_of::<Cache>()`.
1954	pub fn memory_usage(&self) -> usize {
1955	use core::mem::size_of;
1956	(self.stack.len() * size_of::<FollowEpsilon>())
1957	+ self.curr.memory_usage()
1958	+ self.next.memory_usage()
1959	}
1960
1961	/// Clears this cache. This should be called at the start of every search
1962	/// to ensure we start with a clean slate.
1963	///
1964	/// This also sets the length of the capturing groups used in the current
1965	/// search. This permits an optimization where by 'SlotTable::for_state'
1966	/// only returns the number of slots equivalent to the number of slots
1967	/// given in the 'Captures' value. This may be less than the total number
1968	/// of possible slots, e.g., when one only wants to track overall match
1969	/// offsets. This in turn permits less copying of capturing group spans
1970	/// in the PikeVM.
1971	fn setup_search(&mut self, captures_slot_len: usize) {
1972	self.stack.clear();
1973	self.curr.setup_search(captures_slot_len);
1974	self.next.setup_search(captures_slot_len);
1975	}
1976	}
1977
1978	/// A set of active states used to "simulate" the execution of an NFA via the
1979	/// PikeVM.
1980	///
1981	/// There are two sets of these used during NFA simulation. One set corresponds
1982	/// to the "current" set of states being traversed for the current position
1983	/// in a haystack. The other set corresponds to the "next" set of states being
1984	/// built, which will become the new "current" set for the next position in the
1985	/// haystack. These two sets correspond to CLIST and NLIST in Thompson's
1986	/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
1987	///
1988	/// In addition to representing a set of NFA states, this also maintains slot
1989	/// values for each state. These slot values are what turn the NFA simulation
1990	/// into the "Pike VM." Namely, they track capturing group values for each
1991	/// state. During the computation of epsilon closure, we copy slot values from
1992	/// states in the "current" set to the "next" set. Eventually, once a match
1993	/// is found, the slot values for that match state are what we write to the
1994	/// caller provided 'Captures' value.
1995	#[derive(Clone, Debug)]
1996	struct ActiveStates {
1997	/// The set of active NFA states. This set preserves insertion order, which
1998	/// is critical for simulating the match semantics of backtracking regex
1999	/// engines.
2000	set: SparseSet,
2001	/// The slots for every NFA state, where each slot stores a (possibly
2002	/// absent) offset. Every capturing group has two slots. One for a start
2003	/// offset and one for an end offset.
2004	slot_table: SlotTable,
2005	}
2006
2007	impl ActiveStates {
2008	/// Create a new set of active states for the given PikeVM. The active
2009	/// states returned may only be used with the given PikeVM. (Use 'reset'
2010	/// to re-purpose the allocation for a different PikeVM.)
2011	fn new(re: &PikeVM) -> ActiveStates {
2012	let mut active = ActiveStates {
2013	set: SparseSet::new(`0`),
2014	slot_table: SlotTable::new(),
2015	};
2016	active.reset(re);
2017	active
2018	}
2019
2020	/// Reset this set of active states such that it can be used with the given
2021	/// PikeVM (and only that PikeVM).
2022	fn reset(&mut self, re: &PikeVM) {
2023	self.set.resize(re.get_nfa().states().len());
2024	self.slot_table.reset(re);
2025	}
2026
2027	/// Return the heap memory usage, in bytes, used by this set of active
2028	/// states.
2029	///
2030	/// This does not include the stack size of this value.
2031	fn memory_usage(&self) -> usize {
2032	self.set.memory_usage() + self.slot_table.memory_usage()
2033	}
2034
2035	/// Setup this set of active states for a new search. The given slot
2036	/// length should be the number of slots in a caller provided 'Captures'
2037	/// (and may be zero).
2038	fn setup_search(&mut self, captures_slot_len: usize) {
2039	self.set.clear();
2040	self.slot_table.setup_search(captures_slot_len);
2041	}
2042	}
2043
2044	/// A table of slots, where each row represent a state in an NFA. Thus, the
2045	/// table has room for storing slots for every single state in an NFA.
2046	///
2047	/// This table is represented with a single contiguous allocation. In general,
2048	/// the notion of "capturing group" doesn't really exist at this level of
2049	/// abstraction, hence the name "slot" instead. (Indeed, every capturing group
2050	/// maps to a pair of slots, one for the start offset and one for the end
2051	/// offset.) Slots are indexed by the 'Captures' NFA state.
2052	///
2053	/// N.B. Not every state actually needs a row of slots. Namely, states that
2054	/// only have epsilon transitions currently never have anything written to
2055	/// their rows in this table. Thus, the table is somewhat wasteful in its heap
2056	/// usage. However, it is important to maintain fast random access by state
2057	/// ID, which means one giant table tends to work well. RE2 takes a different
2058	/// approach here and allocates each row as its own reference counted thing.
2059	/// I explored such a strategy at one point here, but couldn't get it to work
2060	/// well using entirely safe code. (To the ambitious reader: I encourage you to
2061	/// re-litigate that experiment.) I very much wanted to stick to safe code, but
2062	/// could be convinced otherwise if there was a solid argument and the safety
2063	/// was encapsulated well.
2064	#[derive(Clone, Debug)]
2065	struct SlotTable {
2066	/// The actual table of offsets.
2067	table: Vec<Option<NonMaxUsize>>,
2068	/// The number of slots per state, i.e., the table's stride or the length
2069	/// of each row.
2070	slots_per_state: usize,
2071	/// The number of slots in the caller-provided 'Captures' value for the
2072	/// current search. Setting this to 'slots_per_state' is always correct,
2073	/// but may be wasteful.
2074	slots_for_captures: usize,
2075	}
2076
2077	impl SlotTable {
2078	/// Create a new slot table.
2079	///
2080	/// One should call 'reset' with the corresponding PikeVM before use.
2081	fn new() -> SlotTable {
2082	SlotTable { table: vec![], slots_for_captures: `0`, slots_per_state: `0` }
2083	}
2084
2085	/// Reset this slot table such that it can be used with the given PikeVM
2086	/// (and only that PikeVM).
2087	fn reset(&mut self, re: &PikeVM) {
2088	let nfa = re.get_nfa();
2089	self.slots_per_state = nfa.group_info().slot_len();
2090	// This is always correct, but may be reduced for a particular search
2091	// if a 'Captures' has fewer slots, e.g., none at all or only slots
2092	// for tracking the overall match instead of all slots for every
2093	// group.
2094	self.slots_for_captures = core::cmp::max(
2095	self.slots_per_state,
2096	nfa.pattern_len().checked_mul(`2`).unwrap(),
2097	);
2098	let len = nfa
2099	.states()
2100	.len()
2101	.checked_mul(self.slots_per_state)
2102	// Add space to account for scratch space used during a search.
2103	.and_then(\|x\| x.checked_add(self.slots_for_captures))
2104	// It seems like this could actually panic on legitimate inputs on
2105	// 32-bit targets, and very likely to panic on 16-bit. Should we
2106	// somehow convert this to an error? What about something similar
2107	// for the lazy DFA cache? If you're tripping this assert, please
2108	// file a bug.
2109	.expect("slot table length doesn't overflow");
2110	// This happens about as often as a regex is compiled, so it probably
2111	// should be at debug level, but I found it quite distracting and not
2112	// particularly useful.
2113	trace!(
2114	"resizing PikeVM active states table to {} entries \
2115	(slots_per_state={})",
2116	len,
2117	self.slots_per_state,
2118	);
2119	self.table.resize(len, None);
2120	}
2121
2122	/// Return the heap memory usage, in bytes, used by this slot table.
2123	///
2124	/// This does not include the stack size of this value.
2125	fn memory_usage(&self) -> usize {
2126	self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>()
2127	}
2128
2129	/// Perform any per-search setup for this slot table.
2130	///
2131	/// In particular, this sets the length of the number of slots used in the
2132	/// 'Captures' given by the caller (if any at all). This number may be
2133	/// smaller than the total number of slots available, e.g., when the caller
2134	/// is only interested in tracking the overall match and not the spans of
2135	/// every matching capturing group. Only tracking the overall match can
2136	/// save a substantial amount of time copying capturing spans during a
2137	/// search.
2138	fn setup_search(&mut self, captures_slot_len: usize) {
2139	self.slots_for_captures = captures_slot_len;
2140	}
2141
2142	/// Return a mutable slice of the slots for the given state.
2143	///
2144	/// Note that the length of the slice returned may be less than the total
2145	/// number of slots available for this state. In particular, the length
2146	/// always matches the number of slots indicated via 'setup_search'.
2147	fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
2148	let i = sid.as_usize() * self.slots_per_state;
2149	&mut self.table[i..i + self.slots_for_captures]
2150	}
2151
2152	/// Return a slice of slots of appropriate length where every slot offset
2153	/// is guaranteed to be absent. This is useful in cases where you need to
2154	/// compute an epsilon closure outside of the user supplied regex, and thus
2155	/// never want it to have any capturing slots set.
2156	fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
2157	let i = self.table.len() - self.slots_for_captures;
2158	&mut self.table[i..i + self.slots_for_captures]
2159	}
2160	}
2161
2162	/// Represents a stack frame for use while computing an epsilon closure.
2163	///
2164	/// (An "epsilon closure" refers to the set of reachable NFA states from a
2165	/// single state without consuming any input. That is, the set of all epsilon
2166	/// transitions not only from that single state, but from every other state
2167	/// reachable by an epsilon transition as well. This is why it's called a
2168	/// "closure." Computing an epsilon closure is also done during DFA
2169	/// determinization! Compare and contrast the epsilon closure here in this
2170	/// PikeVM and the one used for determinization in crate::util::determinize.)
2171	///
2172	/// Computing the epsilon closure in a Thompson NFA proceeds via a depth
2173	/// first traversal over all epsilon transitions from a particular state.
2174	/// (A depth first traversal is important because it emulates the same priority
2175	/// of matches that is typically found in backtracking regex engines.) This
2176	/// depth first traversal is naturally expressed using recursion, but to avoid
2177	/// a call stack size proportional to the size of a regex, we put our stack on
2178	/// the heap instead.
2179	///
2180	/// This stack thus consists of call frames. The typical call frame is
2181	/// `Explore`, which instructs epsilon closure to explore the epsilon
2182	/// transitions from that state. (Subsequent epsilon transitions are then
2183	/// pushed on to the stack as more `Explore` frames.) If the state ID being
2184	/// explored has no epsilon transitions, then the capturing group slots are
2185	/// copied from the original state that sparked the epsilon closure (from the
2186	/// 'step' routine) to the state ID being explored. This way, capturing group
2187	/// slots are forwarded from the previous state to the next.
2188	///
2189	/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
2190	/// set the position for a particular slot back to some particular offset. This
2191	/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
2192	/// set the offset of the slot indicated in `Capture` to the current offset,
2193	/// and then push the old offset on to the stack as a `RestoreCapture` frame.
2194	/// Thus, the new offset is only used until the epsilon closure reverts back to
2195	/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
2196	/// transition its "scope" to only states that come "after" it during depth
2197	/// first traversal.
2198	#[derive(Clone, Debug)]
2199	enum FollowEpsilon {
2200	/// Explore the epsilon transitions from a state ID.
2201	Explore(StateID),
2202	/// Reset the given `slot` to the given `offset` (which might be `None`).
2203	RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
2204	}
2205
2206	/// A set of counters that "instruments" a PikeVM search. To enable this, you
2207	/// must enable the 'internal-instrument-pikevm' feature. Then run your Rust
2208	/// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in
2209	/// the environment. The metrics collected will be dumped automatically for
2210	/// every search executed by the PikeVM.
2211	///
2212	/// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an
2213	/// absolute decrease in wall-clock performance, even if the 'trace' log level
2214	/// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't
2215	/// enabled.) The main point of instrumentation is to get counts of various
2216	/// events that occur during the PikeVM's execution.
2217	///
2218	/// This is a somewhat hacked together collection of metrics that are useful
2219	/// to gather from a PikeVM search. In particular, it lets us scrutinize the
2220	/// performance profile of a search beyond what general purpose profiling tools
2221	/// give us. Namely, we orient the profiling data around the specific states of
2222	/// the NFA.
2223	///
2224	/// In other words, this lets us see which parts of the NFA graph are most
2225	/// frequently activated. This then provides direction for optimization
2226	/// opportunities.
2227	///
2228	/// The really sad part about this is that it absolutely clutters up the PikeVM
2229	/// implementation. :'( Another approach would be to just manually add this
2230	/// code in whenever I want this kind of profiling data, but it's complicated
2231	/// and tedious enough that I went with this approach... for now.
2232	///
2233	/// When instrumentation is enabled (which also turns on 'logging'), then a
2234	/// `Counters` is initialized for every search and `trace`'d just before the
2235	/// search returns to the caller.
2236	///
2237	/// Tip: When debugging performance problems with the PikeVM, it's best to try
2238	/// to work with an NFA that is as small as possible. Otherwise the state graph
2239	/// is likely to be too big to digest.
2240	#[cfg(feature = "internal-instrument-pikevm")]
2241	#[derive(Clone, Debug)]
2242	struct Counters {
2243	/// The number of times the NFA is in a particular permutation of states.
2244	state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>,
2245	/// The number of times 'step' is called for a particular state ID (which
2246	/// indexes this array).
2247	steps: Vec<u64>,
2248	/// The number of times an epsilon closure was computed for a state.
2249	closures: Vec<u64>,
2250	/// The number of times a particular state ID is pushed on to a stack while
2251	/// computing an epsilon closure.
2252	stack_pushes: Vec<u64>,
2253	/// The number of times a particular state ID is inserted into a sparse set
2254	/// while computing an epsilon closure.
2255	set_inserts: Vec<u64>,
2256	}
2257
2258	#[cfg(feature = "internal-instrument-pikevm")]
2259	impl Counters {
2260	fn empty() -> Counters {
2261	Counters {
2262	state_sets: alloc::collections::BTreeMap::new(),
2263	steps: vec![],
2264	closures: vec![],
2265	stack_pushes: vec![],
2266	set_inserts: vec![],
2267	}
2268	}
2269
2270	fn reset(&mut self, nfa: &NFA) {
2271	let len = nfa.states().len();
2272
2273	self.state_sets.clear();
2274
2275	self.steps.clear();
2276	self.steps.resize(len, `0`);
2277
2278	self.closures.clear();
2279	self.closures.resize(len, `0`);
2280
2281	self.stack_pushes.clear();
2282	self.stack_pushes.resize(len, `0`);
2283
2284	self.set_inserts.clear();
2285	self.set_inserts.resize(len, `0`);
2286	}
2287
2288	fn eprint(&self, nfa: &NFA) {
2289	trace!("===== START PikeVM Instrumentation Output =====");
2290	// We take the top-K most occurring state sets. Otherwise the output
2291	// is likely to be overwhelming. And we probably only care about the
2292	// most frequently occurring ones anyway.
2293	const LIMIT: usize = `20`;
2294	let mut set_counts =
2295	self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>();
2296	set_counts.sort_by_key(\|(_, &count)\| core::cmp::Reverse(count));
2297	trace!("## PikeVM frequency of state sets (top {})", LIMIT);
2298	for (set, count) in set_counts.iter().take(LIMIT) {
2299	trace!("{:?}: {}", set, count);
2300	}
2301	if set_counts.len() > LIMIT {
2302	trace!(
2303	"... {} sets omitted (out of {} total)",
2304	set_counts.len() - LIMIT,
2305	set_counts.len(),
2306	);
2307	}
2308
2309	trace!("");
2310	trace!("## PikeVM total frequency of events");
2311	trace!(
2312	"steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}",
2313	self.steps.iter().copied().sum::<u64>(),
2314	self.closures.iter().copied().sum::<u64>(),
2315	self.stack_pushes.iter().copied().sum::<u64>(),
2316	self.set_inserts.iter().copied().sum::<u64>(),
2317	);
2318
2319	trace!("");
2320	trace!("## PikeVM frequency of events broken down by state");
2321	for sid in `0`..self.steps.len() {
2322	trace!(
2323	"{:06}: steps: {}, closures: {}, \
2324	stack-pushes: {}, set-inserts: {}",
2325	sid,
2326	self.steps[sid],
2327	self.closures[sid],
2328	self.stack_pushes[sid],
2329	self.set_inserts[sid],
2330	);
2331	}
2332
2333	trace!("");
2334	trace!("## NFA debug display");
2335	trace!("{:?}", nfa);
2336	trace!("===== END PikeVM Instrumentation Output =====");
2337	}
2338
2339	fn record_state_set(&mut self, set: &SparseSet) {
2340	let set = set.iter().collect::<Vec<StateID>>();
2341	*self.state_sets.entry(set).or_insert(`0`) += `1`;
2342	}
2343
2344	fn record_step(&mut self, sid: StateID) {
2345	self.steps[sid] += `1`;
2346	}
2347
2348	fn record_closure(&mut self, sid: StateID) {
2349	self.closures[sid] += `1`;
2350	}
2351
2352	fn record_stack_push(&mut self, sid: StateID) {
2353	self.stack_pushes[sid] += `1`;
2354	}
2355
2356	fn record_set_insert(&mut self, sid: StateID) {
2357	self.set_inserts[sid] += `1`;
2358	}
2359	}
2360