onepass.rs - Codebrowser

1	/!*
2	A DFA that can return spans for matching capturing groups.
3
4	This module is the home of a [one-pass DFA](DFA).
5
6	This module also contains a [`Builder`] and a [`Config`] for building and
7	configuring a one-pass DFA.
8	*/
9
10	// A note on naming and credit:
11	//
12	// As far as I know, Russ Cox came up with the practical vision and
13	// implementation of a "one-pass regex engine." He mentions and describes it
14	// briefly in the third article of his regexp article series:
15	// https://swtch.com/~rsc/regexp/regexp3.html
16	//
17	// Cox's implementation is in RE2, and the implementation below is most
18	// heavily inspired by RE2's. The key thing they have in common is that
19	// their transitions are defined over an alphabet of bytes. In contrast,
20	// Go's regex engine also has a one-pass engine, but its transitions are
21	// more firmly rooted on Unicode codepoints. The ideas are the same, but the
22	// implementations are different.
23	//
24	// RE2 tends to call this a "one-pass NFA." Here, we call it a "one-pass DFA."
25	// They're both true in their own ways:
26	//
27	// The "one-pass" criterion is generally a property of the NFA itself. In*
28	// particular, it is said that an NFA is one-pass if, after each byte of input
29	// during a search, there is at most one "VM thread" remaining to take for the
30	// next byte of input. That is, there is never any ambiguity as to the path to
31	// take through the NFA during a search.
32	//
33	// On the other hand, once a one-pass NFA has its representation converted*
34	// to something where a constant number of instructions is used for each byte
35	// of input, the implementation looks a lot more like a DFA. It's technically
36	// more powerful than a DFA since it has side effects (storing offsets inside
37	// of slots activated by a transition), but it is far closer to a DFA than an
38	// NFA simulation.
39	//
40	// Thus, in this crate, we call it a one-pass DFA.
41
42	use alloc::{vec, vec::Vec};
43
44	use crate::{
45	dfa::{remapper::Remapper, DEAD},
46	nfa::thompson::{self, NFA},
47	util::{
48	alphabet::ByteClasses,
49	captures::Captures,
50	escape::DebugByte,
51	int::{Usize, U32, U64, U8},
52	look::{Look, LookSet, UnicodeWordBoundaryError},
53	primitives::{NonMaxUsize, PatternID, StateID},
54	search::{Anchored, Input, Match, MatchError, MatchKind, Span},
55	sparse_set::SparseSet,
56	},
57	};
58
59	/// The configuration used for building a [one-pass DFA](DFA).
60	///
61	/// A one-pass DFA configuration is a simple data object that is typically used
62	/// with [`Builder::configure`]. It can be cheaply cloned.
63	///
64	/// A default configuration can be created either with `Config::new`, or
65	/// perhaps more conveniently, with [`DFA::config`].
66	#[derive(Clone, Debug, Default)]
67	pub struct Config {
68	match_kind: Option<MatchKind>,
69	starts_for_each_pattern: Option<bool>,
70	byte_classes: Option<bool>,
71	size_limit: Option<Option<usize>>,
72	}
73
74	impl Config {
75	/// Return a new default one-pass DFA configuration.
76	pub fn new() -> Config {
77	Config::default()
78	}
79
80	/// Set the desired match semantics.
81	///
82	/// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
83	/// match semantics of Perl-like regex engines. That is, when multiple
84	/// patterns would match at the same leftmost position, the pattern that
85	/// appears first in the concrete syntax is chosen.
86	///
87	/// Currently, the only other kind of match semantics supported is
88	/// [`MatchKind::All`]. This corresponds to "classical DFA" construction
89	/// where all possible matches are visited.
90	///
91	/// When it comes to the one-pass DFA, it is rarer for preference order and
92	/// "longest match" to actually disagree. Since if they did disagree, then
93	/// the regex typically isn't one-pass. For example, searching `Samwise`
94	/// for `Sam\|Samwise` will report `Sam` for leftmost-first matching and
95	/// `Samwise` for "longest match" or "all" matching. However, this regex is
96	/// not one-pass if taken literally. The equivalent regex, `Sam(?:\|wise)`
97	/// is one-pass and `Sam\|Samwise` may be optimized to it.
98	///
99	/// The other main difference is that "all" match semantics don't support
100	/// non-greedy matches. "All" match semantics always try to match as much
101	/// as possible.
102	pub fn match_kind(mut self, kind: MatchKind) -> Config {
103	self.match_kind = Some(kind);
104	self
105	}
106
107	/// Whether to compile a separate start state for each pattern in the
108	/// one-pass DFA.
109	///
110	/// When enabled, a separate anchored* start state is added for each*
111	/// pattern in the DFA. When this start state is used, then the DFA will
112	/// only search for matches for the pattern specified, even if there are
113	/// other patterns in the DFA.
114	///
115	/// The main downside of this option is that it can potentially increase
116	/// the size of the DFA and/or increase the time it takes to build the DFA.
117	///
118	/// You might want to enable this option when you want to both search for
119	/// anchored matches of any pattern or to search for anchored matches of
120	/// one particular pattern while using the same DFA. (Otherwise, you would
121	/// need to compile a new DFA for each pattern.)
122	///
123	/// By default this is disabled.
124	///
125	/// # Example
126	///
127	/// This example shows how to build a multi-regex and then search for
128	/// matches for a any of the patterns or matches for a specific pattern.
129	///
130	/// ```
131	/// use regex_automata::{
132	/// dfa::onepass::DFA, Anchored, Input, Match, PatternID,
133	/// };
134	///
135	/// let re = DFA::builder()
136	/// .configure(DFA::config().starts_for_each_pattern(`true`))
137	/// .build_many(&["[a-z]+", "[0-9]+"])?;
138	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
139	/// let haystack = "123abc";
140	/// let input = Input::new(haystack).anchored(Anchored::Yes);
141	///
142	/// // A normal multi-pattern search will show pattern 1 matches.
143	/// re.try_search(&mut cache, &input, &mut caps)?;
144	/// assert_eq!(Some(Match::must(`1`, `0`..`3`)), caps.get_match());
145	///
146	/// // If we only want to report pattern 0 matches, then we'll get no
147	/// // match here.
148	/// let input = input.anchored(Anchored::Pattern(PatternID::must(`0`)));
149	/// re.try_search(&mut cache, &input, &mut caps)?;
150	/// assert_eq!(None, caps.get_match());
151	///
152	/// # Ok::<(), Box<dyn std::error::Error>>(())
153	/// ```
154	pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
155	self.starts_for_each_pattern = Some(yes);
156	self
157	}
158
159	/// Whether to attempt to shrink the size of the DFA's alphabet or not.
160	///
161	/// This option is enabled by default and should never be disabled unless
162	/// one is debugging a one-pass DFA.
163	///
164	/// When enabled, the DFA will use a map from all possible bytes to their
165	/// corresponding equivalence class. Each equivalence class represents a
166	/// set of bytes that does not discriminate between a match and a non-match
167	/// in the DFA. For example, the pattern `[ab]+` has at least two
168	/// equivalence classes: a set containing `a` and `b` and a set containing
169	/// every byte except for `a` and `b`. `a` and `b` are in the same
170	/// equivalence class because they never discriminate between a match and a
171	/// non-match.
172	///
173	/// The advantage of this map is that the size of the transition table
174	/// can be reduced drastically from (approximately) `#states * 256 *
175	/// sizeof(StateID)` to `#states k * sizeof(StateID)` where `k` is the*
176	/// number of equivalence classes (rounded up to the nearest power of 2).
177	/// As a result, total space usage can decrease substantially. Moreover,
178	/// since a smaller alphabet is used, DFA compilation becomes faster as
179	/// well.
180	///
181	/// WARNING:* This is only useful for debugging DFAs. Disabling this*
182	/// does not yield any speed advantages. Namely, even when this is
183	/// disabled, a byte class map is still used while searching. The only
184	/// difference is that every byte will be forced into its own distinct
185	/// equivalence class. This is useful for debugging the actual generated
186	/// transitions because it lets one see the transitions defined on actual
187	/// bytes instead of the equivalence classes.
188	pub fn byte_classes(mut self, yes: bool) -> Config {
189	self.byte_classes = Some(yes);
190	self
191	}
192
193	/// Set a size limit on the total heap used by a one-pass DFA.
194	///
195	/// This size limit is expressed in bytes and is applied during
196	/// construction of a one-pass DFA. If the DFA's heap usage exceeds
197	/// this configured limit, then construction is stopped and an error is
198	/// returned.
199	///
200	/// The default is no limit.
201	///
202	/// # Example
203	///
204	/// This example shows a one-pass DFA that fails to build because of
205	/// a configured size limit. This particular example also serves as a
206	/// cautionary tale demonstrating just how big DFAs with large Unicode
207	/// character classes can get.
208	///
209	/// ```
210	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
211	/// use regex_automata::{dfa::onepass::DFA, Match};
212	///
213	/// // 6MB isn't enough!
214	/// DFA::builder()
215	/// .configure(DFA::config().size_limit(Some(`6_000_000`)))
216	/// .build(r"\w{20}")
217	/// .unwrap_err();
218	///
219	/// // ... but 7MB probably is!
220	/// // (Note that DFA sizes aren't necessarily stable between releases.)
221	/// let re = DFA::builder()
222	/// .configure(DFA::config().size_limit(Some(`7_000_000`)))
223	/// .build(r"\w{20}")?;
224	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
225	/// let haystack = "A".repeat(`20`);
226	/// re.captures(&mut cache, &haystack, &mut caps);
227	/// assert_eq!(Some(Match::must(`0`, `0`..`20`)), caps.get_match());
228	///
229	/// # Ok::<(), Box<dyn std::error::Error>>(())
230	/// ```
231	///
232	/// While one needs a little more than 3MB to represent `\w{20}`, it
233	/// turns out that you only need a little more than 4KB to represent
234	/// `(?-u:\w{20})`. So only use Unicode if you need it!
235	pub fn size_limit(mut self, limit: Option<usize>) -> Config {
236	self.size_limit = Some(limit);
237	self
238	}
239
240	/// Returns the match semantics set in this configuration.
241	pub fn get_match_kind(&self) -> MatchKind {
242	self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
243	}
244
245	/// Returns whether this configuration has enabled anchored starting states
246	/// for every pattern in the DFA.
247	pub fn get_starts_for_each_pattern(&self) -> bool {
248	self.starts_for_each_pattern.unwrap_or(`false`)
249	}
250
251	/// Returns whether this configuration has enabled byte classes or not.
252	/// This is typically a debugging oriented option, as disabling it confers
253	/// no speed benefit.
254	pub fn get_byte_classes(&self) -> bool {
255	self.byte_classes.unwrap_or(`true`)
256	}
257
258	/// Returns the DFA size limit of this configuration if one was set.
259	/// The size limit is total number of bytes on the heap that a DFA is
260	/// permitted to use. If the DFA exceeds this limit during construction,
261	/// then construction is stopped and an error is returned.
262	pub fn get_size_limit(&self) -> Option<usize> {
263	self.size_limit.unwrap_or(None)
264	}
265
266	/// Overwrite the default configuration such that the options in `o` are
267	/// always used. If an option in `o` is not set, then the corresponding
268	/// option in `self` is used. If it's not set in `self` either, then it
269	/// remains not set.
270	pub(crate) fn overwrite(&self, o: Config) -> Config {
271	Config {
272	match_kind: o.match_kind.or(self.match_kind),
273	starts_for_each_pattern: o
274	.starts_for_each_pattern
275	.or(self.starts_for_each_pattern),
276	byte_classes: o.byte_classes.or(self.byte_classes),
277	size_limit: o.size_limit.or(self.size_limit),
278	}
279	}
280	}
281
282	/// A builder for a [one-pass DFA](DFA).
283	///
284	/// This builder permits configuring options for the syntax of a pattern, the
285	/// NFA construction and the DFA construction. This builder is different from a
286	/// general purpose regex builder in that it permits fine grain configuration
287	/// of the construction process. The trade off for this is complexity, and
288	/// the possibility of setting a configuration that might not make sense. For
289	/// example, there are two different UTF-8 modes:
290	///
291	/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
292	/// whether the pattern itself can contain sub-expressions that match invalid
293	/// UTF-8.
294	/// * [`thompson::Config::utf8`] controls whether empty matches that split a
295	/// Unicode codepoint are reported or not.
296	///
297	/// Generally speaking, callers will want to either enable all of these or
298	/// disable all of these.
299	///
300	/// # Example
301	///
302	/// This example shows how to disable UTF-8 mode in the syntax and the NFA.
303	/// This is generally what you want for matching on arbitrary bytes.
304	///
305	/// ```
306	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
307	/// use regex_automata::{
308	/// dfa::onepass::DFA,
309	/// nfa::thompson,
310	/// util::syntax,
311	/// Match,
312	/// };
313	///
314	/// let re = DFA::builder()
315	/// .syntax(syntax::Config::new().utf8(`false`))
316	/// .thompson(thompson::Config::new().utf8(`false`))
317	/// .build(r"foo(?-u:[^b])ar.*")?;
318	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
319	///
320	/// let haystack = b"foo`\xFF`arzz`\xE2\x98\xFF\n`";
321	/// re.captures(&mut cache, haystack, &mut caps);
322	/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
323	/// // but the subsequent `.` does not! Disabling UTF-8*
324	/// // on the syntax permits this.
325	/// //
326	/// // N.B. This example does not show the impact of
327	/// // disabling UTF-8 mode on a one-pass DFA Config,
328	/// // since that only impacts regexes that can
329	/// // produce matches of length 0.
330	/// assert_eq!(Some(Match::must(`0`, `0`..`8`)), caps.get_match());
331	///
332	/// # Ok::<(), Box<dyn std::error::Error>>(())
333	/// ```
334	#[derive(Clone, Debug)]
335	pub struct Builder {
336	config: Config,
337	#[cfg(feature = "syntax")]
338	thompson: thompson::Compiler,
339	}
340
341	impl Builder {
342	/// Create a new one-pass DFA builder with the default configuration.
343	pub fn new() -> Builder {
344	Builder {
345	config: Config::default(),
346	#[cfg(feature = "syntax")]
347	thompson: thompson::Compiler::new(),
348	}
349	}
350
351	/// Build a one-pass DFA from the given pattern.
352	///
353	/// If there was a problem parsing or compiling the pattern, then an error
354	/// is returned.
355	#[cfg(feature = "syntax")]
356	pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
357	self.build_many(&[pattern])
358	}
359
360	/// Build a one-pass DFA from the given patterns.
361	///
362	/// When matches are returned, the pattern ID corresponds to the index of
363	/// the pattern in the slice given.
364	#[cfg(feature = "syntax")]
365	pub fn build_many<P: AsRef<str>>(
366	&self,
367	patterns: &[P],
368	) -> Result<DFA, BuildError> {
369	let nfa =
370	self.thompson.build_many(patterns).map_err(BuildError::nfa)?;
371	self.build_from_nfa(nfa)
372	}
373
374	/// Build a DFA from the given NFA.
375	///
376	/// # Example
377	///
378	/// This example shows how to build a DFA if you already have an NFA in
379	/// hand.
380	///
381	/// ```
382	/// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Match};
383	///
384	/// // This shows how to set non-default options for building an NFA.
385	/// let nfa = NFA::compiler()
386	/// .configure(NFA::config().shrink(`true`))
387	/// .build(r"[a-z0-9]+")?;
388	/// let re = DFA::builder().build_from_nfa(nfa)?;
389	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
390	/// re.captures(&mut cache, "foo123bar", &mut caps);
391	/// assert_eq!(Some(Match::must(`0`, `0`..`9`)), caps.get_match());
392	///
393	/// # Ok::<(), Box<dyn std::error::Error>>(())
394	/// ```
395	pub fn build_from_nfa(&self, nfa: NFA) -> Result<DFA, BuildError> {
396	// Why take ownership if we're just going to pass a reference to the
397	// NFA to our internal builder? Well, the first thing to note is that
398	// an NFA uses reference counting internally, so either choice is going
399	// to be cheap. So there isn't much cost either way.
400	//
401	// The real reason is that a one-pass DFA, semantically, shares
402	// ownership of an NFA. This is unlike other DFAs that don't share
403	// ownership of an NFA at all, primarily because they want to be
404	// self-contained in order to support cheap (de)serialization.
405	//
406	// But then why pass a '&nfa' below if we want to share ownership?
407	// Well, it turns out that using a '&NFA' in our internal builder
408	// separates its lifetime from the DFA we're building, and this turns
409	// out to make code a bit more composable. e.g., We can iterate over
410	// things inside the NFA while borrowing the builder as mutable because
411	// we know the NFA cannot be mutated. So TL;DR --- this weirdness is
412	// "because borrow checker."
413	InternalBuilder::new(self.config.clone(), &nfa).build()
414	}
415
416	/// Apply the given one-pass DFA configuration options to this builder.
417	pub fn configure(&mut self, config: Config) -> &mut Builder {
418	self.config = self.config.overwrite(config);
419	self
420	}
421
422	/// Set the syntax configuration for this builder using
423	/// [`syntax::Config`](crate::util::syntax::Config).
424	///
425	/// This permits setting things like case insensitivity, Unicode and multi
426	/// line mode.
427	///
428	/// These settings only apply when constructing a one-pass DFA directly
429	/// from a pattern.
430	#[cfg(feature = "syntax")]
431	pub fn syntax(
432	&mut self,
433	config: crate::util::syntax::Config,
434	) -> &mut Builder {
435	self.thompson.syntax(config);
436	self
437	}
438
439	/// Set the Thompson NFA configuration for this builder using
440	/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
441	///
442	/// This permits setting things like whether additional time should be
443	/// spent shrinking the size of the NFA.
444	///
445	/// These settings only apply when constructing a DFA directly from a
446	/// pattern.
447	#[cfg(feature = "syntax")]
448	pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
449	self.thompson.configure(config);
450	self
451	}
452	}
453
454	/// An internal builder for encapsulating the state necessary to build a
455	/// one-pass DFA. Typical use is just `InternalBuilder::new(..).build()`.
456	///
457	/// There is no separate pass for determining whether the NFA is one-pass or
458	/// not. We just try to build the DFA. If during construction we discover that
459	/// it is not one-pass, we bail out. This is likely to lead to some undesirable
460	/// expense in some cases, so it might make sense to try an identify common
461	/// patterns in the NFA that make it definitively not one-pass. That way, we
462	/// can avoid ever trying to build a one-pass DFA in the first place. For
463	/// example, '\w\s' is not one-pass, and since '\w' is Unicode-aware by*
464	/// default, it's probably not a trivial cost to try and build a one-pass DFA
465	/// for it and then fail.
466	///
467	/// Note that some (immutable) fields are duplicated here. For example, the
468	/// 'nfa' and 'classes' fields are both in the 'DFA'. They are the same thing,
469	/// but we duplicate them because it makes composition easier below. Otherwise,
470	/// since the borrow checker can't see through method calls, the mutable borrow
471	/// we use to mutate the DFA winds up preventing borrowing from any other part
472	/// of the DFA, even though we aren't mutating those parts. We only do this
473	/// because the duplication is cheap.
474	#[derive(Debug)]
475	struct InternalBuilder<'a> {
476	/// The DFA we're building.
477	dfa: DFA,
478	/// An unordered collection of NFA state IDs that we haven't yet tried to
479	/// build into a DFA state yet.
480	///
481	/// This collection does not ultimately wind up including every NFA state
482	/// ID. Instead, each ID represents a "start" state for a sub-graph of the
483	/// NFA. The set of NFA states we then use to build a DFA state consists
484	/// of that "start" state and all states reachable from it via epsilon
485	/// transitions.
486	uncompiled_nfa_ids: Vec<StateID>,
487	/// A map from NFA state ID to DFA state ID. This is useful for easily
488	/// determining whether an NFA state has been used as a "starting" point
489	/// to build a DFA state yet. If it hasn't, then it is mapped to DEAD,
490	/// and since DEAD is specially added and never corresponds to any NFA
491	/// state, it follows that a mapping to DEAD implies the NFA state has
492	/// no corresponding DFA state yet.
493	nfa_to_dfa_id: Vec<StateID>,
494	/// A stack used to traverse the NFA states that make up a single DFA
495	/// state. Traversal occurs until the stack is empty, and we only push to
496	/// the stack when the state ID isn't in 'seen'. Actually, even more than
497	/// that, if we try to push something on to this stack that is already in
498	/// 'seen', then we bail out on construction completely, since it implies
499	/// that the NFA is not one-pass.
500	stack: Vec<(StateID, Epsilons)>,
501	/// The set of NFA states that we've visited via 'stack'.
502	seen: SparseSet,
503	/// Whether a match NFA state has been observed while constructing a
504	/// one-pass DFA state. Once a match state is seen, assuming we are using
505	/// leftmost-first match semantics, then we don't add any more transitions
506	/// to the DFA state we're building.
507	matched: bool,
508	/// The config passed to the builder.
509	///
510	/// This is duplicated in dfa.config.
511	config: Config,
512	/// The NFA we're building a one-pass DFA from.
513	///
514	/// This is duplicated in dfa.nfa.
515	nfa: &'a NFA,
516	/// The equivalence classes that make up the alphabet for this DFA>
517	///
518	/// This is duplicated in dfa.classes.
519	classes: ByteClasses,
520	}
521
522	impl<'a> InternalBuilder<'a> {
523	/// Create a new builder with an initial empty DFA.
524	fn new(config: Config, nfa: &'a NFA) -> InternalBuilder {
525	let classes = if !config.get_byte_classes() {
526	// A one-pass DFA will always use the equivalence class map, but
527	// enabling this option is useful for debugging. Namely, this will
528	// cause all transitions to be defined over their actual bytes
529	// instead of an opaque equivalence class identifier. The former is
530	// much easier to grok as a human.
531	ByteClasses::singletons()
532	} else {
533	nfa.byte_classes().clone()
534	};
535	// Normally a DFA alphabet includes the EOI symbol, but we don't need
536	// that in the one-pass DFA since we handle look-around explicitly
537	// without encoding it into the DFA. Thus, we don't need to delay
538	// matches by 1 byte. However, we reuse the space that would* be used*
539	// by the EOI transition by putting match information there (like which
540	// pattern matches and which look-around assertions need to hold). So
541	// this means our real alphabet length is 1 fewer than what the byte
542	// classes report, since we don't use EOI.
543	let alphabet_len = classes.alphabet_len().checked_sub(`1`).unwrap();
544	let stride2 = classes.stride2();
545	let dfa = DFA {
546	config: config.clone(),
547	nfa: nfa.clone(),
548	table: vec![],
549	starts: vec![],
550	// Since one-pass DFAs have a smaller state ID max than
551	// StateID::MAX, it follows that StateID::MAX is a valid initial
552	// value for min_match_id since no state ID can ever be greater
553	// than it. In the case of a one-pass DFA with no match states, the
554	// min_match_id will keep this sentinel value.
555	min_match_id: StateID::MAX,
556	classes: classes.clone(),
557	alphabet_len,
558	stride2,
559	pateps_offset: alphabet_len,
560	// OK because PatternID::MAX2 is guaranteed not to overflow.*
561	explicit_slot_start: nfa.pattern_len().checked_mul(`2`).unwrap(),
562	};
563	InternalBuilder {
564	dfa,
565	uncompiled_nfa_ids: vec![],
566	nfa_to_dfa_id: vec![DEAD; nfa.states().len()],
567	stack: vec![],
568	seen: SparseSet::new(nfa.states().len()),
569	matched: `false`,
570	config,
571	nfa,
572	classes,
573	}
574	}
575
576	/// Build the DFA from the NFA given to this builder. If the NFA is not
577	/// one-pass, then return an error. An error may also be returned if a
578	/// particular limit is exceeded. (Some limits, like the total heap memory
579	/// used, are configurable. Others, like the total patterns or slots, are
580	/// hard-coded based on representational limitations.)
581	fn build(mut self) -> Result<DFA, BuildError> {
582	self.nfa.look_set_any().available().map_err(BuildError::word)?;
583	for look in self.nfa.look_set_any().iter() {
584	// This is a future incompatibility check where if we add any
585	// more look-around assertions, then the one-pass DFA either
586	// needs to reject them (what we do here) or it needs to have its
587	// Transition representation modified to be capable of storing the
588	// new assertions.
589	if look.as_repr() > Look::WordUnicodeNegate.as_repr() {
590	return Err(BuildError::unsupported_look(look));
591	}
592	}
593	if self.nfa.pattern_len().as_u64() > PatternEpsilons::PATTERN_ID_LIMIT
594	{
595	return Err(BuildError::too_many_patterns(
596	PatternEpsilons::PATTERN_ID_LIMIT,
597	));
598	}
599	if self.nfa.group_info().explicit_slot_len() > Slots::LIMIT {
600	return Err(BuildError::not_one_pass(
601	"too many explicit capturing groups (max is 16)",
602	));
603	}
604	assert_eq!(DEAD, self.add_empty_state()?);
605
606	// This is where the explicit slots start. We care about this because
607	// we only need to track explicit slots. The implicit slots---two for
608	// each pattern---are tracked as part of the search routine itself.
609	let explicit_slot_start = self.nfa.pattern_len() * `2`;
610	self.add_start_state(None, self.nfa.start_anchored())?;
611	if self.config.get_starts_for_each_pattern() {
612	for pid in self.nfa.patterns() {
613	self.add_start_state(
614	Some(pid),
615	self.nfa.start_pattern(pid).unwrap(),
616	)?;
617	}
618	}
619	// NOTE: One wonders what the effects of treating 'uncompiled_nfa_ids'
620	// as a stack are. It is really an unordered set* of NFA state IDs.*
621	// If it, for example, in practice led to discovering whether a regex
622	// was or wasn't one-pass later than if we processed NFA state IDs in
623	// ascending order, then that would make this routine more costly in
624	// the somewhat common case of a regex that isn't one-pass.
625	while let Some(nfa_id) = self.uncompiled_nfa_ids.pop() {
626	let dfa_id = self.nfa_to_dfa_id[nfa_id];
627	// Once we see a match, we keep going, but don't add any new
628	// transitions. Normally we'd just stop, but we have to keep
629	// going in order to verify that our regex is actually one-pass.
630	self.matched = `false`;
631	// The NFA states we've already explored for this DFA state.
632	self.seen.clear();
633	// The NFA states to explore via epsilon transitions. If we ever
634	// try to push an NFA state that we've already seen, then the NFA
635	// is not one-pass because it implies there are multiple epsilon
636	// transition paths that lead to the same NFA state. In other
637	// words, there is ambiguity.
638	self.stack_push(nfa_id, Epsilons::empty())?;
639	while let Some((id, epsilons)) = self.stack.pop() {
640	match *self.nfa.state(id) {
641	thompson::State::ByteRange { ref trans } => {
642	self.compile_transition(dfa_id, trans, epsilons)?;
643	}
644	thompson::State::Sparse(ref sparse) => {
645	for trans in sparse.transitions.iter() {
646	self.compile_transition(dfa_id, trans, epsilons)?;
647	}
648	}
649	thompson::State::Dense(ref dense) => {
650	for trans in dense.iter() {
651	self.compile_transition(dfa_id, &trans, epsilons)?;
652	}
653	}
654	thompson::State::Look { look, next } => {
655	let looks = epsilons.looks().insert(look);
656	self.stack_push(next, epsilons.set_looks(looks))?;
657	}
658	thompson::State::Union { ref alternates } => {
659	for &sid in alternates.iter().rev() {
660	self.stack_push(sid, epsilons)?;
661	}
662	}
663	thompson::State::BinaryUnion { alt1, alt2 } => {
664	self.stack_push(alt2, epsilons)?;
665	self.stack_push(alt1, epsilons)?;
666	}
667	thompson::State::Capture { next, slot, .. } => {
668	let slot = slot.as_usize();
669	let epsilons = if slot < explicit_slot_start {
670	// If this is an implicit slot, we don't care
671	// about it, since we handle implicit slots in
672	// the search routine. We can get away with that
673	// because there are 2 implicit slots for every
674	// pattern.
675	epsilons
676	} else {
677	// Offset our explicit slots so that they start
678	// at index 0.
679	let offset = slot - explicit_slot_start;
680	epsilons.set_slots(epsilons.slots().insert(offset))
681	};
682	self.stack_push(next, epsilons)?;
683	}
684	thompson::State::Fail => {
685	continue;
686	}
687	thompson::State::Match { pattern_id } => {
688	// If we found two different paths to a match state
689	// for the same DFA state, then we have ambiguity.
690	// Thus, it's not one-pass.
691	if self.matched {
692	return Err(BuildError::not_one_pass(
693	"multiple epsilon transitions to match state",
694	));
695	}
696	self.matched = `true`;
697	// Shove the matching pattern ID and the 'epsilons'
698	// into the current DFA state's pattern epsilons. The
699	// 'epsilons' includes the slots we need to capture
700	// before reporting the match and also the conditional
701	// epsilon transitions we need to check before we can
702	// report a match.
703	self.dfa.set_pattern_epsilons(
704	dfa_id,
705	PatternEpsilons::empty()
706	.set_pattern_id(pattern_id)
707	.set_epsilons(epsilons),
708	);
709	// N.B. It is tempting to just bail out here when
710	// compiling a leftmost-first DFA, since we will never
711	// compile any more transitions in that case. But we
712	// actually need to keep going in order to verify that
713	// we actually have a one-pass regex. e.g., We might
714	// see more Match states (e.g., for other patterns)
715	// that imply that we don't have a one-pass regex.
716	// So instead, we mark that we've found a match and
717	// continue on. When we go to compile a new DFA state,
718	// we just skip that part. But otherwise check that the
719	// one-pass property is upheld.
720	}
721	}
722	}
723	}
724	self.shuffle_states();
725	Ok(self.dfa)
726	}
727
728	/// Shuffle all match states to the end of the transition table and set
729	/// 'min_match_id' to the ID of the first such match state.
730	///
731	/// The point of this is to make it extremely cheap to determine whether
732	/// a state is a match state or not. We need to check on this on every
733	/// transition during a search, so it being cheap is important. This
734	/// permits us to check it by simply comparing two state identifiers, as
735	/// opposed to looking for the pattern ID in the state's `PatternEpsilons`.
736	/// (Which requires a memory load and some light arithmetic.)
737	fn shuffle_states(&mut self) {
738	let mut remapper = Remapper::new(&self.dfa);
739	let mut next_dest = self.dfa.last_state_id();
740	for i in (`0`..self.dfa.state_len()).rev() {
741	let id = StateID::must(i);
742	let is_match =
743	self.dfa.pattern_epsilons(id).pattern_id().is_some();
744	if !is_match {
745	continue;
746	}
747	remapper.swap(&mut self.dfa, next_dest, id);
748	self.dfa.min_match_id = next_dest;
749	next_dest = self.dfa.prev_state_id(next_dest).expect(
750	"match states should be a proper subset of all states",
751	);
752	}
753	remapper.remap(&mut self.dfa);
754	}
755
756	/// Compile the given NFA transition into the DFA state given.
757	///
758	/// 'Epsilons' corresponds to any conditional epsilon transitions that need
759	/// to be satisfied to follow this transition, and any slots that need to
760	/// be saved if the transition is followed.
761	///
762	/// If this transition indicates that the NFA is not one-pass, then
763	/// this returns an error. (This occurs, for example, if the DFA state
764	/// already has a transition defined for the same input symbols as the
765	/// given transition, and* the result of the old and new transitions is*
766	/// different.)
767	fn compile_transition(
768	&mut self,
769	dfa_id: StateID,
770	trans: &thompson::Transition,
771	epsilons: Epsilons,
772	) -> Result<(), BuildError> {
773	let next_dfa_id = self.add_dfa_state_for_nfa_state(trans.next)?;
774	for byte in self
775	.classes
776	.representatives(trans.start..=trans.end)
777	.filter_map(\|r\| r.as_u8())
778	{
779	let oldtrans = self.dfa.transition(dfa_id, byte);
780	let newtrans =
781	Transition::new(self.matched, next_dfa_id, epsilons);
782	// If the old transition points to the DEAD state, then we know
783	// 'byte' has not been mapped to any transition for this DFA state
784	// yet. So set it unconditionally. Otherwise, we require that the
785	// old and new transitions are equivalent. Otherwise, there is
786	// ambiguity and thus the regex is not one-pass.
787	if oldtrans.state_id() == DEAD {
788	self.dfa.set_transition(dfa_id, byte, newtrans);
789	} else if oldtrans != newtrans {
790	return Err(BuildError::not_one_pass(
791	"conflicting transition",
792	));
793	}
794	}
795	Ok(())
796	}
797
798	/// Add a start state to the DFA corresponding to the given NFA starting
799	/// state ID.
800	///
801	/// If adding a state would blow any limits (configured or hard-coded),
802	/// then an error is returned.
803	///
804	/// If the starting state is an anchored state for a particular pattern,
805	/// then callers must provide the pattern ID for that starting state.
806	/// Callers must also ensure that the first starting state added is the
807	/// start state for all patterns, and then each anchored starting state for
808	/// each pattern (if necessary) added in order. Otherwise, this panics.
809	fn add_start_state(
810	&mut self,
811	pid: Option<PatternID>,
812	nfa_id: StateID,
813	) -> Result<StateID, BuildError> {
814	match pid {
815	// With no pid, this should be the start state for all patterns
816	// and thus be the first one.
817	None => assert!(self.dfa.starts.is_empty()),
818	// With a pid, we want it to be at self.dfa.starts[pid+1].
819	Some(pid) => assert!(self.dfa.starts.len() == pid.one_more()),
820	}
821	let dfa_id = self.add_dfa_state_for_nfa_state(nfa_id)?;
822	self.dfa.starts.push(dfa_id);
823	Ok(dfa_id)
824	}
825
826	/// Add a new DFA state corresponding to the given NFA state. If adding a
827	/// state would blow any limits (configured or hard-coded), then an error
828	/// is returned. If a DFA state already exists for the given NFA state,
829	/// then that DFA state's ID is returned and no new states are added.
830	///
831	/// It is not expected that this routine is called for every NFA state.
832	/// Instead, an NFA state ID will usually correspond to the "start" state
833	/// for a sub-graph of the NFA, where all states in the sub-graph are
834	/// reachable via epsilon transitions (conditional or unconditional). That
835	/// sub-graph of NFA states is ultimately what produces a single DFA state.
836	fn add_dfa_state_for_nfa_state(
837	&mut self,
838	nfa_id: StateID,
839	) -> Result<StateID, BuildError> {
840	// If we've already built a DFA state for the given NFA state, then
841	// just return that. We definitely do not want to have more than one
842	// DFA state in existence for the same NFA state, since all but one of
843	// them will likely become unreachable. And at least some of them are
844	// likely to wind up being incomplete.
845	let existing_dfa_id = self.nfa_to_dfa_id[nfa_id];
846	if existing_dfa_id != DEAD {
847	return Ok(existing_dfa_id);
848	}
849	// If we don't have any DFA state yet, add it and then add the given
850	// NFA state to the list of states to explore.
851	let dfa_id = self.add_empty_state()?;
852	self.nfa_to_dfa_id[nfa_id] = dfa_id;
853	self.uncompiled_nfa_ids.push(nfa_id);
854	Ok(dfa_id)
855	}
856
857	/// Unconditionally add a new empty DFA state. If adding it would exceed
858	/// any limits (configured or hard-coded), then an error is returned. The
859	/// ID of the new state is returned on success.
860	///
861	/// The added state is not* a match state.*
862	fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
863	let state_limit = Transition::STATE_ID_LIMIT;
864	// Note that unlike dense and lazy DFAs, we specifically do NOT
865	// premultiply our state IDs here. The reason is that we want to pack
866	// our state IDs into 64-bit transitions with other info, so the fewer
867	// the bits we use for state IDs the better. If we premultiply, then
868	// our state ID space shrinks. We justify this by the assumption that
869	// a one-pass DFA is just already doing a fair bit more work than a
870	// normal DFA anyway, so an extra multiplication to compute a state
871	// transition doesn't seem like a huge deal.
872	let next_id = self.dfa.table.len() >> self.dfa.stride2();
873	let id = StateID::new(next_id)
874	.map_err(\|_\| BuildError::too_many_states(state_limit))?;
875	if id.as_u64() > Transition::STATE_ID_LIMIT {
876	return Err(BuildError::too_many_states(state_limit));
877	}
878	self.dfa
879	.table
880	.extend(core::iter::repeat(Transition(`0`)).take(self.dfa.stride()));
881	// The default empty value for 'PatternEpsilons' is sadly not all
882	// zeroes. Instead, a special sentinel is used to indicate that there
883	// is no pattern. So we need to explicitly set the pattern epsilons to
884	// the correct "empty" PatternEpsilons.
885	self.dfa.set_pattern_epsilons(id, PatternEpsilons::empty());
886	if let Some(size_limit) = self.config.get_size_limit() {
887	if self.dfa.memory_usage() > size_limit {
888	return Err(BuildError::exceeded_size_limit(size_limit));
889	}
890	}
891	Ok(id)
892	}
893
894	/// Push the given NFA state ID and its corresponding epsilons (slots and
895	/// conditional epsilon transitions) on to a stack for use in a depth first
896	/// traversal of a sub-graph of the NFA.
897	///
898	/// If the given NFA state ID has already been pushed on to the stack, then
899	/// it indicates the regex is not one-pass and this correspondingly returns
900	/// an error.
901	fn stack_push(
902	&mut self,
903	nfa_id: StateID,
904	epsilons: Epsilons,
905	) -> Result<(), BuildError> {
906	// If we already have seen a match and we are compiling a leftmost
907	// first DFA, then we shouldn't add any more states to look at. This is
908	// effectively how preference order and non-greediness is implemented.
909	// if !self.config.get_match_kind().continue_past_first_match()
910	// && self.matched
911	// {
912	// return Ok(());
913	// }
914	if !self.seen.insert(nfa_id) {
915	return Err(BuildError::not_one_pass(
916	"multiple epsilon transitions to same state",
917	));
918	}
919	self.stack.push((nfa_id, epsilons));
920	Ok(())
921	}
922	}
923
924	/// A one-pass DFA for executing a subset of anchored regex searches while
925	/// resolving capturing groups.
926	///
927	/// A one-pass DFA can be built from an NFA that is one-pass. An NFA is
928	/// one-pass when there is never any ambiguity about how to continue a search.
929	/// For example, `aa` is not one-pass becuase during a search, it's not*
930	/// possible to know whether to continue matching the `a` or to move on to*
931	/// the single `a`. However, `ab` is one-pass, because for every byte in the*
932	/// input, it's always clear when to move on from `a` to `b`.*
933	///
934	/// # Only anchored searches are supported
935	///
936	/// In this crate, especially for DFAs, unanchored searches are implemented by
937	/// treating the pattern as if it had a `(?s-u:.)?` prefix. While the prefix*
938	/// is one-pass on its own, adding anything after it, e.g., `(?s-u:.)?a` will*
939	/// make the overall pattern not one-pass. Why? Because the `(?s-u:.)` matches
940	/// any byte, and there is therefore ambiguity as to when the prefix should
941	/// stop matching and something else should start matching.
942	///
943	/// Therefore, one-pass DFAs do not support unanchored searches. In addition
944	/// to many regexes simply not being one-pass, it implies that one-pass DFAs
945	/// have limited utility. With that said, when a one-pass DFA can be used, it
946	/// can potentially provide a dramatic speed up over alternatives like the
947	/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
948	/// and the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). In particular,
949	/// a one-pass DFA is the only DFA capable of reporting the spans of matching
950	/// capturing groups.
951	///
952	/// To clarify, when we say that unanchored searches are not supported, what
953	/// that actually means is:
954	///
955	/// The high level routines, [`DFA::is_match`] and [`DFA::captures`], always*
956	/// do anchored searches.
957	/// Since iterators are most useful in the context of unanchored searches,*
958	/// there is no `DFA::captures_iter` method.
959	/// For lower level routines like [`DFA::try_search`], an error will be*
960	/// returned if the given [`Input`] is configured to do an unanchored search or
961	/// search for an invalid pattern ID. (Note that an [`Input`] is configured to
962	/// do an unanchored search by default, so just giving a `Input::new` is
963	/// guaranteed to return an error.)
964	///
965	/// # Other limitations
966	///
967	/// In addition to the [configurable heap limit](Config::size_limit) and
968	/// the requirement that a regex pattern be one-pass, there are some other
969	/// limitations:
970	///
971	/// There is an internal limit on the total number of explicit capturing*
972	/// groups that appear across all patterns. It is somewhat small and there is
973	/// no way to configure it. If your pattern(s) exceed this limit, then building
974	/// a one-pass DFA will fail.
975	/// If the number of patterns exceeds an internal unconfigurable limit, then*
976	/// building a one-pass DFA will fail. This limit is quite large and you're
977	/// unlikely to hit it.
978	/// If the total number of states exceeds an internal unconfigurable limit,*
979	/// then building a one-pass DFA will fail. This limit is quite large and
980	/// you're unlikely to hit it.
981	///
982	/// # Other examples of regexes that aren't one-pass
983	///
984	/// One particularly unfortunate example is that enabling Unicode can cause
985	/// regexes that were one-pass to no longer be one-pass. Consider the regex
986	/// `(?-u)\w\s` for example. It is one-pass because there is exactly no*
987	/// overlap between the ASCII definitions of `\w` and `\s`. But `\w\s`*
988	/// (i.e., with Unicode enabled) is not* one-pass because `\w` and `\s` get*
989	/// translated to UTF-8 automatons. And while the codepoints* in `\w` and `\s`*
990	/// do not overlap, the underlying UTF-8 encodings do. Indeed, because of the
991	/// overlap between UTF-8 automata, the use of Unicode character classes will
992	/// tend to vastly increase the likelihood of a regex not being one-pass.
993	///
994	/// # How does one know if a regex is one-pass or not?
995	///
996	/// At the time of writing, the only way to know is to try and build a one-pass
997	/// DFA. The one-pass property is checked while constructing the DFA.
998	///
999	/// This does mean that you might potentially waste some CPU cycles and memory
1000	/// by optimistically trying to build a one-pass DFA. But this is currently the
1001	/// only way. In the future, building a one-pass DFA might be able to use some
1002	/// heuristics to detect common violations of the one-pass property and bail
1003	/// more quickly.
1004	///
1005	/// # Resource usage
1006	///
1007	/// Unlike a general DFA, a one-pass DFA has stricter bounds on its resource
1008	/// usage. Namely, construction of a one-pass DFA has a time and space
1009	/// complexity of `O(n)`, where `n ~ nfa.states().len()`. (A general DFA's time
1010	/// and space complexity is `O(2^n)`.) This smaller time bound is achieved
1011	/// because there is at most one DFA state created for each NFA state. If
1012	/// additional DFA states would be required, then the pattern is not one-pass
1013	/// and construction will fail.
1014	///
1015	/// Note though that currently, this DFA uses a fully dense representation.
1016	/// This means that while its space complexity is no worse than an NFA, it may
1017	/// in practice use more memory because of higher constant factors. The reason
1018	/// for this trade off is two-fold. Firstly, a dense representation makes the
1019	/// search faster. Secondly, the bigger an NFA, the more unlikely it is to be
1020	/// one-pass. Therefore, most one-pass DFAs are usually pretty small.
1021	///
1022	/// # Example
1023	///
1024	/// This example shows that the one-pass DFA implements Unicode word boundaries
1025	/// correctly while simultaneously reporting spans for capturing groups that
1026	/// participate in a match. (This is the only DFA that implements full support
1027	/// for Unicode word boundaries.)
1028	///
1029	/// ```
1030	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1031	/// use regex_automata::{dfa::onepass::DFA, Match, Span};
1032	///
1033	/// let re = DFA::new(r"\b(?P<first>\w+)[[:space:]]+(?P<last>\w+)\b")?;
1034	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1035	///
1036	/// re.captures(&mut cache, "Шерлок Холмс", &mut caps);
1037	/// assert_eq!(Some(Match::must(`0`, `0`..`23`)), caps.get_match());
1038	/// assert_eq!(Some(Span::from(`0`..`12`)), caps.get_group_by_name("first"));
1039	/// assert_eq!(Some(Span::from(`13`..`23`)), caps.get_group_by_name("last"));
1040	/// # Ok::<(), Box<dyn std::error::Error>>(())
1041	/// ```
1042	///
1043	/// # Example: iteration
1044	///
1045	/// Unlike other regex engines in this crate, this one does not provide
1046	/// iterator search functions. This is because a one-pass DFA only supports
1047	/// anchored searches, and so iterator functions are generally not applicable.
1048	///
1049	/// However, if you know that all of your matches are
1050	/// directly adjacent, then an iterator can be used. The
1051	/// [`util::iter::Searcher`](crate::util::iter::Searcher) type can be used for
1052	/// this purpose:
1053	///
1054	/// ```
1055	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1056	/// use regex_automata::{
1057	/// dfa::onepass::DFA,
1058	/// util::iter::Searcher,
1059	/// Anchored, Input, Span,
1060	/// };
1061	///
1062	/// let re = DFA::new(r"\w(\d)\w")?;
1063	/// let (mut cache, caps) = (re.create_cache(), re.create_captures());
1064	/// let input = Input::new("a1zb2yc3x").anchored(Anchored::Yes);
1065	///
1066	/// let mut it = Searcher::new(input).into_captures_iter(caps, \|input, caps\| {
1067	/// Ok(re.try_search(&mut cache, input, caps)?)
1068	/// }).infallible();
1069	/// let caps0 = it.next().unwrap();
1070	/// assert_eq!(Some(Span::from(`1`..`2`)), caps0.get_group(`1`));
1071	///
1072	/// # Ok::<(), Box<dyn std::error::Error>>(())
1073	/// ```
1074	#[derive(Clone)]
1075	pub struct DFA {
1076	/// The configuration provided by the caller.
1077	config: Config,
1078	/// The NFA used to build this DFA.
1079	///
1080	/// NOTE: We probably don't need to store the NFA here, but we use enough
1081	/// bits from it that it's convenient to do so. And there really isn't much
1082	/// cost to doing so either, since an NFA is reference counted internally.
1083	nfa: NFA,
1084	/// The transition table. Given a state ID 's' and a byte of haystack 'b',
1085	/// the next state is `table[sid + classes[byte]]`.
1086	///
1087	/// The stride of this table (i.e., the number of columns) is always
1088	/// a power of 2, even if the alphabet length is smaller. This makes
1089	/// converting between state IDs and state indices very cheap.
1090	///
1091	/// Note that the stride always includes room for one extra "transition"
1092	/// that isn't actually a transition. It is a 'PatternEpsilons' that is
1093	/// used for match states only. Because of this, the maximum number of
1094	/// active columns in the transition table is 257, which means the maximum
1095	/// stride is 512 (the next power of 2 greater than or equal to 257).
1096	table: Vec<Transition>,
1097	/// The DFA state IDs of the starting states.
1098	///
1099	/// `starts[0]` is always present and corresponds to the starting state
1100	/// when searching for matches of any pattern in the DFA.
1101	///
1102	/// `starts[i]` where i>0 corresponds to the starting state for the pattern
1103	/// ID 'i-1'. These starting states are optional.
1104	starts: Vec<StateID>,
1105	/// Every state ID >= this value corresponds to a match state.
1106	///
1107	/// This is what a search uses to detect whether a state is a match state
1108	/// or not. It requires only a simple comparison instead of bit-unpacking
1109	/// the PatternEpsilons from every state.
1110	min_match_id: StateID,
1111	/// The alphabet of this DFA, split into equivalence classes. Bytes in the
1112	/// same equivalence class can never discriminate between a match and a
1113	/// non-match.
1114	classes: ByteClasses,
1115	/// The number of elements in each state in the transition table. This may
1116	/// be less than the stride, since the stride is always a power of 2 and
1117	/// the alphabet length can be anything up to and including 256.
1118	alphabet_len: usize,
1119	/// The number of columns in the transition table, expressed as a power of
1120	/// 2.
1121	stride2: usize,
1122	/// The offset at which the PatternEpsilons for a match state is stored in
1123	/// the transition table.
1124	///
1125	/// PERF: One wonders whether it would be better to put this in a separate
1126	/// allocation, since only match states have a non-empty PatternEpsilons
1127	/// and the number of match states tends be dwarfed by the number of
1128	/// non-match states. So this would save '8len(non_match_states)' for each*
1129	/// DFA. The question is whether moving this to a different allocation will
1130	/// lead to a perf hit during searches. You might think dealing with match
1131	/// states is rare, but some regexes spend a lot of time in match states
1132	/// gobbling up input. But... match state handling is already somewhat
1133	/// expensive, so maybe this wouldn't do much? Either way, it's worth
1134	/// experimenting.
1135	pateps_offset: usize,
1136	/// The first explicit slot index. This refers to the first slot appearing
1137	/// immediately after the last implicit slot. It is always 'patterns.len()
1138	/// 2'.*
1139	///
1140	/// We record this because we only store the explicit slots in our DFA
1141	/// transition table that need to be saved. Implicit slots are handled
1142	/// automatically as part of the search.
1143	explicit_slot_start: usize,
1144	}
1145
1146	impl DFA {
1147	/// Parse the given regular expression using the default configuration and
1148	/// return the corresponding one-pass DFA.
1149	///
1150	/// If you want a non-default configuration, then use the [`Builder`] to
1151	/// set your own configuration.
1152	///
1153	/// # Example
1154	///
1155	/// ```
1156	/// use regex_automata::{dfa::onepass::DFA, Match};
1157	///
1158	/// let re = DFA::new("foo[0-9]+bar")?;
1159	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1160	///
1161	/// re.captures(&mut cache, "foo12345barzzz", &mut caps);
1162	/// assert_eq!(Some(Match::must(`0`, `0`..`11`)), caps.get_match());
1163	/// # Ok::<(), Box<dyn std::error::Error>>(())
1164	/// ```
1165	#[cfg(feature = "syntax")]
1166	#[inline]
1167	pub fn new(pattern: &str) -> Result<DFA, BuildError> {
1168	DFA::builder().build(pattern)
1169	}
1170
1171	/// Like `new`, but parses multiple patterns into a single "multi regex."
1172	/// This similarly uses the default regex configuration.
1173	///
1174	/// # Example
1175	///
1176	/// ```
1177	/// use regex_automata::{dfa::onepass::DFA, Match};
1178	///
1179	/// let re = DFA::new_many(&["[a-z]+", "[0-9]+"])?;
1180	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1181	///
1182	/// re.captures(&mut cache, "abc123", &mut caps);
1183	/// assert_eq!(Some(Match::must(`0`, `0`..`3`)), caps.get_match());
1184	///
1185	/// re.captures(&mut cache, "123abc", &mut caps);
1186	/// assert_eq!(Some(Match::must(`1`, `0`..`3`)), caps.get_match());
1187	///
1188	/// # Ok::<(), Box<dyn std::error::Error>>(())
1189	/// ```
1190	#[cfg(feature = "syntax")]
1191	#[inline]
1192	pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
1193	DFA::builder().build_many(patterns)
1194	}
1195
1196	/// Like `new`, but builds a one-pass DFA directly from an NFA. This is
1197	/// useful if you already have an NFA, or even if you hand-assembled the
1198	/// NFA.
1199	///
1200	/// # Example
1201	///
1202	/// This shows how to hand assemble a regular expression via its HIR,
1203	/// compile an NFA from it and build a one-pass DFA from the NFA.
1204	///
1205	/// ```
1206	/// use regex_automata::{
1207	/// dfa::onepass::DFA,
1208	/// nfa::thompson::NFA,
1209	/// Match,
1210	/// };
1211	/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
1212	///
1213	/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
1214	/// ClassBytesRange::new(b'0', b'9'),
1215	/// ClassBytesRange::new(b'A', b'Z'),
1216	/// ClassBytesRange::new(b'_', b'_'),
1217	/// ClassBytesRange::new(b'a', b'z'),
1218	/// ])));
1219	///
1220	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
1221	/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
1222	///
1223	/// let re = DFA::new_from_nfa(nfa)?;
1224	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1225	/// let expected = Some(Match::must(`0`, `0`..`1`));
1226	/// re.captures(&mut cache, "A", &mut caps);
1227	/// assert_eq!(expected, caps.get_match());
1228	///
1229	/// # Ok::<(), Box<dyn std::error::Error>>(())
1230	/// ```
1231	pub fn new_from_nfa(nfa: NFA) -> Result<DFA, BuildError> {
1232	DFA::builder().build_from_nfa(nfa)
1233	}
1234
1235	/// Create a new one-pass DFA that matches every input.
1236	///
1237	/// # Example
1238	///
1239	/// ```
1240	/// use regex_automata::{dfa::onepass::DFA, Match};
1241	///
1242	/// let dfa = DFA::always_match()?;
1243	/// let mut cache = dfa.create_cache();
1244	/// let mut caps = dfa.create_captures();
1245	///
1246	/// let expected = Match::must(`0`, `0`..`0`);
1247	/// dfa.captures(&mut cache, "", &mut caps);
1248	/// assert_eq!(Some(expected), caps.get_match());
1249	/// dfa.captures(&mut cache, "foo", &mut caps);
1250	/// assert_eq!(Some(expected), caps.get_match());
1251	/// # Ok::<(), Box<dyn std::error::Error>>(())
1252	/// ```
1253	pub fn always_match() -> Result<DFA, BuildError> {
1254	let nfa = thompson::NFA::always_match();
1255	Builder::new().build_from_nfa(nfa)
1256	}
1257
1258	/// Create a new one-pass DFA that never matches any input.
1259	///
1260	/// # Example
1261	///
1262	/// ```
1263	/// use regex_automata::dfa::onepass::DFA;
1264	///
1265	/// let dfa = DFA::never_match()?;
1266	/// let mut cache = dfa.create_cache();
1267	/// let mut caps = dfa.create_captures();
1268	///
1269	/// dfa.captures(&mut cache, "", &mut caps);
1270	/// assert_eq!(None, caps.get_match());
1271	/// dfa.captures(&mut cache, "foo", &mut caps);
1272	/// assert_eq!(None, caps.get_match());
1273	/// # Ok::<(), Box<dyn std::error::Error>>(())
1274	/// ```
1275	pub fn never_match() -> Result<DFA, BuildError> {
1276	let nfa = thompson::NFA::never_match();
1277	Builder::new().build_from_nfa(nfa)
1278	}
1279
1280	/// Return a default configuration for a DFA.
1281	///
1282	/// This is a convenience routine to avoid needing to import the `Config`
1283	/// type when customizing the construction of a DFA.
1284	///
1285	/// # Example
1286	///
1287	/// This example shows how to change the match semantics of this DFA from
1288	/// its default "leftmost first" to "all." When using "all," non-greediness
1289	/// doesn't apply and neither does preference order matching. Instead, the
1290	/// longest match possible is always returned. (Although, by construction,
1291	/// it's impossible for a one-pass DFA to have a different answer for
1292	/// "preference order" vs "longest match.")
1293	///
1294	/// ```
1295	/// use regex_automata::{dfa::onepass::DFA, Match, MatchKind};
1296	///
1297	/// let re = DFA::builder()
1298	/// .configure(DFA::config().match_kind(MatchKind::All))
1299	/// .build(r"(abc)+?")?;
1300	/// let mut cache = re.create_cache();
1301	/// let mut caps = re.create_captures();
1302	///
1303	/// re.captures(&mut cache, "abcabc", &mut caps);
1304	/// // Normally, the non-greedy repetition would give us a 0..3 match.
1305	/// assert_eq!(Some(Match::must(`0`, `0`..`6`)), caps.get_match());
1306	/// # Ok::<(), Box<dyn std::error::Error>>(())
1307	/// ```
1308	#[inline]
1309	pub fn config() -> Config {
1310	Config::new()
1311	}
1312
1313	/// Return a builder for configuring the construction of a DFA.
1314	///
1315	/// This is a convenience routine to avoid needing to import the
1316	/// [`Builder`] type in common cases.
1317	///
1318	/// # Example
1319	///
1320	/// This example shows how to use the builder to disable UTF-8 mode.
1321	///
1322	/// ```
1323	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1324	/// use regex_automata::{
1325	/// dfa::onepass::DFA,
1326	/// nfa::thompson,
1327	/// util::syntax,
1328	/// Match,
1329	/// };
1330	///
1331	/// let re = DFA::builder()
1332	/// .syntax(syntax::Config::new().utf8(`false`))
1333	/// .thompson(thompson::Config::new().utf8(`false`))
1334	/// .build(r"foo(?-u:[^b])ar.*")?;
1335	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1336	///
1337	/// let haystack = b"foo`\xFF`arzz`\xE2\x98\xFF\n`";
1338	/// let expected = Some(Match::must(`0`, `0`..`8`));
1339	/// re.captures(&mut cache, haystack, &mut caps);
1340	/// assert_eq!(expected, caps.get_match());
1341	///
1342	/// # Ok::<(), Box<dyn std::error::Error>>(())
1343	/// ```
1344	#[inline]
1345	pub fn builder() -> Builder {
1346	Builder::new()
1347	}
1348
1349	/// Create a new empty set of capturing groups that is guaranteed to be
1350	/// valid for the search APIs on this DFA.
1351	///
1352	/// A `Captures` value created for a specific DFA cannot be used with any
1353	/// other DFA.
1354	///
1355	/// This is a convenience function for [`Captures::all`]. See the
1356	/// [`Captures`] documentation for an explanation of its alternative
1357	/// constructors that permit the DFA to do less work during a search, and
1358	/// thus might make it faster.
1359	#[inline]
1360	pub fn create_captures(&self) -> Captures {
1361	Captures::all(self.nfa.group_info().clone())
1362	}
1363
1364	/// Create a new cache for this DFA.
1365	///
1366	/// The cache returned should only be used for searches for this
1367	/// DFA. If you want to reuse the cache for another DFA, then you
1368	/// must call [`Cache::reset`] with that DFA (or, equivalently,
1369	/// [`DFA::reset_cache`]).
1370	#[inline]
1371	pub fn create_cache(&self) -> Cache {
1372	Cache::new(self)
1373	}
1374
1375	/// Reset the given cache such that it can be used for searching with the
1376	/// this DFA (and only this DFA).
1377	///
1378	/// A cache reset permits reusing memory already allocated in this cache
1379	/// with a different DFA.
1380	///
1381	/// # Example
1382	///
1383	/// This shows how to re-purpose a cache for use with a different DFA.
1384	///
1385	/// ```
1386	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1387	/// use regex_automata::{dfa::onepass::DFA, Match};
1388	///
1389	/// let re1 = DFA::new(r"\w")?;
1390	/// let re2 = DFA::new(r"\W")?;
1391	/// let mut caps1 = re1.create_captures();
1392	/// let mut caps2 = re2.create_captures();
1393	///
1394	/// let mut cache = re1.create_cache();
1395	/// assert_eq!(
1396	/// Some(Match::must(`0`, `0`..`2`)),
1397	/// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
1398	/// );
1399	///
1400	/// // Using 'cache' with re2 is not allowed. It may result in panics or
1401	/// // incorrect results. In order to re-purpose the cache, we must reset
1402	/// // it with the one-pass DFA we'd like to use it with.
1403	/// //
1404	/// // Similarly, after this reset, using the cache with 're1' is also not
1405	/// // allowed.
1406	/// re2.reset_cache(&mut cache);
1407	/// assert_eq!(
1408	/// Some(Match::must(`0`, `0`..`3`)),
1409	/// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
1410	/// );
1411	///
1412	/// # Ok::<(), Box<dyn std::error::Error>>(())
1413	/// ```
1414	#[inline]
1415	pub fn reset_cache(&self, cache: &mut Cache) {
1416	cache.reset(self);
1417	}
1418
1419	/// Return the config for this one-pass DFA.
1420	#[inline]
1421	pub fn get_config(&self) -> &Config {
1422	&self.config
1423	}
1424
1425	/// Returns a reference to the underlying NFA.
1426	#[inline]
1427	pub fn get_nfa(&self) -> &NFA {
1428	&self.nfa
1429	}
1430
1431	/// Returns the total number of patterns compiled into this DFA.
1432	///
1433	/// In the case of a DFA that contains no patterns, this returns `0`.
1434	#[inline]
1435	pub fn pattern_len(&self) -> usize {
1436	self.get_nfa().pattern_len()
1437	}
1438
1439	/// Returns the total number of states in this one-pass DFA.
1440	///
1441	/// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
1442	/// a low level DFA API. Therefore, this routine has little use other than
1443	/// being informational.
1444	#[inline]
1445	pub fn state_len(&self) -> usize {
1446	self.table.len() >> self.stride2()
1447	}
1448
1449	/// Returns the total number of elements in the alphabet for this DFA.
1450	///
1451	/// That is, this returns the total number of transitions that each
1452	/// state in this DFA must have. The maximum alphabet size is 256, which
1453	/// corresponds to each possible byte value.
1454	///
1455	/// The alphabet size may be less than 256 though, and unless
1456	/// [`Config::byte_classes`] is disabled, it is typically must less than
1457	/// 256. Namely, bytes are grouped into equivalence classes such that no
1458	/// two bytes in the same class can distinguish a match from a non-match.
1459	/// For example, in the regex `^[a-z]+$`, the ASCII bytes `a-z` could
1460	/// all be in the same equivalence class. This leads to a massive space
1461	/// savings.
1462	///
1463	/// Note though that the alphabet length does _not_ necessarily equal the
1464	/// total stride space taken up by a single DFA state in the transition
1465	/// table. Namely, for performance reasons, the stride is always the
1466	/// smallest power of two that is greater than or equal to the alphabet
1467	/// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
1468	/// often more useful. The alphabet length is typically useful only for
1469	/// informational purposes.
1470	///
1471	/// Note also that unlike dense or sparse DFAs, a one-pass DFA does
1472	/// not have a special end-of-input (EOI) transition. This is because
1473	/// a one-pass DFA handles look-around assertions explicitly (like the
1474	/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)) and does not build
1475	/// them into the transitions of the DFA.
1476	#[inline]
1477	pub fn alphabet_len(&self) -> usize {
1478	self.alphabet_len
1479	}
1480
1481	/// Returns the total stride for every state in this DFA, expressed as the
1482	/// exponent of a power of 2. The stride is the amount of space each state
1483	/// takes up in the transition table, expressed as a number of transitions.
1484	/// (Unused transitions map to dead states.)
1485	///
1486	/// The stride of a DFA is always equivalent to the smallest power of
1487	/// 2 that is greater than or equal to the DFA's alphabet length. This
1488	/// definition uses extra space, but possibly permits faster translation
1489	/// between state identifiers and their corresponding offsets in this DFA's
1490	/// transition table.
1491	///
1492	/// For example, if the DFA's stride is 16 transitions, then its `stride2`
1493	/// is `4` since `2^4 = 16`.
1494	///
1495	/// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
1496	/// while the maximum `stride2` value is `9` (corresponding to a stride
1497	/// of `512`). The maximum in theory should be `8`, but because of some
1498	/// implementation quirks that may be relaxed in the future, it is one more
1499	/// than `8`. (Do note that a maximal stride is incredibly rare, as it
1500	/// would imply that there is almost no redundant in the regex pattern.)
1501	///
1502	/// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
1503	/// a low level DFA API. Therefore, this routine has little use other than
1504	/// being informational.
1505	#[inline]
1506	pub fn stride2(&self) -> usize {
1507	self.stride2
1508	}
1509
1510	/// Returns the total stride for every state in this DFA. This corresponds
1511	/// to the total number of transitions used by each state in this DFA's
1512	/// transition table.
1513	///
1514	/// Please see [`DFA::stride2`] for more information. In particular, this
1515	/// returns the stride as the number of transitions, where as `stride2`
1516	/// returns it as the exponent of a power of 2.
1517	///
1518	/// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
1519	/// a low level DFA API. Therefore, this routine has little use other than
1520	/// being informational.
1521	#[inline]
1522	pub fn stride(&self) -> usize {
1523	`1` << self.stride2()
1524	}
1525
1526	/// Returns the memory usage, in bytes, of this DFA.
1527	///
1528	/// The memory usage is computed based on the number of bytes used to
1529	/// represent this DFA.
1530	///
1531	/// This does not* include the stack size used up by this DFA. To*
1532	/// compute that, use `std::mem::size_of::<onepass::DFA>()`.
1533	#[inline]
1534	pub fn memory_usage(&self) -> usize {
1535	use core::mem::size_of;
1536
1537	self.table.len() * size_of::<Transition>()
1538	+ self.starts.len() * size_of::<StateID>()
1539	}
1540	}
1541
1542	impl DFA {
1543	/// Executes an anchored leftmost forward search, and returns true if and
1544	/// only if this one-pass DFA matches the given haystack.
1545	///
1546	/// This routine may short circuit if it knows that scanning future
1547	/// input will never lead to a different result. In particular, if the
1548	/// underlying DFA enters a match state, then this routine will return
1549	/// `true` immediately without inspecting any future input. (Consider how
1550	/// this might make a difference given the regex `a+` on the haystack
1551	/// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
1552	/// but routines like `find` need to continue searching because `+` is
1553	/// greedy by default.)
1554	///
1555	/// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
1556	/// given configuration was [`Anchored::No`] (which is the default).
1557	///
1558	/// # Panics
1559	///
1560	/// This routine panics if the search could not complete. This can occur
1561	/// in the following circumstances:
1562	///
1563	/// When the provided `Input` configuration is not supported. For*
1564	/// example, by providing an unsupported anchor mode. Concretely,
1565	/// this occurs when using [`Anchored::Pattern`] without enabling
1566	/// [`Config::starts_for_each_pattern`].
1567	///
1568	/// When a search panics, callers cannot know whether a match exists or
1569	/// not.
1570	///
1571	/// Use [`DFA::try_search`] if you want to handle these panics as error
1572	/// values instead.
1573	///
1574	/// # Example
1575	///
1576	/// This shows basic usage:
1577	///
1578	/// ```
1579	/// use regex_automata::dfa::onepass::DFA;
1580	///
1581	/// let re = DFA::new("foo[0-9]+bar")?;
1582	/// let mut cache = re.create_cache();
1583	///
1584	/// assert!(re.is_match(&mut cache, "foo12345bar"));
1585	/// assert!(!re.is_match(&mut cache, "foobar"));
1586	/// # Ok::<(), Box<dyn std::error::Error>>(())
1587	/// ```
1588	///
1589	/// # Example: consistency with search APIs
1590	///
1591	/// `is_match` is guaranteed to return `true` whenever `captures` returns
1592	/// a match. This includes searches that are executed entirely within a
1593	/// codepoint:
1594	///
1595	/// ```
1596	/// use regex_automata::{dfa::onepass::DFA, Input};
1597	///
1598	/// let re = DFA::new("a*")?;
1599	/// let mut cache = re.create_cache();
1600	///
1601	/// assert!(!re.is_match(&mut cache, Input::new("☃").span(`1`..`2`)));
1602	/// # Ok::<(), Box<dyn std::error::Error>>(())
1603	/// ```
1604	///
1605	/// Notice that when UTF-8 mode is disabled, then the above reports a
1606	/// match because the restriction against zero-width matches that split a
1607	/// codepoint has been lifted:
1608	///
1609	/// ```
1610	/// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Input};
1611	///
1612	/// let re = DFA::builder()
1613	/// .thompson(NFA::config().utf8(`false`))
1614	/// .build("a*")?;
1615	/// let mut cache = re.create_cache();
1616	///
1617	/// assert!(re.is_match(&mut cache, Input::new("☃").span(`1`..`2`)));
1618	/// # Ok::<(), Box<dyn std::error::Error>>(())
1619	/// ```
1620	#[inline]
1621	pub fn is_match<'h, I: Into<Input<'h>>>(
1622	&self,
1623	cache: &mut Cache,
1624	input: I,
1625	) -> bool {
1626	let mut input = input.into().earliest(`true`);
1627	if matches!(input.get_anchored(), Anchored::No) {
1628	input.set_anchored(Anchored::Yes);
1629	}
1630	self.try_search_slots(cache, &input, &mut []).unwrap().is_some()
1631	}
1632
1633	/// Executes an anchored leftmost forward search, and returns a `Match` if
1634	/// and only if this one-pass DFA matches the given haystack.
1635	///
1636	/// This routine only includes the overall match span. To get access to the
1637	/// individual spans of each capturing group, use [`DFA::captures`].
1638	///
1639	/// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
1640	/// given configuration was [`Anchored::No`] (which is the default).
1641	///
1642	/// # Panics
1643	///
1644	/// This routine panics if the search could not complete. This can occur
1645	/// in the following circumstances:
1646	///
1647	/// When the provided `Input` configuration is not supported. For*
1648	/// example, by providing an unsupported anchor mode. Concretely,
1649	/// this occurs when using [`Anchored::Pattern`] without enabling
1650	/// [`Config::starts_for_each_pattern`].
1651	///
1652	/// When a search panics, callers cannot know whether a match exists or
1653	/// not.
1654	///
1655	/// Use [`DFA::try_search`] if you want to handle these panics as error
1656	/// values instead.
1657	///
1658	/// # Example
1659	///
1660	/// Leftmost first match semantics corresponds to the match with the
1661	/// smallest starting offset, but where the end offset is determined by
1662	/// preferring earlier branches in the original regular expression. For
1663	/// example, `Sam\|Samwise` will match `Sam` in `Samwise`, but `Samwise\|Sam`
1664	/// will match `Samwise` in `Samwise`.
1665	///
1666	/// Generally speaking, the "leftmost first" match is how most backtracking
1667	/// regular expressions tend to work. This is in contrast to POSIX-style
1668	/// regular expressions that yield "leftmost longest" matches. Namely,
1669	/// both `Sam\|Samwise` and `Samwise\|Sam` match `Samwise` when using
1670	/// leftmost longest semantics. (This crate does not currently support
1671	/// leftmost longest semantics.)
1672	///
1673	/// ```
1674	/// use regex_automata::{dfa::onepass::DFA, Match};
1675	///
1676	/// let re = DFA::new("foo[0-9]+")?;
1677	/// let mut cache = re.create_cache();
1678	/// let expected = Match::must(`0`, `0`..`8`);
1679	/// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
1680	///
1681	/// // Even though a match is found after reading the first byte (`a`),
1682	/// // the leftmost first match semantics demand that we find the earliest
1683	/// // match that prefers earlier parts of the pattern over later parts.
1684	/// let re = DFA::new("abc\|a")?;
1685	/// let mut cache = re.create_cache();
1686	/// let expected = Match::must(`0`, `0`..`3`);
1687	/// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
1688	///
1689	/// # Ok::<(), Box<dyn std::error::Error>>(())
1690	/// ```
1691	#[inline]
1692	pub fn find<'h, I: Into<Input<'h>>>(
1693	&self,
1694	cache: &mut Cache,
1695	input: I,
1696	) -> Option<Match> {
1697	let mut input = input.into();
1698	if matches!(input.get_anchored(), Anchored::No) {
1699	input.set_anchored(Anchored::Yes);
1700	}
1701	if self.get_nfa().pattern_len() == `1` {
1702	let mut slots = [None, None];
1703	let pid =
1704	self.try_search_slots(cache, &input, &mut slots).unwrap()?;
1705	let start = slots[`0`].unwrap().get();
1706	let end = slots[`1`].unwrap().get();
1707	return Some(Match::new(pid, Span { start, end }));
1708	}
1709	let ginfo = self.get_nfa().group_info();
1710	let slots_len = ginfo.implicit_slot_len();
1711	let mut slots = vec![None; slots_len];
1712	let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?;
1713	let start = slots[pid.as_usize() * `2`].unwrap().get();
1714	let end = slots[pid.as_usize() * `2` + `1`].unwrap().get();
1715	Some(Match::new(pid, Span { start, end }))
1716	}
1717
1718	/// Executes an anchored leftmost forward search and writes the spans
1719	/// of capturing groups that participated in a match into the provided
1720	/// [`Captures`] value. If no match was found, then [`Captures::is_match`]
1721	/// is guaranteed to return `false`.
1722	///
1723	/// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
1724	/// given configuration was [`Anchored::No`] (which is the default).
1725	///
1726	/// # Panics
1727	///
1728	/// This routine panics if the search could not complete. This can occur
1729	/// in the following circumstances:
1730	///
1731	/// When the provided `Input` configuration is not supported. For*
1732	/// example, by providing an unsupported anchor mode. Concretely,
1733	/// this occurs when using [`Anchored::Pattern`] without enabling
1734	/// [`Config::starts_for_each_pattern`].
1735	///
1736	/// When a search panics, callers cannot know whether a match exists or
1737	/// not.
1738	///
1739	/// Use [`DFA::try_search`] if you want to handle these panics as error
1740	/// values instead.
1741	///
1742	/// # Example
1743	///
1744	/// This shows a simple example of a one-pass regex that extracts
1745	/// capturing group spans.
1746	///
1747	/// ```
1748	/// use regex_automata::{dfa::onepass::DFA, Match, Span};
1749	///
1750	/// let re = DFA::new(
1751	/// // Notice that we use ASCII here. The corresponding Unicode regex
1752	/// // is sadly not one-pass.
1753	/// "(?P<first>[[:alpha:]]+)[[:space:]]+(?P<last>[[:alpha:]]+)",
1754	/// )?;
1755	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1756	///
1757	/// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
1758	/// assert_eq!(Some(Match::must(`0`, `0`..`17`)), caps.get_match());
1759	/// assert_eq!(Some(Span::from(`0`..`5`)), caps.get_group(`1`));
1760	/// assert_eq!(Some(Span::from(`6`..`17`)), caps.get_group_by_name("last"));
1761	///
1762	/// # Ok::<(), Box<dyn std::error::Error>>(())
1763	/// ```
1764	#[inline]
1765	pub fn captures<'h, I: Into<Input<'h>>>(
1766	&self,
1767	cache: &mut Cache,
1768	input: I,
1769	caps: &mut Captures,
1770	) {
1771	let mut input = input.into();
1772	if matches!(input.get_anchored(), Anchored::No) {
1773	input.set_anchored(Anchored::Yes);
1774	}
1775	self.try_search(cache, &input, caps).unwrap();
1776	}
1777
1778	/// Executes an anchored leftmost forward search and writes the spans
1779	/// of capturing groups that participated in a match into the provided
1780	/// [`Captures`] value. If no match was found, then [`Captures::is_match`]
1781	/// is guaranteed to return `false`.
1782	///
1783	/// The differences with [`DFA::captures`] are:
1784	///
1785	/// 1. This returns an error instead of panicking if the search fails.
1786	/// 2. Accepts an `&Input` instead of a `Into<Input>`. This permits reusing
1787	/// the same input for multiple searches, which _may_ be important for
1788	/// latency.
1789	/// 3. This does not automatically change the [`Anchored`] mode from `No`
1790	/// to `Yes`. Instead, if [`Input::anchored`] is `Anchored::No`, then an
1791	/// error is returned.
1792	///
1793	/// # Errors
1794	///
1795	/// This routine errors if the search could not complete. This can occur
1796	/// in the following circumstances:
1797	///
1798	/// When the provided `Input` configuration is not supported. For*
1799	/// example, by providing an unsupported anchor mode. Concretely,
1800	/// this occurs when using [`Anchored::Pattern`] without enabling
1801	/// [`Config::starts_for_each_pattern`].
1802	///
1803	/// When a search returns an error, callers cannot know whether a match
1804	/// exists or not.
1805	///
1806	/// # Example: specific pattern search
1807	///
1808	/// This example shows how to build a multi-regex that permits searching
1809	/// for specific patterns. Note that this is somewhat less useful than
1810	/// in other regex engines, since a one-pass DFA by definition has no
1811	/// ambiguity about which pattern can match at a position. That is, if it
1812	/// were possible for two different patterns to match at the same starting
1813	/// position, then the multi-regex would not be one-pass and construction
1814	/// would have failed.
1815	///
1816	/// Nevertheless, this can still be useful if you only care about matches
1817	/// for a specific pattern, and want the DFA to report "no match" even if
1818	/// some other pattern would have matched.
1819	///
1820	/// Note that in order to make use of this functionality,
1821	/// [`Config::starts_for_each_pattern`] must be enabled. It is disabled
1822	/// by default since it may result in higher memory usage.
1823	///
1824	/// ```
1825	/// use regex_automata::{
1826	/// dfa::onepass::DFA, Anchored, Input, Match, PatternID,
1827	/// };
1828	///
1829	/// let re = DFA::builder()
1830	/// .configure(DFA::config().starts_for_each_pattern(`true`))
1831	/// .build_many(&["[a-z]+", "[0-9]+"])?;
1832	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1833	/// let haystack = "123abc";
1834	/// let input = Input::new(haystack).anchored(Anchored::Yes);
1835	///
1836	/// // A normal multi-pattern search will show pattern 1 matches.
1837	/// re.try_search(&mut cache, &input, &mut caps)?;
1838	/// assert_eq!(Some(Match::must(`1`, `0`..`3`)), caps.get_match());
1839	///
1840	/// // If we only want to report pattern 0 matches, then we'll get no
1841	/// // match here.
1842	/// let input = input.anchored(Anchored::Pattern(PatternID::must(`0`)));
1843	/// re.try_search(&mut cache, &input, &mut caps)?;
1844	/// assert_eq!(None, caps.get_match());
1845	///
1846	/// # Ok::<(), Box<dyn std::error::Error>>(())
1847	/// ```
1848	///
1849	/// # Example: specifying the bounds of a search
1850	///
1851	/// This example shows how providing the bounds of a search can produce
1852	/// different results than simply sub-slicing the haystack.
1853	///
1854	/// ```
1855	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1856	/// use regex_automata::{dfa::onepass::DFA, Anchored, Input, Match};
1857	///
1858	/// // one-pass DFAs fully support Unicode word boundaries!
1859	/// // A sad joke is that a Unicode aware regex like \w+\s is not one-pass.
1860	/// // :-(
1861	/// let re = DFA::new(r"\b[0-9]{3}\b")?;
1862	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1863	/// let haystack = "foo123bar";
1864	///
1865	/// // Since we sub-slice the haystack, the search doesn't know about
1866	/// // the larger context and assumes that `123` is surrounded by word
1867	/// // boundaries. And of course, the match position is reported relative
1868	/// // to the sub-slice as well, which means we get `0..3` instead of
1869	/// // `3..6`.
1870	/// let expected = Some(Match::must(`0`, `0`..`3`));
1871	/// let input = Input::new(&haystack[`3`..`6`]).anchored(Anchored::Yes);
1872	/// re.try_search(&mut cache, &input, &mut caps)?;
1873	/// assert_eq!(expected, caps.get_match());
1874	///
1875	/// // But if we provide the bounds of the search within the context of the
1876	/// // entire haystack, then the search can take the surrounding context
1877	/// // into account. (And if we did find a match, it would be reported
1878	/// // as a valid offset into `haystack` instead of its sub-slice.)
1879	/// let expected = None;
1880	/// let input = Input::new(haystack).range(`3`..`6`).anchored(Anchored::Yes);
1881	/// re.try_search(&mut cache, &input, &mut caps)?;
1882	/// assert_eq!(expected, caps.get_match());
1883	///
1884	/// # Ok::<(), Box<dyn std::error::Error>>(())
1885	/// ```
1886	#[inline]
1887	pub fn try_search(
1888	&self,
1889	cache: &mut Cache,
1890	input: &Input<'_>,
1891	caps: &mut Captures,
1892	) -> Result<(), MatchError> {
1893	let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
1894	caps.set_pattern(pid);
1895	Ok(())
1896	}
1897
1898	/// Executes an anchored leftmost forward search and writes the spans
1899	/// of capturing groups that participated in a match into the provided
1900	/// `slots`, and returns the matching pattern ID. The contents of the
1901	/// slots for patterns other than the matching pattern are unspecified. If
1902	/// no match was found, then `None` is returned and the contents of all
1903	/// `slots` is unspecified.
1904	///
1905	/// This is like [`DFA::try_search`], but it accepts a raw slots slice
1906	/// instead of a `Captures` value. This is useful in contexts where you
1907	/// don't want or need to allocate a `Captures`.
1908	///
1909	/// It is legal to pass _any_ number of slots to this routine. If the regex
1910	/// engine would otherwise write a slot offset that doesn't fit in the
1911	/// provided slice, then it is simply skipped. In general though, there are
1912	/// usually three slice lengths you might want to use:
1913	///
1914	/// An empty slice, if you only care about which pattern matched.*
1915	/// A slice with*
1916	/// [`pattern_len() 2`](crate::dfa::onepass::DFA::pattern_len)*
1917	/// slots, if you only care about the overall match spans for each matching
1918	/// pattern.
1919	/// A slice with*
1920	/// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1921	/// permits recording match offsets for every capturing group in every
1922	/// pattern.
1923	///
1924	/// # Errors
1925	///
1926	/// This routine errors if the search could not complete. This can occur
1927	/// in the following circumstances:
1928	///
1929	/// When the provided `Input` configuration is not supported. For*
1930	/// example, by providing an unsupported anchor mode. Concretely,
1931	/// this occurs when using [`Anchored::Pattern`] without enabling
1932	/// [`Config::starts_for_each_pattern`].
1933	///
1934	/// When a search returns an error, callers cannot know whether a match
1935	/// exists or not.
1936	///
1937	/// # Example
1938	///
1939	/// This example shows how to find the overall match offsets in a
1940	/// multi-pattern search without allocating a `Captures` value. Indeed, we
1941	/// can put our slots right on the stack.
1942	///
1943	/// ```
1944	/// use regex_automata::{dfa::onepass::DFA, Anchored, Input, PatternID};
1945	///
1946	/// let re = DFA::new_many(&[
1947	/// r"[a-zA-Z]+",
1948	/// r"[0-9]+",
1949	/// ])?;
1950	/// let mut cache = re.create_cache();
1951	/// let input = Input::new("123").anchored(Anchored::Yes);
1952	///
1953	/// // We only care about the overall match offsets here, so we just
1954	/// // allocate two slots for each pattern. Each slot records the start
1955	/// // and end of the match.
1956	/// let mut slots = [None; `4`];
1957	/// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
1958	/// assert_eq!(Some(PatternID::must(`1`)), pid);
1959	///
1960	/// // The overall match offsets are always at 'pid 2' and 'pid * 2 + 1'.*
1961	/// // See 'GroupInfo' for more details on the mapping between groups and
1962	/// // slot indices.
1963	/// let slot_start = pid.unwrap().as_usize() * `2`;
1964	/// let slot_end = slot_start + `1`;
1965	/// assert_eq!(Some(`0`), slots[slot_start].map(\|s\| s.get()));
1966	/// assert_eq!(Some(`3`), slots[slot_end].map(\|s\| s.get()));
1967	///
1968	/// # Ok::<(), Box<dyn std::error::Error>>(())
1969	/// ```
1970	#[inline]
1971	pub fn try_search_slots(
1972	&self,
1973	cache: &mut Cache,
1974	input: &Input<'_>,
1975	slots: &mut [Option<NonMaxUsize>],
1976	) -> Result<Option<PatternID>, MatchError> {
1977	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1978	if !utf8empty {
1979	return self.try_search_slots_imp(cache, input, slots);
1980	}
1981	// See PikeVM::try_search_slots for why we do this.
1982	let min = self.get_nfa().group_info().implicit_slot_len();
1983	if slots.len() >= min {
1984	return self.try_search_slots_imp(cache, input, slots);
1985	}
1986	if self.get_nfa().pattern_len() == `1` {
1987	let mut enough = [None, None];
1988	let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1989	// This is OK because we know `enough_slots` is strictly bigger
1990	// than `slots`, otherwise this special case isn't reached.
1991	slots.copy_from_slice(&enough[..slots.len()]);
1992	return Ok(got);
1993	}
1994	let mut enough = vec![None; min];
1995	let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1996	// This is OK because we know `enough_slots` is strictly bigger than
1997	// `slots`, otherwise this special case isn't reached.
1998	slots.copy_from_slice(&enough[..slots.len()]);
1999	Ok(got)
2000	}
2001
2002	#[inline(never)]
2003	fn try_search_slots_imp(
2004	&self,
2005	cache: &mut Cache,
2006	input: &Input<'_>,
2007	slots: &mut [Option<NonMaxUsize>],
2008	) -> Result<Option<PatternID>, MatchError> {
2009	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
2010	match self.search_imp(cache, input, slots)? {
2011	None => return Ok(None),
2012	Some(pid) if !utf8empty => return Ok(Some(pid)),
2013	Some(pid) => {
2014	// These slot indices are always correct because we know our
2015	// 'pid' is valid and thus we know that the slot indices for it
2016	// are valid.
2017	let slot_start = pid.as_usize().wrapping_mul(`2`);
2018	let slot_end = slot_start.wrapping_add(`1`);
2019	// OK because we know we have a match and we know our caller
2020	// provided slots are big enough (which we make true above if
2021	// the caller didn't). Namely, we're only here when 'utf8empty'
2022	// is true, and when that's true, we require slots for every
2023	// pattern.
2024	let start = slots[slot_start].unwrap().get();
2025	let end = slots[slot_end].unwrap().get();
2026	// If our match splits a codepoint, then we cannot report is
2027	// as a match. And since one-pass DFAs only support anchored
2028	// searches, we don't try to skip ahead to find the next match.
2029	// We can just quit with nothing.
2030	if start == end && !input.is_char_boundary(start) {
2031	return Ok(None);
2032	}
2033	Ok(Some(pid))
2034	}
2035	}
2036	}
2037	}
2038
2039	impl DFA {
2040	fn search_imp(
2041	&self,
2042	cache: &mut Cache,
2043	input: &Input<'_>,
2044	slots: &mut [Option<NonMaxUsize>],
2045	) -> Result<Option<PatternID>, MatchError> {
2046	// PERF: Some ideas. I ran out of steam after my initial impl to try
2047	// many of these.
2048	//
2049	// 1) Try doing more state shuffling. Right now, all we do is push
2050	// match states to the end of the transition table so that we can do
2051	// 'if sid >= self.min_match_id' to know whether we're in a match
2052	// state or not. But what about doing something like dense DFAs and
2053	// pushing dead, match and states with captures/looks all toward the
2054	// beginning of the transition table. Then we could do 'if sid <=
2055	// self.max_special_id', in which case, we need to do some special
2056	// handling of some sort. Otherwise, we get the happy path, just
2057	// like in a DFA search. The main argument against this is that the
2058	// one-pass DFA is likely to be used most often with capturing groups
2059	// and if capturing groups are common, then this might wind up being a
2060	// pessimization.
2061	//
2062	// 2) Consider moving 'PatternEpsilons' out of the transition table.
2063	// It is only needed for match states and usually a small minority of
2064	// states are match states. Therefore, we're using an extra 'u64' for
2065	// most states.
2066	//
2067	// 3) I played around with the match state handling and it seems like
2068	// there is probably a lot left on the table for improvement. The
2069	// key tension is that the 'find_match' routine is a giant mess, but
2070	// splitting it out into a non-inlineable function is a non-starter
2071	// because the match state might consume input, so 'find_match' COULD
2072	// be called quite a lot, and a function call at that point would trash
2073	// perf. In theory, we could detect whether a match state consumes
2074	// input and then specialize our search routine based on that. In that
2075	// case, maybe an extra function call is OK, but even then, it might be
2076	// too much of a latency hit. Another idea is to just try and figure
2077	// out how to reduce the code size of 'find_match'. RE2 has a trick
2078	// here where the match handling isn't done if we know the next byte of
2079	// input yields a match too. Maybe we adopt that?
2080	//
2081	// This just might be a tricky DFA to optimize.
2082
2083	if input.is_done() {
2084	return Ok(None);
2085	}
2086	// We unfortunately have a bit of book-keeping to do to set things
2087	// up. We do have to setup our cache and clear all of our slots. In
2088	// particular, clearing the slots is necessary for the case where we
2089	// report a match, but one of the capturing groups didn't participate
2090	// in the match but had a span set from a previous search. That would
2091	// be bad. In theory, we could avoid all this slot clearing if we knew
2092	// that every slot was always activated for every match. Then we would
2093	// know they would always be overwritten when a match is found.
2094	let explicit_slots_len = core::cmp::min(
2095	Slots::LIMIT,
2096	slots.len().saturating_sub(self.explicit_slot_start),
2097	);
2098	cache.setup_search(explicit_slots_len);
2099	for slot in cache.explicit_slots() {
2100	*slot = None;
2101	}
2102	for slot in slots.iter_mut() {
2103	*slot = None;
2104	}
2105	// We set the starting slots for every pattern up front. This does
2106	// increase our latency somewhat, but it avoids having to do it every
2107	// time we see a match state (which could be many times in a single
2108	// search if the match state consumes input).
2109	for pid in self.nfa.patterns() {
2110	let i = pid.as_usize() * `2`;
2111	if i >= slots.len() {
2112	break;
2113	}
2114	slots[i] = NonMaxUsize::new(input.start());
2115	}
2116	let mut pid = None;
2117	let mut next_sid = match input.get_anchored() {
2118	Anchored::Yes => self.start(),
2119	Anchored::Pattern(pid) => self.start_pattern(pid)?,
2120	Anchored::No => {
2121	// If the regex is itself always anchored, then we're fine,
2122	// even if the search is configured to be unanchored.
2123	if !self.nfa.is_always_start_anchored() {
2124	return Err(MatchError::unsupported_anchored(
2125	Anchored::No,
2126	));
2127	}
2128	self.start()
2129	}
2130	};
2131	let leftmost_first =
2132	matches!(self.config.get_match_kind(), MatchKind::LeftmostFirst);
2133	for at in input.start()..input.end() {
2134	let sid = next_sid;
2135	let trans = self.transition(sid, input.haystack()[at]);
2136	next_sid = trans.state_id();
2137	let epsilons = trans.epsilons();
2138	if sid >= self.min_match_id {
2139	if self.find_match(cache, input, at, sid, slots, &mut pid) {
2140	if input.get_earliest()
2141	\|\| (leftmost_first && trans.match_wins())
2142	{
2143	return Ok(pid);
2144	}
2145	}
2146	}
2147	if sid == DEAD
2148	\|\| (!epsilons.looks().is_empty()
2149	&& !self.nfa.look_matcher().matches_set_inline(
2150	epsilons.looks(),
2151	input.haystack(),
2152	at,
2153	))
2154	{
2155	return Ok(pid);
2156	}
2157	epsilons.slots().apply(at, cache.explicit_slots());
2158	}
2159	if next_sid >= self.min_match_id {
2160	self.find_match(
2161	cache,
2162	input,
2163	input.end(),
2164	next_sid,
2165	slots,
2166	&mut pid,
2167	);
2168	}
2169	Ok(pid)
2170	}
2171
2172	/// Assumes 'sid' is a match state and looks for whether a match can
2173	/// be reported. If so, appropriate offsets are written to 'slots' and
2174	/// 'matched_pid' is set to the matching pattern ID.
2175	///
2176	/// Even when 'sid' is a match state, it's possible that a match won't
2177	/// be reported. For example, when the conditional epsilon transitions
2178	/// leading to the match state aren't satisfied at the given position in
2179	/// the haystack.
2180	#[cfg_attr(feature = "perf-inline", inline(always))]
2181	fn find_match(
2182	&self,
2183	cache: &mut Cache,
2184	input: &Input<'_>,
2185	at: usize,
2186	sid: StateID,
2187	slots: &mut [Option<NonMaxUsize>],
2188	matched_pid: &mut Option<PatternID>,
2189	) -> bool {
2190	debug_assert!(sid >= self.min_match_id);
2191	let pateps = self.pattern_epsilons(sid);
2192	let epsilons = pateps.epsilons();
2193	if !epsilons.looks().is_empty()
2194	&& !self.nfa.look_matcher().matches_set_inline(
2195	epsilons.looks(),
2196	input.haystack(),
2197	at,
2198	)
2199	{
2200	return `false`;
2201	}
2202	let pid = pateps.pattern_id_unchecked();
2203	// This calculation is always correct because we know our 'pid' is
2204	// valid and thus we know that the slot indices for it are valid.
2205	let slot_end = pid.as_usize().wrapping_mul(`2`).wrapping_add(`1`);
2206	// Set the implicit 'end' slot for the matching pattern. (The 'start'
2207	// slot was set at the beginning of the search.)
2208	if slot_end < slots.len() {
2209	slots[slot_end] = NonMaxUsize::new(at);
2210	}
2211	// If the caller provided enough room, copy the previously recorded
2212	// explicit slots from our scratch space to the caller provided slots.
2213	// We also* need to set any explicit slots that are active as part of*
2214	// the path to the match state.
2215	if self.explicit_slot_start < slots.len() {
2216	// NOTE: The 'cache.explicit_slots()' slice is setup at the
2217	// beginning of every search such that it is guaranteed to return a
2218	// slice of length equivalent to 'slots[explicit_slot_start..]'.
2219	slots[self.explicit_slot_start..]
2220	.copy_from_slice(cache.explicit_slots());
2221	epsilons.slots().apply(at, &mut slots[self.explicit_slot_start..]);
2222	}
2223	*matched_pid = Some(pid);
2224	`true`
2225	}
2226	}
2227
2228	impl DFA {
2229	/// Returns the anchored start state for matching any pattern in this DFA.
2230	fn start(&self) -> StateID {
2231	self.starts[`0`]
2232	}
2233
2234	/// Returns the anchored start state for matching the given pattern. If
2235	/// 'starts_for_each_pattern'
2236	/// was not enabled, then this returns an error. If the given pattern is
2237	/// not in this DFA, then `Ok(None)` is returned.
2238	fn start_pattern(&self, pid: PatternID) -> Result<StateID, MatchError> {
2239	if !self.config.get_starts_for_each_pattern() {
2240	return Err(MatchError::unsupported_anchored(Anchored::Pattern(
2241	pid,
2242	)));
2243	}
2244	// 'starts' always has non-zero length. The first entry is always the
2245	// anchored starting state for all patterns, and the following entries
2246	// are optional and correspond to the anchored starting states for
2247	// patterns at pid+1. Thus, starts.len()-1 corresponds to the total
2248	// number of patterns that one can explicitly search for. (And it may
2249	// be zero.)
2250	Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD))
2251	}
2252
2253	/// Returns the transition from the given state ID and byte of input. The
2254	/// transition includes the next state ID, the slots that should be saved
2255	/// and any conditional epsilon transitions that must be satisfied in order
2256	/// to take this transition.
2257	fn transition(&self, sid: StateID, byte: u8) -> Transition {
2258	let offset = sid.as_usize() << self.stride2();
2259	let class = self.classes.get(byte).as_usize();
2260	self.table[offset + class]
2261	}
2262
2263	/// Set the transition from the given state ID and byte of input to the
2264	/// transition given.
2265	fn set_transition(&mut self, sid: StateID, byte: u8, to: Transition) {
2266	let offset = sid.as_usize() << self.stride2();
2267	let class = self.classes.get(byte).as_usize();
2268	self.table[offset + class] = to;
2269	}
2270
2271	/// Return an iterator of "sparse" transitions for the given state ID.
2272	/// "sparse" in this context means that consecutive transitions that are
2273	/// equivalent are returned as one group, and transitions to the DEAD state
2274	/// are ignored.
2275	///
2276	/// This winds up being useful for debug printing, since it's much terser
2277	/// to display runs of equivalent transitions than the transition for every
2278	/// possible byte value. Indeed, in practice, it's very common for runs
2279	/// of equivalent transitions to appear.
2280	fn sparse_transitions(&self, sid: StateID) -> SparseTransitionIter<'_> {
2281	let start = sid.as_usize() << self.stride2();
2282	let end = start + self.alphabet_len();
2283	SparseTransitionIter {
2284	it: self.table[start..end].iter().enumerate(),
2285	cur: None,
2286	}
2287	}
2288
2289	/// Return the pattern epsilons for the given state ID.
2290	///
2291	/// If the given state ID does not correspond to a match state ID, then the
2292	/// pattern epsilons returned is empty.
2293	fn pattern_epsilons(&self, sid: StateID) -> PatternEpsilons {
2294	let offset = sid.as_usize() << self.stride2();
2295	PatternEpsilons(self.table[offset + self.pateps_offset].0)
2296	}
2297
2298	/// Set the pattern epsilons for the given state ID.
2299	fn set_pattern_epsilons(&mut self, sid: StateID, pateps: PatternEpsilons) {
2300	let offset = sid.as_usize() << self.stride2();
2301	self.table[offset + self.pateps_offset] = Transition(pateps.0);
2302	}
2303
2304	/// Returns the state ID prior to the one given. This returns None if the
2305	/// given ID is the first DFA state.
2306	fn prev_state_id(&self, id: StateID) -> Option<StateID> {
2307	if id == DEAD {
2308	None
2309	} else {
2310	// CORRECTNESS: Since 'id' is not the first state, subtracting 1
2311	// is always valid.
2312	Some(StateID::new_unchecked(id.as_usize().checked_sub(`1`).unwrap()))
2313	}
2314	}
2315
2316	/// Returns the state ID of the last state in this DFA's transition table.
2317	/// "last" in this context means the last state to appear in memory, i.e.,
2318	/// the one with the greatest ID.
2319	fn last_state_id(&self) -> StateID {
2320	// CORRECTNESS: A DFA table is always non-empty since it always at
2321	// least contains a DEAD state. Since every state has the same stride,
2322	// we can just compute what the "next" state ID would have been and
2323	// then subtract 1 from it.
2324	StateID::new_unchecked(
2325	(self.table.len() >> self.stride2()).checked_sub(`1`).unwrap(),
2326	)
2327	}
2328
2329	/// Move the transitions from 'id1' to 'id2' and vice versa.
2330	///
2331	/// WARNING: This does not update the rest of the transition table to have
2332	/// transitions to 'id1' changed to 'id2' and vice versa. This merely moves
2333	/// the states in memory.
2334	pub(super) fn swap_states(&mut self, id1: StateID, id2: StateID) {
2335	let o1 = id1.as_usize() << self.stride2();
2336	let o2 = id2.as_usize() << self.stride2();
2337	for b in `0`..self.stride() {
2338	self.table.swap(o1 + b, o2 + b);
2339	}
2340	}
2341
2342	/// Map all state IDs in this DFA (transition table + start states)
2343	/// according to the closure given.
2344	pub(super) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
2345	for i in `0`..self.state_len() {
2346	let offset = i << self.stride2();
2347	for b in `0`..self.alphabet_len() {
2348	let next = self.table[offset + b].state_id();
2349	self.table[offset + b].set_state_id(map(next));
2350	}
2351	}
2352	for i in `0`..self.starts.len() {
2353	self.starts[i] = map(self.starts[i]);
2354	}
2355	}
2356	}
2357
2358	impl core::fmt::Debug for DFA {
2359	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2360	fn debug_state_transitions(
2361	f: &mut core::fmt::Formatter,
2362	dfa: &DFA,
2363	sid: StateID,
2364	) -> core::fmt::Result {
2365	for (i, (start, end, trans)) in
2366	dfa.sparse_transitions(sid).enumerate()
2367	{
2368	let next = trans.state_id();
2369	if i > `0` {
2370	write!(f, ", ")?;
2371	}
2372	if start == end {
2373	write!(
2374	f,
2375	"{:?} => {:?}",
2376	DebugByte(start),
2377	next.as_usize(),
2378	)?;
2379	} else {
2380	write!(
2381	f,
2382	"{:?}-{:?} => {:?}",
2383	DebugByte(start),
2384	DebugByte(end),
2385	next.as_usize(),
2386	)?;
2387	}
2388	if trans.match_wins() {
2389	write!(f, " (MW)")?;
2390	}
2391	if !trans.epsilons().is_empty() {
2392	write!(f, " ({:?})", trans.epsilons())?;
2393	}
2394	}
2395	Ok(())
2396	}
2397
2398	writeln!(f, "onepass::DFA(")?;
2399	for index in `0`..self.state_len() {
2400	let sid = StateID::must(index);
2401	let pateps = self.pattern_epsilons(sid);
2402	if sid == DEAD {
2403	write!(f, "D ")?;
2404	} else if pateps.pattern_id().is_some() {
2405	write!(f, "* ")?;
2406	} else {
2407	write!(f, " ")?;
2408	}
2409	write!(f, "{:06?}", sid.as_usize())?;
2410	if !pateps.is_empty() {
2411	write!(f, " ({:?})", pateps)?;
2412	}
2413	write!(f, ": ")?;
2414	debug_state_transitions(f, self, sid)?;
2415	write!(f, "`\n`")?;
2416	}
2417	writeln!(f, "")?;
2418	for (i, &sid) in self.starts.iter().enumerate() {
2419	if i == `0` {
2420	writeln!(f, "START(ALL): {:?}", sid.as_usize())?;
2421	} else {
2422	writeln!(
2423	f,
2424	"START(pattern: {:?}): {:?}",
2425	i - `1`,
2426	sid.as_usize(),
2427	)?;
2428	}
2429	}
2430	writeln!(f, "state length: {:?}", self.state_len())?;
2431	writeln!(f, "pattern length: {:?}", self.pattern_len())?;
2432	writeln!(f, ")")?;
2433	Ok(())
2434	}
2435	}
2436
2437	/// An iterator over groups of consecutive equivalent transitions in a single
2438	/// state.
2439	#[derive(Debug)]
2440	struct SparseTransitionIter<'a> {
2441	it: core::iter::Enumerate<core::slice::Iter<'a, Transition>>,
2442	cur: Option<(u8, u8, Transition)>,
2443	}
2444
2445	impl<'a> Iterator for SparseTransitionIter<'a> {
2446	type Item = (u8, u8, Transition);
2447
2448	fn next(&mut self) -> Option<(u8, u8, Transition)> {
2449	while let Some((b, &trans)) = self.it.next() {
2450	// Fine because we'll never have more than u8::MAX transitions in
2451	// one state.
2452	let b = b.as_u8();
2453	let (prev_start, prev_end, prev_trans) = match self.cur {
2454	Some(t) => t,
2455	None => {
2456	self.cur = Some((b, b, trans));
2457	continue;
2458	}
2459	};
2460	if prev_trans == trans {
2461	self.cur = Some((prev_start, b, prev_trans));
2462	} else {
2463	self.cur = Some((b, b, trans));
2464	if prev_trans.state_id() != DEAD {
2465	return Some((prev_start, prev_end, prev_trans));
2466	}
2467	}
2468	}
2469	if let Some((start, end, trans)) = self.cur.take() {
2470	if trans.state_id() != DEAD {
2471	return Some((start, end, trans));
2472	}
2473	}
2474	None
2475	}
2476	}
2477
2478	/// A cache represents mutable state that a one-pass [`DFA`] requires during a
2479	/// search.
2480	///
2481	/// For a given one-pass DFA, its corresponding cache may be created either via
2482	/// [`DFA::create_cache`], or via [`Cache::new`]. They are equivalent in every
2483	/// way, except the former does not require explicitly importing `Cache`.
2484	///
2485	/// A particular `Cache` is coupled with the one-pass DFA from which it was
2486	/// created. It may only be used with that one-pass DFA. A cache and its
2487	/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
2488	/// only be used with the new one-pass DFA (and not the old one).
2489	#[derive(Clone, Debug)]
2490	pub struct Cache {
2491	/// Scratch space used to store slots during a search. Basically, we use
2492	/// the caller provided slots to store slots known when a match occurs.
2493	/// But after a match occurs, we might continue a search but ultimately
2494	/// fail to extend the match. When continuing the search, we need some
2495	/// place to store candidate capture offsets without overwriting the slot
2496	/// offsets recorded for the most recently seen match.
2497	explicit_slots: Vec<Option<NonMaxUsize>>,
2498	/// The number of slots in the caller-provided 'Captures' value for the
2499	/// current search. This is always at most 'explicit_slots.len()', but
2500	/// might be less than it, if the caller provided fewer slots to fill.
2501	explicit_slot_len: usize,
2502	}
2503
2504	impl Cache {
2505	/// Create a new [`onepass::DFA`](DFA) cache.
2506	///
2507	/// A potentially more convenient routine to create a cache is
2508	/// [`DFA::create_cache`], as it does not require also importing the
2509	/// `Cache` type.
2510	///
2511	/// If you want to reuse the returned `Cache` with some other one-pass DFA,
2512	/// then you must call [`Cache::reset`] with the desired one-pass DFA.
2513	pub fn new(re: &DFA) -> Cache {
2514	let mut cache = Cache { explicit_slots: vec![], explicit_slot_len: `0` };
2515	cache.reset(re);
2516	cache
2517	}
2518
2519	/// Reset this cache such that it can be used for searching with a
2520	/// different [`onepass::DFA`](DFA).
2521	///
2522	/// A cache reset permits reusing memory already allocated in this cache
2523	/// with a different one-pass DFA.
2524	///
2525	/// # Example
2526	///
2527	/// This shows how to re-purpose a cache for use with a different one-pass
2528	/// DFA.
2529	///
2530	/// ```
2531	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
2532	/// use regex_automata::{dfa::onepass::DFA, Match};
2533	///
2534	/// let re1 = DFA::new(r"\w")?;
2535	/// let re2 = DFA::new(r"\W")?;
2536	/// let mut caps1 = re1.create_captures();
2537	/// let mut caps2 = re2.create_captures();
2538	///
2539	/// let mut cache = re1.create_cache();
2540	/// assert_eq!(
2541	/// Some(Match::must(`0`, `0`..`2`)),
2542	/// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
2543	/// );
2544	///
2545	/// // Using 'cache' with re2 is not allowed. It may result in panics or
2546	/// // incorrect results. In order to re-purpose the cache, we must reset
2547	/// // it with the one-pass DFA we'd like to use it with.
2548	/// //
2549	/// // Similarly, after this reset, using the cache with 're1' is also not
2550	/// // allowed.
2551	/// re2.reset_cache(&mut cache);
2552	/// assert_eq!(
2553	/// Some(Match::must(`0`, `0`..`3`)),
2554	/// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
2555	/// );
2556	///
2557	/// # Ok::<(), Box<dyn std::error::Error>>(())
2558	/// ```
2559	pub fn reset(&mut self, re: &DFA) {
2560	let explicit_slot_len = re.get_nfa().group_info().explicit_slot_len();
2561	self.explicit_slots.resize(explicit_slot_len, None);
2562	self.explicit_slot_len = explicit_slot_len;
2563	}
2564
2565	/// Returns the heap memory usage, in bytes, of this cache.
2566	///
2567	/// This does not* include the stack size used up by this cache. To*
2568	/// compute that, use `std::mem::size_of::<Cache>()`.
2569	pub fn memory_usage(&self) -> usize {
2570	self.explicit_slots.len() * core::mem::size_of::<Option<NonMaxUsize>>()
2571	}
2572
2573	fn explicit_slots(&mut self) -> &mut [Option<NonMaxUsize>] {
2574	&mut self.explicit_slots[..self.explicit_slot_len]
2575	}
2576
2577	fn setup_search(&mut self, explicit_slot_len: usize) {
2578	self.explicit_slot_len = explicit_slot_len;
2579	}
2580	}
2581
2582	/// Represents a single transition in a one-pass DFA.
2583	///
2584	/// The high 21 bits corresponds to the state ID. The bit following corresponds
2585	/// to the special "match wins" flag. The remaining low 42 bits corresponds to
2586	/// the transition epsilons, which contains the slots that should be saved when
2587	/// this transition is followed and the conditional epsilon transitions that
2588	/// must be satisfied in order to follow this transition.
2589	#[derive(Clone, Copy, Eq, PartialEq)]
2590	struct Transition(u64);
2591
2592	impl Transition {
2593	const STATE_ID_BITS: u64 = `21`;
2594	const STATE_ID_SHIFT: u64 = `64` - Transition::STATE_ID_BITS;
2595	const STATE_ID_LIMIT: u64 = `1` << Transition::STATE_ID_BITS;
2596	const MATCH_WINS_SHIFT: u64 = `64` - (Transition::STATE_ID_BITS + `1`);
2597	const INFO_MASK: u64 = `0x000003FF_FFFFFFFF`;
2598
2599	/// Return a new transition to the given state ID with the given epsilons.
2600	fn new(match_wins: bool, sid: StateID, epsilons: Epsilons) -> Transition {
2601	let match_wins =
2602	if match_wins { `1` << Transition::MATCH_WINS_SHIFT } else { `0` };
2603	let sid = sid.as_u64() << Transition::STATE_ID_SHIFT;
2604	Transition(sid \| match_wins \| epsilons.0)
2605	}
2606
2607	/// Returns true if and only if this transition points to the DEAD state.
2608	fn is_dead(self) -> bool {
2609	self.state_id() == DEAD
2610	}
2611
2612	/// Return whether this transition has a "match wins" property.
2613	///
2614	/// When a transition has this property, it means that if a match has been
2615	/// found and the search uses leftmost-first semantics, then that match
2616	/// should be returned immediately instead of continuing on.
2617	///
2618	/// The "match wins" name comes from RE2, which uses a pretty much
2619	/// identical mechanism for implementing leftmost-first semantics.
2620	fn match_wins(&self) -> bool {
2621	(self.0 >> Transition::MATCH_WINS_SHIFT & `1`) == `1`
2622	}
2623
2624	/// Return the "next" state ID that this transition points to.
2625	fn state_id(&self) -> StateID {
2626	// OK because a Transition has a valid StateID in its upper bits by
2627	// construction. The cast to usize is also correct, even on 16-bit
2628	// targets because, again, we know the upper bits is a valid StateID,
2629	// which can never overflow usize on any supported target.
2630	StateID::new_unchecked(
2631	(self.0 >> Transition::STATE_ID_SHIFT).as_usize(),
2632	)
2633	}
2634
2635	/// Set the "next" state ID in this transition.
2636	fn set_state_id(&mut self, sid: StateID) {
2637	*self = Transition::new(self.match_wins(), sid, self.epsilons());
2638	}
2639
2640	/// Return the epsilons embedded in this transition.
2641	fn epsilons(&self) -> Epsilons {
2642	Epsilons(self.0 & Transition::INFO_MASK)
2643	}
2644	}
2645
2646	impl core::fmt::Debug for Transition {
2647	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2648	if self.is_dead() {
2649	return write!(f, "0");
2650	}
2651	write!(f, "{}", self.state_id().as_usize())?;
2652	if self.match_wins() {
2653	write!(f, "-MW")?;
2654	}
2655	if !self.epsilons().is_empty() {
2656	write!(f, "-{:?}", self.epsilons())?;
2657	}
2658	Ok(())
2659	}
2660	}
2661
2662	/// A representation of a match state's pattern ID along with the epsilons for
2663	/// when a match occurs.
2664	///
2665	/// A match state in a one-pass DFA, unlike in a more general DFA, has exactly
2666	/// one pattern ID. If it had more, then the original NFA would not have been
2667	/// one-pass.
2668	///
2669	/// The "epsilons" part of this corresponds to what was found in the epsilon
2670	/// transitions between the transition taken in the last byte of input and the
2671	/// ultimate match state. This might include saving slots and/or conditional
2672	/// epsilon transitions that must be satisfied before one can report the match.
2673	///
2674	/// Technically, every state has room for a 'PatternEpsilons', but it is only
2675	/// ever non-empty for match states.
2676	#[derive(Clone, Copy)]
2677	struct PatternEpsilons(u64);
2678
2679	impl PatternEpsilons {
2680	const PATTERN_ID_BITS: u64 = `22`;
2681	const PATTERN_ID_SHIFT: u64 = `64` - PatternEpsilons::PATTERN_ID_BITS;
2682	// A sentinel value indicating that this is not a match state. We don't
2683	// use 0 since 0 is a valid pattern ID.
2684	const PATTERN_ID_NONE: u64 = `0x00000000_003FFFFF`;
2685	const PATTERN_ID_LIMIT: u64 = PatternEpsilons::PATTERN_ID_NONE;
2686	const PATTERN_ID_MASK: u64 = `0xFFFFFC00_00000000`;
2687	const EPSILONS_MASK: u64 = `0x000003FF_FFFFFFFF`;
2688
2689	/// Return a new empty pattern epsilons that has no pattern ID and has no
2690	/// epsilons. This is suitable for non-match states.
2691	fn empty() -> PatternEpsilons {
2692	PatternEpsilons(
2693	PatternEpsilons::PATTERN_ID_NONE
2694	<< PatternEpsilons::PATTERN_ID_SHIFT,
2695	)
2696	}
2697
2698	/// Whether this pattern epsilons is empty or not. It's empty when it has
2699	/// no pattern ID and an empty epsilons.
2700	fn is_empty(self) -> bool {
2701	self.pattern_id().is_none() && self.epsilons().is_empty()
2702	}
2703
2704	/// Return the pattern ID in this pattern epsilons if one exists.
2705	fn pattern_id(self) -> Option<PatternID> {
2706	let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
2707	if pid == PatternEpsilons::PATTERN_ID_LIMIT {
2708	None
2709	} else {
2710	Some(PatternID::new_unchecked(pid.as_usize()))
2711	}
2712	}
2713
2714	/// Returns the pattern ID without checking whether it's valid. If this is
2715	/// called and there is no pattern ID in this `PatternEpsilons`, then this
2716	/// will likely produce an incorrect result or possibly even a panic or
2717	/// an overflow. But safety will not be violated.
2718	///
2719	/// This is useful when you know a particular state is a match state. If
2720	/// it's a match state, then it must have a pattern ID.
2721	fn pattern_id_unchecked(self) -> PatternID {
2722	let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
2723	PatternID::new_unchecked(pid.as_usize())
2724	}
2725
2726	/// Return a new pattern epsilons with the given pattern ID, but the same
2727	/// epsilons.
2728	fn set_pattern_id(self, pid: PatternID) -> PatternEpsilons {
2729	PatternEpsilons(
2730	(pid.as_u64() << PatternEpsilons::PATTERN_ID_SHIFT)
2731	\| (self.0 & PatternEpsilons::EPSILONS_MASK),
2732	)
2733	}
2734
2735	/// Return the epsilons part of this pattern epsilons.
2736	fn epsilons(self) -> Epsilons {
2737	Epsilons(self.0 & PatternEpsilons::EPSILONS_MASK)
2738	}
2739
2740	/// Return a new pattern epsilons with the given epsilons, but the same
2741	/// pattern ID.
2742	fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons {
2743	PatternEpsilons(
2744	(self.0 & PatternEpsilons::PATTERN_ID_MASK)
2745	\| (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK),
2746	)
2747	}
2748	}
2749
2750	impl core::fmt::Debug for PatternEpsilons {
2751	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2752	if self.is_empty() {
2753	return write!(f, "N/A");
2754	}
2755	if let Some(pid) = self.pattern_id() {
2756	write!(f, "{}", pid.as_usize())?;
2757	}
2758	if !self.epsilons().is_empty() {
2759	if self.pattern_id().is_some() {
2760	write!(f, "/")?;
2761	}
2762	write!(f, "{:?}", self.epsilons())?;
2763	}
2764	Ok(())
2765	}
2766	}
2767
2768	/// Epsilons represents all of the NFA epsilons transitions that went into a
2769	/// single transition in a single DFA state. In this case, it only represents
2770	/// the epsilon transitions that have some kind of non-consuming side effect:
2771	/// either the transition requires storing the current position of the search
2772	/// into a slot, or the transition is conditional and requires the current
2773	/// position in the input to satisfy an assertion before the transition may be
2774	/// taken.
2775	///
2776	/// This folds the cumulative effect of a group of NFA states (all connected
2777	/// by epsilon transitions) down into a single set of bits. While these bits
2778	/// can represent all possible conditional epsilon transitions, it only permits
2779	/// storing up to a somewhat small number of slots.
2780	///
2781	/// Epsilons is represented as a 42-bit integer. For example, it is packed into
2782	/// the lower 42 bits of a `Transition`. (Where the high 22 bits contains a
2783	/// `StateID` and a special "match wins" property.)
2784	#[derive(Clone, Copy)]
2785	struct Epsilons(u64);
2786
2787	impl Epsilons {
2788	const SLOT_MASK: u64 = `0x000003FF_FFFFFC00`;
2789	const SLOT_SHIFT: u64 = `10`;
2790	const LOOK_MASK: u64 = `0x00000000_000003FF`;
2791
2792	/// Create a new empty epsilons. It has no slots and no assertions that
2793	/// need to be satisfied.
2794	fn empty() -> Epsilons {
2795	Epsilons(`0`)
2796	}
2797
2798	/// Returns true if this epsilons contains no slots and no assertions.
2799	fn is_empty(self) -> bool {
2800	self.0 == `0`
2801	}
2802
2803	/// Returns the slot epsilon transitions.
2804	fn slots(self) -> Slots {
2805	Slots((self.0 >> Epsilons::SLOT_SHIFT).low_u32())
2806	}
2807
2808	/// Set the slot epsilon transitions.
2809	fn set_slots(self, slots: Slots) -> Epsilons {
2810	Epsilons(
2811	(u64::from(slots.0) << Epsilons::SLOT_SHIFT)
2812	\| (self.0 & Epsilons::LOOK_MASK),
2813	)
2814	}
2815
2816	/// Return the set of look-around assertions in these epsilon transitions.
2817	fn looks(self) -> LookSet {
2818	LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() }
2819	}
2820
2821	/// Set the look-around assertions on these epsilon transitions.
2822	fn set_looks(self, look_set: LookSet) -> Epsilons {
2823	Epsilons(
2824	(self.0 & Epsilons::SLOT_MASK)
2825	\| (u64::from(look_set.bits) & Epsilons::LOOK_MASK),
2826	)
2827	}
2828	}
2829
2830	impl core::fmt::Debug for Epsilons {
2831	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2832	let mut wrote = `false`;
2833	if !self.slots().is_empty() {
2834	write!(f, "{:?}", self.slots())?;
2835	wrote = `true`;
2836	}
2837	if !self.looks().is_empty() {
2838	if wrote {
2839	write!(f, "/")?;
2840	}
2841	write!(f, "{:?}", self.looks())?;
2842	wrote = `true`;
2843	}
2844	if !wrote {
2845	write!(f, "N/A")?;
2846	}
2847	Ok(())
2848	}
2849	}
2850
2851	/// The set of epsilon transitions indicating that the current position in a
2852	/// search should be saved to a slot.
2853	///
2854	/// This only* represents explicit slots. So for example, the pattern*
2855	/// `[a-z]+([0-9]+)([a-z]+)` has:
2856	///
2857	/// 3 capturing groups, thus 6 slots.*
2858	/// 1 implicit capturing group, thus 2 implicit slots.*
2859	/// 2 explicit capturing groups, thus 4 explicit slots.*
2860	///
2861	/// While implicit slots are represented by epsilon transitions in an NFA, we
2862	/// do not explicitly represent them here. Instead, implicit slots are assumed
2863	/// to be present and handled automatically in the search code. Therefore,
2864	/// that means we only need to represent explicit slots in our epsilon
2865	/// transitions.
2866	///
2867	/// Its representation is a bit set. The bit 'i' is set if and only if there
2868	/// exists an explicit slot at index 'c', where 'c = (#patterns 2) + i'. That*
2869	/// is, the bit 'i' corresponds to the first explicit slot and the first
2870	/// explicit slot appears immediately following the last implicit slot. (If
2871	/// this is confusing, see `GroupInfo` for more details on how slots works.)
2872	///
2873	/// A single `Slots` represents all the active slots in a sub-graph of an NFA,
2874	/// where all the states are connected by epsilon transitions. In effect, when
2875	/// traversing the one-pass DFA during a search, all slots set in a particular
2876	/// transition must be captured by recording the current search position.
2877	///
2878	/// The API of `Slots` requires the caller to handle the explicit slot offset.
2879	/// That is, a `Slots` doesn't know where the explicit slots start for a
2880	/// particular NFA. Thus, if the callers see's the bit 'i' is set, then they
2881	/// need to do the arithmetic above to find 'c', which is the real actual slot
2882	/// index in the corresponding NFA.
2883	#[derive(Clone, Copy)]
2884	struct Slots(u32);
2885
2886	impl Slots {
2887	const LIMIT: usize = `32`;
2888
2889	/// Insert the slot at the given bit index.
2890	fn insert(self, slot: usize) -> Slots {
2891	debug_assert!(slot < Slots::LIMIT);
2892	Slots(self.0 \| (`1` << slot.as_u32()))
2893	}
2894
2895	/// Remove the slot at the given bit index.
2896	fn remove(self, slot: usize) -> Slots {
2897	debug_assert!(slot < Slots::LIMIT);
2898	Slots(self.0 & !(`1` << slot.as_u32()))
2899	}
2900
2901	/// Returns true if and only if this set contains no slots.
2902	fn is_empty(self) -> bool {
2903	self.0 == `0`
2904	}
2905
2906	/// Returns an iterator over all of the set bits in this set.
2907	fn iter(self) -> SlotsIter {
2908	SlotsIter { slots: self }
2909	}
2910
2911	/// For the position `at` in the current haystack, copy it to
2912	/// `caller_explicit_slots` for all slots that are in this set.
2913	///
2914	/// Callers may pass a slice of any length. Slots in this set bigger than
2915	/// the length of the given explicit slots are simply skipped.
2916	///
2917	/// The slice must* correspond only to the explicit slots and the first*
2918	/// element of the slice must always correspond to the first explicit slot
2919	/// in the corresponding NFA.
2920	fn apply(
2921	self,
2922	at: usize,
2923	caller_explicit_slots: &mut [Option<NonMaxUsize>],
2924	) {
2925	if self.is_empty() {
2926	return;
2927	}
2928	let at = NonMaxUsize::new(at);
2929	for slot in self.iter() {
2930	if slot >= caller_explicit_slots.len() {
2931	break;
2932	}
2933	caller_explicit_slots[slot] = at;
2934	}
2935	}
2936	}
2937
2938	impl core::fmt::Debug for Slots {
2939	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2940	write!(f, "S")?;
2941	for slot in self.iter() {
2942	write!(f, "-{:?}", slot)?;
2943	}
2944	Ok(())
2945	}
2946	}
2947
2948	/// An iterator over all of the bits set in a slot set.
2949	///
2950	/// This returns the bit index that is set, so callers may need to offset it
2951	/// to get the actual NFA slot index.
2952	#[derive(Debug)]
2953	struct SlotsIter {
2954	slots: Slots,
2955	}
2956
2957	impl Iterator for SlotsIter {
2958	type Item = usize;
2959
2960	fn next(&mut self) -> Option<usize> {
2961	// Number of zeroes here is always <= u8::MAX, and so fits in a usize.
2962	let slot = self.slots.0.trailing_zeros().as_usize();
2963	if slot >= Slots::LIMIT {
2964	return None;
2965	}
2966	self.slots = self.slots.remove(slot);
2967	Some(slot)
2968	}
2969	}
2970
2971	/// An error that occurred during the construction of a one-pass DFA.
2972	///
2973	/// This error does not provide many introspection capabilities. There are
2974	/// generally only two things you can do with it:
2975	///
2976	/// Obtain a human readable message via its `std::fmt::Display` impl.*
2977	/// Access an underlying* [`thompson::BuildError`] type from its `source`
2978	/// method via the `std::error::Error` trait. This error only occurs when using
2979	/// convenience routines for building a one-pass DFA directly from a pattern
2980	/// string.
2981	///
2982	/// When the `std` feature is enabled, this implements the `std::error::Error`
2983	/// trait.
2984	#[derive(Clone, Debug)]
2985	pub struct BuildError {
2986	kind: BuildErrorKind,
2987	}
2988
2989	/// The kind of error that occurred during the construction of a one-pass DFA.
2990	#[derive(Clone, Debug)]
2991	enum BuildErrorKind {
2992	NFA(crate::nfa::thompson::BuildError),
2993	Word(UnicodeWordBoundaryError),
2994	TooManyStates { limit: u64 },
2995	TooManyPatterns { limit: u64 },
2996	UnsupportedLook { look: Look },
2997	ExceededSizeLimit { limit: usize },
2998	NotOnePass { msg: &'static str },
2999	}
3000
3001	impl BuildError {
3002	fn nfa(err: crate::nfa::thompson::BuildError) -> BuildError {
3003	BuildError { kind: BuildErrorKind::NFA(err) }
3004	}
3005
3006	fn word(err: UnicodeWordBoundaryError) -> BuildError {
3007	BuildError { kind: BuildErrorKind::Word(err) }
3008	}
3009
3010	fn too_many_states(limit: u64) -> BuildError {
3011	BuildError { kind: BuildErrorKind::TooManyStates { limit } }
3012	}
3013
3014	fn too_many_patterns(limit: u64) -> BuildError {
3015	BuildError { kind: BuildErrorKind::TooManyPatterns { limit } }
3016	}
3017
3018	fn unsupported_look(look: Look) -> BuildError {
3019	BuildError { kind: BuildErrorKind::UnsupportedLook { look } }
3020	}
3021
3022	fn exceeded_size_limit(limit: usize) -> BuildError {
3023	BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
3024	}
3025
3026	fn not_one_pass(msg: &'static str) -> BuildError {
3027	BuildError { kind: BuildErrorKind::NotOnePass { msg } }
3028	}
3029	}
3030
3031	#[cfg(feature = "std")]
3032	impl std::error::Error for BuildError {
3033	fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
3034	use self::BuildErrorKind::*;
3035
3036	match self.kind {
3037	NFA(ref err) => Some(err),
3038	Word(ref err) => Some(err),
3039	_ => None,
3040	}
3041	}
3042	}
3043
3044	impl core::fmt::Display for BuildError {
3045	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
3046	use self::BuildErrorKind::*;
3047
3048	match self.kind {
3049	NFA(_) => write!(f, "error building NFA"),
3050	Word(_) => write!(f, "NFA contains Unicode word boundary"),
3051	TooManyStates { limit } => write!(
3052	f,
3053	"one-pass DFA exceeded a limit of {:?} for number of states",
3054	limit,
3055	),
3056	TooManyPatterns { limit } => write!(
3057	f,
3058	"one-pass DFA exceeded a limit of {:?} for number of patterns",
3059	limit,
3060	),
3061	UnsupportedLook { look } => write!(
3062	f,
3063	"one-pass DFA does not support the {:?} assertion",
3064	look,
3065	),
3066	ExceededSizeLimit { limit } => write!(
3067	f,
3068	"one-pass DFA exceeded size limit of {:?} during building",
3069	limit,
3070	),
3071	NotOnePass { msg } => write!(
3072	f,
3073	"one-pass DFA could not be built because \
3074	pattern is not one-pass: {}",
3075	msg,
3076	),
3077	}
3078	}
3079	}
3080
3081	#[cfg(all(test, feature = "syntax"))]
3082	mod tests {
3083	use alloc::string::ToString;
3084
3085	use super::*;
3086
3087	#[test]
3088	fn fail_conflicting_transition() {
3089	let predicate = \|err: &str\| err.contains("conflicting transition");
3090
3091	let err = DFA::new(r"a*[ab]").unwrap_err().to_string();
3092	assert!(predicate(&err), "{}", err);
3093	}
3094
3095	#[test]
3096	fn fail_multiple_epsilon() {
3097	let predicate = \|err: &str\| {
3098	err.contains("multiple epsilon transitions to same state")
3099	};
3100
3101	let err = DFA::new(r"(^\|$)a").unwrap_err().to_string();
3102	assert!(predicate(&err), "{}", err);
3103	}
3104
3105	#[test]
3106	fn fail_multiple_match() {
3107	let predicate = \|err: &str\| {
3108	err.contains("multiple epsilon transitions to match state")
3109	};
3110
3111	let err = DFA::new_many(&[r"^", r"$"]).unwrap_err().to_string();
3112	assert!(predicate(&err), "{}", err);
3113	}
3114
3115	// This test is meant to build a one-pass regex with the maximum number of
3116	// possible slots.
3117	//
3118	// NOTE: Remember that the slot limit only applies to explicit capturing
3119	// groups. Any number of implicit capturing groups is supported (up to the
3120	// maximum number of supported patterns), since implicit groups are handled
3121	// by the search loop itself.
3122	#[test]
3123	fn max_slots() {
3124	// One too many...
3125	let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)(q)";
3126	assert!(DFA::new(pat).is_err());
3127	// Just right.
3128	let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)";
3129	assert!(DFA::new(pat).is_ok());
3130	}
3131
3132	// This test ensures that the one-pass DFA works with all look-around
3133	// assertions that we expect it to work with.
3134	//
3135	// The utility of this test is that each one-pass transition has a small
3136	// amount of space to store look-around assertions. Currently, there is
3137	// logic in the one-pass constructor to ensure there aren't more than ten
3138	// possible assertions. And indeed, there are only ten possible assertions
3139	// (at time of writing), so this is okay. But conceivably, more assertions
3140	// could be added. So we check that things at least work with what we
3141	// expect them to work with.
3142	#[test]
3143	fn assertions() {
3144	// haystack anchors
3145	assert!(DFA::new(r"^").is_ok());
3146	assert!(DFA::new(r"$").is_ok());
3147
3148	// line anchors
3149	assert!(DFA::new(r"(?m)^").is_ok());
3150	assert!(DFA::new(r"(?m)$").is_ok());
3151	assert!(DFA::new(r"(?Rm)^").is_ok());
3152	assert!(DFA::new(r"(?Rm)$").is_ok());
3153
3154	// word boundaries
3155	if cfg!(feature = "unicode-word-boundary") {
3156	assert!(DFA::new(r"\b").is_ok());
3157	assert!(DFA::new(r"\B").is_ok());
3158	}
3159	assert!(DFA::new(r"(?-u)\b").is_ok());
3160	assert!(DFA::new(r"(?-u)\B").is_ok());
3161	}
3162
3163	#[cfg(not(miri))] // takes too long on miri
3164	#[test]
3165	fn is_one_pass() {
3166	use crate::util::syntax;
3167
3168	assert!(DFA::new(r"a*b").is_ok());
3169	if cfg!(feature = "unicode-perl") {
3170	assert!(DFA::new(r"\w").is_ok());
3171	}
3172	assert!(DFA::new(r"(?-u)\w*\s").is_ok());
3173	assert!(DFA::new(r"(?s:.)*?").is_ok());
3174	assert!(DFA::builder()
3175	.syntax(syntax::Config::new().utf8(`false`))
3176	.build(r"(?s-u:.)*?")
3177	.is_ok());
3178	}
3179
3180	#[test]
3181	fn is_not_one_pass() {
3182	assert!(DFA::new(r"a*a").is_err());
3183	assert!(DFA::new(r"(?s-u:.)*?").is_err());
3184	assert!(DFA::new(r"(?s:.)*?a").is_err());
3185	}
3186
3187	#[cfg(not(miri))]
3188	#[test]
3189	fn is_not_one_pass_bigger() {
3190	assert!(DFA::new(r"\w*\s").is_err());
3191	}
3192	}
3193

Provided by KDAB

Definitions