nfa.rs - Codebrowser

1	use core::{fmt, mem};
2
3	use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec};
4
5	#[cfg(feature = "syntax")]
6	use crate::nfa::thompson::{
7	compiler::{Compiler, Config},
8	error::BuildError,
9	};
10	use crate::{
11	nfa::thompson::builder::Builder,
12	util::{
13	alphabet::{self, ByteClassSet, ByteClasses},
14	captures::{GroupInfo, GroupInfoError},
15	look::{Look, LookMatcher, LookSet},
16	primitives::{
17	IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID,
18	},
19	sparse_set::SparseSet,
20	},
21	};
22
23	/// A byte oriented Thompson non-deterministic finite automaton (NFA).
24	///
25	/// A Thompson NFA is a finite state machine that permits unconditional epsilon
26	/// transitions, but guarantees that there exists at most one non-epsilon
27	/// transition for each element in the alphabet for each state.
28	///
29	/// An NFA may be used directly for searching, for analysis or to build
30	/// a deterministic finite automaton (DFA).
31	///
32	/// # Cheap clones
33	///
34	/// Since an NFA is a core data type in this crate that many other regex
35	/// engines are based on top of, it is convenient to give ownership of an NFA
36	/// to said regex engines. Because of this, an NFA uses reference counting
37	/// internally. Therefore, it is cheap to clone and it is encouraged to do so.
38	///
39	/// # Capabilities
40	///
41	/// Using an NFA for searching via the
42	/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount
43	/// of "power" of any regex engine in this crate. Namely, it supports the
44	/// following in all cases:
45	///
46	/// 1. Detection of a match.
47	/// 2. Location of a match, including both the start and end offset, in a
48	/// single pass of the haystack.
49	/// 3. Location of matching capturing groups.
50	/// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are
51	/// present.
52	///
53	/// # Capturing Groups
54	///
55	/// Groups refer to parenthesized expressions inside a regex pattern. They look
56	/// like this, where `exp` is an arbitrary regex:
57	///
58	/// `(exp)` - An unnamed capturing group.*
59	/// `(?P<name>exp)` or `(?<name>exp)` - A named capturing group.*
60	/// `(?:exp)` - A non-capturing group.*
61	/// `(?i:exp)` - A non-capturing group that sets flags.*
62	///
63	/// Only the first two forms are said to be _capturing_. Capturing
64	/// means that the last position at which they match is reportable. The
65	/// [`Captures`](crate::util::captures::Captures) type provides convenient
66	/// access to the match positions of capturing groups, which includes looking
67	/// up capturing groups by their name.
68	///
69	/// # Byte oriented
70	///
71	/// This NFA is byte oriented, which means that all of its transitions are
72	/// defined on bytes. In other words, the alphabet of an NFA consists of the
73	/// 256 different byte values.
74	///
75	/// While DFAs nearly demand that they be byte oriented for performance
76	/// reasons, an NFA could conceivably be Unicode codepoint* oriented. Indeed,*
77	/// a previous version of this NFA supported both byte and codepoint oriented
78	/// modes. A codepoint oriented mode can work because an NFA fundamentally uses
79	/// a sparse representation of transitions, which works well with the large
80	/// sparse space of Unicode codepoints.
81	///
82	/// Nevertheless, this NFA is only byte oriented. This choice is primarily
83	/// driven by implementation simplicity, and also in part memory usage. In
84	/// practice, performance between the two is roughly comparable. However,
85	/// building a DFA (including a hybrid DFA) really wants a byte oriented NFA.
86	/// So if we do have a codepoint oriented NFA, then we also need to generate
87	/// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only
88	/// generating byte oriented NFAs, we can produce one less NFA. In other words,
89	/// if we made our NFA codepoint oriented, we'd need to also* make it support*
90	/// a byte oriented mode, which is more complicated. But a byte oriented mode
91	/// can support everything.
92	///
93	/// # Differences with DFAs
94	///
95	/// At the theoretical level, the precise difference between an NFA and a DFA
96	/// is that, in a DFA, for every state, an input symbol unambiguously refers
97	/// to a single transition _and_ that an input symbol is required for each
98	/// transition. At a practical level, this permits DFA implementations to be
99	/// implemented at their core with a small constant number of CPU instructions
100	/// for each byte of input searched. In practice, this makes them quite a bit
101	/// faster than NFAs _in general_. Namely, in order to execute a search for any
102	/// Thompson NFA, one needs to keep track of a _set_ of states, and execute
103	/// the possible transitions on all of those states for each input symbol.
104	/// Overall, this results in much more overhead. To a first approximation, one
105	/// can expect DFA searches to be about an order of magnitude faster.
106	///
107	/// So why use an NFA at all? The main advantage of an NFA is that it takes
108	/// linear time (in the size of the pattern string after repetitions have been
109	/// expanded) to build and linear memory usage. A DFA, on the other hand, may
110	/// take exponential time and/or space to build. Even in non-pathological
111	/// cases, DFAs often take quite a bit more memory than their NFA counterparts,
112	/// _especially_ if large Unicode character classes are involved. Of course,
113	/// an NFA also provides additional capabilities. For example, it can match
114	/// Unicode word boundaries on non-ASCII text and resolve the positions of
115	/// capturing groups.
116	///
117	/// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a
118	/// good balance between an NFA and a DFA. It avoids the exponential build time
119	/// of a DFA while maintaining its fast search time. The downside of a hybrid
120	/// NFA/DFA is that in some cases it can be slower at search time than the NFA.
121	/// (It also has less functionality than a pure NFA. It cannot handle Unicode
122	/// word boundaries on non-ASCII text and cannot resolve capturing groups.)
123	///
124	/// # Example
125	///
126	/// This shows how to build an NFA with the default configuration and execute a
127	/// search using the Pike VM.
128	///
129	/// ```
130	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
131	///
132	/// let re = PikeVM::new(r"foo[0-9]+")?;
133	/// let mut cache = re.create_cache();
134	/// let mut caps = re.create_captures();
135	///
136	/// let expected = Some(Match::must(`0`, `0`..`8`));
137	/// re.captures(&mut cache, b"foo12345", &mut caps);
138	/// assert_eq!(expected, caps.get_match());
139	///
140	/// # Ok::<(), Box<dyn std::error::Error>>(())
141	/// ```
142	///
143	/// # Example: resolving capturing groups
144	///
145	/// This example shows how to parse some simple dates and extract the
146	/// components of each date via capturing groups.
147	///
148	/// ```
149	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
150	/// use regex_automata::{
151	/// nfa::thompson::pikevm::PikeVM,
152	/// util::captures::Captures,
153	/// };
154	///
155	/// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})")?;
156	/// let mut cache = vm.create_cache();
157	///
158	/// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05";
159	/// let all: Vec<Captures> = vm.captures_iter(
160	/// &mut cache, haystack.as_bytes()
161	/// ).collect();
162	/// // There should be a total of 3 matches.
163	/// assert_eq!(`3`, all.len());
164	/// // The year from the second match is '2013'.
165	/// let span = all[`1`].get_group_by_name("y").unwrap();
166	/// assert_eq!("2013", &haystack[span]);
167	///
168	/// # Ok::<(), Box<dyn std::error::Error>>(())
169	/// ```
170	///
171	/// This example shows that only the last match of a capturing group is
172	/// reported, even if it had to match multiple times for an overall match
173	/// to occur.
174	///
175	/// ```
176	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
177	///
178	/// let re = PikeVM::new(r"([a-z]){4}")?;
179	/// let mut cache = re.create_cache();
180	/// let mut caps = re.create_captures();
181	///
182	/// let haystack = b"quux";
183	/// re.captures(&mut cache, haystack, &mut caps);
184	/// assert!(caps.is_match());
185	/// assert_eq!(Some(Span::from(`3`..`4`)), caps.get_group(`1`));
186	///
187	/// # Ok::<(), Box<dyn std::error::Error>>(())
188	/// ```
189	#[derive(Clone)]
190	pub struct NFA(
191	// We make NFAs reference counted primarily for two reasons. First is that
192	// the NFA type itself is quite large (at least 0.5KB), and so it makes
193	// sense to put it on the heap by default anyway. Second is that, for Arc
194	// specifically, this enables cheap clones. This tends to be useful because
195	// several structures (the backtracker, the Pike VM, the hybrid NFA/DFA)
196	// all want to hang on to an NFA for use during search time. We could
197	// provide the NFA at search time via a function argument, but this makes
198	// for an unnecessarily annoying API. Instead, we just let each structure
199	// share ownership of the NFA. Using a deep clone would not be smart, since
200	// the NFA can use quite a bit of heap space.
201	Arc<Inner>,
202	);
203
204	impl NFA {
205	/// Parse the given regular expression using a default configuration and
206	/// build an NFA from it.
207	///
208	/// If you want a non-default configuration, then use the NFA
209	/// [`Compiler`] with a [`Config`].
210	///
211	/// # Example
212	///
213	/// ```
214	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
215	///
216	/// let re = PikeVM::new(r"foo[0-9]+")?;
217	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
218	///
219	/// let expected = Some(Match::must(`0`, `0`..`8`));
220	/// re.captures(&mut cache, b"foo12345", &mut caps);
221	/// assert_eq!(expected, caps.get_match());
222	///
223	/// # Ok::<(), Box<dyn std::error::Error>>(())
224	/// ```
225	#[cfg(feature = "syntax")]
226	pub fn new(pattern: &str) -> Result<NFA, BuildError> {
227	NFA::compiler().build(pattern)
228	}
229
230	/// Parse the given regular expressions using a default configuration and
231	/// build a multi-NFA from them.
232	///
233	/// If you want a non-default configuration, then use the NFA
234	/// [`Compiler`] with a [`Config`].
235	///
236	/// # Example
237	///
238	/// ```
239	/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
240	///
241	/// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?;
242	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
243	///
244	/// let expected = Some(Match::must(`1`, `0`..`3`));
245	/// re.captures(&mut cache, b"foo12345bar", &mut caps);
246	/// assert_eq!(expected, caps.get_match());
247	///
248	/// # Ok::<(), Box<dyn std::error::Error>>(())
249	/// ```
250	#[cfg(feature = "syntax")]
251	pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> {
252	NFA::compiler().build_many(patterns)
253	}
254
255	/// Returns an NFA with a single regex pattern that always matches at every
256	/// position.
257	///
258	/// # Example
259	///
260	/// ```
261	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
262	///
263	/// let re = PikeVM::new_from_nfa(NFA::always_match())?;
264	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
265	///
266	/// let expected = Some(Match::must(`0`, `0`..`0`));
267	/// re.captures(&mut cache, b"", &mut caps);
268	/// assert_eq!(expected, caps.get_match());
269	/// re.captures(&mut cache, b"foo", &mut caps);
270	/// assert_eq!(expected, caps.get_match());
271	///
272	/// # Ok::<(), Box<dyn std::error::Error>>(())
273	/// ```
274	pub fn always_match() -> NFA {
275	// We could use NFA::new("") here and we'd get the same semantics, but
276	// hand-assembling the NFA (as below) does the same thing with a fewer
277	// number of states. It also avoids needing the 'syntax' feature
278	// enabled.
279	//
280	// Technically all we need is the "match" state, but we add the
281	// "capture" states so that the PikeVM can use this NFA.
282	//
283	// The unwraps below are OK because we add so few states that they will
284	// never exhaust any default limits in any environment.
285	let mut builder = Builder::new();
286	let pid = builder.start_pattern().unwrap();
287	assert_eq!(pid.as_usize(), `0`);
288	let start_id =
289	builder.add_capture_start(StateID::ZERO, `0`, None).unwrap();
290	let end_id = builder.add_capture_end(StateID::ZERO, `0`).unwrap();
291	let match_id = builder.add_match().unwrap();
292	builder.patch(start_id, end_id).unwrap();
293	builder.patch(end_id, match_id).unwrap();
294	let pid = builder.finish_pattern(start_id).unwrap();
295	assert_eq!(pid.as_usize(), `0`);
296	builder.build(start_id, start_id).unwrap()
297	}
298
299	/// Returns an NFA that never matches at any position.
300	///
301	/// This is a convenience routine for creating an NFA with zero patterns.
302	///
303	/// # Example
304	///
305	/// ```
306	/// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
307	///
308	/// let re = PikeVM::new_from_nfa(NFA::never_match())?;
309	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
310	///
311	/// re.captures(&mut cache, b"", &mut caps);
312	/// assert!(!caps.is_match());
313	/// re.captures(&mut cache, b"foo", &mut caps);
314	/// assert!(!caps.is_match());
315	///
316	/// # Ok::<(), Box<dyn std::error::Error>>(())
317	/// ```
318	pub fn never_match() -> NFA {
319	// This always succeeds because it only requires one NFA state, which
320	// will never exhaust any (default) limits.
321	let mut builder = Builder::new();
322	let sid = builder.add_fail().unwrap();
323	builder.build(sid, sid).unwrap()
324	}
325
326	/// Return a default configuration for an `NFA`.
327	///
328	/// This is a convenience routine to avoid needing to import the `Config`
329	/// type when customizing the construction of an NFA.
330	///
331	/// # Example
332	///
333	/// This example shows how to build an NFA with a small size limit that
334	/// results in a compilation error for any regex that tries to use more
335	/// heap memory than the configured limit.
336	///
337	/// ```
338	/// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
339	///
340	/// let result = PikeVM::builder()
341	/// .thompson(NFA::config().nfa_size_limit(Some(`1_000`)))
342	/// // Remember, \w is Unicode-aware by default and thus huge.
343	/// .build(r"\w+");
344	/// assert!(result.is_err());
345	/// ```
346	#[cfg(feature = "syntax")]
347	pub fn config() -> Config {
348	Config::new()
349	}
350
351	/// Return a compiler for configuring the construction of an `NFA`.
352	///
353	/// This is a convenience routine to avoid needing to import the
354	/// [`Compiler`] type in common cases.
355	///
356	/// # Example
357	///
358	/// This example shows how to build an NFA that is permitted match invalid
359	/// UTF-8. Without the additional syntax configuration here, compilation of
360	/// `(?-u:.)` would fail because it is permitted to match invalid UTF-8.
361	///
362	/// ```
363	/// use regex_automata::{
364	/// nfa::thompson::pikevm::PikeVM,
365	/// util::syntax,
366	/// Match,
367	/// };
368	///
369	/// let re = PikeVM::builder()
370	/// .syntax(syntax::Config::new().utf8(`false`))
371	/// .build(r"[a-z]+(?-u:.)")?;
372	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
373	///
374	/// let expected = Some(Match::must(`0`, `1`..`5`));
375	/// re.captures(&mut cache, b"`\xFF`abc`\xFF`", &mut caps);
376	/// assert_eq!(expected, caps.get_match());
377	///
378	/// # Ok::<(), Box<dyn std::error::Error>>(())
379	/// ```
380	#[cfg(feature = "syntax")]
381	pub fn compiler() -> Compiler {
382	Compiler::new()
383	}
384
385	/// Returns an iterator over all pattern identifiers in this NFA.
386	///
387	/// Pattern IDs are allocated in sequential order starting from zero,
388	/// where the order corresponds to the order of patterns provided to the
389	/// [`NFA::new_many`] constructor.
390	///
391	/// # Example
392	///
393	/// ```
394	/// use regex_automata::{nfa::thompson::NFA, PatternID};
395	///
396	/// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
397	/// let pids: Vec<PatternID> = nfa.patterns().collect();
398	/// assert_eq!(pids, vec![
399	/// PatternID::must(`0`),
400	/// PatternID::must(`1`),
401	/// PatternID::must(`2`),
402	/// ]);
403	///
404	/// # Ok::<(), Box<dyn std::error::Error>>(())
405	/// ```
406	pub fn patterns(&self) -> PatternIter<'_> {
407	PatternIter {
408	it: PatternID::iter(self.pattern_len()),
409	_marker: core::marker::PhantomData,
410	}
411	}
412
413	/// Returns the total number of regex patterns in this NFA.
414	///
415	/// This may return zero if the NFA was constructed with no patterns. In
416	/// this case, the NFA can never produce a match for any input.
417	///
418	/// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because
419	/// NFA construction will fail if too many patterns are added.
420	///
421	/// It is always true that `nfa.patterns().count() == nfa.pattern_len()`.
422	///
423	/// # Example
424	///
425	/// ```
426	/// use regex_automata::nfa::thompson::NFA;
427	///
428	/// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
429	/// assert_eq!(`3`, nfa.pattern_len());
430	///
431	/// let nfa = NFA::never_match();
432	/// assert_eq!(`0`, nfa.pattern_len());
433	///
434	/// let nfa = NFA::always_match();
435	/// assert_eq!(`1`, nfa.pattern_len());
436	///
437	/// # Ok::<(), Box<dyn std::error::Error>>(())
438	/// ```
439	#[inline]
440	pub fn pattern_len(&self) -> usize {
441	self.0.start_pattern.len()
442	}
443
444	/// Return the state identifier of the initial anchored state of this NFA.
445	///
446	/// The returned identifier is guaranteed to be a valid index into the
447	/// slice returned by [`NFA::states`], and is also a valid argument to
448	/// [`NFA::state`].
449	///
450	/// # Example
451	///
452	/// This example shows a somewhat contrived example where we can easily
453	/// predict the anchored starting state.
454	///
455	/// ```
456	/// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures};
457	///
458	/// let nfa = NFA::compiler()
459	/// .configure(NFA::config().which_captures(WhichCaptures::None))
460	/// .build("a")?;
461	/// let state = nfa.state(nfa.start_anchored());
462	/// match *state {
463	/// State::ByteRange { trans } => {
464	/// assert_eq!(b'a', trans.start);
465	/// assert_eq!(b'a', trans.end);
466	/// }
467	/// _ => unreachable!("unexpected state"),
468	/// }
469	///
470	/// # Ok::<(), Box<dyn std::error::Error>>(())
471	/// ```
472	#[inline]
473	pub fn start_anchored(&self) -> StateID {
474	self.0.start_anchored
475	}
476
477	/// Return the state identifier of the initial unanchored state of this
478	/// NFA.
479	///
480	/// This is equivalent to the identifier returned by
481	/// [`NFA::start_anchored`] when the NFA has no unanchored starting state.
482	///
483	/// The returned identifier is guaranteed to be a valid index into the
484	/// slice returned by [`NFA::states`], and is also a valid argument to
485	/// [`NFA::state`].
486	///
487	/// # Example
488	///
489	/// This example shows that the anchored and unanchored starting states
490	/// are equivalent when an anchored NFA is built.
491	///
492	/// ```
493	/// use regex_automata::nfa::thompson::NFA;
494	///
495	/// let nfa = NFA::new("^a")?;
496	/// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
497	///
498	/// # Ok::<(), Box<dyn std::error::Error>>(())
499	/// ```
500	#[inline]
501	pub fn start_unanchored(&self) -> StateID {
502	self.0.start_unanchored
503	}
504
505	/// Return the state identifier of the initial anchored state for the given
506	/// pattern, or `None` if there is no pattern corresponding to the given
507	/// identifier.
508	///
509	/// If one uses the starting state for a particular pattern, then the only
510	/// match that can be returned is for the corresponding pattern.
511	///
512	/// The returned identifier is guaranteed to be a valid index into the
513	/// slice returned by [`NFA::states`], and is also a valid argument to
514	/// [`NFA::state`].
515	///
516	/// # Errors
517	///
518	/// If the pattern doesn't exist in this NFA, then this returns an error.
519	/// This occurs when `pid.as_usize() >= nfa.pattern_len()`.
520	///
521	/// # Example
522	///
523	/// This example shows that the anchored and unanchored starting states
524	/// are equivalent when an anchored NFA is built.
525	///
526	/// ```
527	/// use regex_automata::{nfa::thompson::NFA, PatternID};
528	///
529	/// let nfa = NFA::new_many(&["^a", "^b"])?;
530	/// // The anchored and unanchored states for the entire NFA are the same,
531	/// // since all of the patterns are anchored.
532	/// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
533	/// // But the anchored starting states for each pattern are distinct,
534	/// // because these starting states can only lead to matches for the
535	/// // corresponding pattern.
536	/// let anchored = Some(nfa.start_anchored());
537	/// assert_ne!(anchored, nfa.start_pattern(PatternID::must(`0`)));
538	/// assert_ne!(anchored, nfa.start_pattern(PatternID::must(`1`)));
539	/// // Requesting a pattern not in the NFA will result in None:
540	/// assert_eq!(None, nfa.start_pattern(PatternID::must(`2`)));
541	///
542	/// # Ok::<(), Box<dyn std::error::Error>>(())
543	/// ```
544	#[inline]
545	pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> {
546	self.0.start_pattern.get(pid.as_usize()).copied()
547	}
548
549	/// Get the byte class set for this NFA.
550	///
551	/// A byte class set is a partitioning of this NFA's alphabet into
552	/// equivalence classes. Any two bytes in the same equivalence class are
553	/// guaranteed to never discriminate between a match or a non-match. (The
554	/// partitioning may not be minimal.)
555	///
556	/// Byte classes are used internally by this crate when building DFAs.
557	/// Namely, among other optimizations, they enable a space optimization
558	/// where the DFA's internal alphabet is defined over the equivalence
559	/// classes of bytes instead of all possible byte values. The former is
560	/// often quite a bit smaller than the latter, which permits the DFA to use
561	/// less space for its transition table.
562	#[inline]
563	pub(crate) fn byte_class_set(&self) -> &ByteClassSet {
564	&self.0.byte_class_set
565	}
566
567	/// Get the byte classes for this NFA.
568	///
569	/// Byte classes represent a partitioning of this NFA's alphabet into
570	/// equivalence classes. Any two bytes in the same equivalence class are
571	/// guaranteed to never discriminate between a match or a non-match. (The
572	/// partitioning may not be minimal.)
573	///
574	/// Byte classes are used internally by this crate when building DFAs.
575	/// Namely, among other optimizations, they enable a space optimization
576	/// where the DFA's internal alphabet is defined over the equivalence
577	/// classes of bytes instead of all possible byte values. The former is
578	/// often quite a bit smaller than the latter, which permits the DFA to use
579	/// less space for its transition table.
580	///
581	/// # Example
582	///
583	/// This example shows how to query the class of various bytes.
584	///
585	/// ```
586	/// use regex_automata::nfa::thompson::NFA;
587	///
588	/// let nfa = NFA::new("[a-z]+")?;
589	/// let classes = nfa.byte_classes();
590	/// // 'a' and 'z' are in the same class for this regex.
591	/// assert_eq!(classes.get(b'a'), classes.get(b'z'));
592	/// // But 'a' and 'A' are not.
593	/// assert_ne!(classes.get(b'a'), classes.get(b'A'));
594	///
595	/// # Ok::<(), Box<dyn std::error::Error>>(())
596	/// ```
597	#[inline]
598	pub fn byte_classes(&self) -> &ByteClasses {
599	&self.0.byte_classes
600	}
601
602	/// Return a reference to the NFA state corresponding to the given ID.
603	///
604	/// This is a convenience routine for `nfa.states()[id]`.
605	///
606	/// # Panics
607	///
608	/// This panics when the given identifier does not reference a valid state.
609	/// That is, when `id.as_usize() >= nfa.states().len()`.
610	///
611	/// # Example
612	///
613	/// The anchored state for a pattern will typically correspond to a
614	/// capturing state for that pattern. (Although, this is not an API
615	/// guarantee!)
616	///
617	/// ```
618	/// use regex_automata::{nfa::thompson::{NFA, State}, PatternID};
619	///
620	/// let nfa = NFA::new("a")?;
621	/// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap());
622	/// match *state {
623	/// State::Capture { slot, .. } => {
624	/// assert_eq!(`0`, slot.as_usize());
625	/// }
626	/// _ => unreachable!("unexpected state"),
627	/// }
628	///
629	/// # Ok::<(), Box<dyn std::error::Error>>(())
630	/// ```
631	#[inline]
632	pub fn state(&self, id: StateID) -> &State {
633	&self.states()[id]
634	}
635
636	/// Returns a slice of all states in this NFA.
637	///
638	/// The slice returned is indexed by `StateID`. This provides a convenient
639	/// way to access states while following transitions among those states.
640	///
641	/// # Example
642	///
643	/// This demonstrates that disabling UTF-8 mode can shrink the size of the
644	/// NFA considerably in some cases, especially when using Unicode character
645	/// classes.
646	///
647	/// ```
648	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
649	/// use regex_automata::nfa::thompson::NFA;
650	///
651	/// let nfa_unicode = NFA::new(r"\w")?;
652	/// let nfa_ascii = NFA::new(r"(?-u)\w")?;
653	/// // Yes, a factor of 45 difference. No lie.
654	/// assert!(`40` * nfa_ascii.states().len() < nfa_unicode.states().len());
655	///
656	/// # Ok::<(), Box<dyn std::error::Error>>(())
657	/// ```
658	#[inline]
659	pub fn states(&self) -> &[State] {
660	&self.0.states
661	}
662
663	/// Returns the capturing group info for this NFA.
664	///
665	/// The [`GroupInfo`] provides a way to map to and from capture index
666	/// and capture name for each pattern. It also provides a mapping from
667	/// each of the capturing groups in every pattern to their corresponding
668	/// slot offsets encoded in [`State::Capture`] states.
669	///
670	/// Note that `GroupInfo` uses reference counting internally, such that
671	/// cloning a `GroupInfo` is very cheap.
672	///
673	/// # Example
674	///
675	/// This example shows how to get a list of all capture group names for
676	/// a particular pattern.
677	///
678	/// ```
679	/// use regex_automata::{nfa::thompson::NFA, PatternID};
680	///
681	/// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?;
682	/// // The first is the implicit group that is always unnammed. The next
683	/// // 5 groups are the explicit groups found in the concrete syntax above.
684	/// let expected = vec![None, None, Some("foo"), None, None, Some("bar")];
685	/// let got: Vec<Option<&str>> =
686	/// nfa.group_info().pattern_names(PatternID::ZERO).collect();
687	/// assert_eq!(expected, got);
688	///
689	/// // Using an invalid pattern ID will result in nothing yielded.
690	/// let got = nfa.group_info().pattern_names(PatternID::must(`999`)).count();
691	/// assert_eq!(`0`, got);
692	///
693	/// # Ok::<(), Box<dyn std::error::Error>>(())
694	/// ```
695	#[inline]
696	pub fn group_info(&self) -> &GroupInfo {
697	&self.0.group_info()
698	}
699
700	/// Returns true if and only if this NFA has at least one
701	/// [`Capture`](State::Capture) in its sequence of states.
702	///
703	/// This is useful as a way to perform a quick test before attempting
704	/// something that does or does not require capture states. For example,
705	/// some regex engines (like the PikeVM) require capture states in order to
706	/// work at all.
707	///
708	/// # Example
709	///
710	/// This example shows a few different NFAs and whether they have captures
711	/// or not.
712	///
713	/// ```
714	/// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
715	///
716	/// // Obviously has capture states.
717	/// let nfa = NFA::new("(a)")?;
718	/// assert!(nfa.has_capture());
719	///
720	/// // Less obviously has capture states, because every pattern has at
721	/// // least one anonymous capture group corresponding to the match for the
722	/// // entire pattern.
723	/// let nfa = NFA::new("a")?;
724	/// assert!(nfa.has_capture());
725	///
726	/// // Other than hand building your own NFA, this is the only way to build
727	/// // an NFA without capturing groups. In general, you should only do this
728	/// // if you don't intend to use any of the NFA-oriented regex engines.
729	/// // Overall, capturing groups don't have many downsides. Although they
730	/// // can add a bit of noise to simple NFAs, so it can be nice to disable
731	/// // them for debugging purposes.
732	/// //
733	/// // Notice that 'has_capture' is false here even when we have an
734	/// // explicit capture group in the pattern.
735	/// let nfa = NFA::compiler()
736	/// .configure(NFA::config().which_captures(WhichCaptures::None))
737	/// .build("(a)")?;
738	/// assert!(!nfa.has_capture());
739	///
740	/// # Ok::<(), Box<dyn std::error::Error>>(())
741	/// ```
742	#[inline]
743	pub fn has_capture(&self) -> bool {
744	self.0.has_capture
745	}
746
747	/// Returns true if and only if this NFA can match the empty string.
748	/// When it returns false, all possible matches are guaranteed to have a
749	/// non-zero length.
750	///
751	/// This is useful as cheap way to know whether code needs to handle the
752	/// case of a zero length match. This is particularly important when UTF-8
753	/// modes are enabled, as when UTF-8 mode is enabled, empty matches that
754	/// split a codepoint must never be reported. This extra handling can
755	/// sometimes be costly, and since regexes matching an empty string are
756	/// somewhat rare, it can be beneficial to treat such regexes specially.
757	///
758	/// # Example
759	///
760	/// This example shows a few different NFAs and whether they match the
761	/// empty string or not. Notice the empty string isn't merely a matter
762	/// of a string of length literally `0`, but rather, whether a match can
763	/// occur between specific pairs of bytes.
764	///
765	/// ```
766	/// use regex_automata::{nfa::thompson::NFA, util::syntax};
767	///
768	/// // The empty regex matches the empty string.
769	/// let nfa = NFA::new("")?;
770	/// assert!(nfa.has_empty(), "empty matches empty");
771	/// // The '+' repetition operator requires at least one match, and so
772	/// // does not match the empty string.
773	/// let nfa = NFA::new("a+")?;
774	/// assert!(!nfa.has_empty(), "+ does not match empty");
775	/// // But the '' repetition operator does.*
776	/// let nfa = NFA::new("a*")?;
777	/// assert!(nfa.has_empty(), "* does match empty");
778	/// // And wrapping '+' in an operator that can match an empty string also
779	/// // causes it to match the empty string too.
780	/// let nfa = NFA::new("(a+)*")?;
781	/// assert!(nfa.has_empty(), "+ inside of * matches empty");
782	///
783	/// // If a regex is just made of a look-around assertion, even if the
784	/// // assertion requires some kind of non-empty string around it (such as
785	/// // \b), then it is still treated as if it matches the empty string.
786	/// // Namely, if a match occurs of just a look-around assertion, then the
787	/// // match returned is empty.
788	/// let nfa = NFA::compiler()
789	/// .syntax(syntax::Config::new().utf8(`false`))
790	/// .build(r"^$\A\z\b\B(?-u:\b\B)")?;
791	/// assert!(nfa.has_empty(), "assertions match empty");
792	/// // Even when an assertion is wrapped in a '+', it still matches the
793	/// // empty string.
794	/// let nfa = NFA::new(r"\b+")?;
795	/// assert!(nfa.has_empty(), "+ of an assertion matches empty");
796	///
797	/// // An alternation with even one branch that can match the empty string
798	/// // is also said to match the empty string overall.
799	/// let nfa = NFA::new("foo\|(bar)?\|quux")?;
800	/// assert!(nfa.has_empty(), "alternations can match empty");
801	///
802	/// // An NFA that matches nothing does not match the empty string.
803	/// let nfa = NFA::new("[a&&b]")?;
804	/// assert!(!nfa.has_empty(), "never matching means not matching empty");
805	/// // But if it's wrapped in something that doesn't require a match at
806	/// // all, then it can match the empty string!
807	/// let nfa = NFA::new("[a&&b]*")?;
808	/// assert!(nfa.has_empty(), "* on never-match still matches empty");
809	/// // Since a '+' requires a match, using it on something that can never
810	/// // match will itself produce a regex that can never match anything,
811	/// // and thus does not match the empty string.
812	/// let nfa = NFA::new("[a&&b]+")?;
813	/// assert!(!nfa.has_empty(), "+ on never-match still matches nothing");
814	///
815	/// # Ok::<(), Box<dyn std::error::Error>>(())
816	/// ```
817	#[inline]
818	pub fn has_empty(&self) -> bool {
819	self.0.has_empty
820	}
821
822	/// Whether UTF-8 mode is enabled for this NFA or not.
823	///
824	/// When UTF-8 mode is enabled, all matches reported by a regex engine
825	/// derived from this NFA are guaranteed to correspond to spans of valid
826	/// UTF-8. This includes zero-width matches. For example, the regex engine
827	/// must guarantee that the empty regex will not match at the positions
828	/// between code units in the UTF-8 encoding of a single codepoint.
829	///
830	/// See [`Config::utf8`] for more information.
831	///
832	/// This is enabled by default.
833	///
834	/// # Example
835	///
836	/// This example shows how UTF-8 mode can impact the match spans that may
837	/// be reported in certain cases.
838	///
839	/// ```
840	/// use regex_automata::{
841	/// nfa::thompson::{self, pikevm::PikeVM},
842	/// Match, Input,
843	/// };
844	///
845	/// let re = PikeVM::new("")?;
846	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
847	///
848	/// // UTF-8 mode is enabled by default.
849	/// let mut input = Input::new("☃");
850	/// re.search(&mut cache, &input, &mut caps);
851	/// assert_eq!(Some(Match::must(`0`, `0`..`0`)), caps.get_match());
852	///
853	/// // Even though an empty regex matches at 1..1, our next match is
854	/// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
855	/// // three bytes long).
856	/// input.set_start(`1`);
857	/// re.search(&mut cache, &input, &mut caps);
858	/// assert_eq!(Some(Match::must(`0`, `3`..`3`)), caps.get_match());
859	///
860	/// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
861	/// let re = PikeVM::builder()
862	/// .thompson(thompson::Config::new().utf8(`false`))
863	/// .build("")?;
864	/// re.search(&mut cache, &input, &mut caps);
865	/// assert_eq!(Some(Match::must(`0`, `1`..`1`)), caps.get_match());
866	///
867	/// input.set_start(`2`);
868	/// re.search(&mut cache, &input, &mut caps);
869	/// assert_eq!(Some(Match::must(`0`, `2`..`2`)), caps.get_match());
870	///
871	/// input.set_start(`3`);
872	/// re.search(&mut cache, &input, &mut caps);
873	/// assert_eq!(Some(Match::must(`0`, `3`..`3`)), caps.get_match());
874	///
875	/// input.set_start(`4`);
876	/// re.search(&mut cache, &input, &mut caps);
877	/// assert_eq!(None, caps.get_match());
878	///
879	/// # Ok::<(), Box<dyn std::error::Error>>(())
880	/// ```
881	#[inline]
882	pub fn is_utf8(&self) -> bool {
883	self.0.utf8
884	}
885
886	/// Returns true when this NFA is meant to be matched in reverse.
887	///
888	/// Generally speaking, when this is true, it means the NFA is supposed to
889	/// be used in conjunction with moving backwards through the haystack. That
890	/// is, from a higher memory address to a lower memory address.
891	///
892	/// It is often the case that lower level routines dealing with an NFA
893	/// don't need to care about whether it is "meant" to be matched in reverse
894	/// or not. However, there are some specific cases where it matters. For
895	/// example, the implementation of CRLF-aware `^` and `$` line anchors
896	/// needs to know whether the search is in the forward or reverse
897	/// direction. In the forward direction, neither `^` nor `$` should match
898	/// when a `\r` has been seen previously and a `\n` is next. However, in
899	/// the reverse direction, neither `^` nor `$` should match when a `\n`
900	/// has been seen previously and a `\r` is next. This fundamentally changes
901	/// how the state machine is constructed, and thus needs to be altered
902	/// based on the direction of the search.
903	///
904	/// This is automatically set when using a [`Compiler`] with a configuration
905	/// where [`Config::reverse`] is enabled. If you're building your own NFA
906	/// by hand via a [`Builder`]
907	#[inline]
908	pub fn is_reverse(&self) -> bool {
909	self.0.reverse
910	}
911
912	/// Returns true if and only if all starting states for this NFA correspond
913	/// to the beginning of an anchored search.
914	///
915	/// Typically, an NFA will have both an anchored and an unanchored starting
916	/// state. Namely, because it tends to be useful to have both and the cost
917	/// of having an unanchored starting state is almost zero (for an NFA).
918	/// However, if all patterns in the NFA are themselves anchored, then even
919	/// the unanchored starting state will correspond to an anchored search
920	/// since the pattern doesn't permit anything else.
921	///
922	/// # Example
923	///
924	/// This example shows a few different scenarios where this method's
925	/// return value varies.
926	///
927	/// ```
928	/// use regex_automata::nfa::thompson::NFA;
929	///
930	/// // The unanchored starting state permits matching this pattern anywhere
931	/// // in a haystack, instead of just at the beginning.
932	/// let nfa = NFA::new("a")?;
933	/// assert!(!nfa.is_always_start_anchored());
934	///
935	/// // In this case, the pattern is itself anchored, so there is no way
936	/// // to run an unanchored search.
937	/// let nfa = NFA::new("^a")?;
938	/// assert!(nfa.is_always_start_anchored());
939	///
940	/// // When multiline mode is enabled, '^' can match at the start of a line
941	/// // in addition to the start of a haystack, so an unanchored search is
942	/// // actually possible.
943	/// let nfa = NFA::new("(?m)^a")?;
944	/// assert!(!nfa.is_always_start_anchored());
945	///
946	/// // Weird cases also work. A pattern is only considered anchored if all
947	/// // matches may only occur at the start of a haystack.
948	/// let nfa = NFA::new("(^a)\|a")?;
949	/// assert!(!nfa.is_always_start_anchored());
950	///
951	/// // When multiple patterns are present, if they are all anchored, then
952	/// // the NFA is always anchored too.
953	/// let nfa = NFA::new_many(&["^a", "^b", "^c"])?;
954	/// assert!(nfa.is_always_start_anchored());
955	///
956	/// // But if one pattern is unanchored, then the NFA must permit an
957	/// // unanchored search.
958	/// let nfa = NFA::new_many(&["^a", "b", "^c"])?;
959	/// assert!(!nfa.is_always_start_anchored());
960	///
961	/// # Ok::<(), Box<dyn std::error::Error>>(())
962	/// ```
963	#[inline]
964	pub fn is_always_start_anchored(&self) -> bool {
965	self.start_anchored() == self.start_unanchored()
966	}
967
968	/// Returns the look-around matcher associated with this NFA.
969	///
970	/// A look-around matcher determines how to match look-around assertions.
971	/// In particular, some assertions are configurable. For example, the
972	/// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
973	/// from the default of `\n` to any other byte.
974	///
975	/// If the NFA was built using a [`Compiler`], then this matcher
976	/// can be set via the [`Config::look_matcher`] configuration
977	/// knob. Otherwise, if you've built an NFA by hand, it is set via
978	/// [`Builder::set_look_matcher`].
979	///
980	/// # Example
981	///
982	/// This shows how to change the line terminator for multi-line assertions.
983	///
984	/// ```
985	/// use regex_automata::{
986	/// nfa::thompson::{self, pikevm::PikeVM},
987	/// util::look::LookMatcher,
988	/// Match, Input,
989	/// };
990	///
991	/// let mut lookm = LookMatcher::new();
992	/// lookm.set_line_terminator(b'`\x00`');
993	///
994	/// let re = PikeVM::builder()
995	/// .thompson(thompson::Config::new().look_matcher(lookm))
996	/// .build(r"(?m)^[a-z]+$")?;
997	/// let mut cache = re.create_cache();
998	///
999	/// // Multi-line assertions now use NUL as a terminator.
1000	/// assert_eq!(
1001	/// Some(Match::must(`0`, `1`..`4`)),
1002	/// re.find(&mut cache, b"`\x00`abc`\x00`"),
1003	/// );
1004	/// // ... and \n is no longer recognized as a terminator.
1005	/// assert_eq!(
1006	/// None,
1007	/// re.find(&mut cache, b"`\n`abc`\n`"),
1008	/// );
1009	///
1010	/// # Ok::<(), Box<dyn std::error::Error>>(())
1011	/// ```
1012	#[inline]
1013	pub fn look_matcher(&self) -> &LookMatcher {
1014	&self.0.look_matcher
1015	}
1016
1017	/// Returns the union of all look-around assertions used throughout this
1018	/// NFA. When the returned set is empty, it implies that the NFA has no
1019	/// look-around assertions and thus zero conditional epsilon transitions.
1020	///
1021	/// This is useful in some cases enabling optimizations. It is not
1022	/// unusual, for example, for optimizations to be of the form, "for any
1023	/// regex with zero conditional epsilon transitions, do ..." where "..."
1024	/// is some kind of optimization.
1025	///
1026	/// This isn't only helpful for optimizations either. Sometimes look-around
1027	/// assertions are difficult to support. For example, many of the DFAs in
1028	/// this crate don't support Unicode word boundaries or handle them using
1029	/// heuristics. Handling that correctly typically requires some kind of
1030	/// cheap check of whether the NFA has a Unicode word boundary in the first
1031	/// place.
1032	///
1033	/// # Example
1034	///
1035	/// This example shows how this routine varies based on the regex pattern:
1036	///
1037	/// ```
1038	/// use regex_automata::{nfa::thompson::NFA, util::look::Look};
1039	///
1040	/// // No look-around at all.
1041	/// let nfa = NFA::new("a")?;
1042	/// assert!(nfa.look_set_any().is_empty());
1043	///
1044	/// // When multiple patterns are present, since this returns the union,
1045	/// // it will include look-around assertions that only appear in one
1046	/// // pattern.
1047	/// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?;
1048	/// assert!(nfa.look_set_any().contains(Look::Start));
1049	///
1050	/// // Some groups of assertions have various shortcuts. For example:
1051	/// let nfa = NFA::new(r"(?-u:\b)")?;
1052	/// assert!(nfa.look_set_any().contains_word());
1053	/// assert!(!nfa.look_set_any().contains_word_unicode());
1054	/// assert!(nfa.look_set_any().contains_word_ascii());
1055	///
1056	/// # Ok::<(), Box<dyn std::error::Error>>(())
1057	/// ```
1058	#[inline]
1059	pub fn look_set_any(&self) -> LookSet {
1060	self.0.look_set_any
1061	}
1062
1063	/// Returns the union of all prefix look-around assertions for every
1064	/// pattern in this NFA. When the returned set is empty, it implies none of
1065	/// the patterns require moving through a conditional epsilon transition
1066	/// before inspecting the first byte in the haystack.
1067	///
1068	/// This can be useful for determining what kinds of assertions need to be
1069	/// satisfied at the beginning of a search. For example, typically DFAs
1070	/// in this crate will build a distinct starting state for each possible
1071	/// starting configuration that might result in look-around assertions
1072	/// being satisfied differently. However, if the set returned here is
1073	/// empty, then you know that the start state is invariant because there
1074	/// are no conditional epsilon transitions to consider.
1075	///
1076	/// # Example
1077	///
1078	/// This example shows how this routine varies based on the regex pattern:
1079	///
1080	/// ```
1081	/// use regex_automata::{nfa::thompson::NFA, util::look::Look};
1082	///
1083	/// // No look-around at all.
1084	/// let nfa = NFA::new("a")?;
1085	/// assert!(nfa.look_set_prefix_any().is_empty());
1086	///
1087	/// // When multiple patterns are present, since this returns the union,
1088	/// // it will include look-around assertions that only appear in one
1089	/// // pattern. But it will only include assertions that are in the prefix
1090	/// // of a pattern. For example, this includes '^' but not '$' even though
1091	/// // '$' does appear.
1092	/// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?;
1093	/// assert!(nfa.look_set_prefix_any().contains(Look::Start));
1094	/// assert!(!nfa.look_set_prefix_any().contains(Look::End));
1095	///
1096	/// # Ok::<(), Box<dyn std::error::Error>>(())
1097	/// ```
1098	#[inline]
1099	pub fn look_set_prefix_any(&self) -> LookSet {
1100	self.0.look_set_prefix_any
1101	}
1102
1103	// FIXME: The `look_set_prefix_all` computation was not correct, and it
1104	// seemed a little tricky to fix it. Since I wasn't actually using it for
1105	// anything, I just decided to remove it in the run up to the regex 1.9
1106	// release. If you need this, please file an issue.
1107	/*
1108	/// Returns the intersection of all prefix look-around assertions for every
1109	/// pattern in this NFA. When the returned set is empty, it implies at
1110	/// least one of the patterns does not require moving through a conditional
1111	/// epsilon transition before inspecting the first byte in the haystack.
1112	/// Conversely, when the set contains an assertion, it implies that every
1113	/// pattern in the NFA also contains that assertion in its prefix.
1114	///
1115	/// This can be useful for determining what kinds of assertions need to be
1116	/// satisfied at the beginning of a search. For example, if you know that
1117	/// [`Look::Start`] is in the prefix intersection set returned here, then
1118	/// you know that all searches, regardless of input configuration, will be
1119	/// anchored.
1120	///
1121	/// # Example
1122	///
1123	/// This example shows how this routine varies based on the regex pattern:
1124	///
1125	/// ```
1126	/// use regex_automata::{nfa::thompson::NFA, util::look::Look};
1127	///
1128	/// // No look-around at all.
1129	/// let nfa = NFA::new("a")?;
1130	/// assert!(nfa.look_set_prefix_all().is_empty());
1131	///
1132	/// // When multiple patterns are present, since this returns the
1133	/// // intersection, it will only include assertions present in every
1134	/// // prefix, and only the prefix.
1135	/// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?;
1136	/// assert!(nfa.look_set_prefix_all().contains(Look::Start));
1137	/// assert!(!nfa.look_set_prefix_all().contains(Look::End));
1138	///
1139	/// # Ok::<(), Box<dyn std::error::Error>>(())
1140	/// ```
1141	#[inline]
1142	pub fn look_set_prefix_all(&self) -> LookSet {
1143	self.0.look_set_prefix_all
1144	}
1145	*/
1146
1147	/// Returns the memory usage, in bytes, of this NFA.
1148	///
1149	/// This does not* include the stack size used up by this NFA. To*
1150	/// compute that, use `std::mem::size_of::<NFA>()`.
1151	///
1152	/// # Example
1153	///
1154	/// This example shows that large Unicode character classes can use quite
1155	/// a bit of memory.
1156	///
1157	/// ```
1158	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1159	/// use regex_automata::nfa::thompson::NFA;
1160	///
1161	/// let nfa_unicode = NFA::new(r"\w")?;
1162	/// let nfa_ascii = NFA::new(r"(?-u:\w)")?;
1163	///
1164	/// assert!(`10` * nfa_ascii.memory_usage() < nfa_unicode.memory_usage());
1165	///
1166	/// # Ok::<(), Box<dyn std::error::Error>>(())
1167	/// ```
1168	#[inline]
1169	pub fn memory_usage(&self) -> usize {
1170	use core::mem::size_of;
1171
1172	size_of::<Inner>() // allocated on the heap via Arc
1173	+ self.0.states.len() * size_of::<State>()
1174	+ self.0.start_pattern.len() * size_of::<StateID>()
1175	+ self.0.group_info.memory_usage()
1176	+ self.0.memory_extra
1177	}
1178	}
1179
1180	impl fmt::Debug for NFA {
1181	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1182	self.0.fmt(f)
1183	}
1184	}
1185
1186	/// The "inner" part of the NFA. We split this part out so that we can easily
1187	/// wrap it in an `Arc` above in the definition of `NFA`.
1188	///
1189	/// See builder.rs for the code that actually builds this type. This module
1190	/// does provide (internal) mutable methods for adding things to this
1191	/// NFA before finalizing it, but the high level construction process is
1192	/// controlled by the builder abstraction. (Which is complicated enough to
1193	/// get its own module.)
1194	#[derive(Default)]
1195	pub(super) struct Inner {
1196	/// The state sequence. This sequence is guaranteed to be indexable by all
1197	/// starting state IDs, and it is also guaranteed to contain at most one
1198	/// `Match` state for each pattern compiled into this NFA. (A pattern may
1199	/// not have a corresponding `Match` state if a `Match` state is impossible
1200	/// to reach.)
1201	states: Vec<State>,
1202	/// The anchored starting state of this NFA.
1203	start_anchored: StateID,
1204	/// The unanchored starting state of this NFA.
1205	start_unanchored: StateID,
1206	/// The starting states for each individual pattern. Starting at any
1207	/// of these states will result in only an anchored search for the
1208	/// corresponding pattern. The vec is indexed by pattern ID. When the NFA
1209	/// contains a single regex, then `start_pattern[0]` and `start_anchored`
1210	/// are always equivalent.
1211	start_pattern: Vec<StateID>,
1212	/// Info about the capturing groups in this NFA. This is responsible for
1213	/// mapping groups to slots, mapping groups to names and names to groups.
1214	group_info: GroupInfo,
1215	/// A representation of equivalence classes over the transitions in this
1216	/// NFA. Two bytes in the same equivalence class must not discriminate
1217	/// between a match or a non-match. This map can be used to shrink the
1218	/// total size of a DFA's transition table with a small match-time cost.
1219	///
1220	/// Note that the NFA's transitions are not* defined in terms of these*
1221	/// equivalence classes. The NFA's transitions are defined on the original
1222	/// byte values. For the most part, this is because they wouldn't really
1223	/// help the NFA much since the NFA already uses a sparse representation
1224	/// to represent transitions. Byte classes are most effective in a dense
1225	/// representation.
1226	byte_class_set: ByteClassSet,
1227	/// This is generated from `byte_class_set`, and essentially represents the
1228	/// same thing but supports different access patterns. Namely, this permits
1229	/// looking up the equivalence class of a byte very cheaply.
1230	///
1231	/// Ideally we would just store this, but because of annoying code
1232	/// structure reasons, we keep both this and `byte_class_set` around for
1233	/// now. I think I would prefer that `byte_class_set` were computed in the
1234	/// `Builder`, but right now, we compute it as states are added to the
1235	/// `NFA`.
1236	byte_classes: ByteClasses,
1237	/// Whether this NFA has a `Capture` state anywhere.
1238	has_capture: bool,
1239	/// When the empty string is in the language matched by this NFA.
1240	has_empty: bool,
1241	/// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that
1242	/// all non-empty matches produced by this NFA correspond to spans of valid
1243	/// UTF-8, and any empty matches produced by this NFA that split a UTF-8
1244	/// encoded codepoint should be filtered out by the corresponding regex
1245	/// engine.
1246	utf8: bool,
1247	/// Whether this NFA is meant to be matched in reverse or not.
1248	reverse: bool,
1249	/// The matcher to be used for look-around assertions.
1250	look_matcher: LookMatcher,
1251	/// The union of all look-around assertions that occur anywhere within
1252	/// this NFA. If this set is empty, then it means there are precisely zero
1253	/// conditional epsilon transitions in the NFA.
1254	look_set_any: LookSet,
1255	/// The union of all look-around assertions that occur as a zero-length
1256	/// prefix for any of the patterns in this NFA.
1257	look_set_prefix_any: LookSet,
1258	/*
1259	/// The intersection of all look-around assertions that occur as a
1260	/// zero-length prefix for any of the patterns in this NFA.
1261	look_set_prefix_all: LookSet,
1262	*/
1263	/// Heap memory used indirectly by NFA states and other things (like the
1264	/// various capturing group representations above). Since each state
1265	/// might use a different amount of heap, we need to keep track of this
1266	/// incrementally.
1267	memory_extra: usize,
1268	}
1269
1270	impl Inner {
1271	/// Runs any last finalization bits and turns this into a full NFA.
1272	pub(super) fn into_nfa(mut self) -> NFA {
1273	self.byte_classes = self.byte_class_set.byte_classes();
1274	// Do epsilon closure from the start state of every pattern in order
1275	// to compute various properties such as look-around assertions and
1276	// whether the empty string can be matched.
1277	let mut stack = vec![];
1278	let mut seen = SparseSet::new(self.states.len());
1279	for &start_id in self.start_pattern.iter() {
1280	stack.push(start_id);
1281	seen.clear();
1282	// let mut prefix_all = LookSet::full();
1283	let mut prefix_any = LookSet::empty();
1284	while let Some(sid) = stack.pop() {
1285	if !seen.insert(sid) {
1286	continue;
1287	}
1288	match self.states[sid] {
1289	State::ByteRange { .. }
1290	\| State::Dense { .. }
1291	\| State::Fail => continue,
1292	State::Sparse(_) => {
1293	// This snippet below will rewrite this sparse state
1294	// as a dense state. By doing it here, we apply this
1295	// optimization to all hot "sparse" states since these
1296	// are the states that are reachable from the start
1297	// state via an epsilon closure.
1298	//
1299	// Unfortunately, this optimization did not seem to
1300	// help much in some very limited ad hoc benchmarking.
1301	//
1302	// I left the 'Dense' state type in place in case we
1303	// want to revisit this, but I suspect the real way
1304	// to make forward progress is a more fundamental
1305	// rearchitecting of how data in the NFA is laid out.
1306	// I think we should consider a single contiguous
1307	// allocation instead of all this indirection and
1308	// potential heap allocations for every state. But this
1309	// is a large re-design and will require API breaking
1310	// changes.
1311	// self.memory_extra -= self.states[sid].memory_usage();
1312	// let trans = DenseTransitions::from_sparse(sparse);
1313	// self.states[sid] = State::Dense(trans);
1314	// self.memory_extra += self.states[sid].memory_usage();
1315	continue;
1316	}
1317	State::Match { .. } => self.has_empty = `true`,
1318	State::Look { look, next } => {
1319	prefix_any = prefix_any.insert(look);
1320	stack.push(next);
1321	}
1322	State::Union { ref alternates } => {
1323	// Order doesn't matter here, since we're just dealing
1324	// with look-around sets. But if we do richer analysis
1325	// here that needs to care about preference order, then
1326	// this should be done in reverse.
1327	stack.extend(alternates.iter());
1328	}
1329	State::BinaryUnion { alt1, alt2 } => {
1330	stack.push(alt2);
1331	stack.push(alt1);
1332	}
1333	State::Capture { next, .. } => {
1334	stack.push(next);
1335	}
1336	}
1337	}
1338	self.look_set_prefix_any =
1339	self.look_set_prefix_any.union(prefix_any);
1340	}
1341	NFA(Arc::new(self))
1342	}
1343
1344	/// Returns the capturing group info for this NFA.
1345	pub(super) fn group_info(&self) -> &GroupInfo {
1346	&self.group_info
1347	}
1348
1349	/// Add the given state to this NFA after allocating a fresh identifier for
1350	/// it.
1351	///
1352	/// This panics if too many states are added such that a fresh identifier
1353	/// could not be created. (Currently, the only caller of this routine is
1354	/// a `Builder`, and it upholds this invariant.)
1355	pub(super) fn add(&mut self, state: State) -> StateID {
1356	match state {
1357	State::ByteRange { ref trans } => {
1358	self.byte_class_set.set_range(trans.start, trans.end);
1359	}
1360	State::Sparse(ref sparse) => {
1361	for trans in sparse.transitions.iter() {
1362	self.byte_class_set.set_range(trans.start, trans.end);
1363	}
1364	}
1365	State::Dense { .. } => unreachable!(),
1366	State::Look { look, .. } => {
1367	self.look_matcher
1368	.add_to_byteset(look, &mut self.byte_class_set);
1369	self.look_set_any = self.look_set_any.insert(look);
1370	}
1371	State::Capture { .. } => {
1372	self.has_capture = `true`;
1373	}
1374	State::Union { .. }
1375	\| State::BinaryUnion { .. }
1376	\| State::Fail
1377	\| State::Match { .. } => {}
1378	}
1379
1380	let id = StateID::new(self.states.len()).unwrap();
1381	self.memory_extra += state.memory_usage();
1382	self.states.push(state);
1383	id
1384	}
1385
1386	/// Set the starting state identifiers for this NFA.
1387	///
1388	/// `start_anchored` and `start_unanchored` may be equivalent. When they
1389	/// are, then the NFA can only execute anchored searches. This might
1390	/// occur, for example, for patterns that are unconditionally anchored.
1391	/// e.g., `^foo`.
1392	pub(super) fn set_starts(
1393	&mut self,
1394	start_anchored: StateID,
1395	start_unanchored: StateID,
1396	start_pattern: &[StateID],
1397	) {
1398	self.start_anchored = start_anchored;
1399	self.start_unanchored = start_unanchored;
1400	self.start_pattern = start_pattern.to_vec();
1401	}
1402
1403	/// Sets the UTF-8 mode of this NFA.
1404	pub(super) fn set_utf8(&mut self, yes: bool) {
1405	self.utf8 = yes;
1406	}
1407
1408	/// Sets the reverse mode of this NFA.
1409	pub(super) fn set_reverse(&mut self, yes: bool) {
1410	self.reverse = yes;
1411	}
1412
1413	/// Sets the look-around assertion matcher for this NFA.
1414	pub(super) fn set_look_matcher(&mut self, m: LookMatcher) {
1415	self.look_matcher = m;
1416	}
1417
1418	/// Set the capturing groups for this NFA.
1419	///
1420	/// The given slice should contain the capturing groups for each pattern,
1421	/// The capturing groups in turn should correspond to the total number of
1422	/// capturing groups in the pattern, including the anonymous first capture
1423	/// group for each pattern. If a capturing group does have a name, then it
1424	/// should be provided as a Arc<str>.
1425	///
1426	/// This returns an error if a corresponding `GroupInfo` could not be
1427	/// built.
1428	pub(super) fn set_captures(
1429	&mut self,
1430	captures: &[Vec<Option<Arc<str>>>],
1431	) -> Result<(), GroupInfoError> {
1432	self.group_info = GroupInfo::new(
1433	captures.iter().map(\|x\| x.iter().map(\|y\| y.as_ref())),
1434	)?;
1435	Ok(())
1436	}
1437
1438	/// Remap the transitions in every state of this NFA using the given map.
1439	/// The given map should be indexed according to state ID namespace used by
1440	/// the transitions of the states currently in this NFA.
1441	///
1442	/// This is particularly useful to the NFA builder, since it is convenient
1443	/// to add NFA states in order to produce their final IDs. Then, after all
1444	/// of the intermediate "empty" states (unconditional epsilon transitions)
1445	/// have been removed from the builder's representation, we can re-map all
1446	/// of the transitions in the states already added to their final IDs.
1447	pub(super) fn remap(&mut self, old_to_new: &[StateID]) {
1448	for state in &mut self.states {
1449	state.remap(old_to_new);
1450	}
1451	self.start_anchored = old_to_new[self.start_anchored];
1452	self.start_unanchored = old_to_new[self.start_unanchored];
1453	for id in self.start_pattern.iter_mut() {
1454	id = old_to_new[id];
1455	}
1456	}
1457	}
1458
1459	impl fmt::Debug for Inner {
1460	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1461	writeln!(f, "thompson::NFA(")?;
1462	for (sid, state) in self.states.iter().with_state_ids() {
1463	let status = if sid == self.start_anchored {
1464	'^'
1465	} else if sid == self.start_unanchored {
1466	'>'
1467	} else {
1468	' '
1469	};
1470	writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?;
1471	}
1472	let pattern_len = self.start_pattern.len();
1473	if pattern_len > `1` {
1474	writeln!(f, "")?;
1475	for pid in `0`..pattern_len {
1476	let sid = self.start_pattern[pid];
1477	writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?;
1478	}
1479	}
1480	writeln!(f, "")?;
1481	writeln!(
1482	f,
1483	"transition equivalence classes: {:?}",
1484	self.byte_classes,
1485	)?;
1486	writeln!(f, ")")?;
1487	Ok(())
1488	}
1489	}
1490
1491	/// A state in an NFA.
1492	///
1493	/// In theory, it can help to conceptualize an `NFA` as a graph consisting of
1494	/// `State`s. Each `State` contains its complete set of outgoing transitions.
1495	///
1496	/// In practice, it can help to conceptualize an `NFA` as a sequence of
1497	/// instructions for a virtual machine. Each `State` says what to do and where
1498	/// to go next.
1499	///
1500	/// Strictly speaking, the practical interpretation is the most correct one,
1501	/// because of the [`Capture`](State::Capture) state. Namely, a `Capture`
1502	/// state always forwards execution to another state unconditionally. Its only
1503	/// purpose is to cause a side effect: the recording of the current input
1504	/// position at a particular location in memory. In this sense, an `NFA`
1505	/// has more power than a theoretical non-deterministic finite automaton.
1506	///
1507	/// For most uses of this crate, it is likely that one may never even need to
1508	/// be aware of this type at all. The main use cases for looking at `State`s
1509	/// directly are if you need to write your own search implementation or if you
1510	/// need to do some kind of analysis on the NFA.
1511	#[derive(Clone, Eq, PartialEq)]
1512	pub enum State {
1513	/// A state with a single transition that can only be taken if the current
1514	/// input symbol is in a particular range of bytes.
1515	ByteRange {
1516	/// The transition from this state to the next.
1517	trans: Transition,
1518	},
1519	/// A state with possibly many transitions represented in a sparse fashion.
1520	/// Transitions are non-overlapping and ordered lexicographically by input
1521	/// range.
1522	///
1523	/// In practice, this is used for encoding UTF-8 automata. Its presence is
1524	/// primarily an optimization that avoids many additional unconditional
1525	/// epsilon transitions (via [`Union`](State::Union) states), and thus
1526	/// decreases the overhead of traversing the NFA. This can improve both
1527	/// matching time and DFA construction time.
1528	Sparse(SparseTransitions),
1529	/// A dense representation of a state with multiple transitions.
1530	Dense(DenseTransitions),
1531	/// A conditional epsilon transition satisfied via some sort of
1532	/// look-around. Look-around is limited to anchor and word boundary
1533	/// assertions.
1534	///
1535	/// Look-around states are meant to be evaluated while performing epsilon
1536	/// closure (computing the set of states reachable from a particular state
1537	/// via only epsilon transitions). If the current position in the haystack
1538	/// satisfies the look-around assertion, then you're permitted to follow
1539	/// that epsilon transition.
1540	Look {
1541	/// The look-around assertion that must be satisfied before moving
1542	/// to `next`.
1543	look: Look,
1544	/// The state to transition to if the look-around assertion is
1545	/// satisfied.
1546	next: StateID,
1547	},
1548	/// An alternation such that there exists an epsilon transition to all
1549	/// states in `alternates`, where matches found via earlier transitions
1550	/// are preferred over later transitions.
1551	Union {
1552	/// An ordered sequence of unconditional epsilon transitions to other
1553	/// states. Transitions earlier in the sequence are preferred over
1554	/// transitions later in the sequence.
1555	alternates: Box<[StateID]>,
1556	},
1557	/// An alternation such that there exists precisely two unconditional
1558	/// epsilon transitions, where matches found via `alt1` are preferred over
1559	/// matches found via `alt2`.
1560	///
1561	/// This state exists as a common special case of Union where there are
1562	/// only two alternates. In this case, we don't need any allocations to
1563	/// represent the state. This saves a bit of memory and also saves an
1564	/// additional memory access when traversing the NFA.
1565	BinaryUnion {
1566	/// An unconditional epsilon transition to another NFA state. This
1567	/// is preferred over `alt2`.
1568	alt1: StateID,
1569	/// An unconditional epsilon transition to another NFA state. Matches
1570	/// reported via this transition should only be reported if no matches
1571	/// were found by following `alt1`.
1572	alt2: StateID,
1573	},
1574	/// An empty state that records a capture location.
1575	///
1576	/// From the perspective of finite automata, this is precisely equivalent
1577	/// to an unconditional epsilon transition, but serves the purpose of
1578	/// instructing NFA simulations to record additional state when the finite
1579	/// state machine passes through this epsilon transition.
1580	///
1581	/// `slot` in this context refers to the specific capture group slot
1582	/// offset that is being recorded. Each capturing group has two slots
1583	/// corresponding to the start and end of the matching portion of that
1584	/// group.
1585	///
1586	/// The pattern ID and capture group index are also included in this state
1587	/// in case they are useful. But mostly, all you'll need is `next` and
1588	/// `slot`.
1589	Capture {
1590	/// The state to transition to, unconditionally.
1591	next: StateID,
1592	/// The pattern ID that this capture belongs to.
1593	pattern_id: PatternID,
1594	/// The capture group index that this capture belongs to. Capture group
1595	/// indices are local to each pattern. For example, when capturing
1596	/// groups are enabled, every pattern has a capture group at index
1597	/// `0`.
1598	group_index: SmallIndex,
1599	/// The slot index for this capture. Every capturing group has two
1600	/// slots: one for the start haystack offset and one for the end
1601	/// haystack offset. Unlike capture group indices, slot indices are
1602	/// global across all patterns in this NFA. That is, each slot belongs
1603	/// to a single pattern, but there is only one slot at index `i`.
1604	slot: SmallIndex,
1605	},
1606	/// A state that cannot be transitioned out of. This is useful for cases
1607	/// where you want to prevent matching from occurring. For example, if your
1608	/// regex parser permits empty character classes, then one could choose
1609	/// a `Fail` state to represent them. (An empty character class can be
1610	/// thought of as an empty set. Since nothing is in an empty set, they can
1611	/// never match anything.)
1612	Fail,
1613	/// A match state. There is at least one such occurrence of this state for
1614	/// each regex that can match that is in this NFA.
1615	Match {
1616	/// The matching pattern ID.
1617	pattern_id: PatternID,
1618	},
1619	}
1620
1621	impl State {
1622	/// Returns true if and only if this state contains one or more epsilon
1623	/// transitions.
1624	///
1625	/// In practice, a state has no outgoing transitions (like `Match`), has
1626	/// only non-epsilon transitions (like `ByteRange`) or has only epsilon
1627	/// transitions (like `Union`).
1628	///
1629	/// # Example
1630	///
1631	/// ```
1632	/// use regex_automata::{
1633	/// nfa::thompson::{State, Transition},
1634	/// util::primitives::{PatternID, StateID, SmallIndex},
1635	/// };
1636	///
1637	/// // Capture states are epsilon transitions.
1638	/// let state = State::Capture {
1639	/// next: StateID::ZERO,
1640	/// pattern_id: PatternID::ZERO,
1641	/// group_index: SmallIndex::ZERO,
1642	/// slot: SmallIndex::ZERO,
1643	/// };
1644	/// assert!(state.is_epsilon());
1645	///
1646	/// // ByteRange states are not.
1647	/// let state = State::ByteRange {
1648	/// trans: Transition { start: b'a', end: b'z', next: StateID::ZERO },
1649	/// };
1650	/// assert!(!state.is_epsilon());
1651	///
1652	/// # Ok::<(), Box<dyn std::error::Error>>(())
1653	/// ```
1654	#[inline]
1655	pub fn is_epsilon(&self) -> bool {
1656	match *self {
1657	State::ByteRange { .. }
1658	\| State::Sparse { .. }
1659	\| State::Dense { .. }
1660	\| State::Fail
1661	\| State::Match { .. } => `false`,
1662	State::Look { .. }
1663	\| State::Union { .. }
1664	\| State::BinaryUnion { .. }
1665	\| State::Capture { .. } => `true`,
1666	}
1667	}
1668
1669	/// Returns the heap memory usage of this NFA state in bytes.
1670	fn memory_usage(&self) -> usize {
1671	match *self {
1672	State::ByteRange { .. }
1673	\| State::Look { .. }
1674	\| State::BinaryUnion { .. }
1675	\| State::Capture { .. }
1676	\| State::Match { .. }
1677	\| State::Fail => `0`,
1678	State::Sparse(SparseTransitions { ref transitions }) => {
1679	transitions.len() * mem::size_of::<Transition>()
1680	}
1681	State::Dense { .. } => `256` * mem::size_of::<StateID>(),
1682	State::Union { ref alternates } => {
1683	alternates.len() * mem::size_of::<StateID>()
1684	}
1685	}
1686	}
1687
1688	/// Remap the transitions in this state using the given map. Namely, the
1689	/// given map should be indexed according to the transitions currently
1690	/// in this state.
1691	///
1692	/// This is used during the final phase of the NFA compiler, which turns
1693	/// its intermediate NFA into the final NFA.
1694	fn remap(&mut self, remap: &[StateID]) {
1695	match *self {
1696	State::ByteRange { ref mut trans } => {
1697	trans.next = remap[trans.next]
1698	}
1699	State::Sparse(SparseTransitions { ref mut transitions }) => {
1700	for t in transitions.iter_mut() {
1701	t.next = remap[t.next];
1702	}
1703	}
1704	State::Dense(DenseTransitions { ref mut transitions }) => {
1705	for sid in transitions.iter_mut() {
1706	sid = remap[sid];
1707	}
1708	}
1709	State::Look { ref mut next, .. } => next = remap[next],
1710	State::Union { ref mut alternates } => {
1711	for alt in alternates.iter_mut() {
1712	alt = remap[alt];
1713	}
1714	}
1715	State::BinaryUnion { ref mut alt1, ref mut alt2 } => {
1716	alt1 = remap[alt1];
1717	alt2 = remap[alt2];
1718	}
1719	State::Capture { ref mut next, .. } => next = remap[next],
1720	State::Fail => {}
1721	State::Match { .. } => {}
1722	}
1723	}
1724	}
1725
1726	impl fmt::Debug for State {
1727	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1728	match *self {
1729	State::ByteRange { ref trans } => trans.fmt(f),
1730	State::Sparse(SparseTransitions { ref transitions }) => {
1731	let rs = transitions
1732	.iter()
1733	.map(\|t\| format!("{:?}", t))
1734	.collect::<Vec<String>>()
1735	.join(", ");
1736	write!(f, "sparse({})", rs)
1737	}
1738	State::Dense(ref dense) => {
1739	write!(f, "dense(")?;
1740	for (i, t) in dense.iter().enumerate() {
1741	if i > `0` {
1742	write!(f, ", ")?;
1743	}
1744	write!(f, "{:?}", t)?;
1745	}
1746	write!(f, ")")
1747	}
1748	State::Look { ref look, next } => {
1749	write!(f, "{:?} => {:?}", look, next.as_usize())
1750	}
1751	State::Union { ref alternates } => {
1752	let alts = alternates
1753	.iter()
1754	.map(\|id\| format!("{:?}", id.as_usize()))
1755	.collect::<Vec<String>>()
1756	.join(", ");
1757	write!(f, "union({})", alts)
1758	}
1759	State::BinaryUnion { alt1, alt2 } => {
1760	write!(
1761	f,
1762	"binary-union({}, {})",
1763	alt1.as_usize(),
1764	alt2.as_usize()
1765	)
1766	}
1767	State::Capture { next, pattern_id, group_index, slot } => {
1768	write!(
1769	f,
1770	"capture(pid={:?}, group={:?}, slot={:?}) => {:?}",
1771	pattern_id.as_usize(),
1772	group_index.as_usize(),
1773	slot.as_usize(),
1774	next.as_usize(),
1775	)
1776	}
1777	State::Fail => write!(f, "FAIL"),
1778	State::Match { pattern_id } => {
1779	write!(f, "MATCH({:?})", pattern_id.as_usize())
1780	}
1781	}
1782	}
1783	}
1784
1785	/// A sequence of transitions used to represent a sparse state.
1786	///
1787	/// This is the primary representation of a [`Sparse`](State::Sparse) state.
1788	/// It corresponds to a sorted sequence of transitions with non-overlapping
1789	/// byte ranges. If the byte at the current position in the haystack matches
1790	/// one of the byte ranges, then the finite state machine should take the
1791	/// corresponding transition.
1792	#[derive(Clone, Debug, Eq, PartialEq)]
1793	pub struct SparseTransitions {
1794	/// The sorted sequence of non-overlapping transitions.
1795	pub transitions: Box<[Transition]>,
1796	}
1797
1798	impl SparseTransitions {
1799	/// This follows the matching transition for a particular byte.
1800	///
1801	/// The matching transition is found by looking for a matching byte
1802	/// range (there is at most one) corresponding to the position `at` in
1803	/// `haystack`.
1804	///
1805	/// If `at >= haystack.len()`, then this returns `None`.
1806	#[inline]
1807	pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
1808	haystack.get(at).and_then(\|&b\| self.matches_byte(b))
1809	}
1810
1811	/// This follows the matching transition for any member of the alphabet.
1812	///
1813	/// The matching transition is found by looking for a matching byte
1814	/// range (there is at most one) corresponding to the position `at` in
1815	/// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi),
1816	/// then this always returns `None`.
1817	#[inline]
1818	pub(crate) fn matches_unit(
1819	&self,
1820	unit: alphabet::Unit,
1821	) -> Option<StateID> {
1822	unit.as_u8().map_or(None, \|byte\| self.matches_byte(byte))
1823	}
1824
1825	/// This follows the matching transition for a particular byte.
1826	///
1827	/// The matching transition is found by looking for a matching byte range
1828	/// (there is at most one) corresponding to the byte given.
1829	#[inline]
1830	pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
1831	for t in self.transitions.iter() {
1832	if t.start > byte {
1833	break;
1834	} else if t.matches_byte(byte) {
1835	return Some(t.next);
1836	}
1837	}
1838	None
1839
1840	/*
1841	// This is an alternative implementation that uses binary search. In
1842	// some ad hoc experiments, like
1843	//
1844	// regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file
1845	//
1846	// I could not observe any improvement, and in fact, things seemed to
1847	// be a bit slower. I can see an improvement in at least one benchmark:
1848	//
1849	// regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8
1850	//
1851	// Where total search time goes from 3.2s to 2.4s when using binary
1852	// search.
1853	self.transitions
1854	.binary_search_by(\|t\| {
1855	if t.end < byte {
1856	core::cmp::Ordering::Less
1857	} else if t.start > byte {
1858	core::cmp::Ordering::Greater
1859	} else {
1860	core::cmp::Ordering::Equal
1861	}
1862	})
1863	.ok()
1864	.map(\|i\| self.transitions[i].next)
1865	*/
1866	}
1867	}
1868
1869	/// A sequence of transitions used to represent a dense state.
1870	///
1871	/// This is the primary representation of a [`Dense`](State::Dense) state. It
1872	/// provides constant time matching. That is, given a byte in a haystack and
1873	/// a `DenseTransitions`, one can determine if the state matches in constant
1874	/// time.
1875	///
1876	/// This is in contrast to `SparseTransitions`, whose time complexity is
1877	/// necessarily bigger than constant time. Also in contrast, `DenseTransitions`
1878	/// usually requires (much) more heap memory.
1879	#[derive(Clone, Debug, Eq, PartialEq)]
1880	pub struct DenseTransitions {
1881	/// A dense representation of this state's transitions on the heap. This
1882	/// always has length 256.
1883	pub transitions: Box<[StateID]>,
1884	}
1885
1886	impl DenseTransitions {
1887	/// This follows the matching transition for a particular byte.
1888	///
1889	/// The matching transition is found by looking for a transition that
1890	/// doesn't correspond to `StateID::ZERO` for the byte `at` the given
1891	/// position in `haystack`.
1892	///
1893	/// If `at >= haystack.len()`, then this returns `None`.
1894	#[inline]
1895	pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
1896	haystack.get(at).and_then(\|&b\| self.matches_byte(b))
1897	}
1898
1899	/// This follows the matching transition for any member of the alphabet.
1900	///
1901	/// The matching transition is found by looking for a transition that
1902	/// doesn't correspond to `StateID::ZERO` for the byte `at` the given
1903	/// position in `haystack`.
1904	///
1905	/// If `at >= haystack.len()` or if the given alphabet unit is
1906	/// [`EOI`](alphabet::Unit::eoi), then this returns `None`.
1907	#[inline]
1908	pub(crate) fn matches_unit(
1909	&self,
1910	unit: alphabet::Unit,
1911	) -> Option<StateID> {
1912	unit.as_u8().map_or(None, \|byte\| self.matches_byte(byte))
1913	}
1914
1915	/// This follows the matching transition for a particular byte.
1916	///
1917	/// The matching transition is found by looking for a transition that
1918	/// doesn't correspond to `StateID::ZERO` for the given `byte`.
1919	///
1920	/// If `at >= haystack.len()`, then this returns `None`.
1921	#[inline]
1922	pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
1923	let next = self.transitions[usize::from(byte)];
1924	if next == StateID::ZERO {
1925	None
1926	} else {
1927	Some(next)
1928	}
1929	}
1930
1931	/*
1932	/// The dense state optimization isn't currently enabled, so permit a
1933	/// little bit of dead code.
1934	pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions {
1935	let mut dense = vec![StateID::ZERO; 256];
1936	for t in sparse.transitions.iter() {
1937	for b in t.start..=t.end {
1938	dense[usize::from(b)] = t.next;
1939	}
1940	}
1941	DenseTransitions { transitions: dense.into_boxed_slice() }
1942	}
1943	*/
1944
1945	/// Returns an iterator over all transitions that don't point to
1946	/// `StateID::ZERO`.
1947	pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ {
1948	use crate::util::int::Usize;
1949	self.transitions
1950	.iter()
1951	.enumerate()
1952	.filter(\|&(_, &sid)\| sid != StateID::ZERO)
1953	.map(\|(byte, &next)\| Transition {
1954	start: byte.as_u8(),
1955	end: byte.as_u8(),
1956	next,
1957	})
1958	}
1959	}
1960
1961	/// A single transition to another state.
1962	///
1963	/// This transition may only be followed if the current byte in the haystack
1964	/// falls in the inclusive range of bytes specified.
1965	#[derive(Clone, Copy, Eq, Hash, PartialEq)]
1966	pub struct Transition {
1967	/// The inclusive start of the byte range.
1968	pub start: u8,
1969	/// The inclusive end of the byte range.
1970	pub end: u8,
1971	/// The identifier of the state to transition to.
1972	pub next: StateID,
1973	}
1974
1975	impl Transition {
1976	/// Returns true if the position `at` in `haystack` falls in this
1977	/// transition's range of bytes.
1978	///
1979	/// If `at >= haystack.len()`, then this returns `false`.
1980	pub fn matches(&self, haystack: &[u8], at: usize) -> bool {
1981	haystack.get(at).map_or(`false`, \|&b\| self.matches_byte(b))
1982	}
1983
1984	/// Returns true if the given alphabet unit falls in this transition's
1985	/// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then
1986	/// this returns `false`.
1987	pub fn matches_unit(&self, unit: alphabet::Unit) -> bool {
1988	unit.as_u8().map_or(`false`, \|byte\| self.matches_byte(byte))
1989	}
1990
1991	/// Returns true if the given byte falls in this transition's range of
1992	/// bytes.
1993	pub fn matches_byte(&self, byte: u8) -> bool {
1994	self.start <= byte && byte <= self.end
1995	}
1996	}
1997
1998	impl fmt::Debug for Transition {
1999	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2000	use crate::util::escape::DebugByte;
2001
2002	let Transition { start, end, next } = *self;
2003	if self.start == self.end {
2004	write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())
2005	} else {
2006	write!(
2007	f,
2008	"{:?}-{:?} => {:?}",
2009	DebugByte(start),
2010	DebugByte(end),
2011	next.as_usize(),
2012	)
2013	}
2014	}
2015	}
2016
2017	/// An iterator over all pattern IDs in an NFA.
2018	///
2019	/// This iterator is created by [`NFA::patterns`].
2020	///
2021	/// The lifetime parameter `'a` refers to the lifetime of the NFA from which
2022	/// this pattern iterator was created.
2023	#[derive(Debug)]
2024	pub struct PatternIter<'a> {
2025	it: PatternIDIter,
2026	/// We explicitly associate a lifetime with this iterator even though we
2027	/// don't actually borrow anything from the NFA. We do this for backward
2028	/// compatibility purposes. If we ever do need to borrow something from
2029	/// the NFA, then we can and just get rid of this marker without breaking
2030	/// the public API.
2031	_marker: core::marker::PhantomData<&'a ()>,
2032	}
2033
2034	impl<'a> Iterator for PatternIter<'a> {
2035	type Item = PatternID;
2036
2037	fn next(&mut self) -> Option<PatternID> {
2038	self.it.next()
2039	}
2040	}
2041
2042	#[cfg(all(test, feature = "nfa-pikevm"))]
2043	mod tests {
2044	use super::*;
2045	use crate::{nfa::thompson::pikevm::PikeVM, Input};
2046
2047	// This asserts that an NFA state doesn't have its size changed. It is
2048	// really* easy to accidentally increase the size, and thus potentially*
2049	// dramatically increase the memory usage of every NFA.
2050	//
2051	// This assert doesn't mean we absolutely cannot increase the size of an
2052	// NFA state. We can. It's just here to make sure we do it knowingly and
2053	// intentionally.
2054	#[test]
2055	fn state_has_small_size() {
2056	#[cfg(target_pointer_width = "64")]
2057	assert_eq!(`24`, core::mem::size_of::<State>());
2058	#[cfg(target_pointer_width = "32")]
2059	assert_eq!(`20`, core::mem::size_of::<State>());
2060	}
2061
2062	#[test]
2063	fn always_match() {
2064	let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap();
2065	let mut cache = re.create_cache();
2066	let mut caps = re.create_captures();
2067	let mut find = \|haystack, start, end\| {
2068	let input = Input::new(haystack).range(start..end);
2069	re.search(&mut cache, &input, &mut caps);
2070	caps.get_match().map(\|m\| m.end())
2071	};
2072
2073	assert_eq!(Some(`0`), find("", `0`, `0`));
2074	assert_eq!(Some(`0`), find("a", `0`, `1`));
2075	assert_eq!(Some(`1`), find("a", `1`, `1`));
2076	assert_eq!(Some(`0`), find("ab", `0`, `2`));
2077	assert_eq!(Some(`1`), find("ab", `1`, `2`));
2078	assert_eq!(Some(`2`), find("ab", `2`, `2`));
2079	}
2080
2081	#[test]
2082	fn never_match() {
2083	let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap();
2084	let mut cache = re.create_cache();
2085	let mut caps = re.create_captures();
2086	let mut find = \|haystack, start, end\| {
2087	let input = Input::new(haystack).range(start..end);
2088	re.search(&mut cache, &input, &mut caps);
2089	caps.get_match().map(\|m\| m.end())
2090	};
2091
2092	assert_eq!(None, find("", `0`, `0`));
2093	assert_eq!(None, find("a", `0`, `1`));
2094	assert_eq!(None, find("a", `1`, `1`));
2095	assert_eq!(None, find("ab", `0`, `2`));
2096	assert_eq!(None, find("ab", `1`, `2`));
2097	assert_eq!(None, find("ab", `2`, `2`));
2098	}
2099	}
2100