1use core::{fmt, mem};
2
3use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec};
4
5#[cfg(feature = "syntax")]
6use crate::nfa::thompson::{
7 compiler::{Compiler, Config},
8 error::BuildError,
9};
10use crate::{
11 nfa::thompson::builder::Builder,
12 util::{
13 alphabet::{self, ByteClassSet, ByteClasses},
14 captures::{GroupInfo, GroupInfoError},
15 look::{Look, LookMatcher, LookSet},
16 primitives::{
17 IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID,
18 },
19 sparse_set::SparseSet,
20 },
21};
22
23/// A byte oriented Thompson non-deterministic finite automaton (NFA).
24///
25/// A Thompson NFA is a finite state machine that permits unconditional epsilon
26/// transitions, but guarantees that there exists at most one non-epsilon
27/// transition for each element in the alphabet for each state.
28///
29/// An NFA may be used directly for searching, for analysis or to build
30/// a deterministic finite automaton (DFA).
31///
32/// # Cheap clones
33///
34/// Since an NFA is a core data type in this crate that many other regex
35/// engines are based on top of, it is convenient to give ownership of an NFA
36/// to said regex engines. Because of this, an NFA uses reference counting
37/// internally. Therefore, it is cheap to clone and it is encouraged to do so.
38///
39/// # Capabilities
40///
41/// Using an NFA for searching via the
42/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount
43/// of "power" of any regex engine in this crate. Namely, it supports the
44/// following in all cases:
45///
46/// 1. Detection of a match.
47/// 2. Location of a match, including both the start and end offset, in a
48/// single pass of the haystack.
49/// 3. Location of matching capturing groups.
50/// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are
51/// present.
52///
53/// # Capturing Groups
54///
55/// Groups refer to parenthesized expressions inside a regex pattern. They look
56/// like this, where `exp` is an arbitrary regex:
57///
58/// * `(exp)` - An unnamed capturing group.
59/// * `(?P<name>exp)` or `(?<name>exp)` - A named capturing group.
60/// * `(?:exp)` - A non-capturing group.
61/// * `(?i:exp)` - A non-capturing group that sets flags.
62///
63/// Only the first two forms are said to be _capturing_. Capturing
64/// means that the last position at which they match is reportable. The
65/// [`Captures`](crate::util::captures::Captures) type provides convenient
66/// access to the match positions of capturing groups, which includes looking
67/// up capturing groups by their name.
68///
69/// # Byte oriented
70///
71/// This NFA is byte oriented, which means that all of its transitions are
72/// defined on bytes. In other words, the alphabet of an NFA consists of the
73/// 256 different byte values.
74///
75/// While DFAs nearly demand that they be byte oriented for performance
76/// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed,
77/// a previous version of this NFA supported both byte and codepoint oriented
78/// modes. A codepoint oriented mode can work because an NFA fundamentally uses
79/// a sparse representation of transitions, which works well with the large
80/// sparse space of Unicode codepoints.
81///
82/// Nevertheless, this NFA is only byte oriented. This choice is primarily
83/// driven by implementation simplicity, and also in part memory usage. In
84/// practice, performance between the two is roughly comparable. However,
85/// building a DFA (including a hybrid DFA) really wants a byte oriented NFA.
86/// So if we do have a codepoint oriented NFA, then we also need to generate
87/// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only
88/// generating byte oriented NFAs, we can produce one less NFA. In other words,
89/// if we made our NFA codepoint oriented, we'd need to *also* make it support
90/// a byte oriented mode, which is more complicated. But a byte oriented mode
91/// can support everything.
92///
93/// # Differences with DFAs
94///
95/// At the theoretical level, the precise difference between an NFA and a DFA
96/// is that, in a DFA, for every state, an input symbol unambiguously refers
97/// to a single transition _and_ that an input symbol is required for each
98/// transition. At a practical level, this permits DFA implementations to be
99/// implemented at their core with a small constant number of CPU instructions
100/// for each byte of input searched. In practice, this makes them quite a bit
101/// faster than NFAs _in general_. Namely, in order to execute a search for any
102/// Thompson NFA, one needs to keep track of a _set_ of states, and execute
103/// the possible transitions on all of those states for each input symbol.
104/// Overall, this results in much more overhead. To a first approximation, one
105/// can expect DFA searches to be about an order of magnitude faster.
106///
107/// So why use an NFA at all? The main advantage of an NFA is that it takes
108/// linear time (in the size of the pattern string after repetitions have been
109/// expanded) to build and linear memory usage. A DFA, on the other hand, may
110/// take exponential time and/or space to build. Even in non-pathological
111/// cases, DFAs often take quite a bit more memory than their NFA counterparts,
112/// _especially_ if large Unicode character classes are involved. Of course,
113/// an NFA also provides additional capabilities. For example, it can match
114/// Unicode word boundaries on non-ASCII text and resolve the positions of
115/// capturing groups.
116///
117/// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a
118/// good balance between an NFA and a DFA. It avoids the exponential build time
119/// of a DFA while maintaining its fast search time. The downside of a hybrid
120/// NFA/DFA is that in some cases it can be slower at search time than the NFA.
121/// (It also has less functionality than a pure NFA. It cannot handle Unicode
122/// word boundaries on non-ASCII text and cannot resolve capturing groups.)
123///
124/// # Example
125///
126/// This shows how to build an NFA with the default configuration and execute a
127/// search using the Pike VM.
128///
129/// ```
130/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
131///
132/// let re = PikeVM::new(r"foo[0-9]+")?;
133/// let mut cache = re.create_cache();
134/// let mut caps = re.create_captures();
135///
136/// let expected = Some(Match::must(0, 0..8));
137/// re.captures(&mut cache, b"foo12345", &mut caps);
138/// assert_eq!(expected, caps.get_match());
139///
140/// # Ok::<(), Box<dyn std::error::Error>>(())
141/// ```
142///
143/// # Example: resolving capturing groups
144///
145/// This example shows how to parse some simple dates and extract the
146/// components of each date via capturing groups.
147///
148/// ```
149/// # if cfg!(miri) { return Ok(()); } // miri takes too long
150/// use regex_automata::{
151/// nfa::thompson::pikevm::PikeVM,
152/// util::captures::Captures,
153/// };
154///
155/// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})")?;
156/// let mut cache = vm.create_cache();
157///
158/// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05";
159/// let all: Vec<Captures> = vm.captures_iter(
160/// &mut cache, haystack.as_bytes()
161/// ).collect();
162/// // There should be a total of 3 matches.
163/// assert_eq!(3, all.len());
164/// // The year from the second match is '2013'.
165/// let span = all[1].get_group_by_name("y").unwrap();
166/// assert_eq!("2013", &haystack[span]);
167///
168/// # Ok::<(), Box<dyn std::error::Error>>(())
169/// ```
170///
171/// This example shows that only the last match of a capturing group is
172/// reported, even if it had to match multiple times for an overall match
173/// to occur.
174///
175/// ```
176/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
177///
178/// let re = PikeVM::new(r"([a-z]){4}")?;
179/// let mut cache = re.create_cache();
180/// let mut caps = re.create_captures();
181///
182/// let haystack = b"quux";
183/// re.captures(&mut cache, haystack, &mut caps);
184/// assert!(caps.is_match());
185/// assert_eq!(Some(Span::from(3..4)), caps.get_group(1));
186///
187/// # Ok::<(), Box<dyn std::error::Error>>(())
188/// ```
189#[derive(Clone)]
190pub struct NFA(
191 // We make NFAs reference counted primarily for two reasons. First is that
192 // the NFA type itself is quite large (at least 0.5KB), and so it makes
193 // sense to put it on the heap by default anyway. Second is that, for Arc
194 // specifically, this enables cheap clones. This tends to be useful because
195 // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA)
196 // all want to hang on to an NFA for use during search time. We could
197 // provide the NFA at search time via a function argument, but this makes
198 // for an unnecessarily annoying API. Instead, we just let each structure
199 // share ownership of the NFA. Using a deep clone would not be smart, since
200 // the NFA can use quite a bit of heap space.
201 Arc<Inner>,
202);
203
204impl NFA {
205 /// Parse the given regular expression using a default configuration and
206 /// build an NFA from it.
207 ///
208 /// If you want a non-default configuration, then use the NFA
209 /// [`Compiler`] with a [`Config`].
210 ///
211 /// # Example
212 ///
213 /// ```
214 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
215 ///
216 /// let re = PikeVM::new(r"foo[0-9]+")?;
217 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
218 ///
219 /// let expected = Some(Match::must(0, 0..8));
220 /// re.captures(&mut cache, b"foo12345", &mut caps);
221 /// assert_eq!(expected, caps.get_match());
222 ///
223 /// # Ok::<(), Box<dyn std::error::Error>>(())
224 /// ```
225 #[cfg(feature = "syntax")]
226 pub fn new(pattern: &str) -> Result<NFA, BuildError> {
227 NFA::compiler().build(pattern)
228 }
229
230 /// Parse the given regular expressions using a default configuration and
231 /// build a multi-NFA from them.
232 ///
233 /// If you want a non-default configuration, then use the NFA
234 /// [`Compiler`] with a [`Config`].
235 ///
236 /// # Example
237 ///
238 /// ```
239 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
240 ///
241 /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?;
242 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
243 ///
244 /// let expected = Some(Match::must(1, 0..3));
245 /// re.captures(&mut cache, b"foo12345bar", &mut caps);
246 /// assert_eq!(expected, caps.get_match());
247 ///
248 /// # Ok::<(), Box<dyn std::error::Error>>(())
249 /// ```
250 #[cfg(feature = "syntax")]
251 pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> {
252 NFA::compiler().build_many(patterns)
253 }
254
255 /// Returns an NFA with a single regex pattern that always matches at every
256 /// position.
257 ///
258 /// # Example
259 ///
260 /// ```
261 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
262 ///
263 /// let re = PikeVM::new_from_nfa(NFA::always_match())?;
264 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
265 ///
266 /// let expected = Some(Match::must(0, 0..0));
267 /// re.captures(&mut cache, b"", &mut caps);
268 /// assert_eq!(expected, caps.get_match());
269 /// re.captures(&mut cache, b"foo", &mut caps);
270 /// assert_eq!(expected, caps.get_match());
271 ///
272 /// # Ok::<(), Box<dyn std::error::Error>>(())
273 /// ```
274 pub fn always_match() -> NFA {
275 // We could use NFA::new("") here and we'd get the same semantics, but
276 // hand-assembling the NFA (as below) does the same thing with a fewer
277 // number of states. It also avoids needing the 'syntax' feature
278 // enabled.
279 //
280 // Technically all we need is the "match" state, but we add the
281 // "capture" states so that the PikeVM can use this NFA.
282 //
283 // The unwraps below are OK because we add so few states that they will
284 // never exhaust any default limits in any environment.
285 let mut builder = Builder::new();
286 let pid = builder.start_pattern().unwrap();
287 assert_eq!(pid.as_usize(), 0);
288 let start_id =
289 builder.add_capture_start(StateID::ZERO, 0, None).unwrap();
290 let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap();
291 let match_id = builder.add_match().unwrap();
292 builder.patch(start_id, end_id).unwrap();
293 builder.patch(end_id, match_id).unwrap();
294 let pid = builder.finish_pattern(start_id).unwrap();
295 assert_eq!(pid.as_usize(), 0);
296 builder.build(start_id, start_id).unwrap()
297 }
298
299 /// Returns an NFA that never matches at any position.
300 ///
301 /// This is a convenience routine for creating an NFA with zero patterns.
302 ///
303 /// # Example
304 ///
305 /// ```
306 /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
307 ///
308 /// let re = PikeVM::new_from_nfa(NFA::never_match())?;
309 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
310 ///
311 /// re.captures(&mut cache, b"", &mut caps);
312 /// assert!(!caps.is_match());
313 /// re.captures(&mut cache, b"foo", &mut caps);
314 /// assert!(!caps.is_match());
315 ///
316 /// # Ok::<(), Box<dyn std::error::Error>>(())
317 /// ```
318 pub fn never_match() -> NFA {
319 // This always succeeds because it only requires one NFA state, which
320 // will never exhaust any (default) limits.
321 let mut builder = Builder::new();
322 let sid = builder.add_fail().unwrap();
323 builder.build(sid, sid).unwrap()
324 }
325
326 /// Return a default configuration for an `NFA`.
327 ///
328 /// This is a convenience routine to avoid needing to import the `Config`
329 /// type when customizing the construction of an NFA.
330 ///
331 /// # Example
332 ///
333 /// This example shows how to build an NFA with a small size limit that
334 /// results in a compilation error for any regex that tries to use more
335 /// heap memory than the configured limit.
336 ///
337 /// ```
338 /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
339 ///
340 /// let result = PikeVM::builder()
341 /// .thompson(NFA::config().nfa_size_limit(Some(1_000)))
342 /// // Remember, \w is Unicode-aware by default and thus huge.
343 /// .build(r"\w+");
344 /// assert!(result.is_err());
345 /// ```
346 #[cfg(feature = "syntax")]
347 pub fn config() -> Config {
348 Config::new()
349 }
350
351 /// Return a compiler for configuring the construction of an `NFA`.
352 ///
353 /// This is a convenience routine to avoid needing to import the
354 /// [`Compiler`] type in common cases.
355 ///
356 /// # Example
357 ///
358 /// This example shows how to build an NFA that is permitted match invalid
359 /// UTF-8. Without the additional syntax configuration here, compilation of
360 /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8.
361 ///
362 /// ```
363 /// use regex_automata::{
364 /// nfa::thompson::pikevm::PikeVM,
365 /// util::syntax,
366 /// Match,
367 /// };
368 ///
369 /// let re = PikeVM::builder()
370 /// .syntax(syntax::Config::new().utf8(false))
371 /// .build(r"[a-z]+(?-u:.)")?;
372 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
373 ///
374 /// let expected = Some(Match::must(0, 1..5));
375 /// re.captures(&mut cache, b"\xFFabc\xFF", &mut caps);
376 /// assert_eq!(expected, caps.get_match());
377 ///
378 /// # Ok::<(), Box<dyn std::error::Error>>(())
379 /// ```
380 #[cfg(feature = "syntax")]
381 pub fn compiler() -> Compiler {
382 Compiler::new()
383 }
384
385 /// Returns an iterator over all pattern identifiers in this NFA.
386 ///
387 /// Pattern IDs are allocated in sequential order starting from zero,
388 /// where the order corresponds to the order of patterns provided to the
389 /// [`NFA::new_many`] constructor.
390 ///
391 /// # Example
392 ///
393 /// ```
394 /// use regex_automata::{nfa::thompson::NFA, PatternID};
395 ///
396 /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
397 /// let pids: Vec<PatternID> = nfa.patterns().collect();
398 /// assert_eq!(pids, vec![
399 /// PatternID::must(0),
400 /// PatternID::must(1),
401 /// PatternID::must(2),
402 /// ]);
403 ///
404 /// # Ok::<(), Box<dyn std::error::Error>>(())
405 /// ```
406 pub fn patterns(&self) -> PatternIter<'_> {
407 PatternIter {
408 it: PatternID::iter(self.pattern_len()),
409 _marker: core::marker::PhantomData,
410 }
411 }
412
413 /// Returns the total number of regex patterns in this NFA.
414 ///
415 /// This may return zero if the NFA was constructed with no patterns. In
416 /// this case, the NFA can never produce a match for any input.
417 ///
418 /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because
419 /// NFA construction will fail if too many patterns are added.
420 ///
421 /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`.
422 ///
423 /// # Example
424 ///
425 /// ```
426 /// use regex_automata::nfa::thompson::NFA;
427 ///
428 /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
429 /// assert_eq!(3, nfa.pattern_len());
430 ///
431 /// let nfa = NFA::never_match();
432 /// assert_eq!(0, nfa.pattern_len());
433 ///
434 /// let nfa = NFA::always_match();
435 /// assert_eq!(1, nfa.pattern_len());
436 ///
437 /// # Ok::<(), Box<dyn std::error::Error>>(())
438 /// ```
439 #[inline]
440 pub fn pattern_len(&self) -> usize {
441 self.0.start_pattern.len()
442 }
443
444 /// Return the state identifier of the initial anchored state of this NFA.
445 ///
446 /// The returned identifier is guaranteed to be a valid index into the
447 /// slice returned by [`NFA::states`], and is also a valid argument to
448 /// [`NFA::state`].
449 ///
450 /// # Example
451 ///
452 /// This example shows a somewhat contrived example where we can easily
453 /// predict the anchored starting state.
454 ///
455 /// ```
456 /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures};
457 ///
458 /// let nfa = NFA::compiler()
459 /// .configure(NFA::config().which_captures(WhichCaptures::None))
460 /// .build("a")?;
461 /// let state = nfa.state(nfa.start_anchored());
462 /// match *state {
463 /// State::ByteRange { trans } => {
464 /// assert_eq!(b'a', trans.start);
465 /// assert_eq!(b'a', trans.end);
466 /// }
467 /// _ => unreachable!("unexpected state"),
468 /// }
469 ///
470 /// # Ok::<(), Box<dyn std::error::Error>>(())
471 /// ```
472 #[inline]
473 pub fn start_anchored(&self) -> StateID {
474 self.0.start_anchored
475 }
476
477 /// Return the state identifier of the initial unanchored state of this
478 /// NFA.
479 ///
480 /// This is equivalent to the identifier returned by
481 /// [`NFA::start_anchored`] when the NFA has no unanchored starting state.
482 ///
483 /// The returned identifier is guaranteed to be a valid index into the
484 /// slice returned by [`NFA::states`], and is also a valid argument to
485 /// [`NFA::state`].
486 ///
487 /// # Example
488 ///
489 /// This example shows that the anchored and unanchored starting states
490 /// are equivalent when an anchored NFA is built.
491 ///
492 /// ```
493 /// use regex_automata::nfa::thompson::NFA;
494 ///
495 /// let nfa = NFA::new("^a")?;
496 /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
497 ///
498 /// # Ok::<(), Box<dyn std::error::Error>>(())
499 /// ```
500 #[inline]
501 pub fn start_unanchored(&self) -> StateID {
502 self.0.start_unanchored
503 }
504
505 /// Return the state identifier of the initial anchored state for the given
506 /// pattern, or `None` if there is no pattern corresponding to the given
507 /// identifier.
508 ///
509 /// If one uses the starting state for a particular pattern, then the only
510 /// match that can be returned is for the corresponding pattern.
511 ///
512 /// The returned identifier is guaranteed to be a valid index into the
513 /// slice returned by [`NFA::states`], and is also a valid argument to
514 /// [`NFA::state`].
515 ///
516 /// # Errors
517 ///
518 /// If the pattern doesn't exist in this NFA, then this returns an error.
519 /// This occurs when `pid.as_usize() >= nfa.pattern_len()`.
520 ///
521 /// # Example
522 ///
523 /// This example shows that the anchored and unanchored starting states
524 /// are equivalent when an anchored NFA is built.
525 ///
526 /// ```
527 /// use regex_automata::{nfa::thompson::NFA, PatternID};
528 ///
529 /// let nfa = NFA::new_many(&["^a", "^b"])?;
530 /// // The anchored and unanchored states for the entire NFA are the same,
531 /// // since all of the patterns are anchored.
532 /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
533 /// // But the anchored starting states for each pattern are distinct,
534 /// // because these starting states can only lead to matches for the
535 /// // corresponding pattern.
536 /// let anchored = Some(nfa.start_anchored());
537 /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0)));
538 /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1)));
539 /// // Requesting a pattern not in the NFA will result in None:
540 /// assert_eq!(None, nfa.start_pattern(PatternID::must(2)));
541 ///
542 /// # Ok::<(), Box<dyn std::error::Error>>(())
543 /// ```
544 #[inline]
545 pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> {
546 self.0.start_pattern.get(pid.as_usize()).copied()
547 }
548
549 /// Get the byte class set for this NFA.
550 ///
551 /// A byte class set is a partitioning of this NFA's alphabet into
552 /// equivalence classes. Any two bytes in the same equivalence class are
553 /// guaranteed to never discriminate between a match or a non-match. (The
554 /// partitioning may not be minimal.)
555 ///
556 /// Byte classes are used internally by this crate when building DFAs.
557 /// Namely, among other optimizations, they enable a space optimization
558 /// where the DFA's internal alphabet is defined over the equivalence
559 /// classes of bytes instead of all possible byte values. The former is
560 /// often quite a bit smaller than the latter, which permits the DFA to use
561 /// less space for its transition table.
562 #[inline]
563 pub(crate) fn byte_class_set(&self) -> &ByteClassSet {
564 &self.0.byte_class_set
565 }
566
567 /// Get the byte classes for this NFA.
568 ///
569 /// Byte classes represent a partitioning of this NFA's alphabet into
570 /// equivalence classes. Any two bytes in the same equivalence class are
571 /// guaranteed to never discriminate between a match or a non-match. (The
572 /// partitioning may not be minimal.)
573 ///
574 /// Byte classes are used internally by this crate when building DFAs.
575 /// Namely, among other optimizations, they enable a space optimization
576 /// where the DFA's internal alphabet is defined over the equivalence
577 /// classes of bytes instead of all possible byte values. The former is
578 /// often quite a bit smaller than the latter, which permits the DFA to use
579 /// less space for its transition table.
580 ///
581 /// # Example
582 ///
583 /// This example shows how to query the class of various bytes.
584 ///
585 /// ```
586 /// use regex_automata::nfa::thompson::NFA;
587 ///
588 /// let nfa = NFA::new("[a-z]+")?;
589 /// let classes = nfa.byte_classes();
590 /// // 'a' and 'z' are in the same class for this regex.
591 /// assert_eq!(classes.get(b'a'), classes.get(b'z'));
592 /// // But 'a' and 'A' are not.
593 /// assert_ne!(classes.get(b'a'), classes.get(b'A'));
594 ///
595 /// # Ok::<(), Box<dyn std::error::Error>>(())
596 /// ```
597 #[inline]
598 pub fn byte_classes(&self) -> &ByteClasses {
599 &self.0.byte_classes
600 }
601
602 /// Return a reference to the NFA state corresponding to the given ID.
603 ///
604 /// This is a convenience routine for `nfa.states()[id]`.
605 ///
606 /// # Panics
607 ///
608 /// This panics when the given identifier does not reference a valid state.
609 /// That is, when `id.as_usize() >= nfa.states().len()`.
610 ///
611 /// # Example
612 ///
613 /// The anchored state for a pattern will typically correspond to a
614 /// capturing state for that pattern. (Although, this is not an API
615 /// guarantee!)
616 ///
617 /// ```
618 /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID};
619 ///
620 /// let nfa = NFA::new("a")?;
621 /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap());
622 /// match *state {
623 /// State::Capture { slot, .. } => {
624 /// assert_eq!(0, slot.as_usize());
625 /// }
626 /// _ => unreachable!("unexpected state"),
627 /// }
628 ///
629 /// # Ok::<(), Box<dyn std::error::Error>>(())
630 /// ```
631 #[inline]
632 pub fn state(&self, id: StateID) -> &State {
633 &self.states()[id]
634 }
635
636 /// Returns a slice of all states in this NFA.
637 ///
638 /// The slice returned is indexed by `StateID`. This provides a convenient
639 /// way to access states while following transitions among those states.
640 ///
641 /// # Example
642 ///
643 /// This demonstrates that disabling UTF-8 mode can shrink the size of the
644 /// NFA considerably in some cases, especially when using Unicode character
645 /// classes.
646 ///
647 /// ```
648 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
649 /// use regex_automata::nfa::thompson::NFA;
650 ///
651 /// let nfa_unicode = NFA::new(r"\w")?;
652 /// let nfa_ascii = NFA::new(r"(?-u)\w")?;
653 /// // Yes, a factor of 45 difference. No lie.
654 /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len());
655 ///
656 /// # Ok::<(), Box<dyn std::error::Error>>(())
657 /// ```
658 #[inline]
659 pub fn states(&self) -> &[State] {
660 &self.0.states
661 }
662
663 /// Returns the capturing group info for this NFA.
664 ///
665 /// The [`GroupInfo`] provides a way to map to and from capture index
666 /// and capture name for each pattern. It also provides a mapping from
667 /// each of the capturing groups in every pattern to their corresponding
668 /// slot offsets encoded in [`State::Capture`] states.
669 ///
670 /// Note that `GroupInfo` uses reference counting internally, such that
671 /// cloning a `GroupInfo` is very cheap.
672 ///
673 /// # Example
674 ///
675 /// This example shows how to get a list of all capture group names for
676 /// a particular pattern.
677 ///
678 /// ```
679 /// use regex_automata::{nfa::thompson::NFA, PatternID};
680 ///
681 /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?;
682 /// // The first is the implicit group that is always unnammed. The next
683 /// // 5 groups are the explicit groups found in the concrete syntax above.
684 /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")];
685 /// let got: Vec<Option<&str>> =
686 /// nfa.group_info().pattern_names(PatternID::ZERO).collect();
687 /// assert_eq!(expected, got);
688 ///
689 /// // Using an invalid pattern ID will result in nothing yielded.
690 /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count();
691 /// assert_eq!(0, got);
692 ///
693 /// # Ok::<(), Box<dyn std::error::Error>>(())
694 /// ```
695 #[inline]
696 pub fn group_info(&self) -> &GroupInfo {
697 &self.0.group_info()
698 }
699
700 /// Returns true if and only if this NFA has at least one
701 /// [`Capture`](State::Capture) in its sequence of states.
702 ///
703 /// This is useful as a way to perform a quick test before attempting
704 /// something that does or does not require capture states. For example,
705 /// some regex engines (like the PikeVM) require capture states in order to
706 /// work at all.
707 ///
708 /// # Example
709 ///
710 /// This example shows a few different NFAs and whether they have captures
711 /// or not.
712 ///
713 /// ```
714 /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
715 ///
716 /// // Obviously has capture states.
717 /// let nfa = NFA::new("(a)")?;
718 /// assert!(nfa.has_capture());
719 ///
720 /// // Less obviously has capture states, because every pattern has at
721 /// // least one anonymous capture group corresponding to the match for the
722 /// // entire pattern.
723 /// let nfa = NFA::new("a")?;
724 /// assert!(nfa.has_capture());
725 ///
726 /// // Other than hand building your own NFA, this is the only way to build
727 /// // an NFA without capturing groups. In general, you should only do this
728 /// // if you don't intend to use any of the NFA-oriented regex engines.
729 /// // Overall, capturing groups don't have many downsides. Although they
730 /// // can add a bit of noise to simple NFAs, so it can be nice to disable
731 /// // them for debugging purposes.
732 /// //
733 /// // Notice that 'has_capture' is false here even when we have an
734 /// // explicit capture group in the pattern.
735 /// let nfa = NFA::compiler()
736 /// .configure(NFA::config().which_captures(WhichCaptures::None))
737 /// .build("(a)")?;
738 /// assert!(!nfa.has_capture());
739 ///
740 /// # Ok::<(), Box<dyn std::error::Error>>(())
741 /// ```
742 #[inline]
743 pub fn has_capture(&self) -> bool {
744 self.0.has_capture
745 }
746
747 /// Returns true if and only if this NFA can match the empty string.
748 /// When it returns false, all possible matches are guaranteed to have a
749 /// non-zero length.
750 ///
751 /// This is useful as cheap way to know whether code needs to handle the
752 /// case of a zero length match. This is particularly important when UTF-8
753 /// modes are enabled, as when UTF-8 mode is enabled, empty matches that
754 /// split a codepoint must never be reported. This extra handling can
755 /// sometimes be costly, and since regexes matching an empty string are
756 /// somewhat rare, it can be beneficial to treat such regexes specially.
757 ///
758 /// # Example
759 ///
760 /// This example shows a few different NFAs and whether they match the
761 /// empty string or not. Notice the empty string isn't merely a matter
762 /// of a string of length literally `0`, but rather, whether a match can
763 /// occur between specific pairs of bytes.
764 ///
765 /// ```
766 /// use regex_automata::{nfa::thompson::NFA, util::syntax};
767 ///
768 /// // The empty regex matches the empty string.
769 /// let nfa = NFA::new("")?;
770 /// assert!(nfa.has_empty(), "empty matches empty");
771 /// // The '+' repetition operator requires at least one match, and so
772 /// // does not match the empty string.
773 /// let nfa = NFA::new("a+")?;
774 /// assert!(!nfa.has_empty(), "+ does not match empty");
775 /// // But the '*' repetition operator does.
776 /// let nfa = NFA::new("a*")?;
777 /// assert!(nfa.has_empty(), "* does match empty");
778 /// // And wrapping '+' in an operator that can match an empty string also
779 /// // causes it to match the empty string too.
780 /// let nfa = NFA::new("(a+)*")?;
781 /// assert!(nfa.has_empty(), "+ inside of * matches empty");
782 ///
783 /// // If a regex is just made of a look-around assertion, even if the
784 /// // assertion requires some kind of non-empty string around it (such as
785 /// // \b), then it is still treated as if it matches the empty string.
786 /// // Namely, if a match occurs of just a look-around assertion, then the
787 /// // match returned is empty.
788 /// let nfa = NFA::compiler()
789 /// .syntax(syntax::Config::new().utf8(false))
790 /// .build(r"^$\A\z\b\B(?-u:\b\B)")?;
791 /// assert!(nfa.has_empty(), "assertions match empty");
792 /// // Even when an assertion is wrapped in a '+', it still matches the
793 /// // empty string.
794 /// let nfa = NFA::new(r"\b+")?;
795 /// assert!(nfa.has_empty(), "+ of an assertion matches empty");
796 ///
797 /// // An alternation with even one branch that can match the empty string
798 /// // is also said to match the empty string overall.
799 /// let nfa = NFA::new("foo|(bar)?|quux")?;
800 /// assert!(nfa.has_empty(), "alternations can match empty");
801 ///
802 /// // An NFA that matches nothing does not match the empty string.
803 /// let nfa = NFA::new("[a&&b]")?;
804 /// assert!(!nfa.has_empty(), "never matching means not matching empty");
805 /// // But if it's wrapped in something that doesn't require a match at
806 /// // all, then it can match the empty string!
807 /// let nfa = NFA::new("[a&&b]*")?;
808 /// assert!(nfa.has_empty(), "* on never-match still matches empty");
809 /// // Since a '+' requires a match, using it on something that can never
810 /// // match will itself produce a regex that can never match anything,
811 /// // and thus does not match the empty string.
812 /// let nfa = NFA::new("[a&&b]+")?;
813 /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing");
814 ///
815 /// # Ok::<(), Box<dyn std::error::Error>>(())
816 /// ```
817 #[inline]
818 pub fn has_empty(&self) -> bool {
819 self.0.has_empty
820 }
821
822 /// Whether UTF-8 mode is enabled for this NFA or not.
823 ///
824 /// When UTF-8 mode is enabled, all matches reported by a regex engine
825 /// derived from this NFA are guaranteed to correspond to spans of valid
826 /// UTF-8. This includes zero-width matches. For example, the regex engine
827 /// must guarantee that the empty regex will not match at the positions
828 /// between code units in the UTF-8 encoding of a single codepoint.
829 ///
830 /// See [`Config::utf8`] for more information.
831 ///
832 /// This is enabled by default.
833 ///
834 /// # Example
835 ///
836 /// This example shows how UTF-8 mode can impact the match spans that may
837 /// be reported in certain cases.
838 ///
839 /// ```
840 /// use regex_automata::{
841 /// nfa::thompson::{self, pikevm::PikeVM},
842 /// Match, Input,
843 /// };
844 ///
845 /// let re = PikeVM::new("")?;
846 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
847 ///
848 /// // UTF-8 mode is enabled by default.
849 /// let mut input = Input::new("☃");
850 /// re.search(&mut cache, &input, &mut caps);
851 /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
852 ///
853 /// // Even though an empty regex matches at 1..1, our next match is
854 /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
855 /// // three bytes long).
856 /// input.set_start(1);
857 /// re.search(&mut cache, &input, &mut caps);
858 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
859 ///
860 /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
861 /// let re = PikeVM::builder()
862 /// .thompson(thompson::Config::new().utf8(false))
863 /// .build("")?;
864 /// re.search(&mut cache, &input, &mut caps);
865 /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
866 ///
867 /// input.set_start(2);
868 /// re.search(&mut cache, &input, &mut caps);
869 /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
870 ///
871 /// input.set_start(3);
872 /// re.search(&mut cache, &input, &mut caps);
873 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
874 ///
875 /// input.set_start(4);
876 /// re.search(&mut cache, &input, &mut caps);
877 /// assert_eq!(None, caps.get_match());
878 ///
879 /// # Ok::<(), Box<dyn std::error::Error>>(())
880 /// ```
881 #[inline]
882 pub fn is_utf8(&self) -> bool {
883 self.0.utf8
884 }
885
886 /// Returns true when this NFA is meant to be matched in reverse.
887 ///
888 /// Generally speaking, when this is true, it means the NFA is supposed to
889 /// be used in conjunction with moving backwards through the haystack. That
890 /// is, from a higher memory address to a lower memory address.
891 ///
892 /// It is often the case that lower level routines dealing with an NFA
893 /// don't need to care about whether it is "meant" to be matched in reverse
894 /// or not. However, there are some specific cases where it matters. For
895 /// example, the implementation of CRLF-aware `^` and `$` line anchors
896 /// needs to know whether the search is in the forward or reverse
897 /// direction. In the forward direction, neither `^` nor `$` should match
898 /// when a `\r` has been seen previously and a `\n` is next. However, in
899 /// the reverse direction, neither `^` nor `$` should match when a `\n`
900 /// has been seen previously and a `\r` is next. This fundamentally changes
901 /// how the state machine is constructed, and thus needs to be altered
902 /// based on the direction of the search.
903 ///
904 /// This is automatically set when using a [`Compiler`] with a configuration
905 /// where [`Config::reverse`] is enabled. If you're building your own NFA
906 /// by hand via a [`Builder`]
907 #[inline]
908 pub fn is_reverse(&self) -> bool {
909 self.0.reverse
910 }
911
912 /// Returns true if and only if all starting states for this NFA correspond
913 /// to the beginning of an anchored search.
914 ///
915 /// Typically, an NFA will have both an anchored and an unanchored starting
916 /// state. Namely, because it tends to be useful to have both and the cost
917 /// of having an unanchored starting state is almost zero (for an NFA).
918 /// However, if all patterns in the NFA are themselves anchored, then even
919 /// the unanchored starting state will correspond to an anchored search
920 /// since the pattern doesn't permit anything else.
921 ///
922 /// # Example
923 ///
924 /// This example shows a few different scenarios where this method's
925 /// return value varies.
926 ///
927 /// ```
928 /// use regex_automata::nfa::thompson::NFA;
929 ///
930 /// // The unanchored starting state permits matching this pattern anywhere
931 /// // in a haystack, instead of just at the beginning.
932 /// let nfa = NFA::new("a")?;
933 /// assert!(!nfa.is_always_start_anchored());
934 ///
935 /// // In this case, the pattern is itself anchored, so there is no way
936 /// // to run an unanchored search.
937 /// let nfa = NFA::new("^a")?;
938 /// assert!(nfa.is_always_start_anchored());
939 ///
940 /// // When multiline mode is enabled, '^' can match at the start of a line
941 /// // in addition to the start of a haystack, so an unanchored search is
942 /// // actually possible.
943 /// let nfa = NFA::new("(?m)^a")?;
944 /// assert!(!nfa.is_always_start_anchored());
945 ///
946 /// // Weird cases also work. A pattern is only considered anchored if all
947 /// // matches may only occur at the start of a haystack.
948 /// let nfa = NFA::new("(^a)|a")?;
949 /// assert!(!nfa.is_always_start_anchored());
950 ///
951 /// // When multiple patterns are present, if they are all anchored, then
952 /// // the NFA is always anchored too.
953 /// let nfa = NFA::new_many(&["^a", "^b", "^c"])?;
954 /// assert!(nfa.is_always_start_anchored());
955 ///
956 /// // But if one pattern is unanchored, then the NFA must permit an
957 /// // unanchored search.
958 /// let nfa = NFA::new_many(&["^a", "b", "^c"])?;
959 /// assert!(!nfa.is_always_start_anchored());
960 ///
961 /// # Ok::<(), Box<dyn std::error::Error>>(())
962 /// ```
963 #[inline]
964 pub fn is_always_start_anchored(&self) -> bool {
965 self.start_anchored() == self.start_unanchored()
966 }
967
968 /// Returns the look-around matcher associated with this NFA.
969 ///
970 /// A look-around matcher determines how to match look-around assertions.
971 /// In particular, some assertions are configurable. For example, the
972 /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
973 /// from the default of `\n` to any other byte.
974 ///
975 /// If the NFA was built using a [`Compiler`], then this matcher
976 /// can be set via the [`Config::look_matcher`] configuration
977 /// knob. Otherwise, if you've built an NFA by hand, it is set via
978 /// [`Builder::set_look_matcher`].
979 ///
980 /// # Example
981 ///
982 /// This shows how to change the line terminator for multi-line assertions.
983 ///
984 /// ```
985 /// use regex_automata::{
986 /// nfa::thompson::{self, pikevm::PikeVM},
987 /// util::look::LookMatcher,
988 /// Match, Input,
989 /// };
990 ///
991 /// let mut lookm = LookMatcher::new();
992 /// lookm.set_line_terminator(b'\x00');
993 ///
994 /// let re = PikeVM::builder()
995 /// .thompson(thompson::Config::new().look_matcher(lookm))
996 /// .build(r"(?m)^[a-z]+$")?;
997 /// let mut cache = re.create_cache();
998 ///
999 /// // Multi-line assertions now use NUL as a terminator.
1000 /// assert_eq!(
1001 /// Some(Match::must(0, 1..4)),
1002 /// re.find(&mut cache, b"\x00abc\x00"),
1003 /// );
1004 /// // ... and \n is no longer recognized as a terminator.
1005 /// assert_eq!(
1006 /// None,
1007 /// re.find(&mut cache, b"\nabc\n"),
1008 /// );
1009 ///
1010 /// # Ok::<(), Box<dyn std::error::Error>>(())
1011 /// ```
1012 #[inline]
1013 pub fn look_matcher(&self) -> &LookMatcher {
1014 &self.0.look_matcher
1015 }
1016
1017 /// Returns the union of all look-around assertions used throughout this
1018 /// NFA. When the returned set is empty, it implies that the NFA has no
1019 /// look-around assertions and thus zero conditional epsilon transitions.
1020 ///
1021 /// This is useful in some cases enabling optimizations. It is not
1022 /// unusual, for example, for optimizations to be of the form, "for any
1023 /// regex with zero conditional epsilon transitions, do ..." where "..."
1024 /// is some kind of optimization.
1025 ///
1026 /// This isn't only helpful for optimizations either. Sometimes look-around
1027 /// assertions are difficult to support. For example, many of the DFAs in
1028 /// this crate don't support Unicode word boundaries or handle them using
1029 /// heuristics. Handling that correctly typically requires some kind of
1030 /// cheap check of whether the NFA has a Unicode word boundary in the first
1031 /// place.
1032 ///
1033 /// # Example
1034 ///
1035 /// This example shows how this routine varies based on the regex pattern:
1036 ///
1037 /// ```
1038 /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
1039 ///
1040 /// // No look-around at all.
1041 /// let nfa = NFA::new("a")?;
1042 /// assert!(nfa.look_set_any().is_empty());
1043 ///
1044 /// // When multiple patterns are present, since this returns the union,
1045 /// // it will include look-around assertions that only appear in one
1046 /// // pattern.
1047 /// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?;
1048 /// assert!(nfa.look_set_any().contains(Look::Start));
1049 ///
1050 /// // Some groups of assertions have various shortcuts. For example:
1051 /// let nfa = NFA::new(r"(?-u:\b)")?;
1052 /// assert!(nfa.look_set_any().contains_word());
1053 /// assert!(!nfa.look_set_any().contains_word_unicode());
1054 /// assert!(nfa.look_set_any().contains_word_ascii());
1055 ///
1056 /// # Ok::<(), Box<dyn std::error::Error>>(())
1057 /// ```
1058 #[inline]
1059 pub fn look_set_any(&self) -> LookSet {
1060 self.0.look_set_any
1061 }
1062
1063 /// Returns the union of all prefix look-around assertions for every
1064 /// pattern in this NFA. When the returned set is empty, it implies none of
1065 /// the patterns require moving through a conditional epsilon transition
1066 /// before inspecting the first byte in the haystack.
1067 ///
1068 /// This can be useful for determining what kinds of assertions need to be
1069 /// satisfied at the beginning of a search. For example, typically DFAs
1070 /// in this crate will build a distinct starting state for each possible
1071 /// starting configuration that might result in look-around assertions
1072 /// being satisfied differently. However, if the set returned here is
1073 /// empty, then you know that the start state is invariant because there
1074 /// are no conditional epsilon transitions to consider.
1075 ///
1076 /// # Example
1077 ///
1078 /// This example shows how this routine varies based on the regex pattern:
1079 ///
1080 /// ```
1081 /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
1082 ///
1083 /// // No look-around at all.
1084 /// let nfa = NFA::new("a")?;
1085 /// assert!(nfa.look_set_prefix_any().is_empty());
1086 ///
1087 /// // When multiple patterns are present, since this returns the union,
1088 /// // it will include look-around assertions that only appear in one
1089 /// // pattern. But it will only include assertions that are in the prefix
1090 /// // of a pattern. For example, this includes '^' but not '$' even though
1091 /// // '$' does appear.
1092 /// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?;
1093 /// assert!(nfa.look_set_prefix_any().contains(Look::Start));
1094 /// assert!(!nfa.look_set_prefix_any().contains(Look::End));
1095 ///
1096 /// # Ok::<(), Box<dyn std::error::Error>>(())
1097 /// ```
1098 #[inline]
1099 pub fn look_set_prefix_any(&self) -> LookSet {
1100 self.0.look_set_prefix_any
1101 }
1102
1103 // FIXME: The `look_set_prefix_all` computation was not correct, and it
1104 // seemed a little tricky to fix it. Since I wasn't actually using it for
1105 // anything, I just decided to remove it in the run up to the regex 1.9
1106 // release. If you need this, please file an issue.
1107 /*
1108 /// Returns the intersection of all prefix look-around assertions for every
1109 /// pattern in this NFA. When the returned set is empty, it implies at
1110 /// least one of the patterns does not require moving through a conditional
1111 /// epsilon transition before inspecting the first byte in the haystack.
1112 /// Conversely, when the set contains an assertion, it implies that every
1113 /// pattern in the NFA also contains that assertion in its prefix.
1114 ///
1115 /// This can be useful for determining what kinds of assertions need to be
1116 /// satisfied at the beginning of a search. For example, if you know that
1117 /// [`Look::Start`] is in the prefix intersection set returned here, then
1118 /// you know that all searches, regardless of input configuration, will be
1119 /// anchored.
1120 ///
1121 /// # Example
1122 ///
1123 /// This example shows how this routine varies based on the regex pattern:
1124 ///
1125 /// ```
1126 /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
1127 ///
1128 /// // No look-around at all.
1129 /// let nfa = NFA::new("a")?;
1130 /// assert!(nfa.look_set_prefix_all().is_empty());
1131 ///
1132 /// // When multiple patterns are present, since this returns the
1133 /// // intersection, it will only include assertions present in every
1134 /// // prefix, and only the prefix.
1135 /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?;
1136 /// assert!(nfa.look_set_prefix_all().contains(Look::Start));
1137 /// assert!(!nfa.look_set_prefix_all().contains(Look::End));
1138 ///
1139 /// # Ok::<(), Box<dyn std::error::Error>>(())
1140 /// ```
1141 #[inline]
1142 pub fn look_set_prefix_all(&self) -> LookSet {
1143 self.0.look_set_prefix_all
1144 }
1145 */
1146
1147 /// Returns the memory usage, in bytes, of this NFA.
1148 ///
1149 /// This does **not** include the stack size used up by this NFA. To
1150 /// compute that, use `std::mem::size_of::<NFA>()`.
1151 ///
1152 /// # Example
1153 ///
1154 /// This example shows that large Unicode character classes can use quite
1155 /// a bit of memory.
1156 ///
1157 /// ```
1158 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1159 /// use regex_automata::nfa::thompson::NFA;
1160 ///
1161 /// let nfa_unicode = NFA::new(r"\w")?;
1162 /// let nfa_ascii = NFA::new(r"(?-u:\w)")?;
1163 ///
1164 /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage());
1165 ///
1166 /// # Ok::<(), Box<dyn std::error::Error>>(())
1167 /// ```
1168 #[inline]
1169 pub fn memory_usage(&self) -> usize {
1170 use core::mem::size_of;
1171
1172 size_of::<Inner>() // allocated on the heap via Arc
1173 + self.0.states.len() * size_of::<State>()
1174 + self.0.start_pattern.len() * size_of::<StateID>()
1175 + self.0.group_info.memory_usage()
1176 + self.0.memory_extra
1177 }
1178}
1179
1180impl fmt::Debug for NFA {
1181 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1182 self.0.fmt(f)
1183 }
1184}
1185
1186/// The "inner" part of the NFA. We split this part out so that we can easily
1187/// wrap it in an `Arc` above in the definition of `NFA`.
1188///
1189/// See builder.rs for the code that actually builds this type. This module
1190/// does provide (internal) mutable methods for adding things to this
1191/// NFA before finalizing it, but the high level construction process is
1192/// controlled by the builder abstraction. (Which is complicated enough to
1193/// get its own module.)
1194#[derive(Default)]
1195pub(super) struct Inner {
1196 /// The state sequence. This sequence is guaranteed to be indexable by all
1197 /// starting state IDs, and it is also guaranteed to contain at most one
1198 /// `Match` state for each pattern compiled into this NFA. (A pattern may
1199 /// not have a corresponding `Match` state if a `Match` state is impossible
1200 /// to reach.)
1201 states: Vec<State>,
1202 /// The anchored starting state of this NFA.
1203 start_anchored: StateID,
1204 /// The unanchored starting state of this NFA.
1205 start_unanchored: StateID,
1206 /// The starting states for each individual pattern. Starting at any
1207 /// of these states will result in only an anchored search for the
1208 /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
1209 /// contains a single regex, then `start_pattern[0]` and `start_anchored`
1210 /// are always equivalent.
1211 start_pattern: Vec<StateID>,
1212 /// Info about the capturing groups in this NFA. This is responsible for
1213 /// mapping groups to slots, mapping groups to names and names to groups.
1214 group_info: GroupInfo,
1215 /// A representation of equivalence classes over the transitions in this
1216 /// NFA. Two bytes in the same equivalence class must not discriminate
1217 /// between a match or a non-match. This map can be used to shrink the
1218 /// total size of a DFA's transition table with a small match-time cost.
1219 ///
1220 /// Note that the NFA's transitions are *not* defined in terms of these
1221 /// equivalence classes. The NFA's transitions are defined on the original
1222 /// byte values. For the most part, this is because they wouldn't really
1223 /// help the NFA much since the NFA already uses a sparse representation
1224 /// to represent transitions. Byte classes are most effective in a dense
1225 /// representation.
1226 byte_class_set: ByteClassSet,
1227 /// This is generated from `byte_class_set`, and essentially represents the
1228 /// same thing but supports different access patterns. Namely, this permits
1229 /// looking up the equivalence class of a byte very cheaply.
1230 ///
1231 /// Ideally we would just store this, but because of annoying code
1232 /// structure reasons, we keep both this and `byte_class_set` around for
1233 /// now. I think I would prefer that `byte_class_set` were computed in the
1234 /// `Builder`, but right now, we compute it as states are added to the
1235 /// `NFA`.
1236 byte_classes: ByteClasses,
1237 /// Whether this NFA has a `Capture` state anywhere.
1238 has_capture: bool,
1239 /// When the empty string is in the language matched by this NFA.
1240 has_empty: bool,
1241 /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that
1242 /// all non-empty matches produced by this NFA correspond to spans of valid
1243 /// UTF-8, and any empty matches produced by this NFA that split a UTF-8
1244 /// encoded codepoint should be filtered out by the corresponding regex
1245 /// engine.
1246 utf8: bool,
1247 /// Whether this NFA is meant to be matched in reverse or not.
1248 reverse: bool,
1249 /// The matcher to be used for look-around assertions.
1250 look_matcher: LookMatcher,
1251 /// The union of all look-around assertions that occur anywhere within
1252 /// this NFA. If this set is empty, then it means there are precisely zero
1253 /// conditional epsilon transitions in the NFA.
1254 look_set_any: LookSet,
1255 /// The union of all look-around assertions that occur as a zero-length
1256 /// prefix for any of the patterns in this NFA.
1257 look_set_prefix_any: LookSet,
1258 /*
1259 /// The intersection of all look-around assertions that occur as a
1260 /// zero-length prefix for any of the patterns in this NFA.
1261 look_set_prefix_all: LookSet,
1262 */
1263 /// Heap memory used indirectly by NFA states and other things (like the
1264 /// various capturing group representations above). Since each state
1265 /// might use a different amount of heap, we need to keep track of this
1266 /// incrementally.
1267 memory_extra: usize,
1268}
1269
1270impl Inner {
1271 /// Runs any last finalization bits and turns this into a full NFA.
1272 pub(super) fn into_nfa(mut self) -> NFA {
1273 self.byte_classes = self.byte_class_set.byte_classes();
1274 // Do epsilon closure from the start state of every pattern in order
1275 // to compute various properties such as look-around assertions and
1276 // whether the empty string can be matched.
1277 let mut stack = vec![];
1278 let mut seen = SparseSet::new(self.states.len());
1279 for &start_id in self.start_pattern.iter() {
1280 stack.push(start_id);
1281 seen.clear();
1282 // let mut prefix_all = LookSet::full();
1283 let mut prefix_any = LookSet::empty();
1284 while let Some(sid) = stack.pop() {
1285 if !seen.insert(sid) {
1286 continue;
1287 }
1288 match self.states[sid] {
1289 State::ByteRange { .. }
1290 | State::Dense { .. }
1291 | State::Fail => continue,
1292 State::Sparse(_) => {
1293 // This snippet below will rewrite this sparse state
1294 // as a dense state. By doing it here, we apply this
1295 // optimization to all hot "sparse" states since these
1296 // are the states that are reachable from the start
1297 // state via an epsilon closure.
1298 //
1299 // Unfortunately, this optimization did not seem to
1300 // help much in some very limited ad hoc benchmarking.
1301 //
1302 // I left the 'Dense' state type in place in case we
1303 // want to revisit this, but I suspect the real way
1304 // to make forward progress is a more fundamental
1305 // rearchitecting of how data in the NFA is laid out.
1306 // I think we should consider a single contiguous
1307 // allocation instead of all this indirection and
1308 // potential heap allocations for every state. But this
1309 // is a large re-design and will require API breaking
1310 // changes.
1311 // self.memory_extra -= self.states[sid].memory_usage();
1312 // let trans = DenseTransitions::from_sparse(sparse);
1313 // self.states[sid] = State::Dense(trans);
1314 // self.memory_extra += self.states[sid].memory_usage();
1315 continue;
1316 }
1317 State::Match { .. } => self.has_empty = true,
1318 State::Look { look, next } => {
1319 prefix_any = prefix_any.insert(look);
1320 stack.push(next);
1321 }
1322 State::Union { ref alternates } => {
1323 // Order doesn't matter here, since we're just dealing
1324 // with look-around sets. But if we do richer analysis
1325 // here that needs to care about preference order, then
1326 // this should be done in reverse.
1327 stack.extend(alternates.iter());
1328 }
1329 State::BinaryUnion { alt1, alt2 } => {
1330 stack.push(alt2);
1331 stack.push(alt1);
1332 }
1333 State::Capture { next, .. } => {
1334 stack.push(next);
1335 }
1336 }
1337 }
1338 self.look_set_prefix_any =
1339 self.look_set_prefix_any.union(prefix_any);
1340 }
1341 NFA(Arc::new(self))
1342 }
1343
1344 /// Returns the capturing group info for this NFA.
1345 pub(super) fn group_info(&self) -> &GroupInfo {
1346 &self.group_info
1347 }
1348
1349 /// Add the given state to this NFA after allocating a fresh identifier for
1350 /// it.
1351 ///
1352 /// This panics if too many states are added such that a fresh identifier
1353 /// could not be created. (Currently, the only caller of this routine is
1354 /// a `Builder`, and it upholds this invariant.)
1355 pub(super) fn add(&mut self, state: State) -> StateID {
1356 match state {
1357 State::ByteRange { ref trans } => {
1358 self.byte_class_set.set_range(trans.start, trans.end);
1359 }
1360 State::Sparse(ref sparse) => {
1361 for trans in sparse.transitions.iter() {
1362 self.byte_class_set.set_range(trans.start, trans.end);
1363 }
1364 }
1365 State::Dense { .. } => unreachable!(),
1366 State::Look { look, .. } => {
1367 self.look_matcher
1368 .add_to_byteset(look, &mut self.byte_class_set);
1369 self.look_set_any = self.look_set_any.insert(look);
1370 }
1371 State::Capture { .. } => {
1372 self.has_capture = true;
1373 }
1374 State::Union { .. }
1375 | State::BinaryUnion { .. }
1376 | State::Fail
1377 | State::Match { .. } => {}
1378 }
1379
1380 let id = StateID::new(self.states.len()).unwrap();
1381 self.memory_extra += state.memory_usage();
1382 self.states.push(state);
1383 id
1384 }
1385
1386 /// Set the starting state identifiers for this NFA.
1387 ///
1388 /// `start_anchored` and `start_unanchored` may be equivalent. When they
1389 /// are, then the NFA can only execute anchored searches. This might
1390 /// occur, for example, for patterns that are unconditionally anchored.
1391 /// e.g., `^foo`.
1392 pub(super) fn set_starts(
1393 &mut self,
1394 start_anchored: StateID,
1395 start_unanchored: StateID,
1396 start_pattern: &[StateID],
1397 ) {
1398 self.start_anchored = start_anchored;
1399 self.start_unanchored = start_unanchored;
1400 self.start_pattern = start_pattern.to_vec();
1401 }
1402
1403 /// Sets the UTF-8 mode of this NFA.
1404 pub(super) fn set_utf8(&mut self, yes: bool) {
1405 self.utf8 = yes;
1406 }
1407
1408 /// Sets the reverse mode of this NFA.
1409 pub(super) fn set_reverse(&mut self, yes: bool) {
1410 self.reverse = yes;
1411 }
1412
1413 /// Sets the look-around assertion matcher for this NFA.
1414 pub(super) fn set_look_matcher(&mut self, m: LookMatcher) {
1415 self.look_matcher = m;
1416 }
1417
1418 /// Set the capturing groups for this NFA.
1419 ///
1420 /// The given slice should contain the capturing groups for each pattern,
1421 /// The capturing groups in turn should correspond to the total number of
1422 /// capturing groups in the pattern, including the anonymous first capture
1423 /// group for each pattern. If a capturing group does have a name, then it
1424 /// should be provided as a Arc<str>.
1425 ///
1426 /// This returns an error if a corresponding `GroupInfo` could not be
1427 /// built.
1428 pub(super) fn set_captures(
1429 &mut self,
1430 captures: &[Vec<Option<Arc<str>>>],
1431 ) -> Result<(), GroupInfoError> {
1432 self.group_info = GroupInfo::new(
1433 captures.iter().map(|x| x.iter().map(|y| y.as_ref())),
1434 )?;
1435 Ok(())
1436 }
1437
1438 /// Remap the transitions in every state of this NFA using the given map.
1439 /// The given map should be indexed according to state ID namespace used by
1440 /// the transitions of the states currently in this NFA.
1441 ///
1442 /// This is particularly useful to the NFA builder, since it is convenient
1443 /// to add NFA states in order to produce their final IDs. Then, after all
1444 /// of the intermediate "empty" states (unconditional epsilon transitions)
1445 /// have been removed from the builder's representation, we can re-map all
1446 /// of the transitions in the states already added to their final IDs.
1447 pub(super) fn remap(&mut self, old_to_new: &[StateID]) {
1448 for state in &mut self.states {
1449 state.remap(old_to_new);
1450 }
1451 self.start_anchored = old_to_new[self.start_anchored];
1452 self.start_unanchored = old_to_new[self.start_unanchored];
1453 for id in self.start_pattern.iter_mut() {
1454 *id = old_to_new[*id];
1455 }
1456 }
1457}
1458
1459impl fmt::Debug for Inner {
1460 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1461 writeln!(f, "thompson::NFA(")?;
1462 for (sid, state) in self.states.iter().with_state_ids() {
1463 let status = if sid == self.start_anchored {
1464 '^'
1465 } else if sid == self.start_unanchored {
1466 '>'
1467 } else {
1468 ' '
1469 };
1470 writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?;
1471 }
1472 let pattern_len = self.start_pattern.len();
1473 if pattern_len > 1 {
1474 writeln!(f, "")?;
1475 for pid in 0..pattern_len {
1476 let sid = self.start_pattern[pid];
1477 writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?;
1478 }
1479 }
1480 writeln!(f, "")?;
1481 writeln!(
1482 f,
1483 "transition equivalence classes: {:?}",
1484 self.byte_classes,
1485 )?;
1486 writeln!(f, ")")?;
1487 Ok(())
1488 }
1489}
1490
1491/// A state in an NFA.
1492///
1493/// In theory, it can help to conceptualize an `NFA` as a graph consisting of
1494/// `State`s. Each `State` contains its complete set of outgoing transitions.
1495///
1496/// In practice, it can help to conceptualize an `NFA` as a sequence of
1497/// instructions for a virtual machine. Each `State` says what to do and where
1498/// to go next.
1499///
1500/// Strictly speaking, the practical interpretation is the most correct one,
1501/// because of the [`Capture`](State::Capture) state. Namely, a `Capture`
1502/// state always forwards execution to another state unconditionally. Its only
1503/// purpose is to cause a side effect: the recording of the current input
1504/// position at a particular location in memory. In this sense, an `NFA`
1505/// has more power than a theoretical non-deterministic finite automaton.
1506///
1507/// For most uses of this crate, it is likely that one may never even need to
1508/// be aware of this type at all. The main use cases for looking at `State`s
1509/// directly are if you need to write your own search implementation or if you
1510/// need to do some kind of analysis on the NFA.
1511#[derive(Clone, Eq, PartialEq)]
1512pub enum State {
1513 /// A state with a single transition that can only be taken if the current
1514 /// input symbol is in a particular range of bytes.
1515 ByteRange {
1516 /// The transition from this state to the next.
1517 trans: Transition,
1518 },
1519 /// A state with possibly many transitions represented in a sparse fashion.
1520 /// Transitions are non-overlapping and ordered lexicographically by input
1521 /// range.
1522 ///
1523 /// In practice, this is used for encoding UTF-8 automata. Its presence is
1524 /// primarily an optimization that avoids many additional unconditional
1525 /// epsilon transitions (via [`Union`](State::Union) states), and thus
1526 /// decreases the overhead of traversing the NFA. This can improve both
1527 /// matching time and DFA construction time.
1528 Sparse(SparseTransitions),
1529 /// A dense representation of a state with multiple transitions.
1530 Dense(DenseTransitions),
1531 /// A conditional epsilon transition satisfied via some sort of
1532 /// look-around. Look-around is limited to anchor and word boundary
1533 /// assertions.
1534 ///
1535 /// Look-around states are meant to be evaluated while performing epsilon
1536 /// closure (computing the set of states reachable from a particular state
1537 /// via only epsilon transitions). If the current position in the haystack
1538 /// satisfies the look-around assertion, then you're permitted to follow
1539 /// that epsilon transition.
1540 Look {
1541 /// The look-around assertion that must be satisfied before moving
1542 /// to `next`.
1543 look: Look,
1544 /// The state to transition to if the look-around assertion is
1545 /// satisfied.
1546 next: StateID,
1547 },
1548 /// An alternation such that there exists an epsilon transition to all
1549 /// states in `alternates`, where matches found via earlier transitions
1550 /// are preferred over later transitions.
1551 Union {
1552 /// An ordered sequence of unconditional epsilon transitions to other
1553 /// states. Transitions earlier in the sequence are preferred over
1554 /// transitions later in the sequence.
1555 alternates: Box<[StateID]>,
1556 },
1557 /// An alternation such that there exists precisely two unconditional
1558 /// epsilon transitions, where matches found via `alt1` are preferred over
1559 /// matches found via `alt2`.
1560 ///
1561 /// This state exists as a common special case of Union where there are
1562 /// only two alternates. In this case, we don't need any allocations to
1563 /// represent the state. This saves a bit of memory and also saves an
1564 /// additional memory access when traversing the NFA.
1565 BinaryUnion {
1566 /// An unconditional epsilon transition to another NFA state. This
1567 /// is preferred over `alt2`.
1568 alt1: StateID,
1569 /// An unconditional epsilon transition to another NFA state. Matches
1570 /// reported via this transition should only be reported if no matches
1571 /// were found by following `alt1`.
1572 alt2: StateID,
1573 },
1574 /// An empty state that records a capture location.
1575 ///
1576 /// From the perspective of finite automata, this is precisely equivalent
1577 /// to an unconditional epsilon transition, but serves the purpose of
1578 /// instructing NFA simulations to record additional state when the finite
1579 /// state machine passes through this epsilon transition.
1580 ///
1581 /// `slot` in this context refers to the specific capture group slot
1582 /// offset that is being recorded. Each capturing group has two slots
1583 /// corresponding to the start and end of the matching portion of that
1584 /// group.
1585 ///
1586 /// The pattern ID and capture group index are also included in this state
1587 /// in case they are useful. But mostly, all you'll need is `next` and
1588 /// `slot`.
1589 Capture {
1590 /// The state to transition to, unconditionally.
1591 next: StateID,
1592 /// The pattern ID that this capture belongs to.
1593 pattern_id: PatternID,
1594 /// The capture group index that this capture belongs to. Capture group
1595 /// indices are local to each pattern. For example, when capturing
1596 /// groups are enabled, every pattern has a capture group at index
1597 /// `0`.
1598 group_index: SmallIndex,
1599 /// The slot index for this capture. Every capturing group has two
1600 /// slots: one for the start haystack offset and one for the end
1601 /// haystack offset. Unlike capture group indices, slot indices are
1602 /// global across all patterns in this NFA. That is, each slot belongs
1603 /// to a single pattern, but there is only one slot at index `i`.
1604 slot: SmallIndex,
1605 },
1606 /// A state that cannot be transitioned out of. This is useful for cases
1607 /// where you want to prevent matching from occurring. For example, if your
1608 /// regex parser permits empty character classes, then one could choose
1609 /// a `Fail` state to represent them. (An empty character class can be
1610 /// thought of as an empty set. Since nothing is in an empty set, they can
1611 /// never match anything.)
1612 Fail,
1613 /// A match state. There is at least one such occurrence of this state for
1614 /// each regex that can match that is in this NFA.
1615 Match {
1616 /// The matching pattern ID.
1617 pattern_id: PatternID,
1618 },
1619}
1620
1621impl State {
1622 /// Returns true if and only if this state contains one or more epsilon
1623 /// transitions.
1624 ///
1625 /// In practice, a state has no outgoing transitions (like `Match`), has
1626 /// only non-epsilon transitions (like `ByteRange`) or has only epsilon
1627 /// transitions (like `Union`).
1628 ///
1629 /// # Example
1630 ///
1631 /// ```
1632 /// use regex_automata::{
1633 /// nfa::thompson::{State, Transition},
1634 /// util::primitives::{PatternID, StateID, SmallIndex},
1635 /// };
1636 ///
1637 /// // Capture states are epsilon transitions.
1638 /// let state = State::Capture {
1639 /// next: StateID::ZERO,
1640 /// pattern_id: PatternID::ZERO,
1641 /// group_index: SmallIndex::ZERO,
1642 /// slot: SmallIndex::ZERO,
1643 /// };
1644 /// assert!(state.is_epsilon());
1645 ///
1646 /// // ByteRange states are not.
1647 /// let state = State::ByteRange {
1648 /// trans: Transition { start: b'a', end: b'z', next: StateID::ZERO },
1649 /// };
1650 /// assert!(!state.is_epsilon());
1651 ///
1652 /// # Ok::<(), Box<dyn std::error::Error>>(())
1653 /// ```
1654 #[inline]
1655 pub fn is_epsilon(&self) -> bool {
1656 match *self {
1657 State::ByteRange { .. }
1658 | State::Sparse { .. }
1659 | State::Dense { .. }
1660 | State::Fail
1661 | State::Match { .. } => false,
1662 State::Look { .. }
1663 | State::Union { .. }
1664 | State::BinaryUnion { .. }
1665 | State::Capture { .. } => true,
1666 }
1667 }
1668
1669 /// Returns the heap memory usage of this NFA state in bytes.
1670 fn memory_usage(&self) -> usize {
1671 match *self {
1672 State::ByteRange { .. }
1673 | State::Look { .. }
1674 | State::BinaryUnion { .. }
1675 | State::Capture { .. }
1676 | State::Match { .. }
1677 | State::Fail => 0,
1678 State::Sparse(SparseTransitions { ref transitions }) => {
1679 transitions.len() * mem::size_of::<Transition>()
1680 }
1681 State::Dense { .. } => 256 * mem::size_of::<StateID>(),
1682 State::Union { ref alternates } => {
1683 alternates.len() * mem::size_of::<StateID>()
1684 }
1685 }
1686 }
1687
1688 /// Remap the transitions in this state using the given map. Namely, the
1689 /// given map should be indexed according to the transitions currently
1690 /// in this state.
1691 ///
1692 /// This is used during the final phase of the NFA compiler, which turns
1693 /// its intermediate NFA into the final NFA.
1694 fn remap(&mut self, remap: &[StateID]) {
1695 match *self {
1696 State::ByteRange { ref mut trans } => {
1697 trans.next = remap[trans.next]
1698 }
1699 State::Sparse(SparseTransitions { ref mut transitions }) => {
1700 for t in transitions.iter_mut() {
1701 t.next = remap[t.next];
1702 }
1703 }
1704 State::Dense(DenseTransitions { ref mut transitions }) => {
1705 for sid in transitions.iter_mut() {
1706 *sid = remap[*sid];
1707 }
1708 }
1709 State::Look { ref mut next, .. } => *next = remap[*next],
1710 State::Union { ref mut alternates } => {
1711 for alt in alternates.iter_mut() {
1712 *alt = remap[*alt];
1713 }
1714 }
1715 State::BinaryUnion { ref mut alt1, ref mut alt2 } => {
1716 *alt1 = remap[*alt1];
1717 *alt2 = remap[*alt2];
1718 }
1719 State::Capture { ref mut next, .. } => *next = remap[*next],
1720 State::Fail => {}
1721 State::Match { .. } => {}
1722 }
1723 }
1724}
1725
1726impl fmt::Debug for State {
1727 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1728 match *self {
1729 State::ByteRange { ref trans } => trans.fmt(f),
1730 State::Sparse(SparseTransitions { ref transitions }) => {
1731 let rs = transitions
1732 .iter()
1733 .map(|t| format!("{:?}", t))
1734 .collect::<Vec<String>>()
1735 .join(", ");
1736 write!(f, "sparse({})", rs)
1737 }
1738 State::Dense(ref dense) => {
1739 write!(f, "dense(")?;
1740 for (i, t) in dense.iter().enumerate() {
1741 if i > 0 {
1742 write!(f, ", ")?;
1743 }
1744 write!(f, "{:?}", t)?;
1745 }
1746 write!(f, ")")
1747 }
1748 State::Look { ref look, next } => {
1749 write!(f, "{:?} => {:?}", look, next.as_usize())
1750 }
1751 State::Union { ref alternates } => {
1752 let alts = alternates
1753 .iter()
1754 .map(|id| format!("{:?}", id.as_usize()))
1755 .collect::<Vec<String>>()
1756 .join(", ");
1757 write!(f, "union({})", alts)
1758 }
1759 State::BinaryUnion { alt1, alt2 } => {
1760 write!(
1761 f,
1762 "binary-union({}, {})",
1763 alt1.as_usize(),
1764 alt2.as_usize()
1765 )
1766 }
1767 State::Capture { next, pattern_id, group_index, slot } => {
1768 write!(
1769 f,
1770 "capture(pid={:?}, group={:?}, slot={:?}) => {:?}",
1771 pattern_id.as_usize(),
1772 group_index.as_usize(),
1773 slot.as_usize(),
1774 next.as_usize(),
1775 )
1776 }
1777 State::Fail => write!(f, "FAIL"),
1778 State::Match { pattern_id } => {
1779 write!(f, "MATCH({:?})", pattern_id.as_usize())
1780 }
1781 }
1782 }
1783}
1784
1785/// A sequence of transitions used to represent a sparse state.
1786///
1787/// This is the primary representation of a [`Sparse`](State::Sparse) state.
1788/// It corresponds to a sorted sequence of transitions with non-overlapping
1789/// byte ranges. If the byte at the current position in the haystack matches
1790/// one of the byte ranges, then the finite state machine should take the
1791/// corresponding transition.
1792#[derive(Clone, Debug, Eq, PartialEq)]
1793pub struct SparseTransitions {
1794 /// The sorted sequence of non-overlapping transitions.
1795 pub transitions: Box<[Transition]>,
1796}
1797
1798impl SparseTransitions {
1799 /// This follows the matching transition for a particular byte.
1800 ///
1801 /// The matching transition is found by looking for a matching byte
1802 /// range (there is at most one) corresponding to the position `at` in
1803 /// `haystack`.
1804 ///
1805 /// If `at >= haystack.len()`, then this returns `None`.
1806 #[inline]
1807 pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
1808 haystack.get(at).and_then(|&b| self.matches_byte(b))
1809 }
1810
1811 /// This follows the matching transition for any member of the alphabet.
1812 ///
1813 /// The matching transition is found by looking for a matching byte
1814 /// range (there is at most one) corresponding to the position `at` in
1815 /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi),
1816 /// then this always returns `None`.
1817 #[inline]
1818 pub(crate) fn matches_unit(
1819 &self,
1820 unit: alphabet::Unit,
1821 ) -> Option<StateID> {
1822 unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
1823 }
1824
1825 /// This follows the matching transition for a particular byte.
1826 ///
1827 /// The matching transition is found by looking for a matching byte range
1828 /// (there is at most one) corresponding to the byte given.
1829 #[inline]
1830 pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
1831 for t in self.transitions.iter() {
1832 if t.start > byte {
1833 break;
1834 } else if t.matches_byte(byte) {
1835 return Some(t.next);
1836 }
1837 }
1838 None
1839
1840 /*
1841 // This is an alternative implementation that uses binary search. In
1842 // some ad hoc experiments, like
1843 //
1844 // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file
1845 //
1846 // I could not observe any improvement, and in fact, things seemed to
1847 // be a bit slower. I can see an improvement in at least one benchmark:
1848 //
1849 // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8
1850 //
1851 // Where total search time goes from 3.2s to 2.4s when using binary
1852 // search.
1853 self.transitions
1854 .binary_search_by(|t| {
1855 if t.end < byte {
1856 core::cmp::Ordering::Less
1857 } else if t.start > byte {
1858 core::cmp::Ordering::Greater
1859 } else {
1860 core::cmp::Ordering::Equal
1861 }
1862 })
1863 .ok()
1864 .map(|i| self.transitions[i].next)
1865 */
1866 }
1867}
1868
1869/// A sequence of transitions used to represent a dense state.
1870///
1871/// This is the primary representation of a [`Dense`](State::Dense) state. It
1872/// provides constant time matching. That is, given a byte in a haystack and
1873/// a `DenseTransitions`, one can determine if the state matches in constant
1874/// time.
1875///
1876/// This is in contrast to `SparseTransitions`, whose time complexity is
1877/// necessarily bigger than constant time. Also in contrast, `DenseTransitions`
1878/// usually requires (much) more heap memory.
1879#[derive(Clone, Debug, Eq, PartialEq)]
1880pub struct DenseTransitions {
1881 /// A dense representation of this state's transitions on the heap. This
1882 /// always has length 256.
1883 pub transitions: Box<[StateID]>,
1884}
1885
1886impl DenseTransitions {
1887 /// This follows the matching transition for a particular byte.
1888 ///
1889 /// The matching transition is found by looking for a transition that
1890 /// doesn't correspond to `StateID::ZERO` for the byte `at` the given
1891 /// position in `haystack`.
1892 ///
1893 /// If `at >= haystack.len()`, then this returns `None`.
1894 #[inline]
1895 pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
1896 haystack.get(at).and_then(|&b| self.matches_byte(b))
1897 }
1898
1899 /// This follows the matching transition for any member of the alphabet.
1900 ///
1901 /// The matching transition is found by looking for a transition that
1902 /// doesn't correspond to `StateID::ZERO` for the byte `at` the given
1903 /// position in `haystack`.
1904 ///
1905 /// If `at >= haystack.len()` or if the given alphabet unit is
1906 /// [`EOI`](alphabet::Unit::eoi), then this returns `None`.
1907 #[inline]
1908 pub(crate) fn matches_unit(
1909 &self,
1910 unit: alphabet::Unit,
1911 ) -> Option<StateID> {
1912 unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
1913 }
1914
1915 /// This follows the matching transition for a particular byte.
1916 ///
1917 /// The matching transition is found by looking for a transition that
1918 /// doesn't correspond to `StateID::ZERO` for the given `byte`.
1919 ///
1920 /// If `at >= haystack.len()`, then this returns `None`.
1921 #[inline]
1922 pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
1923 let next = self.transitions[usize::from(byte)];
1924 if next == StateID::ZERO {
1925 None
1926 } else {
1927 Some(next)
1928 }
1929 }
1930
1931 /*
1932 /// The dense state optimization isn't currently enabled, so permit a
1933 /// little bit of dead code.
1934 pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions {
1935 let mut dense = vec![StateID::ZERO; 256];
1936 for t in sparse.transitions.iter() {
1937 for b in t.start..=t.end {
1938 dense[usize::from(b)] = t.next;
1939 }
1940 }
1941 DenseTransitions { transitions: dense.into_boxed_slice() }
1942 }
1943 */
1944
1945 /// Returns an iterator over all transitions that don't point to
1946 /// `StateID::ZERO`.
1947 pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ {
1948 use crate::util::int::Usize;
1949 self.transitions
1950 .iter()
1951 .enumerate()
1952 .filter(|&(_, &sid)| sid != StateID::ZERO)
1953 .map(|(byte, &next)| Transition {
1954 start: byte.as_u8(),
1955 end: byte.as_u8(),
1956 next,
1957 })
1958 }
1959}
1960
1961/// A single transition to another state.
1962///
1963/// This transition may only be followed if the current byte in the haystack
1964/// falls in the inclusive range of bytes specified.
1965#[derive(Clone, Copy, Eq, Hash, PartialEq)]
1966pub struct Transition {
1967 /// The inclusive start of the byte range.
1968 pub start: u8,
1969 /// The inclusive end of the byte range.
1970 pub end: u8,
1971 /// The identifier of the state to transition to.
1972 pub next: StateID,
1973}
1974
1975impl Transition {
1976 /// Returns true if the position `at` in `haystack` falls in this
1977 /// transition's range of bytes.
1978 ///
1979 /// If `at >= haystack.len()`, then this returns `false`.
1980 pub fn matches(&self, haystack: &[u8], at: usize) -> bool {
1981 haystack.get(at).map_or(false, |&b| self.matches_byte(b))
1982 }
1983
1984 /// Returns true if the given alphabet unit falls in this transition's
1985 /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then
1986 /// this returns `false`.
1987 pub fn matches_unit(&self, unit: alphabet::Unit) -> bool {
1988 unit.as_u8().map_or(false, |byte| self.matches_byte(byte))
1989 }
1990
1991 /// Returns true if the given byte falls in this transition's range of
1992 /// bytes.
1993 pub fn matches_byte(&self, byte: u8) -> bool {
1994 self.start <= byte && byte <= self.end
1995 }
1996}
1997
1998impl fmt::Debug for Transition {
1999 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2000 use crate::util::escape::DebugByte;
2001
2002 let Transition { start, end, next } = *self;
2003 if self.start == self.end {
2004 write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())
2005 } else {
2006 write!(
2007 f,
2008 "{:?}-{:?} => {:?}",
2009 DebugByte(start),
2010 DebugByte(end),
2011 next.as_usize(),
2012 )
2013 }
2014 }
2015}
2016
2017/// An iterator over all pattern IDs in an NFA.
2018///
2019/// This iterator is created by [`NFA::patterns`].
2020///
2021/// The lifetime parameter `'a` refers to the lifetime of the NFA from which
2022/// this pattern iterator was created.
2023#[derive(Debug)]
2024pub struct PatternIter<'a> {
2025 it: PatternIDIter,
2026 /// We explicitly associate a lifetime with this iterator even though we
2027 /// don't actually borrow anything from the NFA. We do this for backward
2028 /// compatibility purposes. If we ever do need to borrow something from
2029 /// the NFA, then we can and just get rid of this marker without breaking
2030 /// the public API.
2031 _marker: core::marker::PhantomData<&'a ()>,
2032}
2033
2034impl<'a> Iterator for PatternIter<'a> {
2035 type Item = PatternID;
2036
2037 fn next(&mut self) -> Option<PatternID> {
2038 self.it.next()
2039 }
2040}
2041
2042#[cfg(all(test, feature = "nfa-pikevm"))]
2043mod tests {
2044 use super::*;
2045 use crate::{nfa::thompson::pikevm::PikeVM, Input};
2046
2047 // This asserts that an NFA state doesn't have its size changed. It is
2048 // *really* easy to accidentally increase the size, and thus potentially
2049 // dramatically increase the memory usage of every NFA.
2050 //
2051 // This assert doesn't mean we absolutely cannot increase the size of an
2052 // NFA state. We can. It's just here to make sure we do it knowingly and
2053 // intentionally.
2054 #[test]
2055 fn state_has_small_size() {
2056 #[cfg(target_pointer_width = "64")]
2057 assert_eq!(24, core::mem::size_of::<State>());
2058 #[cfg(target_pointer_width = "32")]
2059 assert_eq!(20, core::mem::size_of::<State>());
2060 }
2061
2062 #[test]
2063 fn always_match() {
2064 let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap();
2065 let mut cache = re.create_cache();
2066 let mut caps = re.create_captures();
2067 let mut find = |haystack, start, end| {
2068 let input = Input::new(haystack).range(start..end);
2069 re.search(&mut cache, &input, &mut caps);
2070 caps.get_match().map(|m| m.end())
2071 };
2072
2073 assert_eq!(Some(0), find("", 0, 0));
2074 assert_eq!(Some(0), find("a", 0, 1));
2075 assert_eq!(Some(1), find("a", 1, 1));
2076 assert_eq!(Some(0), find("ab", 0, 2));
2077 assert_eq!(Some(1), find("ab", 1, 2));
2078 assert_eq!(Some(2), find("ab", 2, 2));
2079 }
2080
2081 #[test]
2082 fn never_match() {
2083 let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap();
2084 let mut cache = re.create_cache();
2085 let mut caps = re.create_captures();
2086 let mut find = |haystack, start, end| {
2087 let input = Input::new(haystack).range(start..end);
2088 re.search(&mut cache, &input, &mut caps);
2089 caps.get_match().map(|m| m.end())
2090 };
2091
2092 assert_eq!(None, find("", 0, 0));
2093 assert_eq!(None, find("a", 0, 1));
2094 assert_eq!(None, find("a", 1, 1));
2095 assert_eq!(None, find("ab", 0, 2));
2096 assert_eq!(None, find("ab", 1, 2));
2097 assert_eq!(None, find("ab", 2, 2));
2098 }
2099}
2100