1#[cfg(feature = "alloc")]
2use crate::util::search::PatternSet;
3use crate::{
4 dfa::search,
5 util::{
6 empty,
7 prefilter::Prefilter,
8 primitives::{PatternID, StateID},
9 search::{Anchored, HalfMatch, Input, MatchError},
10 start,
11 },
12};
13
14/// A trait describing the interface of a deterministic finite automaton (DFA).
15///
16/// The complexity of this trait probably means that it's unlikely for others
17/// to implement it. The primary purpose of the trait is to provide for a way
18/// of abstracting over different types of DFAs. In this crate, that means
19/// dense DFAs and sparse DFAs. (Dense DFAs are fast but memory hungry, where
20/// as sparse DFAs are slower but come with a smaller memory footprint. But
21/// they otherwise provide exactly equivalent expressive power.) For example, a
22/// [`dfa::regex::Regex`](crate::dfa::regex::Regex) is generic over this trait.
23///
24/// Normally, a DFA's execution model is very simple. You might have a single
25/// start state, zero or more final or "match" states and a function that
26/// transitions from one state to the next given the next byte of input.
27/// Unfortunately, the interface described by this trait is significantly
28/// more complicated than this. The complexity has a number of different
29/// reasons, mostly motivated by performance, functionality or space savings:
30///
31/// * A DFA can search for multiple patterns simultaneously. This
32/// means extra information is returned when a match occurs. Namely,
33/// a match is not just an offset, but an offset plus a pattern ID.
34/// [`Automaton::pattern_len`] returns the number of patterns compiled into
35/// the DFA, [`Automaton::match_len`] returns the total number of patterns
36/// that match in a particular state and [`Automaton::match_pattern`] permits
37/// iterating over the patterns that match in a particular state.
38/// * A DFA can have multiple start states, and the choice of which start
39/// state to use depends on the content of the string being searched and
40/// position of the search, as well as whether the search is an anchored
41/// search for a specific pattern in the DFA. Moreover, computing the start
42/// state also depends on whether you're doing a forward or a reverse search.
43/// [`Automaton::start_state_forward`] and [`Automaton::start_state_reverse`]
44/// are used to compute the start state for forward and reverse searches,
45/// respectively.
46/// * All matches are delayed by one byte to support things like `$` and `\b`
47/// at the end of a pattern. Therefore, every use of a DFA is required to use
48/// [`Automaton::next_eoi_state`]
49/// at the end of the search to compute the final transition.
50/// * For optimization reasons, some states are treated specially. Every
51/// state is either special or not, which can be determined via the
52/// [`Automaton::is_special_state`] method. If it's special, then the state
53/// must be at least one of a few possible types of states. (Note that some
54/// types can overlap, for example, a match state can also be an accel state.
55/// But some types can't. If a state is a dead state, then it can never be any
56/// other type of state.) Those types are:
57/// * A dead state. A dead state means the DFA will never enter a match
58/// state. This can be queried via the [`Automaton::is_dead_state`] method.
59/// * A quit state. A quit state occurs if the DFA had to stop the search
60/// prematurely for some reason. This can be queried via the
61/// [`Automaton::is_quit_state`] method.
62/// * A match state. A match state occurs when a match is found. When a DFA
63/// enters a match state, the search may stop immediately (when looking
64/// for the earliest match), or it may continue to find the leftmost-first
65/// match. This can be queried via the [`Automaton::is_match_state`]
66/// method.
67/// * A start state. A start state is where a search begins. For every
68/// search, there is exactly one start state that is used, however, a
69/// DFA may contain many start states. When the search is in a start
70/// state, it may use a prefilter to quickly skip to candidate matches
71/// without executing the DFA on every byte. This can be queried via the
72/// [`Automaton::is_start_state`] method.
73/// * An accel state. An accel state is a state that is accelerated.
74/// That is, it is a state where _most_ of its transitions loop back to
75/// itself and only a small number of transitions lead to other states.
76/// This kind of state is said to be accelerated because a search routine
77/// can quickly look for the bytes leading out of the state instead of
78/// continuing to execute the DFA on each byte. This can be queried via the
79/// [`Automaton::is_accel_state`] method. And the bytes that lead out of
80/// the state can be queried via the [`Automaton::accelerator`] method.
81///
82/// There are a number of provided methods on this trait that implement
83/// efficient searching (for forwards and backwards) with a DFA using
84/// all of the above features of this trait. In particular, given the
85/// complexity of all these features, implementing a search routine in
86/// this trait can be a little subtle. With that said, it is possible to
87/// somewhat simplify the search routine. For example, handling accelerated
88/// states is strictly optional, since it is always correct to assume that
89/// `Automaton::is_accel_state` returns false. However, one complex part of
90/// writing a search routine using this trait is handling the 1-byte delay of a
91/// match. That is not optional.
92///
93/// # Safety
94///
95/// This trait is not safe to implement so that code may rely on the
96/// correctness of implementations of this trait to avoid undefined behavior.
97/// The primary correctness guarantees are:
98///
99/// * `Automaton::start_state` always returns a valid state ID or an error or
100/// panics.
101/// * `Automaton::next_state`, when given a valid state ID, always returns
102/// a valid state ID for all values of `anchored` and `byte`, or otherwise
103/// panics.
104///
105/// In general, the rest of the methods on `Automaton` need to uphold their
106/// contracts as well. For example, `Automaton::is_dead` should only returns
107/// true if the given state ID is actually a dead state.
108pub unsafe trait Automaton {
109 /// Transitions from the current state to the next state, given the next
110 /// byte of input.
111 ///
112 /// Implementations must guarantee that the returned ID is always a valid
113 /// ID when `current` refers to a valid ID. Moreover, the transition
114 /// function must be defined for all possible values of `input`.
115 ///
116 /// # Panics
117 ///
118 /// If the given ID does not refer to a valid state, then this routine
119 /// may panic but it also may not panic and instead return an invalid ID.
120 /// However, if the caller provides an invalid ID then this must never
121 /// sacrifice memory safety.
122 ///
123 /// # Example
124 ///
125 /// This shows a simplistic example for walking a DFA for a given haystack
126 /// by using the `next_state` method.
127 ///
128 /// ```
129 /// use regex_automata::{dfa::{Automaton, dense}, Input};
130 ///
131 /// let dfa = dense::DFA::new(r"[a-z]+r")?;
132 /// let haystack = "bar".as_bytes();
133 ///
134 /// // The start state is determined by inspecting the position and the
135 /// // initial bytes of the haystack.
136 /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
137 /// // Walk all the bytes in the haystack.
138 /// for &b in haystack {
139 /// state = dfa.next_state(state, b);
140 /// }
141 /// // Matches are always delayed by 1 byte, so we must explicitly walk the
142 /// // special "EOI" transition at the end of the search.
143 /// state = dfa.next_eoi_state(state);
144 /// assert!(dfa.is_match_state(state));
145 ///
146 /// # Ok::<(), Box<dyn std::error::Error>>(())
147 /// ```
148 fn next_state(&self, current: StateID, input: u8) -> StateID;
149
150 /// Transitions from the current state to the next state, given the next
151 /// byte of input.
152 ///
153 /// Unlike [`Automaton::next_state`], implementations may implement this
154 /// more efficiently by assuming that the `current` state ID is valid.
155 /// Typically, this manifests by eliding bounds checks.
156 ///
157 /// # Safety
158 ///
159 /// Callers of this method must guarantee that `current` refers to a valid
160 /// state ID. If `current` is not a valid state ID for this automaton, then
161 /// calling this routine may result in undefined behavior.
162 ///
163 /// If `current` is valid, then implementations must guarantee that the ID
164 /// returned is valid for all possible values of `input`.
165 unsafe fn next_state_unchecked(
166 &self,
167 current: StateID,
168 input: u8,
169 ) -> StateID;
170
171 /// Transitions from the current state to the next state for the special
172 /// EOI symbol.
173 ///
174 /// Implementations must guarantee that the returned ID is always a valid
175 /// ID when `current` refers to a valid ID.
176 ///
177 /// This routine must be called at the end of every search in a correct
178 /// implementation of search. Namely, DFAs in this crate delay matches
179 /// by one byte in order to support look-around operators. Thus, after
180 /// reaching the end of a haystack, a search implementation must follow one
181 /// last EOI transition.
182 ///
183 /// It is best to think of EOI as an additional symbol in the alphabet of
184 /// a DFA that is distinct from every other symbol. That is, the alphabet
185 /// of DFAs in this crate has a logical size of 257 instead of 256, where
186 /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
187 /// physical alphabet size may be smaller because of alphabet compression
188 /// via equivalence classes, but EOI is always represented somehow in the
189 /// alphabet.)
190 ///
191 /// # Panics
192 ///
193 /// If the given ID does not refer to a valid state, then this routine
194 /// may panic but it also may not panic and instead return an invalid ID.
195 /// However, if the caller provides an invalid ID then this must never
196 /// sacrifice memory safety.
197 ///
198 /// # Example
199 ///
200 /// This shows a simplistic example for walking a DFA for a given haystack,
201 /// and then finishing the search with the final EOI transition.
202 ///
203 /// ```
204 /// use regex_automata::{dfa::{Automaton, dense}, Input};
205 ///
206 /// let dfa = dense::DFA::new(r"[a-z]+r")?;
207 /// let haystack = "bar".as_bytes();
208 ///
209 /// // The start state is determined by inspecting the position and the
210 /// // initial bytes of the haystack.
211 /// //
212 /// // The unwrap is OK because we aren't requesting a start state for a
213 /// // specific pattern.
214 /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
215 /// // Walk all the bytes in the haystack.
216 /// for &b in haystack {
217 /// state = dfa.next_state(state, b);
218 /// }
219 /// // Matches are always delayed by 1 byte, so we must explicitly walk
220 /// // the special "EOI" transition at the end of the search. Without this
221 /// // final transition, the assert below will fail since the DFA will not
222 /// // have entered a match state yet!
223 /// state = dfa.next_eoi_state(state);
224 /// assert!(dfa.is_match_state(state));
225 ///
226 /// # Ok::<(), Box<dyn std::error::Error>>(())
227 /// ```
228 fn next_eoi_state(&self, current: StateID) -> StateID;
229
230 /// Return the ID of the start state for this DFA for the given starting
231 /// configuration.
232 ///
233 /// Unlike typical DFA implementations, the start state for DFAs in this
234 /// crate is dependent on a few different factors:
235 ///
236 /// * The [`Anchored`] mode of the search. Unanchored, anchored and
237 /// anchored searches for a specific [`PatternID`] all use different start
238 /// states.
239 /// * Whether a "look-behind" byte exists. For example, the `^` anchor
240 /// matches if and only if there is no look-behind byte.
241 /// * The specific value of that look-behind byte. For example, a `(?m:^)`
242 /// assertion only matches when there is either no look-behind byte, or
243 /// when the look-behind byte is a line terminator.
244 ///
245 /// The [starting configuration](start::Config) provides the above
246 /// information.
247 ///
248 /// This routine can be used for either forward or reverse searches.
249 /// Although, as a convenience, if you have an [`Input`], then it may
250 /// be more succinct to use [`Automaton::start_state_forward`] or
251 /// [`Automaton::start_state_reverse`]. Note, for example, that the
252 /// convenience routines return a [`MatchError`] on failure where as this
253 /// routine returns a [`StartError`].
254 ///
255 /// # Errors
256 ///
257 /// This may return a [`StartError`] if the search needs to give up when
258 /// determining the start state (for example, if it sees a "quit" byte).
259 /// This can also return an error if the given configuration contains an
260 /// unsupported [`Anchored`] configuration.
261 fn start_state(
262 &self,
263 config: &start::Config,
264 ) -> Result<StateID, StartError>;
265
266 /// Return the ID of the start state for this DFA when executing a forward
267 /// search.
268 ///
269 /// This is a convenience routine for calling [`Automaton::start_state`]
270 /// that converts the given [`Input`] to a [start
271 /// configuration](start::Config). Additionally, if an error occurs, it is
272 /// converted from a [`StartError`] to a [`MatchError`] using the offset
273 /// information in the given [`Input`].
274 ///
275 /// # Errors
276 ///
277 /// This may return a [`MatchError`] if the search needs to give up
278 /// when determining the start state (for example, if it sees a "quit"
279 /// byte). This can also return an error if the given `Input` contains an
280 /// unsupported [`Anchored`] configuration.
281 fn start_state_forward(
282 &self,
283 input: &Input<'_>,
284 ) -> Result<StateID, MatchError> {
285 let config = start::Config::from_input_forward(input);
286 self.start_state(&config).map_err(|err| match err {
287 StartError::Quit { byte } => {
288 let offset = input
289 .start()
290 .checked_sub(1)
291 .expect("no quit in start without look-behind");
292 MatchError::quit(byte, offset)
293 }
294 StartError::UnsupportedAnchored { mode } => {
295 MatchError::unsupported_anchored(mode)
296 }
297 })
298 }
299
300 /// Return the ID of the start state for this DFA when executing a reverse
301 /// search.
302 ///
303 /// This is a convenience routine for calling [`Automaton::start_state`]
304 /// that converts the given [`Input`] to a [start
305 /// configuration](start::Config). Additionally, if an error occurs, it is
306 /// converted from a [`StartError`] to a [`MatchError`] using the offset
307 /// information in the given [`Input`].
308 ///
309 /// # Errors
310 ///
311 /// This may return a [`MatchError`] if the search needs to give up
312 /// when determining the start state (for example, if it sees a "quit"
313 /// byte). This can also return an error if the given `Input` contains an
314 /// unsupported [`Anchored`] configuration.
315 fn start_state_reverse(
316 &self,
317 input: &Input<'_>,
318 ) -> Result<StateID, MatchError> {
319 let config = start::Config::from_input_reverse(input);
320 self.start_state(&config).map_err(|err| match err {
321 StartError::Quit { byte } => {
322 let offset = input.end();
323 MatchError::quit(byte, offset)
324 }
325 StartError::UnsupportedAnchored { mode } => {
326 MatchError::unsupported_anchored(mode)
327 }
328 })
329 }
330
331 /// If this DFA has a universal starting state for the given anchor mode
332 /// and the DFA supports universal starting states, then this returns that
333 /// state's identifier.
334 ///
335 /// A DFA is said to have a universal starting state when the starting
336 /// state is invariant with respect to the haystack. Usually, the starting
337 /// state is chosen depending on the bytes immediately surrounding the
338 /// starting position of a search. However, the starting state only differs
339 /// when one or more of the patterns in the DFA have look-around assertions
340 /// in its prefix.
341 ///
342 /// Stated differently, if none of the patterns in a DFA have look-around
343 /// assertions in their prefix, then the DFA has a universal starting state
344 /// and _may_ be returned by this method.
345 ///
346 /// It is always correct for implementations to return `None`, and indeed,
347 /// this is what the default implementation does. When this returns `None`,
348 /// callers must use either `start_state_forward` or `start_state_reverse`
349 /// to get the starting state.
350 ///
351 /// # Use case
352 ///
353 /// There are a few reasons why one might want to use this:
354 ///
355 /// * If you know your regex patterns have no look-around assertions in
356 /// their prefix, then calling this routine is likely cheaper and perhaps
357 /// more semantically meaningful.
358 /// * When implementing prefilter support in a DFA regex implementation,
359 /// it is necessary to re-compute the start state after a candidate
360 /// is returned from the prefilter. However, this is only needed when
361 /// there isn't a universal start state. When one exists, one can avoid
362 /// re-computing the start state.
363 ///
364 /// # Example
365 ///
366 /// ```
367 /// use regex_automata::{
368 /// dfa::{Automaton, dense::DFA},
369 /// Anchored,
370 /// };
371 ///
372 /// // There are no look-around assertions in the prefixes of any of the
373 /// // patterns, so we get a universal start state.
374 /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+$", "[A-Z]+"])?;
375 /// assert!(dfa.universal_start_state(Anchored::No).is_some());
376 /// assert!(dfa.universal_start_state(Anchored::Yes).is_some());
377 ///
378 /// // One of the patterns has a look-around assertion in its prefix,
379 /// // so this means there is no longer a universal start state.
380 /// let dfa = DFA::new_many(&["[0-9]+", "^[a-z]+$", "[A-Z]+"])?;
381 /// assert!(!dfa.universal_start_state(Anchored::No).is_some());
382 /// assert!(!dfa.universal_start_state(Anchored::Yes).is_some());
383 /// # Ok::<(), Box<dyn std::error::Error>>(())
384 /// ```
385 #[inline]
386 fn universal_start_state(&self, _mode: Anchored) -> Option<StateID> {
387 None
388 }
389
390 /// Returns true if and only if the given identifier corresponds to a
391 /// "special" state. A special state is one or more of the following:
392 /// a dead state, a quit state, a match state, a start state or an
393 /// accelerated state.
394 ///
395 /// A correct implementation _may_ always return false for states that
396 /// are either start states or accelerated states, since that information
397 /// is only intended to be used for optimization purposes. Correct
398 /// implementations must return true if the state is a dead, quit or match
399 /// state. This is because search routines using this trait must be able
400 /// to rely on `is_special_state` as an indicator that a state may need
401 /// special treatment. (For example, when a search routine sees a dead
402 /// state, it must terminate.)
403 ///
404 /// This routine permits search implementations to use a single branch to
405 /// check whether a state needs special attention before executing the next
406 /// transition. The example below shows how to do this.
407 ///
408 /// # Example
409 ///
410 /// This example shows how `is_special_state` can be used to implement a
411 /// correct search routine with minimal branching. In particular, this
412 /// search routine implements "leftmost" matching, which means that it
413 /// doesn't immediately stop once a match is found. Instead, it continues
414 /// until it reaches a dead state.
415 ///
416 /// ```
417 /// use regex_automata::{
418 /// dfa::{Automaton, dense},
419 /// HalfMatch, MatchError, Input,
420 /// };
421 ///
422 /// fn find<A: Automaton>(
423 /// dfa: &A,
424 /// haystack: &[u8],
425 /// ) -> Result<Option<HalfMatch>, MatchError> {
426 /// // The start state is determined by inspecting the position and the
427 /// // initial bytes of the haystack. Note that start states can never
428 /// // be match states (since DFAs in this crate delay matches by 1
429 /// // byte), so we don't need to check if the start state is a match.
430 /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
431 /// let mut last_match = None;
432 /// // Walk all the bytes in the haystack. We can quit early if we see
433 /// // a dead or a quit state. The former means the automaton will
434 /// // never transition to any other state. The latter means that the
435 /// // automaton entered a condition in which its search failed.
436 /// for (i, &b) in haystack.iter().enumerate() {
437 /// state = dfa.next_state(state, b);
438 /// if dfa.is_special_state(state) {
439 /// if dfa.is_match_state(state) {
440 /// last_match = Some(HalfMatch::new(
441 /// dfa.match_pattern(state, 0),
442 /// i,
443 /// ));
444 /// } else if dfa.is_dead_state(state) {
445 /// return Ok(last_match);
446 /// } else if dfa.is_quit_state(state) {
447 /// // It is possible to enter into a quit state after
448 /// // observing a match has occurred. In that case, we
449 /// // should return the match instead of an error.
450 /// if last_match.is_some() {
451 /// return Ok(last_match);
452 /// }
453 /// return Err(MatchError::quit(b, i));
454 /// }
455 /// // Implementors may also want to check for start or accel
456 /// // states and handle them differently for performance
457 /// // reasons. But it is not necessary for correctness.
458 /// }
459 /// }
460 /// // Matches are always delayed by 1 byte, so we must explicitly walk
461 /// // the special "EOI" transition at the end of the search.
462 /// state = dfa.next_eoi_state(state);
463 /// if dfa.is_match_state(state) {
464 /// last_match = Some(HalfMatch::new(
465 /// dfa.match_pattern(state, 0),
466 /// haystack.len(),
467 /// ));
468 /// }
469 /// Ok(last_match)
470 /// }
471 ///
472 /// // We use a greedy '+' operator to show how the search doesn't just
473 /// // stop once a match is detected. It continues extending the match.
474 /// // Using '[a-z]+?' would also work as expected and stop the search
475 /// // early. Greediness is built into the automaton.
476 /// let dfa = dense::DFA::new(r"[a-z]+")?;
477 /// let haystack = "123 foobar 4567".as_bytes();
478 /// let mat = find(&dfa, haystack)?.unwrap();
479 /// assert_eq!(mat.pattern().as_usize(), 0);
480 /// assert_eq!(mat.offset(), 10);
481 ///
482 /// // Here's another example that tests our handling of the special EOI
483 /// // transition. This will fail to find a match if we don't call
484 /// // 'next_eoi_state' at the end of the search since the match isn't
485 /// // found until the final byte in the haystack.
486 /// let dfa = dense::DFA::new(r"[0-9]{4}")?;
487 /// let haystack = "123 foobar 4567".as_bytes();
488 /// let mat = find(&dfa, haystack)?.unwrap();
489 /// assert_eq!(mat.pattern().as_usize(), 0);
490 /// assert_eq!(mat.offset(), 15);
491 ///
492 /// // And note that our search implementation above automatically works
493 /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
494 /// // the appropriate pattern ID for us.
495 /// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
496 /// let haystack = "123 foobar 4567".as_bytes();
497 /// let mat = find(&dfa, haystack)?.unwrap();
498 /// assert_eq!(mat.pattern().as_usize(), 1);
499 /// assert_eq!(mat.offset(), 3);
500 /// let mat = find(&dfa, &haystack[3..])?.unwrap();
501 /// assert_eq!(mat.pattern().as_usize(), 0);
502 /// assert_eq!(mat.offset(), 7);
503 /// let mat = find(&dfa, &haystack[10..])?.unwrap();
504 /// assert_eq!(mat.pattern().as_usize(), 1);
505 /// assert_eq!(mat.offset(), 5);
506 ///
507 /// # Ok::<(), Box<dyn std::error::Error>>(())
508 /// ```
509 fn is_special_state(&self, id: StateID) -> bool;
510
511 /// Returns true if and only if the given identifier corresponds to a dead
512 /// state. When a DFA enters a dead state, it is impossible to leave. That
513 /// is, every transition on a dead state by definition leads back to the
514 /// same dead state.
515 ///
516 /// In practice, the dead state always corresponds to the identifier `0`.
517 /// Moreover, in practice, there is only one dead state.
518 ///
519 /// The existence of a dead state is not strictly required in the classical
520 /// model of finite state machines, where one generally only cares about
521 /// the question of whether an input sequence matches or not. Dead states
522 /// are not needed to answer that question, since one can immediately quit
523 /// as soon as one enters a final or "match" state. However, we don't just
524 /// care about matches but also care about the location of matches, and
525 /// more specifically, care about semantics like "greedy" matching.
526 ///
527 /// For example, given the pattern `a+` and the input `aaaz`, the dead
528 /// state won't be entered until the state machine reaches `z` in the
529 /// input, at which point, the search routine can quit. But without the
530 /// dead state, the search routine wouldn't know when to quit. In a
531 /// classical representation, the search routine would stop after seeing
532 /// the first `a` (which is when the search would enter a match state). But
533 /// this wouldn't implement "greedy" matching where `a+` matches as many
534 /// `a`'s as possible.
535 ///
536 /// # Example
537 ///
538 /// See the example for [`Automaton::is_special_state`] for how to use this
539 /// method correctly.
540 fn is_dead_state(&self, id: StateID) -> bool;
541
542 /// Returns true if and only if the given identifier corresponds to a quit
543 /// state. A quit state is like a dead state (it has no transitions other
544 /// than to itself), except it indicates that the DFA failed to complete
545 /// the search. When this occurs, callers can neither accept or reject that
546 /// a match occurred.
547 ///
548 /// In practice, the quit state always corresponds to the state immediately
549 /// following the dead state. (Which is not usually represented by `1`,
550 /// since state identifiers are pre-multiplied by the state machine's
551 /// alphabet stride, and the alphabet stride varies between DFAs.)
552 ///
553 /// The typical way in which a quit state can occur is when heuristic
554 /// support for Unicode word boundaries is enabled via the
555 /// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary)
556 /// option. But other options, like the lower level
557 /// [`dense::Config::quit`](crate::dfa::dense::Config::quit)
558 /// configuration, can also result in a quit state being entered. The
559 /// purpose of the quit state is to provide a way to execute a fast DFA
560 /// in common cases while delegating to slower routines when the DFA quits.
561 ///
562 /// The default search implementations provided by this crate will return a
563 /// [`MatchError::quit`] error when a quit state is entered.
564 ///
565 /// # Example
566 ///
567 /// See the example for [`Automaton::is_special_state`] for how to use this
568 /// method correctly.
569 fn is_quit_state(&self, id: StateID) -> bool;
570
571 /// Returns true if and only if the given identifier corresponds to a
572 /// match state. A match state is also referred to as a "final" state and
573 /// indicates that a match has been found.
574 ///
575 /// If all you care about is whether a particular pattern matches in the
576 /// input sequence, then a search routine can quit early as soon as the
577 /// machine enters a match state. However, if you're looking for the
578 /// standard "leftmost-first" match location, then search _must_ continue
579 /// until either the end of the input or until the machine enters a dead
580 /// state. (Since either condition implies that no other useful work can
581 /// be done.) Namely, when looking for the location of a match, then
582 /// search implementations should record the most recent location in
583 /// which a match state was entered, but otherwise continue executing the
584 /// search as normal. (The search may even leave the match state.) Once
585 /// the termination condition is reached, the most recently recorded match
586 /// location should be returned.
587 ///
588 /// Finally, one additional power given to match states in this crate
589 /// is that they are always associated with a specific pattern in order
590 /// to support multi-DFAs. See [`Automaton::match_pattern`] for more
591 /// details and an example for how to query the pattern associated with a
592 /// particular match state.
593 ///
594 /// # Example
595 ///
596 /// See the example for [`Automaton::is_special_state`] for how to use this
597 /// method correctly.
598 fn is_match_state(&self, id: StateID) -> bool;
599
600 /// Returns true only if the given identifier corresponds to a start
601 /// state
602 ///
603 /// A start state is a state in which a DFA begins a search.
604 /// All searches begin in a start state. Moreover, since all matches are
605 /// delayed by one byte, a start state can never be a match state.
606 ///
607 /// The main role of a start state is, as mentioned, to be a starting
608 /// point for a DFA. This starting point is determined via one of
609 /// [`Automaton::start_state_forward`] or
610 /// [`Automaton::start_state_reverse`], depending on whether one is doing
611 /// a forward or a reverse search, respectively.
612 ///
613 /// A secondary use of start states is for prefix acceleration. Namely,
614 /// while executing a search, if one detects that you're in a start state,
615 /// then it may be faster to look for the next match of a prefix of the
616 /// pattern, if one exists. If a prefix exists and since all matches must
617 /// begin with that prefix, then skipping ahead to occurrences of that
618 /// prefix may be much faster than executing the DFA.
619 ///
620 /// As mentioned in the documentation for
621 /// [`is_special_state`](Automaton::is_special_state) implementations
622 /// _may_ always return false, even if the given identifier is a start
623 /// state. This is because knowing whether a state is a start state or not
624 /// is not necessary for correctness and is only treated as a potential
625 /// performance optimization. (For example, the implementations of this
626 /// trait in this crate will only return true when the given identifier
627 /// corresponds to a start state and when [specialization of start
628 /// states](crate::dfa::dense::Config::specialize_start_states) was enabled
629 /// during DFA construction. If start state specialization is disabled
630 /// (which is the default), then this method will always return false.)
631 ///
632 /// # Example
633 ///
634 /// This example shows how to implement your own search routine that does
635 /// a prefix search whenever the search enters a start state.
636 ///
637 /// Note that you do not need to implement your own search routine
638 /// to make use of prefilters like this. The search routines
639 /// provided by this crate already implement prefilter support via
640 /// the [`Prefilter`](crate::util::prefilter::Prefilter) trait.
641 /// A prefilter can be added to your search configuration with
642 /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter) for
643 /// dense and sparse DFAs in this crate.
644 ///
645 /// This example is meant to show how you might deal with prefilters in a
646 /// simplified case if you are implementing your own search routine.
647 ///
648 /// ```
649 /// use regex_automata::{
650 /// dfa::{Automaton, dense},
651 /// HalfMatch, MatchError, Input,
652 /// };
653 ///
654 /// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option<usize> {
655 /// // Would be faster to use the memchr crate, but this is still
656 /// // faster than running through the DFA.
657 /// slice[at..].iter().position(|&b| b == byte).map(|i| at + i)
658 /// }
659 ///
660 /// fn find<A: Automaton>(
661 /// dfa: &A,
662 /// haystack: &[u8],
663 /// prefix_byte: Option<u8>,
664 /// ) -> Result<Option<HalfMatch>, MatchError> {
665 /// // See the Automaton::is_special_state example for similar code
666 /// // with more comments.
667 ///
668 /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
669 /// let mut last_match = None;
670 /// let mut pos = 0;
671 /// while pos < haystack.len() {
672 /// let b = haystack[pos];
673 /// state = dfa.next_state(state, b);
674 /// pos += 1;
675 /// if dfa.is_special_state(state) {
676 /// if dfa.is_match_state(state) {
677 /// last_match = Some(HalfMatch::new(
678 /// dfa.match_pattern(state, 0),
679 /// pos - 1,
680 /// ));
681 /// } else if dfa.is_dead_state(state) {
682 /// return Ok(last_match);
683 /// } else if dfa.is_quit_state(state) {
684 /// // It is possible to enter into a quit state after
685 /// // observing a match has occurred. In that case, we
686 /// // should return the match instead of an error.
687 /// if last_match.is_some() {
688 /// return Ok(last_match);
689 /// }
690 /// return Err(MatchError::quit(b, pos - 1));
691 /// } else if dfa.is_start_state(state) {
692 /// // If we're in a start state and know all matches begin
693 /// // with a particular byte, then we can quickly skip to
694 /// // candidate matches without running the DFA through
695 /// // every byte inbetween.
696 /// if let Some(prefix_byte) = prefix_byte {
697 /// pos = match find_byte(haystack, pos, prefix_byte) {
698 /// Some(pos) => pos,
699 /// None => break,
700 /// };
701 /// }
702 /// }
703 /// }
704 /// }
705 /// // Matches are always delayed by 1 byte, so we must explicitly walk
706 /// // the special "EOI" transition at the end of the search.
707 /// state = dfa.next_eoi_state(state);
708 /// if dfa.is_match_state(state) {
709 /// last_match = Some(HalfMatch::new(
710 /// dfa.match_pattern(state, 0),
711 /// haystack.len(),
712 /// ));
713 /// }
714 /// Ok(last_match)
715 /// }
716 ///
717 /// // In this example, it's obvious that all occurrences of our pattern
718 /// // begin with 'Z', so we pass in 'Z'. Note also that we need to
719 /// // enable start state specialization, or else it won't be possible to
720 /// // detect start states during a search. ('is_start_state' would always
721 /// // return false.)
722 /// let dfa = dense::DFA::builder()
723 /// .configure(dense::DFA::config().specialize_start_states(true))
724 /// .build(r"Z[a-z]+")?;
725 /// let haystack = "123 foobar Zbaz quux".as_bytes();
726 /// let mat = find(&dfa, haystack, Some(b'Z'))?.unwrap();
727 /// assert_eq!(mat.pattern().as_usize(), 0);
728 /// assert_eq!(mat.offset(), 15);
729 ///
730 /// // But note that we don't need to pass in a prefix byte. If we don't,
731 /// // then the search routine does no acceleration.
732 /// let mat = find(&dfa, haystack, None)?.unwrap();
733 /// assert_eq!(mat.pattern().as_usize(), 0);
734 /// assert_eq!(mat.offset(), 15);
735 ///
736 /// // However, if we pass an incorrect byte, then the prefix search will
737 /// // result in incorrect results.
738 /// assert_eq!(find(&dfa, haystack, Some(b'X'))?, None);
739 ///
740 /// # Ok::<(), Box<dyn std::error::Error>>(())
741 /// ```
742 fn is_start_state(&self, id: StateID) -> bool;
743
744 /// Returns true if and only if the given identifier corresponds to an
745 /// accelerated state.
746 ///
747 /// An accelerated state is a special optimization
748 /// trick implemented by this crate. Namely, if
749 /// [`dense::Config::accelerate`](crate::dfa::dense::Config::accelerate) is
750 /// enabled (and it is by default), then DFAs generated by this crate will
751 /// tag states meeting certain characteristics as accelerated. States meet
752 /// this criteria whenever most of their transitions are self-transitions.
753 /// That is, transitions that loop back to the same state. When a small
754 /// number of transitions aren't self-transitions, then it follows that
755 /// there are only a small number of bytes that can cause the DFA to leave
756 /// that state. Thus, there is an opportunity to look for those bytes
757 /// using more optimized routines rather than continuing to run through
758 /// the DFA. This trick is similar to the prefilter idea described in
759 /// the documentation of [`Automaton::is_start_state`] with two main
760 /// differences:
761 ///
762 /// 1. It is more limited since acceleration only applies to single bytes.
763 /// This means states are rarely accelerated when Unicode mode is enabled
764 /// (which is enabled by default).
765 /// 2. It can occur anywhere in the DFA, which increases optimization
766 /// opportunities.
767 ///
768 /// Like the prefilter idea, the main downside (and a possible reason to
769 /// disable it) is that it can lead to worse performance in some cases.
770 /// Namely, if a state is accelerated for very common bytes, then the
771 /// overhead of checking for acceleration and using the more optimized
772 /// routines to look for those bytes can cause overall performance to be
773 /// worse than if acceleration wasn't enabled at all.
774 ///
775 /// A simple example of a regex that has an accelerated state is
776 /// `(?-u)[^a]+a`. Namely, the `[^a]+` sub-expression gets compiled down
777 /// into a single state where all transitions except for `a` loop back to
778 /// itself, and where `a` is the only transition (other than the special
779 /// EOI transition) that goes to some other state. Thus, this state can
780 /// be accelerated and implemented more efficiently by calling an
781 /// optimized routine like `memchr` with `a` as the needle. Notice that
782 /// the `(?-u)` to disable Unicode is necessary here, as without it,
783 /// `[^a]` will match any UTF-8 encoding of any Unicode scalar value other
784 /// than `a`. This more complicated expression compiles down to many DFA
785 /// states and the simple acceleration optimization is no longer available.
786 ///
787 /// Typically, this routine is used to guard calls to
788 /// [`Automaton::accelerator`], which returns the accelerated bytes for
789 /// the specified state.
790 fn is_accel_state(&self, id: StateID) -> bool;
791
792 /// Returns the total number of patterns compiled into this DFA.
793 ///
794 /// In the case of a DFA that contains no patterns, this must return `0`.
795 ///
796 /// # Example
797 ///
798 /// This example shows the pattern length for a DFA that never matches:
799 ///
800 /// ```
801 /// use regex_automata::dfa::{Automaton, dense::DFA};
802 ///
803 /// let dfa: DFA<Vec<u32>> = DFA::never_match()?;
804 /// assert_eq!(dfa.pattern_len(), 0);
805 /// # Ok::<(), Box<dyn std::error::Error>>(())
806 /// ```
807 ///
808 /// And another example for a DFA that matches at every position:
809 ///
810 /// ```
811 /// use regex_automata::dfa::{Automaton, dense::DFA};
812 ///
813 /// let dfa: DFA<Vec<u32>> = DFA::always_match()?;
814 /// assert_eq!(dfa.pattern_len(), 1);
815 /// # Ok::<(), Box<dyn std::error::Error>>(())
816 /// ```
817 ///
818 /// And finally, a DFA that was constructed from multiple patterns:
819 ///
820 /// ```
821 /// use regex_automata::dfa::{Automaton, dense::DFA};
822 ///
823 /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
824 /// assert_eq!(dfa.pattern_len(), 3);
825 /// # Ok::<(), Box<dyn std::error::Error>>(())
826 /// ```
827 fn pattern_len(&self) -> usize;
828
829 /// Returns the total number of patterns that match in this state.
830 ///
831 /// If the given state is not a match state, then implementations may
832 /// panic.
833 ///
834 /// If the DFA was compiled with one pattern, then this must necessarily
835 /// always return `1` for all match states.
836 ///
837 /// Implementations must guarantee that [`Automaton::match_pattern`] can be
838 /// called with indices up to (but not including) the length returned by
839 /// this routine without panicking.
840 ///
841 /// # Panics
842 ///
843 /// Implementations are permitted to panic if the provided state ID does
844 /// not correspond to a match state.
845 ///
846 /// # Example
847 ///
848 /// This example shows a simple instance of implementing overlapping
849 /// matches. In particular, it shows not only how to determine how many
850 /// patterns have matched in a particular state, but also how to access
851 /// which specific patterns have matched.
852 ///
853 /// Notice that we must use
854 /// [`MatchKind::All`](crate::MatchKind::All)
855 /// when building the DFA. If we used
856 /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst)
857 /// instead, then the DFA would not be constructed in a way that
858 /// supports overlapping matches. (It would only report a single pattern
859 /// that matches at any particular point in time.)
860 ///
861 /// Another thing to take note of is the patterns used and the order in
862 /// which the pattern IDs are reported. In the example below, pattern `3`
863 /// is yielded first. Why? Because it corresponds to the match that
864 /// appears first. Namely, the `@` symbol is part of `\S+` but not part
865 /// of any of the other patterns. Since the `\S+` pattern has a match that
866 /// starts to the left of any other pattern, its ID is returned before any
867 /// other.
868 ///
869 /// ```
870 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
871 /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchKind};
872 ///
873 /// let dfa = dense::Builder::new()
874 /// .configure(dense::Config::new().match_kind(MatchKind::All))
875 /// .build_many(&[
876 /// r"[[:word:]]+", r"[a-z]+", r"[A-Z]+", r"[[:^space:]]+",
877 /// ])?;
878 /// let haystack = "@bar".as_bytes();
879 ///
880 /// // The start state is determined by inspecting the position and the
881 /// // initial bytes of the haystack.
882 /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
883 /// // Walk all the bytes in the haystack.
884 /// for &b in haystack {
885 /// state = dfa.next_state(state, b);
886 /// }
887 /// state = dfa.next_eoi_state(state);
888 ///
889 /// assert!(dfa.is_match_state(state));
890 /// assert_eq!(dfa.match_len(state), 3);
891 /// // The following calls are guaranteed to not panic since `match_len`
892 /// // returned `3` above.
893 /// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3);
894 /// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0);
895 /// assert_eq!(dfa.match_pattern(state, 2).as_usize(), 1);
896 ///
897 /// # Ok::<(), Box<dyn std::error::Error>>(())
898 /// ```
899 fn match_len(&self, id: StateID) -> usize;
900
901 /// Returns the pattern ID corresponding to the given match index in the
902 /// given state.
903 ///
904 /// See [`Automaton::match_len`] for an example of how to use this
905 /// method correctly. Note that if you know your DFA is compiled with a
906 /// single pattern, then this routine is never necessary since it will
907 /// always return a pattern ID of `0` for an index of `0` when `id`
908 /// corresponds to a match state.
909 ///
910 /// Typically, this routine is used when implementing an overlapping
911 /// search, as the example for `Automaton::match_len` does.
912 ///
913 /// # Panics
914 ///
915 /// If the state ID is not a match state or if the match index is out
916 /// of bounds for the given state, then this routine may either panic
917 /// or produce an incorrect result. If the state ID is correct and the
918 /// match index is correct, then this routine must always produce a valid
919 /// `PatternID`.
920 fn match_pattern(&self, id: StateID, index: usize) -> PatternID;
921
922 /// Returns true if and only if this automaton can match the empty string.
923 /// When it returns false, all possible matches are guaranteed to have a
924 /// non-zero length.
925 ///
926 /// This is useful as cheap way to know whether code needs to handle the
927 /// case of a zero length match. This is particularly important when UTF-8
928 /// modes are enabled, as when UTF-8 mode is enabled, empty matches that
929 /// split a codepoint must never be reported. This extra handling can
930 /// sometimes be costly, and since regexes matching an empty string are
931 /// somewhat rare, it can be beneficial to treat such regexes specially.
932 ///
933 /// # Example
934 ///
935 /// This example shows a few different DFAs and whether they match the
936 /// empty string or not. Notice the empty string isn't merely a matter
937 /// of a string of length literally `0`, but rather, whether a match can
938 /// occur between specific pairs of bytes.
939 ///
940 /// ```
941 /// use regex_automata::{dfa::{dense::DFA, Automaton}, util::syntax};
942 ///
943 /// // The empty regex matches the empty string.
944 /// let dfa = DFA::new("")?;
945 /// assert!(dfa.has_empty(), "empty matches empty");
946 /// // The '+' repetition operator requires at least one match, and so
947 /// // does not match the empty string.
948 /// let dfa = DFA::new("a+")?;
949 /// assert!(!dfa.has_empty(), "+ does not match empty");
950 /// // But the '*' repetition operator does.
951 /// let dfa = DFA::new("a*")?;
952 /// assert!(dfa.has_empty(), "* does match empty");
953 /// // And wrapping '+' in an operator that can match an empty string also
954 /// // causes it to match the empty string too.
955 /// let dfa = DFA::new("(a+)*")?;
956 /// assert!(dfa.has_empty(), "+ inside of * matches empty");
957 ///
958 /// // If a regex is just made of a look-around assertion, even if the
959 /// // assertion requires some kind of non-empty string around it (such as
960 /// // \b), then it is still treated as if it matches the empty string.
961 /// // Namely, if a match occurs of just a look-around assertion, then the
962 /// // match returned is empty.
963 /// let dfa = DFA::builder()
964 /// .configure(DFA::config().unicode_word_boundary(true))
965 /// .syntax(syntax::Config::new().utf8(false))
966 /// .build(r"^$\A\z\b\B(?-u:\b\B)")?;
967 /// assert!(dfa.has_empty(), "assertions match empty");
968 /// // Even when an assertion is wrapped in a '+', it still matches the
969 /// // empty string.
970 /// let dfa = DFA::new(r"^+")?;
971 /// assert!(dfa.has_empty(), "+ of an assertion matches empty");
972 ///
973 /// // An alternation with even one branch that can match the empty string
974 /// // is also said to match the empty string overall.
975 /// let dfa = DFA::new("foo|(bar)?|quux")?;
976 /// assert!(dfa.has_empty(), "alternations can match empty");
977 ///
978 /// // An NFA that matches nothing does not match the empty string.
979 /// let dfa = DFA::new("[a&&b]")?;
980 /// assert!(!dfa.has_empty(), "never matching means not matching empty");
981 /// // But if it's wrapped in something that doesn't require a match at
982 /// // all, then it can match the empty string!
983 /// let dfa = DFA::new("[a&&b]*")?;
984 /// assert!(dfa.has_empty(), "* on never-match still matches empty");
985 /// // Since a '+' requires a match, using it on something that can never
986 /// // match will itself produce a regex that can never match anything,
987 /// // and thus does not match the empty string.
988 /// let dfa = DFA::new("[a&&b]+")?;
989 /// assert!(!dfa.has_empty(), "+ on never-match still matches nothing");
990 ///
991 /// # Ok::<(), Box<dyn std::error::Error>>(())
992 /// ```
993 fn has_empty(&self) -> bool;
994
995 /// Whether UTF-8 mode is enabled for this DFA or not.
996 ///
997 /// When UTF-8 mode is enabled, all matches reported by a DFA are
998 /// guaranteed to correspond to spans of valid UTF-8. This includes
999 /// zero-width matches. For example, the DFA must guarantee that the empty
1000 /// regex will not match at the positions between code units in the UTF-8
1001 /// encoding of a single codepoint.
1002 ///
1003 /// See [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) for
1004 /// more information.
1005 ///
1006 /// # Example
1007 ///
1008 /// This example shows how UTF-8 mode can impact the match spans that may
1009 /// be reported in certain cases.
1010 ///
1011 /// ```
1012 /// use regex_automata::{
1013 /// dfa::{dense::DFA, Automaton},
1014 /// nfa::thompson,
1015 /// HalfMatch, Input,
1016 /// };
1017 ///
1018 /// // UTF-8 mode is enabled by default.
1019 /// let re = DFA::new("")?;
1020 /// assert!(re.is_utf8());
1021 /// let mut input = Input::new("☃");
1022 /// let got = re.try_search_fwd(&input)?;
1023 /// assert_eq!(Some(HalfMatch::must(0, 0)), got);
1024 ///
1025 /// // Even though an empty regex matches at 1..1, our next match is
1026 /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
1027 /// // three bytes long).
1028 /// input.set_start(1);
1029 /// let got = re.try_search_fwd(&input)?;
1030 /// assert_eq!(Some(HalfMatch::must(0, 3)), got);
1031 ///
1032 /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
1033 /// let re = DFA::builder()
1034 /// .thompson(thompson::Config::new().utf8(false))
1035 /// .build("")?;
1036 /// assert!(!re.is_utf8());
1037 /// let got = re.try_search_fwd(&input)?;
1038 /// assert_eq!(Some(HalfMatch::must(0, 1)), got);
1039 ///
1040 /// input.set_start(2);
1041 /// let got = re.try_search_fwd(&input)?;
1042 /// assert_eq!(Some(HalfMatch::must(0, 2)), got);
1043 ///
1044 /// input.set_start(3);
1045 /// let got = re.try_search_fwd(&input)?;
1046 /// assert_eq!(Some(HalfMatch::must(0, 3)), got);
1047 ///
1048 /// input.set_start(4);
1049 /// let got = re.try_search_fwd(&input)?;
1050 /// assert_eq!(None, got);
1051 ///
1052 /// # Ok::<(), Box<dyn std::error::Error>>(())
1053 /// ```
1054 fn is_utf8(&self) -> bool;
1055
1056 /// Returns true if and only if this DFA is limited to returning matches
1057 /// whose start position is `0`.
1058 ///
1059 /// Note that if you're using DFAs provided by
1060 /// this crate, then this is _orthogonal_ to
1061 /// [`Config::start_kind`](crate::dfa::dense::Config::start_kind).
1062 ///
1063 /// This is useful in some cases because if a DFA is limited to producing
1064 /// matches that start at offset `0`, then a reverse search is never
1065 /// required for finding the start of a match.
1066 ///
1067 /// # Example
1068 ///
1069 /// ```
1070 /// use regex_automata::dfa::{dense::DFA, Automaton};
1071 ///
1072 /// // The empty regex matches anywhere
1073 /// let dfa = DFA::new("")?;
1074 /// assert!(!dfa.is_always_start_anchored(), "empty matches anywhere");
1075 /// // 'a' matches anywhere.
1076 /// let dfa = DFA::new("a")?;
1077 /// assert!(!dfa.is_always_start_anchored(), "'a' matches anywhere");
1078 /// // '^' only matches at offset 0!
1079 /// let dfa = DFA::new("^a")?;
1080 /// assert!(dfa.is_always_start_anchored(), "'^a' matches only at 0");
1081 /// // But '(?m:^)' matches at 0 but at other offsets too.
1082 /// let dfa = DFA::new("(?m:^)a")?;
1083 /// assert!(!dfa.is_always_start_anchored(), "'(?m:^)a' matches anywhere");
1084 ///
1085 /// # Ok::<(), Box<dyn std::error::Error>>(())
1086 /// ```
1087 fn is_always_start_anchored(&self) -> bool;
1088
1089 /// Return a slice of bytes to accelerate for the given state, if possible.
1090 ///
1091 /// If the given state has no accelerator, then an empty slice must be
1092 /// returned. If `Automaton::is_accel_state` returns true for the given ID,
1093 /// then this routine _must_ return a non-empty slice. But note that it is
1094 /// not required for an implementation of this trait to ever return `true`
1095 /// for `is_accel_state`, even if the state _could_ be accelerated. That
1096 /// is, acceleration is an optional optimization. But the return values of
1097 /// `is_accel_state` and `accelerator` must be in sync.
1098 ///
1099 /// If the given ID is not a valid state ID for this automaton, then
1100 /// implementations may panic or produce incorrect results.
1101 ///
1102 /// See [`Automaton::is_accel_state`] for more details on state
1103 /// acceleration.
1104 ///
1105 /// By default, this method will always return an empty slice.
1106 ///
1107 /// # Example
1108 ///
1109 /// This example shows a contrived case in which we build a regex that we
1110 /// know is accelerated and extract the accelerator from a state.
1111 ///
1112 /// ```
1113 /// use regex_automata::{
1114 /// dfa::{Automaton, dense},
1115 /// util::{primitives::StateID, syntax},
1116 /// };
1117 ///
1118 /// let dfa = dense::Builder::new()
1119 /// // We disable Unicode everywhere and permit the regex to match
1120 /// // invalid UTF-8. e.g., [^abc] matches \xFF, which is not valid
1121 /// // UTF-8. If we left Unicode enabled, [^abc] would match any UTF-8
1122 /// // encoding of any Unicode scalar value except for 'a', 'b' or 'c'.
1123 /// // That translates to a much more complicated DFA, and also
1124 /// // inhibits the 'accelerator' optimization that we are trying to
1125 /// // demonstrate in this example.
1126 /// .syntax(syntax::Config::new().unicode(false).utf8(false))
1127 /// .build("[^abc]+a")?;
1128 ///
1129 /// // Here we just pluck out the state that we know is accelerated.
1130 /// // While the stride calculations are something that can be relied
1131 /// // on by callers, the specific position of the accelerated state is
1132 /// // implementation defined.
1133 /// //
1134 /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'.
1135 /// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`.
1136 /// let id = StateID::new(3 * dfa.stride()).unwrap();
1137 /// let accelerator = dfa.accelerator(id);
1138 /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated.
1139 /// assert_eq!(accelerator, &[b'a', b'b', b'c']);
1140 /// # Ok::<(), Box<dyn std::error::Error>>(())
1141 /// ```
1142 #[inline]
1143 fn accelerator(&self, _id: StateID) -> &[u8] {
1144 &[]
1145 }
1146
1147 /// Returns the prefilter associated with a DFA, if one exists.
1148 ///
1149 /// The default implementation of this trait always returns `None`. And
1150 /// indeed, it is always correct to return `None`.
1151 ///
1152 /// For DFAs in this crate, a prefilter can be attached to a DFA via
1153 /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter).
1154 ///
1155 /// Do note that prefilters are not serialized by DFAs in this crate.
1156 /// So if you deserialize a DFA that had a prefilter attached to it
1157 /// at serialization time, then it will not have a prefilter after
1158 /// deserialization.
1159 #[inline]
1160 fn get_prefilter(&self) -> Option<&Prefilter> {
1161 None
1162 }
1163
1164 /// Executes a forward search and returns the end position of the leftmost
1165 /// match that is found. If no match exists, then `None` is returned.
1166 ///
1167 /// In particular, this method continues searching even after it enters
1168 /// a match state. The search only terminates once it has reached the
1169 /// end of the input or when it has entered a dead or quit state. Upon
1170 /// termination, the position of the last byte seen while still in a match
1171 /// state is returned.
1172 ///
1173 /// # Errors
1174 ///
1175 /// This routine errors if the search could not complete. This can occur
1176 /// in a number of circumstances:
1177 ///
1178 /// * The configuration of the DFA may permit it to "quit" the search.
1179 /// For example, setting quit bytes or enabling heuristic support for
1180 /// Unicode word boundaries. The default configuration does not enable any
1181 /// option that could result in the DFA quitting.
1182 /// * When the provided `Input` configuration is not supported. For
1183 /// example, by providing an unsupported anchor mode.
1184 ///
1185 /// When a search returns an error, callers cannot know whether a match
1186 /// exists or not.
1187 ///
1188 /// # Notes for implementors
1189 ///
1190 /// Implementors of this trait are not required to implement any particular
1191 /// match semantics (such as leftmost-first), which are instead manifest in
1192 /// the DFA's transitions. But this search routine should behave as a
1193 /// general "leftmost" search.
1194 ///
1195 /// In particular, this method must continue searching even after it enters
1196 /// a match state. The search should only terminate once it has reached
1197 /// the end of the input or when it has entered a dead or quit state. Upon
1198 /// termination, the position of the last byte seen while still in a match
1199 /// state is returned.
1200 ///
1201 /// Since this trait provides an implementation for this method by default,
1202 /// it's unlikely that one will need to implement this.
1203 ///
1204 /// # Example
1205 ///
1206 /// This example shows how to use this method with a
1207 /// [`dense::DFA`](crate::dfa::dense::DFA).
1208 ///
1209 /// ```
1210 /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
1211 ///
1212 /// let dfa = dense::DFA::new("foo[0-9]+")?;
1213 /// let expected = Some(HalfMatch::must(0, 8));
1214 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"foo12345"))?);
1215 ///
1216 /// // Even though a match is found after reading the first byte (`a`),
1217 /// // the leftmost first match semantics demand that we find the earliest
1218 /// // match that prefers earlier parts of the pattern over latter parts.
1219 /// let dfa = dense::DFA::new("abc|a")?;
1220 /// let expected = Some(HalfMatch::must(0, 3));
1221 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"abc"))?);
1222 ///
1223 /// # Ok::<(), Box<dyn std::error::Error>>(())
1224 /// ```
1225 ///
1226 /// # Example: specific pattern search
1227 ///
1228 /// This example shows how to build a multi-DFA that permits searching for
1229 /// specific patterns.
1230 ///
1231 /// ```
1232 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1233 /// use regex_automata::{
1234 /// dfa::{Automaton, dense},
1235 /// Anchored, HalfMatch, PatternID, Input,
1236 /// };
1237 ///
1238 /// let dfa = dense::Builder::new()
1239 /// .configure(dense::Config::new().starts_for_each_pattern(true))
1240 /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
1241 /// let haystack = "foo123".as_bytes();
1242 ///
1243 /// // Since we are using the default leftmost-first match and both
1244 /// // patterns match at the same starting position, only the first pattern
1245 /// // will be returned in this case when doing a search for any of the
1246 /// // patterns.
1247 /// let expected = Some(HalfMatch::must(0, 6));
1248 /// let got = dfa.try_search_fwd(&Input::new(haystack))?;
1249 /// assert_eq!(expected, got);
1250 ///
1251 /// // But if we want to check whether some other pattern matches, then we
1252 /// // can provide its pattern ID.
1253 /// let input = Input::new(haystack)
1254 /// .anchored(Anchored::Pattern(PatternID::must(1)));
1255 /// let expected = Some(HalfMatch::must(1, 6));
1256 /// let got = dfa.try_search_fwd(&input)?;
1257 /// assert_eq!(expected, got);
1258 ///
1259 /// # Ok::<(), Box<dyn std::error::Error>>(())
1260 /// ```
1261 ///
1262 /// # Example: specifying the bounds of a search
1263 ///
1264 /// This example shows how providing the bounds of a search can produce
1265 /// different results than simply sub-slicing the haystack.
1266 ///
1267 /// ```
1268 /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
1269 ///
1270 /// // N.B. We disable Unicode here so that we use a simple ASCII word
1271 /// // boundary. Alternatively, we could enable heuristic support for
1272 /// // Unicode word boundaries.
1273 /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?;
1274 /// let haystack = "foo123bar".as_bytes();
1275 ///
1276 /// // Since we sub-slice the haystack, the search doesn't know about the
1277 /// // larger context and assumes that `123` is surrounded by word
1278 /// // boundaries. And of course, the match position is reported relative
1279 /// // to the sub-slice as well, which means we get `3` instead of `6`.
1280 /// let input = Input::new(&haystack[3..6]);
1281 /// let expected = Some(HalfMatch::must(0, 3));
1282 /// let got = dfa.try_search_fwd(&input)?;
1283 /// assert_eq!(expected, got);
1284 ///
1285 /// // But if we provide the bounds of the search within the context of the
1286 /// // entire haystack, then the search can take the surrounding context
1287 /// // into account. (And if we did find a match, it would be reported
1288 /// // as a valid offset into `haystack` instead of its sub-slice.)
1289 /// let input = Input::new(haystack).range(3..6);
1290 /// let expected = None;
1291 /// let got = dfa.try_search_fwd(&input)?;
1292 /// assert_eq!(expected, got);
1293 ///
1294 /// # Ok::<(), Box<dyn std::error::Error>>(())
1295 /// ```
1296 #[inline]
1297 fn try_search_fwd(
1298 &self,
1299 input: &Input<'_>,
1300 ) -> Result<Option<HalfMatch>, MatchError> {
1301 let utf8empty = self.has_empty() && self.is_utf8();
1302 let hm = match search::find_fwd(&self, input)? {
1303 None => return Ok(None),
1304 Some(hm) if !utf8empty => return Ok(Some(hm)),
1305 Some(hm) => hm,
1306 };
1307 // We get to this point when we know our DFA can match the empty string
1308 // AND when UTF-8 mode is enabled. In this case, we skip any matches
1309 // whose offset splits a codepoint. Such a match is necessarily a
1310 // zero-width match, because UTF-8 mode requires the underlying NFA
1311 // to be built such that all non-empty matches span valid UTF-8.
1312 // Therefore, any match that ends in the middle of a codepoint cannot
1313 // be part of a span of valid UTF-8 and thus must be an empty match.
1314 // In such cases, we skip it, so as not to report matches that split a
1315 // codepoint.
1316 //
1317 // Note that this is not a checked assumption. Callers *can* provide an
1318 // NFA with UTF-8 mode enabled but produces non-empty matches that span
1319 // invalid UTF-8. But doing so is documented to result in unspecified
1320 // behavior.
1321 empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1322 let got = search::find_fwd(&self, input)?;
1323 Ok(got.map(|hm| (hm, hm.offset())))
1324 })
1325 }
1326
1327 /// Executes a reverse search and returns the start of the position of the
1328 /// leftmost match that is found. If no match exists, then `None` is
1329 /// returned.
1330 ///
1331 /// # Errors
1332 ///
1333 /// This routine errors if the search could not complete. This can occur
1334 /// in a number of circumstances:
1335 ///
1336 /// * The configuration of the DFA may permit it to "quit" the search.
1337 /// For example, setting quit bytes or enabling heuristic support for
1338 /// Unicode word boundaries. The default configuration does not enable any
1339 /// option that could result in the DFA quitting.
1340 /// * When the provided `Input` configuration is not supported. For
1341 /// example, by providing an unsupported anchor mode.
1342 ///
1343 /// When a search returns an error, callers cannot know whether a match
1344 /// exists or not.
1345 ///
1346 /// # Example
1347 ///
1348 /// This example shows how to use this method with a
1349 /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this
1350 /// routine is principally useful when used in conjunction with the
1351 /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
1352 /// configuration. In general, it's unlikely to be correct to use
1353 /// both `try_search_fwd` and `try_search_rev` with the same DFA since
1354 /// any particular DFA will only support searching in one direction with
1355 /// respect to the pattern.
1356 ///
1357 /// ```
1358 /// use regex_automata::{
1359 /// nfa::thompson,
1360 /// dfa::{Automaton, dense},
1361 /// HalfMatch, Input,
1362 /// };
1363 ///
1364 /// let dfa = dense::Builder::new()
1365 /// .thompson(thompson::Config::new().reverse(true))
1366 /// .build("foo[0-9]+")?;
1367 /// let expected = Some(HalfMatch::must(0, 0));
1368 /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"foo12345"))?);
1369 ///
1370 /// // Even though a match is found after reading the last byte (`c`),
1371 /// // the leftmost first match semantics demand that we find the earliest
1372 /// // match that prefers earlier parts of the pattern over latter parts.
1373 /// let dfa = dense::Builder::new()
1374 /// .thompson(thompson::Config::new().reverse(true))
1375 /// .build("abc|c")?;
1376 /// let expected = Some(HalfMatch::must(0, 0));
1377 /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"abc"))?);
1378 ///
1379 /// # Ok::<(), Box<dyn std::error::Error>>(())
1380 /// ```
1381 ///
1382 /// # Example: UTF-8 mode
1383 ///
1384 /// This examples demonstrates that UTF-8 mode applies to reverse
1385 /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
1386 /// matches reported must correspond to valid UTF-8 spans. This includes
1387 /// prohibiting zero-width matches that split a codepoint.
1388 ///
1389 /// UTF-8 mode is enabled by default. Notice below how the only zero-width
1390 /// matches reported are those at UTF-8 boundaries:
1391 ///
1392 /// ```
1393 /// use regex_automata::{
1394 /// dfa::{dense::DFA, Automaton},
1395 /// nfa::thompson,
1396 /// HalfMatch, Input, MatchKind,
1397 /// };
1398 ///
1399 /// let dfa = DFA::builder()
1400 /// .thompson(thompson::Config::new().reverse(true))
1401 /// .build(r"")?;
1402 ///
1403 /// // Run the reverse DFA to collect all matches.
1404 /// let mut input = Input::new("☃");
1405 /// let mut matches = vec![];
1406 /// loop {
1407 /// match dfa.try_search_rev(&input)? {
1408 /// None => break,
1409 /// Some(hm) => {
1410 /// matches.push(hm);
1411 /// if hm.offset() == 0 || input.end() == 0 {
1412 /// break;
1413 /// } else if hm.offset() < input.end() {
1414 /// input.set_end(hm.offset());
1415 /// } else {
1416 /// // This is only necessary to handle zero-width
1417 /// // matches, which of course occur in this example.
1418 /// // Without this, the search would never advance
1419 /// // backwards beyond the initial match.
1420 /// input.set_end(input.end() - 1);
1421 /// }
1422 /// }
1423 /// }
1424 /// }
1425 ///
1426 /// // No matches split a codepoint.
1427 /// let expected = vec![
1428 /// HalfMatch::must(0, 3),
1429 /// HalfMatch::must(0, 0),
1430 /// ];
1431 /// assert_eq!(expected, matches);
1432 ///
1433 /// # Ok::<(), Box<dyn std::error::Error>>(())
1434 /// ```
1435 ///
1436 /// Now let's look at the same example, but with UTF-8 mode on the
1437 /// original NFA disabled (which results in disabling UTF-8 mode on the
1438 /// DFA):
1439 ///
1440 /// ```
1441 /// use regex_automata::{
1442 /// dfa::{dense::DFA, Automaton},
1443 /// nfa::thompson,
1444 /// HalfMatch, Input, MatchKind,
1445 /// };
1446 ///
1447 /// let dfa = DFA::builder()
1448 /// .thompson(thompson::Config::new().reverse(true).utf8(false))
1449 /// .build(r"")?;
1450 ///
1451 /// // Run the reverse DFA to collect all matches.
1452 /// let mut input = Input::new("☃");
1453 /// let mut matches = vec![];
1454 /// loop {
1455 /// match dfa.try_search_rev(&input)? {
1456 /// None => break,
1457 /// Some(hm) => {
1458 /// matches.push(hm);
1459 /// if hm.offset() == 0 || input.end() == 0 {
1460 /// break;
1461 /// } else if hm.offset() < input.end() {
1462 /// input.set_end(hm.offset());
1463 /// } else {
1464 /// // This is only necessary to handle zero-width
1465 /// // matches, which of course occur in this example.
1466 /// // Without this, the search would never advance
1467 /// // backwards beyond the initial match.
1468 /// input.set_end(input.end() - 1);
1469 /// }
1470 /// }
1471 /// }
1472 /// }
1473 ///
1474 /// // No matches split a codepoint.
1475 /// let expected = vec![
1476 /// HalfMatch::must(0, 3),
1477 /// HalfMatch::must(0, 2),
1478 /// HalfMatch::must(0, 1),
1479 /// HalfMatch::must(0, 0),
1480 /// ];
1481 /// assert_eq!(expected, matches);
1482 ///
1483 /// # Ok::<(), Box<dyn std::error::Error>>(())
1484 /// ```
1485 #[inline]
1486 fn try_search_rev(
1487 &self,
1488 input: &Input<'_>,
1489 ) -> Result<Option<HalfMatch>, MatchError> {
1490 let utf8empty = self.has_empty() && self.is_utf8();
1491 let hm = match search::find_rev(self, input)? {
1492 None => return Ok(None),
1493 Some(hm) if !utf8empty => return Ok(Some(hm)),
1494 Some(hm) => hm,
1495 };
1496 empty::skip_splits_rev(input, hm, hm.offset(), |input| {
1497 let got = search::find_rev(self, input)?;
1498 Ok(got.map(|hm| (hm, hm.offset())))
1499 })
1500 }
1501
1502 /// Executes an overlapping forward search. Matches, if one exists, can be
1503 /// obtained via the [`OverlappingState::get_match`] method.
1504 ///
1505 /// This routine is principally only useful when searching for multiple
1506 /// patterns on inputs where multiple patterns may match the same regions
1507 /// of text. In particular, callers must preserve the automaton's search
1508 /// state from prior calls so that the implementation knows where the last
1509 /// match occurred.
1510 ///
1511 /// When using this routine to implement an iterator of overlapping
1512 /// matches, the `start` of the search should always be set to the end
1513 /// of the last match. If more patterns match at the previous location,
1514 /// then they will be immediately returned. (This is tracked by the given
1515 /// overlapping state.) Otherwise, the search continues at the starting
1516 /// position given.
1517 ///
1518 /// If for some reason you want the search to forget about its previous
1519 /// state and restart the search at a particular position, then setting the
1520 /// state to [`OverlappingState::start`] will accomplish that.
1521 ///
1522 /// # Errors
1523 ///
1524 /// This routine errors if the search could not complete. This can occur
1525 /// in a number of circumstances:
1526 ///
1527 /// * The configuration of the DFA may permit it to "quit" the search.
1528 /// For example, setting quit bytes or enabling heuristic support for
1529 /// Unicode word boundaries. The default configuration does not enable any
1530 /// option that could result in the DFA quitting.
1531 /// * When the provided `Input` configuration is not supported. For
1532 /// example, by providing an unsupported anchor mode.
1533 ///
1534 /// When a search returns an error, callers cannot know whether a match
1535 /// exists or not.
1536 ///
1537 /// # Example
1538 ///
1539 /// This example shows how to run a basic overlapping search with a
1540 /// [`dense::DFA`](crate::dfa::dense::DFA). Notice that we build the
1541 /// automaton with a `MatchKind::All` configuration. Overlapping searches
1542 /// are unlikely to work as one would expect when using the default
1543 /// `MatchKind::LeftmostFirst` match semantics, since leftmost-first
1544 /// matching is fundamentally incompatible with overlapping searches.
1545 /// Namely, overlapping searches need to report matches as they are seen,
1546 /// where as leftmost-first searches will continue searching even after a
1547 /// match has been observed in order to find the conventional end position
1548 /// of the match. More concretely, leftmost-first searches use dead states
1549 /// to terminate a search after a specific match can no longer be extended.
1550 /// Overlapping searches instead do the opposite by continuing the search
1551 /// to find totally new matches (potentially of other patterns).
1552 ///
1553 /// ```
1554 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1555 /// use regex_automata::{
1556 /// dfa::{Automaton, OverlappingState, dense},
1557 /// HalfMatch, Input, MatchKind,
1558 /// };
1559 ///
1560 /// let dfa = dense::Builder::new()
1561 /// .configure(dense::Config::new().match_kind(MatchKind::All))
1562 /// .build_many(&[r"[[:word:]]+$", r"[[:^space:]]+$"])?;
1563 /// let haystack = "@foo";
1564 /// let mut state = OverlappingState::start();
1565 ///
1566 /// let expected = Some(HalfMatch::must(1, 4));
1567 /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?;
1568 /// assert_eq!(expected, state.get_match());
1569 ///
1570 /// // The first pattern also matches at the same position, so re-running
1571 /// // the search will yield another match. Notice also that the first
1572 /// // pattern is returned after the second. This is because the second
1573 /// // pattern begins its match before the first, is therefore an earlier
1574 /// // match and is thus reported first.
1575 /// let expected = Some(HalfMatch::must(0, 4));
1576 /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?;
1577 /// assert_eq!(expected, state.get_match());
1578 ///
1579 /// # Ok::<(), Box<dyn std::error::Error>>(())
1580 /// ```
1581 #[inline]
1582 fn try_search_overlapping_fwd(
1583 &self,
1584 input: &Input<'_>,
1585 state: &mut OverlappingState,
1586 ) -> Result<(), MatchError> {
1587 let utf8empty = self.has_empty() && self.is_utf8();
1588 search::find_overlapping_fwd(self, input, state)?;
1589 match state.get_match() {
1590 None => Ok(()),
1591 Some(_) if !utf8empty => Ok(()),
1592 Some(_) => skip_empty_utf8_splits_overlapping(
1593 input,
1594 state,
1595 |input, state| {
1596 search::find_overlapping_fwd(self, input, state)
1597 },
1598 ),
1599 }
1600 }
1601
1602 /// Executes a reverse overlapping forward search. Matches, if one exists,
1603 /// can be obtained via the [`OverlappingState::get_match`] method.
1604 ///
1605 /// When using this routine to implement an iterator of overlapping
1606 /// matches, the `start` of the search should remain invariant throughout
1607 /// iteration. The `OverlappingState` given to the search will keep track
1608 /// of the current position of the search. (This is because multiple
1609 /// matches may be reported at the same position, so only the search
1610 /// implementation itself knows when to advance the position.)
1611 ///
1612 /// If for some reason you want the search to forget about its previous
1613 /// state and restart the search at a particular position, then setting the
1614 /// state to [`OverlappingState::start`] will accomplish that.
1615 ///
1616 /// # Errors
1617 ///
1618 /// This routine errors if the search could not complete. This can occur
1619 /// in a number of circumstances:
1620 ///
1621 /// * The configuration of the DFA may permit it to "quit" the search.
1622 /// For example, setting quit bytes or enabling heuristic support for
1623 /// Unicode word boundaries. The default configuration does not enable any
1624 /// option that could result in the DFA quitting.
1625 /// * When the provided `Input` configuration is not supported. For
1626 /// example, by providing an unsupported anchor mode.
1627 ///
1628 /// When a search returns an error, callers cannot know whether a match
1629 /// exists or not.
1630 ///
1631 /// # Example: UTF-8 mode
1632 ///
1633 /// This examples demonstrates that UTF-8 mode applies to reverse
1634 /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
1635 /// matches reported must correspond to valid UTF-8 spans. This includes
1636 /// prohibiting zero-width matches that split a codepoint.
1637 ///
1638 /// UTF-8 mode is enabled by default. Notice below how the only zero-width
1639 /// matches reported are those at UTF-8 boundaries:
1640 ///
1641 /// ```
1642 /// use regex_automata::{
1643 /// dfa::{dense::DFA, Automaton, OverlappingState},
1644 /// nfa::thompson,
1645 /// HalfMatch, Input, MatchKind,
1646 /// };
1647 ///
1648 /// let dfa = DFA::builder()
1649 /// .configure(DFA::config().match_kind(MatchKind::All))
1650 /// .thompson(thompson::Config::new().reverse(true))
1651 /// .build_many(&[r"", r"☃"])?;
1652 ///
1653 /// // Run the reverse DFA to collect all matches.
1654 /// let input = Input::new("☃");
1655 /// let mut state = OverlappingState::start();
1656 /// let mut matches = vec![];
1657 /// loop {
1658 /// dfa.try_search_overlapping_rev(&input, &mut state)?;
1659 /// match state.get_match() {
1660 /// None => break,
1661 /// Some(hm) => matches.push(hm),
1662 /// }
1663 /// }
1664 ///
1665 /// // No matches split a codepoint.
1666 /// let expected = vec![
1667 /// HalfMatch::must(0, 3),
1668 /// HalfMatch::must(1, 0),
1669 /// HalfMatch::must(0, 0),
1670 /// ];
1671 /// assert_eq!(expected, matches);
1672 ///
1673 /// # Ok::<(), Box<dyn std::error::Error>>(())
1674 /// ```
1675 ///
1676 /// Now let's look at the same example, but with UTF-8 mode on the
1677 /// original NFA disabled (which results in disabling UTF-8 mode on the
1678 /// DFA):
1679 ///
1680 /// ```
1681 /// use regex_automata::{
1682 /// dfa::{dense::DFA, Automaton, OverlappingState},
1683 /// nfa::thompson,
1684 /// HalfMatch, Input, MatchKind,
1685 /// };
1686 ///
1687 /// let dfa = DFA::builder()
1688 /// .configure(DFA::config().match_kind(MatchKind::All))
1689 /// .thompson(thompson::Config::new().reverse(true).utf8(false))
1690 /// .build_many(&[r"", r"☃"])?;
1691 ///
1692 /// // Run the reverse DFA to collect all matches.
1693 /// let input = Input::new("☃");
1694 /// let mut state = OverlappingState::start();
1695 /// let mut matches = vec![];
1696 /// loop {
1697 /// dfa.try_search_overlapping_rev(&input, &mut state)?;
1698 /// match state.get_match() {
1699 /// None => break,
1700 /// Some(hm) => matches.push(hm),
1701 /// }
1702 /// }
1703 ///
1704 /// // Now *all* positions match, even within a codepoint,
1705 /// // because we lifted the requirement that matches
1706 /// // correspond to valid UTF-8 spans.
1707 /// let expected = vec![
1708 /// HalfMatch::must(0, 3),
1709 /// HalfMatch::must(0, 2),
1710 /// HalfMatch::must(0, 1),
1711 /// HalfMatch::must(1, 0),
1712 /// HalfMatch::must(0, 0),
1713 /// ];
1714 /// assert_eq!(expected, matches);
1715 ///
1716 /// # Ok::<(), Box<dyn std::error::Error>>(())
1717 /// ```
1718 #[inline]
1719 fn try_search_overlapping_rev(
1720 &self,
1721 input: &Input<'_>,
1722 state: &mut OverlappingState,
1723 ) -> Result<(), MatchError> {
1724 let utf8empty = self.has_empty() && self.is_utf8();
1725 search::find_overlapping_rev(self, input, state)?;
1726 match state.get_match() {
1727 None => Ok(()),
1728 Some(_) if !utf8empty => Ok(()),
1729 Some(_) => skip_empty_utf8_splits_overlapping(
1730 input,
1731 state,
1732 |input, state| {
1733 search::find_overlapping_rev(self, input, state)
1734 },
1735 ),
1736 }
1737 }
1738
1739 /// Writes the set of patterns that match anywhere in the given search
1740 /// configuration to `patset`. If multiple patterns match at the same
1741 /// position and the underlying DFA supports overlapping matches, then all
1742 /// matching patterns are written to the given set.
1743 ///
1744 /// Unless all of the patterns in this DFA are anchored, then generally
1745 /// speaking, this will visit every byte in the haystack.
1746 ///
1747 /// This search routine *does not* clear the pattern set. This gives some
1748 /// flexibility to the caller (e.g., running multiple searches with the
1749 /// same pattern set), but does make the API bug-prone if you're reusing
1750 /// the same pattern set for multiple searches but intended them to be
1751 /// independent.
1752 ///
1753 /// If a pattern ID matched but the given `PatternSet` does not have
1754 /// sufficient capacity to store it, then it is not inserted and silently
1755 /// dropped.
1756 ///
1757 /// # Errors
1758 ///
1759 /// This routine errors if the search could not complete. This can occur
1760 /// in a number of circumstances:
1761 ///
1762 /// * The configuration of the DFA may permit it to "quit" the search.
1763 /// For example, setting quit bytes or enabling heuristic support for
1764 /// Unicode word boundaries. The default configuration does not enable any
1765 /// option that could result in the DFA quitting.
1766 /// * When the provided `Input` configuration is not supported. For
1767 /// example, by providing an unsupported anchor mode.
1768 ///
1769 /// When a search returns an error, callers cannot know whether a match
1770 /// exists or not.
1771 ///
1772 /// # Example
1773 ///
1774 /// This example shows how to find all matching patterns in a haystack,
1775 /// even when some patterns match at the same position as other patterns.
1776 ///
1777 /// ```
1778 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1779 /// use regex_automata::{
1780 /// dfa::{Automaton, dense::DFA},
1781 /// Input, MatchKind, PatternSet,
1782 /// };
1783 ///
1784 /// let patterns = &[
1785 /// r"[[:word:]]+",
1786 /// r"[0-9]+",
1787 /// r"[[:alpha:]]+",
1788 /// r"foo",
1789 /// r"bar",
1790 /// r"barfoo",
1791 /// r"foobar",
1792 /// ];
1793 /// let dfa = DFA::builder()
1794 /// .configure(DFA::config().match_kind(MatchKind::All))
1795 /// .build_many(patterns)?;
1796 ///
1797 /// let input = Input::new("foobar");
1798 /// let mut patset = PatternSet::new(dfa.pattern_len());
1799 /// dfa.try_which_overlapping_matches(&input, &mut patset)?;
1800 /// let expected = vec![0, 2, 3, 4, 6];
1801 /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
1802 /// assert_eq!(expected, got);
1803 ///
1804 /// # Ok::<(), Box<dyn std::error::Error>>(())
1805 /// ```
1806 #[cfg(feature = "alloc")]
1807 #[inline]
1808 fn try_which_overlapping_matches(
1809 &self,
1810 input: &Input<'_>,
1811 patset: &mut PatternSet,
1812 ) -> Result<(), MatchError> {
1813 let mut state = OverlappingState::start();
1814 while let Some(m) = {
1815 self.try_search_overlapping_fwd(input, &mut state)?;
1816 state.get_match()
1817 } {
1818 let _ = patset.insert(m.pattern());
1819 // There's nothing left to find, so we can stop. Or the caller
1820 // asked us to.
1821 if patset.is_full() || input.get_earliest() {
1822 break;
1823 }
1824 }
1825 Ok(())
1826 }
1827}
1828
1829unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
1830 #[inline]
1831 fn next_state(&self, current: StateID, input: u8) -> StateID {
1832 (**self).next_state(current, input)
1833 }
1834
1835 #[inline]
1836 unsafe fn next_state_unchecked(
1837 &self,
1838 current: StateID,
1839 input: u8,
1840 ) -> StateID {
1841 (**self).next_state_unchecked(current, input)
1842 }
1843
1844 #[inline]
1845 fn next_eoi_state(&self, current: StateID) -> StateID {
1846 (**self).next_eoi_state(current)
1847 }
1848
1849 #[inline]
1850 fn start_state(
1851 &self,
1852 config: &start::Config,
1853 ) -> Result<StateID, StartError> {
1854 (**self).start_state(config)
1855 }
1856
1857 #[inline]
1858 fn start_state_forward(
1859 &self,
1860 input: &Input<'_>,
1861 ) -> Result<StateID, MatchError> {
1862 (**self).start_state_forward(input)
1863 }
1864
1865 #[inline]
1866 fn start_state_reverse(
1867 &self,
1868 input: &Input<'_>,
1869 ) -> Result<StateID, MatchError> {
1870 (**self).start_state_reverse(input)
1871 }
1872
1873 #[inline]
1874 fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
1875 (**self).universal_start_state(mode)
1876 }
1877
1878 #[inline]
1879 fn is_special_state(&self, id: StateID) -> bool {
1880 (**self).is_special_state(id)
1881 }
1882
1883 #[inline]
1884 fn is_dead_state(&self, id: StateID) -> bool {
1885 (**self).is_dead_state(id)
1886 }
1887
1888 #[inline]
1889 fn is_quit_state(&self, id: StateID) -> bool {
1890 (**self).is_quit_state(id)
1891 }
1892
1893 #[inline]
1894 fn is_match_state(&self, id: StateID) -> bool {
1895 (**self).is_match_state(id)
1896 }
1897
1898 #[inline]
1899 fn is_start_state(&self, id: StateID) -> bool {
1900 (**self).is_start_state(id)
1901 }
1902
1903 #[inline]
1904 fn is_accel_state(&self, id: StateID) -> bool {
1905 (**self).is_accel_state(id)
1906 }
1907
1908 #[inline]
1909 fn pattern_len(&self) -> usize {
1910 (**self).pattern_len()
1911 }
1912
1913 #[inline]
1914 fn match_len(&self, id: StateID) -> usize {
1915 (**self).match_len(id)
1916 }
1917
1918 #[inline]
1919 fn match_pattern(&self, id: StateID, index: usize) -> PatternID {
1920 (**self).match_pattern(id, index)
1921 }
1922
1923 #[inline]
1924 fn has_empty(&self) -> bool {
1925 (**self).has_empty()
1926 }
1927
1928 #[inline]
1929 fn is_utf8(&self) -> bool {
1930 (**self).is_utf8()
1931 }
1932
1933 #[inline]
1934 fn is_always_start_anchored(&self) -> bool {
1935 (**self).is_always_start_anchored()
1936 }
1937
1938 #[inline]
1939 fn accelerator(&self, id: StateID) -> &[u8] {
1940 (**self).accelerator(id)
1941 }
1942
1943 #[inline]
1944 fn get_prefilter(&self) -> Option<&Prefilter> {
1945 (**self).get_prefilter()
1946 }
1947
1948 #[inline]
1949 fn try_search_fwd(
1950 &self,
1951 input: &Input<'_>,
1952 ) -> Result<Option<HalfMatch>, MatchError> {
1953 (**self).try_search_fwd(input)
1954 }
1955
1956 #[inline]
1957 fn try_search_rev(
1958 &self,
1959 input: &Input<'_>,
1960 ) -> Result<Option<HalfMatch>, MatchError> {
1961 (**self).try_search_rev(input)
1962 }
1963
1964 #[inline]
1965 fn try_search_overlapping_fwd(
1966 &self,
1967 input: &Input<'_>,
1968 state: &mut OverlappingState,
1969 ) -> Result<(), MatchError> {
1970 (**self).try_search_overlapping_fwd(input, state)
1971 }
1972
1973 #[inline]
1974 fn try_search_overlapping_rev(
1975 &self,
1976 input: &Input<'_>,
1977 state: &mut OverlappingState,
1978 ) -> Result<(), MatchError> {
1979 (**self).try_search_overlapping_rev(input, state)
1980 }
1981
1982 #[cfg(feature = "alloc")]
1983 #[inline]
1984 fn try_which_overlapping_matches(
1985 &self,
1986 input: &Input<'_>,
1987 patset: &mut PatternSet,
1988 ) -> Result<(), MatchError> {
1989 (**self).try_which_overlapping_matches(input, patset)
1990 }
1991}
1992
1993/// Represents the current state of an overlapping search.
1994///
1995/// This is used for overlapping searches since they need to know something
1996/// about the previous search. For example, when multiple patterns match at the
1997/// same position, this state tracks the last reported pattern so that the next
1998/// search knows whether to report another matching pattern or continue with
1999/// the search at the next position. Additionally, it also tracks which state
2000/// the last search call terminated in.
2001///
2002/// This type provides little introspection capabilities. The only thing a
2003/// caller can do is construct it and pass it around to permit search routines
2004/// to use it to track state, and also ask whether a match has been found.
2005///
2006/// Callers should always provide a fresh state constructed via
2007/// [`OverlappingState::start`] when starting a new search. Reusing state from
2008/// a previous search may result in incorrect results.
2009#[derive(Clone, Debug, Eq, PartialEq)]
2010pub struct OverlappingState {
2011 /// The match reported by the most recent overlapping search to use this
2012 /// state.
2013 ///
2014 /// If a search does not find any matches, then it is expected to clear
2015 /// this value.
2016 pub(crate) mat: Option<HalfMatch>,
2017 /// The state ID of the state at which the search was in when the call
2018 /// terminated. When this is a match state, `last_match` must be set to a
2019 /// non-None value.
2020 ///
2021 /// A `None` value indicates the start state of the corresponding
2022 /// automaton. We cannot use the actual ID, since any one automaton may
2023 /// have many start states, and which one is in use depends on several
2024 /// search-time factors.
2025 pub(crate) id: Option<StateID>,
2026 /// The position of the search.
2027 ///
2028 /// When `id` is None (i.e., we are starting a search), this is set to
2029 /// the beginning of the search as given by the caller regardless of its
2030 /// current value. Subsequent calls to an overlapping search pick up at
2031 /// this offset.
2032 pub(crate) at: usize,
2033 /// The index into the matching patterns of the next match to report if the
2034 /// current state is a match state. Note that this may be 1 greater than
2035 /// the total number of matches to report for the current match state. (In
2036 /// which case, no more matches should be reported at the current position
2037 /// and the search should advance to the next position.)
2038 pub(crate) next_match_index: Option<usize>,
2039 /// This is set to true when a reverse overlapping search has entered its
2040 /// EOI transitions.
2041 ///
2042 /// This isn't used in a forward search because it knows to stop once the
2043 /// position exceeds the end of the search range. In a reverse search,
2044 /// since we use unsigned offsets, we don't "know" once we've gone past
2045 /// `0`. So the only way to detect it is with this extra flag. The reverse
2046 /// overlapping search knows to terminate specifically after it has
2047 /// reported all matches after following the EOI transition.
2048 pub(crate) rev_eoi: bool,
2049}
2050
2051impl OverlappingState {
2052 /// Create a new overlapping state that begins at the start state of any
2053 /// automaton.
2054 pub fn start() -> OverlappingState {
2055 OverlappingState {
2056 mat: None,
2057 id: None,
2058 at: 0,
2059 next_match_index: None,
2060 rev_eoi: false,
2061 }
2062 }
2063
2064 /// Return the match result of the most recent search to execute with this
2065 /// state.
2066 ///
2067 /// A searches will clear this result automatically, such that if no
2068 /// match is found, this will correctly report `None`.
2069 pub fn get_match(&self) -> Option<HalfMatch> {
2070 self.mat
2071 }
2072}
2073
2074/// An error that can occur when computing the start state for a search.
2075///
2076/// Computing a start state can fail for a few reasons, either based on
2077/// incorrect configuration or even based on whether the look-behind byte
2078/// triggers a quit state. Typically one does not need to handle this error
2079/// if you're using [`Automaton::start_state_forward`] (or its reverse
2080/// counterpart), as that routine automatically converts `StartError` to a
2081/// [`MatchError`] for you.
2082///
2083/// This error may be returned by the [`Automaton::start_state`] routine.
2084///
2085/// This error implements the `std::error::Error` trait when the `std` feature
2086/// is enabled.
2087///
2088/// This error is marked as non-exhaustive. New variants may be added in a
2089/// semver compatible release.
2090#[non_exhaustive]
2091#[derive(Clone, Debug)]
2092pub enum StartError {
2093 /// An error that occurs when a starting configuration's look-behind byte
2094 /// is in this DFA's quit set.
2095 Quit {
2096 /// The quit byte that was found.
2097 byte: u8,
2098 },
2099 /// An error that occurs when the caller requests an anchored mode that
2100 /// isn't supported by the DFA.
2101 UnsupportedAnchored {
2102 /// The anchored mode given that is unsupported.
2103 mode: Anchored,
2104 },
2105}
2106
2107impl StartError {
2108 pub(crate) fn quit(byte: u8) -> StartError {
2109 StartError::Quit { byte }
2110 }
2111
2112 pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError {
2113 StartError::UnsupportedAnchored { mode }
2114 }
2115}
2116
2117#[cfg(feature = "std")]
2118impl std::error::Error for StartError {}
2119
2120impl core::fmt::Display for StartError {
2121 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
2122 match *self {
2123 StartError::Quit { byte } => write!(
2124 f,
2125 "error computing start state because the look-behind byte \
2126 {:?} triggered a quit state",
2127 crate::util::escape::DebugByte(byte),
2128 ),
2129 StartError::UnsupportedAnchored { mode: Anchored::Yes } => {
2130 write!(
2131 f,
2132 "error computing start state because \
2133 anchored searches are not supported or enabled"
2134 )
2135 }
2136 StartError::UnsupportedAnchored { mode: Anchored::No } => {
2137 write!(
2138 f,
2139 "error computing start state because \
2140 unanchored searches are not supported or enabled"
2141 )
2142 }
2143 StartError::UnsupportedAnchored {
2144 mode: Anchored::Pattern(pid),
2145 } => {
2146 write!(
2147 f,
2148 "error computing start state because \
2149 anchored searches for a specific pattern ({}) \
2150 are not supported or enabled",
2151 pid.as_usize(),
2152 )
2153 }
2154 }
2155 }
2156}
2157
2158/// Runs the given overlapping `search` function (forwards or backwards) until
2159/// a match is found whose offset does not split a codepoint.
2160///
2161/// This is *not* always correct to call. It should only be called when the DFA
2162/// has UTF-8 mode enabled *and* it can produce zero-width matches. Calling
2163/// this when both of those things aren't true might result in legitimate
2164/// matches getting skipped.
2165#[cold]
2166#[inline(never)]
2167fn skip_empty_utf8_splits_overlapping<F>(
2168 input: &Input<'_>,
2169 state: &mut OverlappingState,
2170 mut search: F,
2171) -> Result<(), MatchError>
2172where
2173 F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,
2174{
2175 // Note that this routine works for forwards and reverse searches
2176 // even though there's no code here to handle those cases. That's
2177 // because overlapping searches drive themselves to completion via
2178 // `OverlappingState`. So all we have to do is push it until no matches are
2179 // found.
2180
2181 let mut hm = match state.get_match() {
2182 None => return Ok(()),
2183 Some(hm) => hm,
2184 };
2185 if input.get_anchored().is_anchored() {
2186 if !input.is_char_boundary(hm.offset()) {
2187 state.mat = None;
2188 }
2189 return Ok(());
2190 }
2191 while !input.is_char_boundary(hm.offset()) {
2192 search(input, state)?;
2193 hm = match state.get_match() {
2194 None => return Ok(()),
2195 Some(hm) => hm,
2196 };
2197 }
2198 Ok(())
2199}
2200
2201/// Write a prefix "state" indicator for fmt::Debug impls.
2202///
2203/// Specifically, this tries to succinctly distinguish the different types of
2204/// states: dead states, quit states, accelerated states, start states and
2205/// match states. It even accounts for the possible overlappings of different
2206/// state types.
2207pub(crate) fn fmt_state_indicator<A: Automaton>(
2208 f: &mut core::fmt::Formatter<'_>,
2209 dfa: A,
2210 id: StateID,
2211) -> core::fmt::Result {
2212 if dfa.is_dead_state(id) {
2213 write!(f, "D")?;
2214 if dfa.is_start_state(id) {
2215 write!(f, ">")?;
2216 } else {
2217 write!(f, " ")?;
2218 }
2219 } else if dfa.is_quit_state(id) {
2220 write!(f, "Q ")?;
2221 } else if dfa.is_start_state(id) {
2222 if dfa.is_accel_state(id) {
2223 write!(f, "A>")?;
2224 } else {
2225 write!(f, " >")?;
2226 }
2227 } else if dfa.is_match_state(id) {
2228 if dfa.is_accel_state(id) {
2229 write!(f, "A*")?;
2230 } else {
2231 write!(f, " *")?;
2232 }
2233 } else if dfa.is_accel_state(id) {
2234 write!(f, "A ")?;
2235 } else {
2236 write!(f, " ")?;
2237 }
2238 Ok(())
2239}
2240
2241#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
2242mod tests {
2243 // A basic test ensuring that our Automaton trait is object safe. (This is
2244 // the main reason why we don't define the search routines as generic over
2245 // Into<Input>.)
2246 #[test]
2247 fn object_safe() {
2248 use crate::{
2249 dfa::{dense, Automaton},
2250 HalfMatch, Input,
2251 };
2252
2253 let dfa = dense::DFA::new("abc").unwrap();
2254 let dfa: &dyn Automaton = &dfa;
2255 assert_eq!(
2256 Ok(Some(HalfMatch::must(0, 6))),
2257 dfa.try_search_fwd(&Input::new(b"xyzabcxyz")),
2258 );
2259 }
2260}
2261