1/*!
2A DFA-backed `Regex`.
3
4This module provides [`Regex`], which is defined generically over the
5[`Automaton`] trait. A `Regex` implements convenience routines you might have
6come to expect, such as finding the start/end of a match and iterating over
7all non-overlapping matches. This `Regex` type is limited in its capabilities
8to what a DFA can provide. Therefore, APIs involving capturing groups, for
9example, are not provided.
10
11Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
12finds the end offset of a match, where as the other is a "reverse" DFA that
13find the start offset of a match.
14
15See the [parent module](crate::dfa) for examples.
16*/
17
18#[cfg(feature = "alloc")]
19use alloc::vec::Vec;
20
21use crate::{
22 dfa::automaton::{Automaton, OverlappingState},
23 util::prefilter::{self, Prefilter},
24 MatchError, MultiMatch,
25};
26#[cfg(feature = "alloc")]
27use crate::{
28 dfa::{dense, error::Error, sparse},
29 nfa::thompson,
30 util::matchtypes::MatchKind,
31};
32
33// When the alloc feature is enabled, the regex type sets its A type parameter
34// to default to an owned dense DFA. But without alloc, we set no default. This
35// makes things a lot more convenient in the common case, since writing out the
36// DFA types is pretty annoying.
37//
38// Since we have two different definitions but only want to write one doc
39// string, we use a macro to capture the doc and other attributes once and then
40// repeat them for each definition.
41macro_rules! define_regex_type {
42 ($(#[$doc:meta])*) => {
43 #[cfg(feature = "alloc")]
44 $(#[$doc])*
45 pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> {
46 prefilter: Option<P>,
47 forward: A,
48 reverse: A,
49 utf8: bool,
50 }
51
52 #[cfg(not(feature = "alloc"))]
53 $(#[$doc])*
54 pub struct Regex<A, P = prefilter::None> {
55 prefilter: Option<P>,
56 forward: A,
57 reverse: A,
58 utf8: bool,
59 }
60 };
61}
62
63define_regex_type!(
64 /// A regular expression that uses deterministic finite automata for fast
65 /// searching.
66 ///
67 /// A regular expression is comprised of two DFAs, a "forward" DFA and a
68 /// "reverse" DFA. The forward DFA is responsible for detecting the end of
69 /// a match while the reverse DFA is responsible for detecting the start
70 /// of a match. Thus, in order to find the bounds of any given match, a
71 /// forward search must first be run followed by a reverse search. A match
72 /// found by the forward DFA guarantees that the reverse DFA will also find
73 /// a match.
74 ///
75 /// The type of the DFA used by a `Regex` corresponds to the `A` type
76 /// parameter, which must satisfy the [`Automaton`] trait. Typically,
77 /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
78 /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
79 /// memory but search faster, while sparse DFAs use less memory but search
80 /// more slowly.
81 ///
82 /// By default, a regex's automaton type parameter is set to
83 /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
84 /// in-memory work loads, this is the most convenient type that gives the
85 /// best search performance. When the `alloc` feature is disabled, no
86 /// default type is used.
87 ///
88 /// A `Regex` also has a `P` type parameter, which is used to select the
89 /// prefilter used during search. By default, no prefilter is enabled by
90 /// setting the type to default to [`prefilter::None`]. A prefilter can be
91 /// enabled by using the [`Regex::prefilter`] method.
92 ///
93 /// # When should I use this?
94 ///
95 /// Generally speaking, if you can afford the overhead of building a full
96 /// DFA for your regex, and you don't need things like capturing groups,
97 /// then this is a good choice if you're looking to optimize for matching
98 /// speed. Note however that its speed may be worse than a general purpose
99 /// regex engine if you don't select a good [prefilter].
100 ///
101 /// # Earliest vs Leftmost vs Overlapping
102 ///
103 /// The search routines exposed on a `Regex` reflect three different ways
104 /// of searching:
105 ///
106 /// * "earliest" means to stop as soon as a match has been detected.
107 /// * "leftmost" means to continue matching until the underlying
108 /// automaton cannot advance. This reflects "standard" searching you
109 /// might be used to in other regex engines. e.g., This permits
110 /// non-greedy and greedy searching to work as you would expect.
111 /// * "overlapping" means to find all possible matches, even if they
112 /// overlap.
113 ///
114 /// Generally speaking, when doing an overlapping search, you'll want to
115 /// build your regex DFAs with [`MatchKind::All`] semantics. Using
116 /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
117 /// likely to lead to odd behavior since `LeftmostFirst` specifically omits
118 /// some matches that can never be reported due to its semantics.
119 ///
120 /// The following example shows the differences between how these different
121 /// types of searches impact looking for matches of `[a-z]+` in the
122 /// haystack `abc`.
123 ///
124 /// ```
125 /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch};
126 ///
127 /// let pattern = r"[a-z]+";
128 /// let haystack = "abc".as_bytes();
129 ///
130 /// // With leftmost-first semantics, we test "earliest" and "leftmost".
131 /// let re = dfa::regex::Builder::new()
132 /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst))
133 /// .build(pattern)?;
134 ///
135 /// // "earliest" searching isn't impacted by greediness
136 /// let mut it = re.find_earliest_iter(haystack);
137 /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
138 /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
139 /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
140 /// assert_eq!(None, it.next());
141 ///
142 /// // "leftmost" searching supports greediness (and non-greediness)
143 /// let mut it = re.find_leftmost_iter(haystack);
144 /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
145 /// assert_eq!(None, it.next());
146 ///
147 /// // For overlapping, we want "all" match kind semantics.
148 /// let re = dfa::regex::Builder::new()
149 /// .dense(dense::Config::new().match_kind(MatchKind::All))
150 /// .build(pattern)?;
151 ///
152 /// // In the overlapping search, we find all three possible matches
153 /// // starting at the beginning of the haystack.
154 /// let mut it = re.find_overlapping_iter(haystack);
155 /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
156 /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
157 /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
158 /// assert_eq!(None, it.next());
159 ///
160 /// # Ok::<(), Box<dyn std::error::Error>>(())
161 /// ```
162 ///
163 /// # Sparse DFAs
164 ///
165 /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
166 /// used with any kind of DFA. While this crate constructs dense DFAs by
167 /// default, it is easy enough to build corresponding sparse DFAs, and then
168 /// build a regex from them:
169 ///
170 /// ```
171 /// use regex_automata::dfa::regex::Regex;
172 ///
173 /// // First, build a regex that uses dense DFAs.
174 /// let dense_re = Regex::new("foo[0-9]+")?;
175 ///
176 /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
177 /// let fwd = dense_re.forward().to_sparse()?;
178 /// let rev = dense_re.reverse().to_sparse()?;
179 ///
180 /// // Third, build a new regex from the constituent sparse DFAs.
181 /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
182 ///
183 /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
184 /// assert_eq!(true, sparse_re.is_match(b"foo123"));
185 ///
186 /// # Ok::<(), Box<dyn std::error::Error>>(())
187 /// ```
188 ///
189 /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
190 /// more succinctly. (Note though that dense DFAs are still constructed
191 /// first internally, and then converted to sparse DFAs, as in the example
192 /// above.)
193 ///
194 /// ```
195 /// use regex_automata::dfa::regex::Regex;
196 ///
197 /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
198 /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
199 /// assert!(sparse_re.is_match(b"foo123"));
200 ///
201 /// # Ok::<(), Box<dyn std::error::Error>>(())
202 /// ```
203 ///
204 /// # Fallibility
205 ///
206 /// In non-default configurations, the DFAs generated in this module may
207 /// return an error during a search. (Currently, the only way this happens
208 /// is if quit bytes are added or Unicode word boundaries are heuristically
209 /// enabled, both of which are turned off by default.) For convenience, the
210 /// main search routines, like [`find_leftmost`](Regex::find_leftmost),
211 /// will panic if an error occurs. However, if you need to use DFAs
212 /// which may produce an error at search time, then there are fallible
213 /// equivalents of all search routines. For example, for `find_leftmost`,
214 /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost).
215 /// The routines prefixed with `try_` return `Result<Option<MultiMatch>,
216 /// MatchError>`, where as the infallible routines simply return
217 /// `Option<MultiMatch>`.
218 ///
219 /// # Example
220 ///
221 /// This example shows how to cause a search to terminate if it sees a
222 /// `\n` byte, and handle the error returned. This could be useful if, for
223 /// example, you wanted to prevent a user supplied pattern from matching
224 /// across a line boundary.
225 ///
226 /// ```
227 /// use regex_automata::{dfa::{self, regex::Regex}, MatchError};
228 ///
229 /// let re = Regex::builder()
230 /// .dense(dfa::dense::Config::new().quit(b'\n', true))
231 /// .build(r"foo\p{any}+bar")?;
232 ///
233 /// let haystack = "foo\nbar".as_bytes();
234 /// // Normally this would produce a match, since \p{any} contains '\n'.
235 /// // But since we instructed the automaton to enter a quit state if a
236 /// // '\n' is observed, this produces a match error instead.
237 /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
238 /// let got = re.try_find_leftmost(haystack).unwrap_err();
239 /// assert_eq!(expected, got);
240 ///
241 /// # Ok::<(), Box<dyn std::error::Error>>(())
242 /// ```
243 #[derive(Clone, Debug)]
244);
245
246#[cfg(feature = "alloc")]
247impl Regex {
248 /// Parse the given regular expression using the default configuration and
249 /// return the corresponding regex.
250 ///
251 /// If you want a non-default configuration, then use the [`Builder`] to
252 /// set your own configuration.
253 ///
254 /// # Example
255 ///
256 /// ```
257 /// use regex_automata::{MultiMatch, dfa::regex::Regex};
258 ///
259 /// let re = Regex::new("foo[0-9]+bar")?;
260 /// assert_eq!(
261 /// Some(MultiMatch::must(0, 3, 14)),
262 /// re.find_leftmost(b"zzzfoo12345barzzz"),
263 /// );
264 /// # Ok::<(), Box<dyn std::error::Error>>(())
265 /// ```
266 pub fn new(pattern: &str) -> Result<Regex, Error> {
267 Builder::new().build(pattern)
268 }
269
270 /// Like `new`, but parses multiple patterns into a single "regex set."
271 /// This similarly uses the default regex configuration.
272 ///
273 /// # Example
274 ///
275 /// ```
276 /// use regex_automata::{MultiMatch, dfa::regex::Regex};
277 ///
278 /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
279 ///
280 /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
281 /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
282 /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
283 /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
284 /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
285 /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
286 /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
287 /// assert_eq!(None, it.next());
288 /// # Ok::<(), Box<dyn std::error::Error>>(())
289 /// ```
290 pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> {
291 Builder::new().build_many(patterns)
292 }
293}
294
295#[cfg(feature = "alloc")]
296impl Regex<sparse::DFA<Vec<u8>>> {
297 /// Parse the given regular expression using the default configuration,
298 /// except using sparse DFAs, and return the corresponding regex.
299 ///
300 /// If you want a non-default configuration, then use the [`Builder`] to
301 /// set your own configuration.
302 ///
303 /// # Example
304 ///
305 /// ```
306 /// use regex_automata::{MultiMatch, dfa::regex::Regex};
307 ///
308 /// let re = Regex::new_sparse("foo[0-9]+bar")?;
309 /// assert_eq!(
310 /// Some(MultiMatch::must(0, 3, 14)),
311 /// re.find_leftmost(b"zzzfoo12345barzzz"),
312 /// );
313 /// # Ok::<(), Box<dyn std::error::Error>>(())
314 /// ```
315 pub fn new_sparse(
316 pattern: &str,
317 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
318 Builder::new().build_sparse(pattern)
319 }
320
321 /// Like `new`, but parses multiple patterns into a single "regex set"
322 /// using sparse DFAs. This otherwise similarly uses the default regex
323 /// configuration.
324 ///
325 /// # Example
326 ///
327 /// ```
328 /// use regex_automata::{MultiMatch, dfa::regex::Regex};
329 ///
330 /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
331 ///
332 /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
333 /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
334 /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
335 /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
336 /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
337 /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
338 /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
339 /// assert_eq!(None, it.next());
340 /// # Ok::<(), Box<dyn std::error::Error>>(())
341 /// ```
342 pub fn new_many_sparse<P: AsRef<str>>(
343 patterns: &[P],
344 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
345 Builder::new().build_many_sparse(patterns)
346 }
347}
348
349/// Convenience routines for regex construction.
350#[cfg(feature = "alloc")]
351impl Regex {
352 /// Return a default configuration for a `Regex`.
353 ///
354 /// This is a convenience routine to avoid needing to import the `Config`
355 /// type when customizing the construction of a regex.
356 ///
357 /// # Example
358 ///
359 /// This example shows how to disable UTF-8 mode for `Regex` iteration.
360 /// When UTF-8 mode is disabled, the position immediately following an
361 /// empty match is where the next search begins, instead of the next
362 /// position of a UTF-8 encoded codepoint.
363 ///
364 /// ```
365 /// use regex_automata::{dfa::regex::Regex, MultiMatch};
366 ///
367 /// let re = Regex::builder()
368 /// .configure(Regex::config().utf8(false))
369 /// .build(r"")?;
370 /// let haystack = "a☃z".as_bytes();
371 /// let mut it = re.find_leftmost_iter(haystack);
372 /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
373 /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
374 /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
375 /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
376 /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
377 /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
378 /// assert_eq!(None, it.next());
379 ///
380 /// # Ok::<(), Box<dyn std::error::Error>>(())
381 /// ```
382 pub fn config() -> Config {
383 Config::new()
384 }
385
386 /// Return a builder for configuring the construction of a `Regex`.
387 ///
388 /// This is a convenience routine to avoid needing to import the
389 /// [`Builder`] type in common cases.
390 ///
391 /// # Example
392 ///
393 /// This example shows how to use the builder to disable UTF-8 mode
394 /// everywhere.
395 ///
396 /// ```
397 /// use regex_automata::{
398 /// dfa::regex::Regex,
399 /// nfa::thompson,
400 /// MultiMatch, SyntaxConfig,
401 /// };
402 ///
403 /// let re = Regex::builder()
404 /// .configure(Regex::config().utf8(false))
405 /// .syntax(SyntaxConfig::new().utf8(false))
406 /// .thompson(thompson::Config::new().utf8(false))
407 /// .build(r"foo(?-u:[^b])ar.*")?;
408 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
409 /// let expected = Some(MultiMatch::must(0, 1, 9));
410 /// let got = re.find_leftmost(haystack);
411 /// assert_eq!(expected, got);
412 ///
413 /// # Ok::<(), Box<dyn std::error::Error>>(())
414 /// ```
415 pub fn builder() -> Builder {
416 Builder::new()
417 }
418}
419
420/// Standard search routines for finding and iterating over matches.
421impl<A: Automaton, P: Prefilter> Regex<A, P> {
422 /// Returns true if and only if this regex matches the given haystack.
423 ///
424 /// This routine may short circuit if it knows that scanning future input
425 /// will never lead to a different result. In particular, if the underlying
426 /// DFA enters a match state or a dead state, then this routine will return
427 /// `true` or `false`, respectively, without inspecting any future input.
428 ///
429 /// # Panics
430 ///
431 /// If the underlying DFAs return an error, then this routine panics. This
432 /// only occurs in non-default configurations where quit bytes are used or
433 /// Unicode word boundaries are heuristically enabled.
434 ///
435 /// The fallible version of this routine is
436 /// [`try_is_match`](Regex::try_is_match).
437 ///
438 /// # Example
439 ///
440 /// ```
441 /// use regex_automata::dfa::regex::Regex;
442 ///
443 /// let re = Regex::new("foo[0-9]+bar")?;
444 /// assert_eq!(true, re.is_match(b"foo12345bar"));
445 /// assert_eq!(false, re.is_match(b"foobar"));
446 /// # Ok::<(), Box<dyn std::error::Error>>(())
447 /// ```
448 pub fn is_match(&self, haystack: &[u8]) -> bool {
449 self.is_match_at(haystack, 0, haystack.len())
450 }
451
452 /// Returns the first position at which a match is found.
453 ///
454 /// This routine stops scanning input in precisely the same circumstances
455 /// as `is_match`. The key difference is that this routine returns the
456 /// position at which it stopped scanning input if and only if a match
457 /// was found. If no match is found, then `None` is returned.
458 ///
459 /// # Panics
460 ///
461 /// If the underlying DFAs return an error, then this routine panics. This
462 /// only occurs in non-default configurations where quit bytes are used or
463 /// Unicode word boundaries are heuristically enabled.
464 ///
465 /// The fallible version of this routine is
466 /// [`try_find_earliest`](Regex::try_find_earliest).
467 ///
468 /// # Example
469 ///
470 /// ```
471 /// use regex_automata::{MultiMatch, dfa::regex::Regex};
472 ///
473 /// // Normally, the leftmost first match would greedily consume as many
474 /// // decimal digits as it could. But a match is detected as soon as one
475 /// // digit is seen.
476 /// let re = Regex::new("foo[0-9]+")?;
477 /// assert_eq!(
478 /// Some(MultiMatch::must(0, 0, 4)),
479 /// re.find_earliest(b"foo12345"),
480 /// );
481 ///
482 /// // Normally, the end of the leftmost first match here would be 3,
483 /// // but the "earliest" match semantics detect a match earlier.
484 /// let re = Regex::new("abc|a")?;
485 /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc"));
486 /// # Ok::<(), Box<dyn std::error::Error>>(())
487 /// ```
488 pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> {
489 self.find_earliest_at(haystack, 0, haystack.len())
490 }
491
492 /// Returns the start and end offset of the leftmost match. If no match
493 /// exists, then `None` is returned.
494 ///
495 /// # Panics
496 ///
497 /// If the underlying DFAs return an error, then this routine panics. This
498 /// only occurs in non-default configurations where quit bytes are used or
499 /// Unicode word boundaries are heuristically enabled.
500 ///
501 /// The fallible version of this routine is
502 /// [`try_find_leftmost`](Regex::try_find_leftmost).
503 ///
504 /// # Example
505 ///
506 /// ```
507 /// use regex_automata::{MultiMatch, dfa::regex::Regex};
508 ///
509 /// // Greediness is applied appropriately when compared to find_earliest.
510 /// let re = Regex::new("foo[0-9]+")?;
511 /// assert_eq!(
512 /// Some(MultiMatch::must(0, 3, 11)),
513 /// re.find_leftmost(b"zzzfoo12345zzz"),
514 /// );
515 ///
516 /// // Even though a match is found after reading the first byte (`a`),
517 /// // the default leftmost-first match semantics demand that we find the
518 /// // earliest match that prefers earlier parts of the pattern over latter
519 /// // parts.
520 /// let re = Regex::new("abc|a")?;
521 /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc"));
522 /// # Ok::<(), Box<dyn std::error::Error>>(())
523 /// ```
524 pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> {
525 self.find_leftmost_at(haystack, 0, haystack.len())
526 }
527
528 /// Search for the first overlapping match in `haystack`.
529 ///
530 /// This routine is principally useful when searching for multiple patterns
531 /// on inputs where multiple patterns may match the same regions of text.
532 /// In particular, callers must preserve the automaton's search state from
533 /// prior calls so that the implementation knows where the last match
534 /// occurred and which pattern was reported.
535 ///
536 /// # Panics
537 ///
538 /// If the underlying DFAs return an error, then this routine panics. This
539 /// only occurs in non-default configurations where quit bytes are used or
540 /// Unicode word boundaries are heuristically enabled.
541 ///
542 /// The fallible version of this routine is
543 /// [`try_find_overlapping`](Regex::try_find_overlapping).
544 ///
545 /// # Example
546 ///
547 /// This example shows how to run an overlapping search with multiple
548 /// regexes.
549 ///
550 /// ```
551 /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
552 ///
553 /// let re = Regex::builder()
554 /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
555 /// .build_many(&[r"\w+$", r"\S+$"])?;
556 /// let haystack = "@foo".as_bytes();
557 /// let mut state = dfa::OverlappingState::start();
558 ///
559 /// let expected = Some(MultiMatch::must(1, 0, 4));
560 /// let got = re.find_overlapping(haystack, &mut state);
561 /// assert_eq!(expected, got);
562 ///
563 /// // The first pattern also matches at the same position, so re-running
564 /// // the search will yield another match. Notice also that the first
565 /// // pattern is returned after the second. This is because the second
566 /// // pattern begins its match before the first, is therefore an earlier
567 /// // match and is thus reported first.
568 /// let expected = Some(MultiMatch::must(0, 1, 4));
569 /// let got = re.find_overlapping(haystack, &mut state);
570 /// assert_eq!(expected, got);
571 ///
572 /// # Ok::<(), Box<dyn std::error::Error>>(())
573 /// ```
574 pub fn find_overlapping(
575 &self,
576 haystack: &[u8],
577 state: &mut OverlappingState,
578 ) -> Option<MultiMatch> {
579 self.find_overlapping_at(haystack, 0, haystack.len(), state)
580 }
581
582 /// Returns an iterator over all non-overlapping "earliest" matches.
583 ///
584 /// Match positions are reported as soon as a match is known to occur, even
585 /// if the standard leftmost match would be longer.
586 ///
587 /// # Panics
588 ///
589 /// If the underlying DFAs return an error during iteration, then iteration
590 /// panics. This only occurs in non-default configurations where quit bytes
591 /// are used or Unicode word boundaries are heuristically enabled.
592 ///
593 /// The fallible version of this routine is
594 /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
595 ///
596 /// # Example
597 ///
598 /// This example shows how to run an "earliest" iterator.
599 ///
600 /// ```
601 /// use regex_automata::{dfa::regex::Regex, MultiMatch};
602 ///
603 /// let re = Regex::new("[0-9]+")?;
604 /// let haystack = "123".as_bytes();
605 ///
606 /// // Normally, a standard leftmost iterator would return a single
607 /// // match, but since "earliest" detects matches earlier, we get
608 /// // three matches.
609 /// let mut it = re.find_earliest_iter(haystack);
610 /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
611 /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
612 /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
613 /// assert_eq!(None, it.next());
614 ///
615 /// # Ok::<(), Box<dyn std::error::Error>>(())
616 /// ```
617 pub fn find_earliest_iter<'r, 't>(
618 &'r self,
619 haystack: &'t [u8],
620 ) -> FindEarliestMatches<'r, 't, A, P> {
621 FindEarliestMatches::new(self, haystack)
622 }
623
624 /// Returns an iterator over all non-overlapping leftmost matches in the
625 /// given bytes. If no match exists, then the iterator yields no elements.
626 ///
627 /// This corresponds to the "standard" regex search iterator.
628 ///
629 /// # Panics
630 ///
631 /// If the underlying DFAs return an error during iteration, then iteration
632 /// panics. This only occurs in non-default configurations where quit bytes
633 /// are used or Unicode word boundaries are heuristically enabled.
634 ///
635 /// The fallible version of this routine is
636 /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
637 ///
638 /// # Example
639 ///
640 /// ```
641 /// use regex_automata::{MultiMatch, dfa::regex::Regex};
642 ///
643 /// let re = Regex::new("foo[0-9]+")?;
644 /// let text = b"foo1 foo12 foo123";
645 /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
646 /// assert_eq!(matches, vec![
647 /// MultiMatch::must(0, 0, 4),
648 /// MultiMatch::must(0, 5, 10),
649 /// MultiMatch::must(0, 11, 17),
650 /// ]);
651 /// # Ok::<(), Box<dyn std::error::Error>>(())
652 /// ```
653 pub fn find_leftmost_iter<'r, 't>(
654 &'r self,
655 haystack: &'t [u8],
656 ) -> FindLeftmostMatches<'r, 't, A, P> {
657 FindLeftmostMatches::new(self, haystack)
658 }
659
660 /// Returns an iterator over all overlapping matches in the given haystack.
661 ///
662 /// This routine is principally useful when searching for multiple patterns
663 /// on inputs where multiple patterns may match the same regions of text.
664 /// The iterator takes care of handling the overlapping state that must be
665 /// threaded through every search.
666 ///
667 /// # Panics
668 ///
669 /// If the underlying DFAs return an error during iteration, then iteration
670 /// panics. This only occurs in non-default configurations where quit bytes
671 /// are used or Unicode word boundaries are heuristically enabled.
672 ///
673 /// The fallible version of this routine is
674 /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
675 ///
676 /// # Example
677 ///
678 /// This example shows how to run an overlapping search with multiple
679 /// regexes.
680 ///
681 /// ```
682 /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
683 ///
684 /// let re = Regex::builder()
685 /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
686 /// .build_many(&[r"\w+$", r"\S+$"])?;
687 /// let haystack = "@foo".as_bytes();
688 ///
689 /// let mut it = re.find_overlapping_iter(haystack);
690 /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
691 /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
692 /// assert_eq!(None, it.next());
693 ///
694 /// # Ok::<(), Box<dyn std::error::Error>>(())
695 /// ```
696 pub fn find_overlapping_iter<'r, 't>(
697 &'r self,
698 haystack: &'t [u8],
699 ) -> FindOverlappingMatches<'r, 't, A, P> {
700 FindOverlappingMatches::new(self, haystack)
701 }
702}
703
704/// Lower level infallible search routines that permit controlling where
705/// the search starts and ends in a particular sequence. This is useful for
706/// executing searches that need to take surrounding context into account. This
707/// is required for correctly implementing iteration because of look-around
708/// operators (`^`, `$`, `\b`).
709impl<A: Automaton, P: Prefilter> Regex<A, P> {
710 /// Returns true if and only if this regex matches the given haystack.
711 ///
712 /// This routine may short circuit if it knows that scanning future input
713 /// will never lead to a different result. In particular, if the underlying
714 /// DFA enters a match state or a dead state, then this routine will return
715 /// `true` or `false`, respectively, without inspecting any future input.
716 ///
717 /// # Searching a substring of the haystack
718 ///
719 /// Being an "at" search routine, this permits callers to search a
720 /// substring of `haystack` by specifying a range in `haystack`.
721 /// Why expose this as an API instead of just asking callers to use
722 /// `&input[start..end]`? The reason is that regex matching often wants
723 /// to take the surrounding context into account in order to handle
724 /// look-around (`^`, `$` and `\b`).
725 ///
726 /// # Panics
727 ///
728 /// If the underlying DFAs return an error, then this routine panics. This
729 /// only occurs in non-default configurations where quit bytes are used or
730 /// Unicode word boundaries are heuristically enabled.
731 ///
732 /// The fallible version of this routine is
733 /// [`try_is_match_at`](Regex::try_is_match_at).
734 pub fn is_match_at(
735 &self,
736 haystack: &[u8],
737 start: usize,
738 end: usize,
739 ) -> bool {
740 self.try_is_match_at(haystack, start, end).unwrap()
741 }
742
743 /// Returns the first position at which a match is found.
744 ///
745 /// This routine stops scanning input in precisely the same circumstances
746 /// as `is_match`. The key difference is that this routine returns the
747 /// position at which it stopped scanning input if and only if a match
748 /// was found. If no match is found, then `None` is returned.
749 ///
750 /// # Searching a substring of the haystack
751 ///
752 /// Being an "at" search routine, this permits callers to search a
753 /// substring of `haystack` by specifying a range in `haystack`.
754 /// Why expose this as an API instead of just asking callers to use
755 /// `&input[start..end]`? The reason is that regex matching often wants
756 /// to take the surrounding context into account in order to handle
757 /// look-around (`^`, `$` and `\b`).
758 ///
759 /// This is useful when implementing an iterator over matches
760 /// within the same haystack, which cannot be done correctly by simply
761 /// providing a subslice of `haystack`.
762 ///
763 /// # Panics
764 ///
765 /// If the underlying DFAs return an error, then this routine panics. This
766 /// only occurs in non-default configurations where quit bytes are used or
767 /// Unicode word boundaries are heuristically enabled.
768 ///
769 /// The fallible version of this routine is
770 /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
771 pub fn find_earliest_at(
772 &self,
773 haystack: &[u8],
774 start: usize,
775 end: usize,
776 ) -> Option<MultiMatch> {
777 self.try_find_earliest_at(haystack, start, end).unwrap()
778 }
779
780 /// Returns the same as `find_leftmost`, but starts the search at the given
781 /// offset.
782 ///
783 /// The significance of the starting point is that it takes the surrounding
784 /// context into consideration. For example, if the DFA is anchored, then
785 /// a match can only occur when `start == 0`.
786 ///
787 /// # Searching a substring of the haystack
788 ///
789 /// Being an "at" search routine, this permits callers to search a
790 /// substring of `haystack` by specifying a range in `haystack`.
791 /// Why expose this as an API instead of just asking callers to use
792 /// `&input[start..end]`? The reason is that regex matching often wants
793 /// to take the surrounding context into account in order to handle
794 /// look-around (`^`, `$` and `\b`).
795 ///
796 /// This is useful when implementing an iterator over matches within the
797 /// same haystack, which cannot be done correctly by simply providing a
798 /// subslice of `haystack`.
799 ///
800 /// # Panics
801 ///
802 /// If the underlying DFAs return an error, then this routine panics. This
803 /// only occurs in non-default configurations where quit bytes are used or
804 /// Unicode word boundaries are heuristically enabled.
805 ///
806 /// The fallible version of this routine is
807 /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
808 pub fn find_leftmost_at(
809 &self,
810 haystack: &[u8],
811 start: usize,
812 end: usize,
813 ) -> Option<MultiMatch> {
814 self.try_find_leftmost_at(haystack, start, end).unwrap()
815 }
816
817 /// Search for the first overlapping match within a given range of
818 /// `haystack`.
819 ///
820 /// This routine is principally useful when searching for multiple patterns
821 /// on inputs where multiple patterns may match the same regions of text.
822 /// In particular, callers must preserve the automaton's search state from
823 /// prior calls so that the implementation knows where the last match
824 /// occurred and which pattern was reported.
825 ///
826 /// # Searching a substring of the haystack
827 ///
828 /// Being an "at" search routine, this permits callers to search a
829 /// substring of `haystack` by specifying a range in `haystack`.
830 /// Why expose this as an API instead of just asking callers to use
831 /// `&input[start..end]`? The reason is that regex matching often wants
832 /// to take the surrounding context into account in order to handle
833 /// look-around (`^`, `$` and `\b`).
834 ///
835 /// This is useful when implementing an iterator over matches
836 /// within the same haystack, which cannot be done correctly by simply
837 /// providing a subslice of `haystack`.
838 ///
839 /// # Panics
840 ///
841 /// If the underlying DFAs return an error, then this routine panics. This
842 /// only occurs in non-default configurations where quit bytes are used or
843 /// Unicode word boundaries are heuristically enabled.
844 ///
845 /// The fallible version of this routine is
846 /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
847 pub fn find_overlapping_at(
848 &self,
849 haystack: &[u8],
850 start: usize,
851 end: usize,
852 state: &mut OverlappingState,
853 ) -> Option<MultiMatch> {
854 self.try_find_overlapping_at(haystack, start, end, state).unwrap()
855 }
856}
857
858/// Fallible search routines. These may return an error when the underlying
859/// DFAs have been configured in a way that permits them to fail during a
860/// search.
861///
862/// Errors during search only occur when the DFA has been explicitly
863/// configured to do so, usually by specifying one or more "quit" bytes or by
864/// heuristically enabling Unicode word boundaries.
865///
866/// Errors will never be returned using the default configuration. So these
867/// fallible routines are only needed for particular configurations.
868impl<A: Automaton, P: Prefilter> Regex<A, P> {
869 /// Returns true if and only if this regex matches the given haystack.
870 ///
871 /// This routine may short circuit if it knows that scanning future input
872 /// will never lead to a different result. In particular, if the underlying
873 /// DFA enters a match state or a dead state, then this routine will return
874 /// `true` or `false`, respectively, without inspecting any future input.
875 ///
876 /// # Errors
877 ///
878 /// This routine only errors if the search could not complete. For
879 /// DFA-based regexes, this only occurs in a non-default configuration
880 /// where quit bytes are used or Unicode word boundaries are heuristically
881 /// enabled.
882 ///
883 /// When a search cannot complete, callers cannot know whether a match
884 /// exists or not.
885 ///
886 /// The infallible (panics on error) version of this routine is
887 /// [`is_match`](Regex::is_match).
888 pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> {
889 self.try_is_match_at(haystack, 0, haystack.len())
890 }
891
892 /// Returns the first position at which a match is found.
893 ///
894 /// This routine stops scanning input in precisely the same circumstances
895 /// as `is_match`. The key difference is that this routine returns the
896 /// position at which it stopped scanning input if and only if a match
897 /// was found. If no match is found, then `None` is returned.
898 ///
899 /// # Errors
900 ///
901 /// This routine only errors if the search could not complete. For
902 /// DFA-based regexes, this only occurs in a non-default configuration
903 /// where quit bytes are used or Unicode word boundaries are heuristically
904 /// enabled.
905 ///
906 /// When a search cannot complete, callers cannot know whether a match
907 /// exists or not.
908 ///
909 /// The infallible (panics on error) version of this routine is
910 /// [`find_earliest`](Regex::find_earliest).
911 pub fn try_find_earliest(
912 &self,
913 haystack: &[u8],
914 ) -> Result<Option<MultiMatch>, MatchError> {
915 self.try_find_earliest_at(haystack, 0, haystack.len())
916 }
917
918 /// Returns the start and end offset of the leftmost match. If no match
919 /// exists, then `None` is returned.
920 ///
921 /// # Errors
922 ///
923 /// This routine only errors if the search could not complete. For
924 /// DFA-based regexes, this only occurs in a non-default configuration
925 /// where quit bytes are used or Unicode word boundaries are heuristically
926 /// enabled.
927 ///
928 /// When a search cannot complete, callers cannot know whether a match
929 /// exists or not.
930 ///
931 /// The infallible (panics on error) version of this routine is
932 /// [`find_leftmost`](Regex::find_leftmost).
933 pub fn try_find_leftmost(
934 &self,
935 haystack: &[u8],
936 ) -> Result<Option<MultiMatch>, MatchError> {
937 self.try_find_leftmost_at(haystack, 0, haystack.len())
938 }
939
940 /// Search for the first overlapping match in `haystack`.
941 ///
942 /// This routine is principally useful when searching for multiple patterns
943 /// on inputs where multiple patterns may match the same regions of text.
944 /// In particular, callers must preserve the automaton's search state from
945 /// prior calls so that the implementation knows where the last match
946 /// occurred and which pattern was reported.
947 ///
948 /// # Errors
949 ///
950 /// This routine only errors if the search could not complete. For
951 /// DFA-based regexes, this only occurs in a non-default configuration
952 /// where quit bytes are used or Unicode word boundaries are heuristically
953 /// enabled.
954 ///
955 /// When a search cannot complete, callers cannot know whether a match
956 /// exists or not.
957 ///
958 /// The infallible (panics on error) version of this routine is
959 /// [`find_overlapping`](Regex::find_overlapping).
960 pub fn try_find_overlapping(
961 &self,
962 haystack: &[u8],
963 state: &mut OverlappingState,
964 ) -> Result<Option<MultiMatch>, MatchError> {
965 self.try_find_overlapping_at(haystack, 0, haystack.len(), state)
966 }
967
968 /// Returns an iterator over all non-overlapping "earliest" matches.
969 ///
970 /// Match positions are reported as soon as a match is known to occur, even
971 /// if the standard leftmost match would be longer.
972 ///
973 /// # Errors
974 ///
975 /// This iterator only yields errors if the search could not complete. For
976 /// DFA-based regexes, this only occurs in a non-default configuration
977 /// where quit bytes are used or Unicode word boundaries are heuristically
978 /// enabled.
979 ///
980 /// When a search cannot complete, callers cannot know whether a match
981 /// exists or not.
982 ///
983 /// The infallible (panics on error) version of this routine is
984 /// [`find_earliest_iter`](Regex::find_earliest_iter).
985 pub fn try_find_earliest_iter<'r, 't>(
986 &'r self,
987 haystack: &'t [u8],
988 ) -> TryFindEarliestMatches<'r, 't, A, P> {
989 TryFindEarliestMatches::new(self, haystack)
990 }
991
992 /// Returns an iterator over all non-overlapping leftmost matches in the
993 /// given bytes. If no match exists, then the iterator yields no elements.
994 ///
995 /// This corresponds to the "standard" regex search iterator.
996 ///
997 /// # Errors
998 ///
999 /// This iterator only yields errors if the search could not complete. For
1000 /// DFA-based regexes, this only occurs in a non-default configuration
1001 /// where quit bytes are used or Unicode word boundaries are heuristically
1002 /// enabled.
1003 ///
1004 /// When a search cannot complete, callers cannot know whether a match
1005 /// exists or not.
1006 ///
1007 /// The infallible (panics on error) version of this routine is
1008 /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
1009 pub fn try_find_leftmost_iter<'r, 't>(
1010 &'r self,
1011 haystack: &'t [u8],
1012 ) -> TryFindLeftmostMatches<'r, 't, A, P> {
1013 TryFindLeftmostMatches::new(self, haystack)
1014 }
1015
1016 /// Returns an iterator over all overlapping matches in the given haystack.
1017 ///
1018 /// This routine is principally useful when searching for multiple patterns
1019 /// on inputs where multiple patterns may match the same regions of text.
1020 /// The iterator takes care of handling the overlapping state that must be
1021 /// threaded through every search.
1022 ///
1023 /// # Errors
1024 ///
1025 /// This iterator only yields errors if the search could not complete. For
1026 /// DFA-based regexes, this only occurs in a non-default configuration
1027 /// where quit bytes are used or Unicode word boundaries are heuristically
1028 /// enabled.
1029 ///
1030 /// When a search cannot complete, callers cannot know whether a match
1031 /// exists or not.
1032 ///
1033 /// The infallible (panics on error) version of this routine is
1034 /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
1035 pub fn try_find_overlapping_iter<'r, 't>(
1036 &'r self,
1037 haystack: &'t [u8],
1038 ) -> TryFindOverlappingMatches<'r, 't, A, P> {
1039 TryFindOverlappingMatches::new(self, haystack)
1040 }
1041}
1042
1043/// Lower level fallible search routines that permit controlling where the
1044/// search starts and ends in a particular sequence.
1045impl<A: Automaton, P: Prefilter> Regex<A, P> {
1046 /// Returns true if and only if this regex matches the given haystack.
1047 ///
1048 /// This routine may short circuit if it knows that scanning future input
1049 /// will never lead to a different result. In particular, if the underlying
1050 /// DFA enters a match state or a dead state, then this routine will return
1051 /// `true` or `false`, respectively, without inspecting any future input.
1052 ///
1053 /// # Searching a substring of the haystack
1054 ///
1055 /// Being an "at" search routine, this permits callers to search a
1056 /// substring of `haystack` by specifying a range in `haystack`.
1057 /// Why expose this as an API instead of just asking callers to use
1058 /// `&input[start..end]`? The reason is that regex matching often wants
1059 /// to take the surrounding context into account in order to handle
1060 /// look-around (`^`, `$` and `\b`).
1061 ///
1062 /// # Errors
1063 ///
1064 /// This routine only errors if the search could not complete. For
1065 /// DFA-based regexes, this only occurs in a non-default configuration
1066 /// where quit bytes are used, Unicode word boundaries are heuristically
1067 /// enabled or limits are set on the number of times the lazy DFA's cache
1068 /// may be cleared.
1069 ///
1070 /// When a search cannot complete, callers cannot know whether a match
1071 /// exists or not.
1072 ///
1073 /// The infallible (panics on error) version of this routine is
1074 /// [`is_match_at`](Regex::is_match_at).
1075 pub fn try_is_match_at(
1076 &self,
1077 haystack: &[u8],
1078 start: usize,
1079 end: usize,
1080 ) -> Result<bool, MatchError> {
1081 self.forward()
1082 .find_earliest_fwd_at(
1083 self.scanner().as_mut(),
1084 None,
1085 haystack,
1086 start,
1087 end,
1088 )
1089 .map(|x| x.is_some())
1090 }
1091
1092 /// Returns the first position at which a match is found.
1093 ///
1094 /// This routine stops scanning input in precisely the same circumstances
1095 /// as `is_match`. The key difference is that this routine returns the
1096 /// position at which it stopped scanning input if and only if a match
1097 /// was found. If no match is found, then `None` is returned.
1098 ///
1099 /// # Searching a substring of the haystack
1100 ///
1101 /// Being an "at" search routine, this permits callers to search a
1102 /// substring of `haystack` by specifying a range in `haystack`.
1103 /// Why expose this as an API instead of just asking callers to use
1104 /// `&input[start..end]`? The reason is that regex matching often wants
1105 /// to take the surrounding context into account in order to handle
1106 /// look-around (`^`, `$` and `\b`).
1107 ///
1108 /// This is useful when implementing an iterator over matches
1109 /// within the same haystack, which cannot be done correctly by simply
1110 /// providing a subslice of `haystack`.
1111 ///
1112 /// # Errors
1113 ///
1114 /// This routine only errors if the search could not complete. For
1115 /// DFA-based regexes, this only occurs in a non-default configuration
1116 /// where quit bytes are used or Unicode word boundaries are heuristically
1117 /// enabled.
1118 ///
1119 /// When a search cannot complete, callers cannot know whether a match
1120 /// exists or not.
1121 ///
1122 /// The infallible (panics on error) version of this routine is
1123 /// [`find_earliest_at`](Regex::find_earliest_at).
1124 pub fn try_find_earliest_at(
1125 &self,
1126 haystack: &[u8],
1127 start: usize,
1128 end: usize,
1129 ) -> Result<Option<MultiMatch>, MatchError> {
1130 self.try_find_earliest_at_imp(
1131 self.scanner().as_mut(),
1132 haystack,
1133 start,
1134 end,
1135 )
1136 }
1137
1138 /// The implementation of "earliest" searching, where a prefilter scanner
1139 /// may be given.
1140 fn try_find_earliest_at_imp(
1141 &self,
1142 pre: Option<&mut prefilter::Scanner>,
1143 haystack: &[u8],
1144 start: usize,
1145 end: usize,
1146 ) -> Result<Option<MultiMatch>, MatchError> {
1147 // N.B. We use `&&A` here to call `Automaton` methods, which ensures
1148 // that we always use the `impl Automaton for &A` for calling methods.
1149 // Since this is the usual way that automata are used, this helps
1150 // reduce the number of monomorphized copies of the search code.
1151 let (fwd, rev) = (self.forward(), self.reverse());
1152 let end = match (&fwd)
1153 .find_earliest_fwd_at(pre, None, haystack, start, end)?
1154 {
1155 None => return Ok(None),
1156 Some(end) => end,
1157 };
1158 // N.B. The only time we need to tell the reverse searcher the pattern
1159 // to match is in the overlapping case, since it's ambiguous. In the
1160 // leftmost case, I have tentatively convinced myself that it isn't
1161 // necessary and the reverse search will always find the same pattern
1162 // to match as the forward search. But I lack a rigorous proof.
1163 let start = (&rev)
1164 .find_earliest_rev_at(None, haystack, start, end.offset())?
1165 .expect("reverse search must match if forward search does");
1166 assert_eq!(
1167 start.pattern(),
1168 end.pattern(),
1169 "forward and reverse search must match same pattern"
1170 );
1171 assert!(start.offset() <= end.offset());
1172 Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
1173 }
1174
1175 /// Returns the start and end offset of the leftmost match. If no match
1176 /// exists, then `None` is returned.
1177 ///
1178 /// # Searching a substring of the haystack
1179 ///
1180 /// Being an "at" search routine, this permits callers to search a
1181 /// substring of `haystack` by specifying a range in `haystack`.
1182 /// Why expose this as an API instead of just asking callers to use
1183 /// `&input[start..end]`? The reason is that regex matching often wants
1184 /// to take the surrounding context into account in order to handle
1185 /// look-around (`^`, `$` and `\b`).
1186 ///
1187 /// This is useful when implementing an iterator over matches
1188 /// within the same haystack, which cannot be done correctly by simply
1189 /// providing a subslice of `haystack`.
1190 ///
1191 /// # Errors
1192 ///
1193 /// This routine only errors if the search could not complete. For
1194 /// DFA-based regexes, this only occurs in a non-default configuration
1195 /// where quit bytes are used or Unicode word boundaries are heuristically
1196 /// enabled.
1197 ///
1198 /// When a search cannot complete, callers cannot know whether a match
1199 /// exists or not.
1200 ///
1201 /// The infallible (panics on error) version of this routine is
1202 /// [`find_leftmost_at`](Regex::find_leftmost_at).
1203 pub fn try_find_leftmost_at(
1204 &self,
1205 haystack: &[u8],
1206 start: usize,
1207 end: usize,
1208 ) -> Result<Option<MultiMatch>, MatchError> {
1209 self.try_find_leftmost_at_imp(
1210 self.scanner().as_mut(),
1211 haystack,
1212 start,
1213 end,
1214 )
1215 }
1216
1217 /// The implementation of leftmost searching, where a prefilter scanner
1218 /// may be given.
1219 fn try_find_leftmost_at_imp(
1220 &self,
1221 scanner: Option<&mut prefilter::Scanner>,
1222 haystack: &[u8],
1223 start: usize,
1224 end: usize,
1225 ) -> Result<Option<MultiMatch>, MatchError> {
1226 // N.B. We use `&&A` here to call `Automaton` methods, which ensures
1227 // that we always use the `impl Automaton for &A` for calling methods.
1228 // Since this is the usual way that automata are used, this helps
1229 // reduce the number of monomorphized copies of the search code.
1230 let (fwd, rev) = (self.forward(), self.reverse());
1231 let end = match (&fwd)
1232 .find_leftmost_fwd_at(scanner, None, haystack, start, end)?
1233 {
1234 None => return Ok(None),
1235 Some(end) => end,
1236 };
1237 // N.B. The only time we need to tell the reverse searcher the pattern
1238 // to match is in the overlapping case, since it's ambiguous. In the
1239 // leftmost case, I have tentatively convinced myself that it isn't
1240 // necessary and the reverse search will always find the same pattern
1241 // to match as the forward search. But I lack a rigorous proof. Why not
1242 // just provide the pattern anyway? Well, if it is needed, then leaving
1243 // it out gives us a chance to find a witness.
1244 let start = (&rev)
1245 .find_leftmost_rev_at(None, haystack, start, end.offset())?
1246 .expect("reverse search must match if forward search does");
1247 assert_eq!(
1248 start.pattern(),
1249 end.pattern(),
1250 "forward and reverse search must match same pattern",
1251 );
1252 assert!(start.offset() <= end.offset());
1253 Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
1254 }
1255
1256 /// Search for the first overlapping match within a given range of
1257 /// `haystack`.
1258 ///
1259 /// This routine is principally useful when searching for multiple patterns
1260 /// on inputs where multiple patterns may match the same regions of text.
1261 /// In particular, callers must preserve the automaton's search state from
1262 /// prior calls so that the implementation knows where the last match
1263 /// occurred and which pattern was reported.
1264 ///
1265 /// # Searching a substring of the haystack
1266 ///
1267 /// Being an "at" search routine, this permits callers to search a
1268 /// substring of `haystack` by specifying a range in `haystack`.
1269 /// Why expose this as an API instead of just asking callers to use
1270 /// `&input[start..end]`? The reason is that regex matching often wants
1271 /// to take the surrounding context into account in order to handle
1272 /// look-around (`^`, `$` and `\b`).
1273 ///
1274 /// This is useful when implementing an iterator over matches
1275 /// within the same haystack, which cannot be done correctly by simply
1276 /// providing a subslice of `haystack`.
1277 ///
1278 /// # Errors
1279 ///
1280 /// This routine only errors if the search could not complete. For
1281 /// DFA-based regexes, this only occurs in a non-default configuration
1282 /// where quit bytes are used or Unicode word boundaries are heuristically
1283 /// enabled.
1284 ///
1285 /// When a search cannot complete, callers cannot know whether a match
1286 /// exists or not.
1287 ///
1288 /// The infallible (panics on error) version of this routine is
1289 /// [`find_overlapping_at`](Regex::find_overlapping_at).
1290 pub fn try_find_overlapping_at(
1291 &self,
1292 haystack: &[u8],
1293 start: usize,
1294 end: usize,
1295 state: &mut OverlappingState,
1296 ) -> Result<Option<MultiMatch>, MatchError> {
1297 self.try_find_overlapping_at_imp(
1298 self.scanner().as_mut(),
1299 haystack,
1300 start,
1301 end,
1302 state,
1303 )
1304 }
1305
1306 /// The implementation of overlapping search at a given range in
1307 /// `haystack`, where `scanner` is a prefilter (if active) and `state` is
1308 /// the current state of the search.
1309 fn try_find_overlapping_at_imp(
1310 &self,
1311 scanner: Option<&mut prefilter::Scanner>,
1312 haystack: &[u8],
1313 start: usize,
1314 end: usize,
1315 state: &mut OverlappingState,
1316 ) -> Result<Option<MultiMatch>, MatchError> {
1317 // N.B. We use `&&A` here to call `Automaton` methods, which ensures
1318 // that we always use the `impl Automaton for &A` for calling methods.
1319 // Since this is the usual way that automata are used, this helps
1320 // reduce the number of monomorphized copies of the search code.
1321 let (fwd, rev) = (self.forward(), self.reverse());
1322 // TODO: Decide whether it's worth making this assert work. It doesn't
1323 // work currently because 'has_starts_for_each_pattern' isn't on the
1324 // Automaton trait. Without this assert, we still get a panic, but it's
1325 // a bit more inscrutable.
1326 // assert!(
1327 // rev.has_starts_for_each_pattern(),
1328 // "overlapping searches require that the reverse DFA is \
1329 // compiled with the 'starts_for_each_pattern' option",
1330 // );
1331 let end = match (&fwd).find_overlapping_fwd_at(
1332 scanner, None, haystack, start, end, state,
1333 )? {
1334 None => return Ok(None),
1335 Some(end) => end,
1336 };
1337 // Unlike the leftmost cases, the reverse overlapping search may match
1338 // a different pattern than the forward search. See test failures when
1339 // using `None` instead of `Some(end.pattern())` below. Thus, we must
1340 // run our reverse search using the pattern that matched in the forward
1341 // direction.
1342 let start = (&rev)
1343 .find_leftmost_rev_at(
1344 Some(end.pattern()),
1345 haystack,
1346 0,
1347 end.offset(),
1348 )?
1349 .expect("reverse search must match if forward search does");
1350 assert!(start.offset() <= end.offset());
1351 assert_eq!(start.pattern(), end.pattern());
1352 Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
1353 }
1354}
1355
1356/// Non-search APIs for querying information about the regex and setting a
1357/// prefilter.
1358impl<A: Automaton, P: Prefilter> Regex<A, P> {
1359 /// Attach the given prefilter to this regex.
1360 pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> {
1361 Regex {
1362 prefilter: Some(prefilter),
1363 forward: self.forward,
1364 reverse: self.reverse,
1365 utf8: self.utf8,
1366 }
1367 }
1368
1369 /// Remove any prefilter from this regex.
1370 pub fn without_prefilter(self) -> Regex<A> {
1371 Regex {
1372 prefilter: None,
1373 forward: self.forward,
1374 reverse: self.reverse,
1375 utf8: self.utf8,
1376 }
1377 }
1378
1379 /// Return the underlying DFA responsible for forward matching.
1380 ///
1381 /// This is useful for accessing the underlying DFA and converting it to
1382 /// some other format or size. See the [`Builder::build_from_dfas`] docs
1383 /// for an example of where this might be useful.
1384 pub fn forward(&self) -> &A {
1385 &self.forward
1386 }
1387
1388 /// Return the underlying DFA responsible for reverse matching.
1389 ///
1390 /// This is useful for accessing the underlying DFA and converting it to
1391 /// some other format or size. See the [`Builder::build_from_dfas`] docs
1392 /// for an example of where this might be useful.
1393 pub fn reverse(&self) -> &A {
1394 &self.reverse
1395 }
1396
1397 /// Returns the total number of patterns matched by this regex.
1398 ///
1399 /// # Example
1400 ///
1401 /// ```
1402 /// use regex_automata::{MultiMatch, dfa::regex::Regex};
1403 ///
1404 /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
1405 /// assert_eq!(3, re.pattern_count());
1406 /// # Ok::<(), Box<dyn std::error::Error>>(())
1407 /// ```
1408 pub fn pattern_count(&self) -> usize {
1409 assert_eq!(
1410 self.forward().pattern_count(),
1411 self.reverse().pattern_count()
1412 );
1413 self.forward().pattern_count()
1414 }
1415
1416 /// Convenience function for returning this regex's prefilter as a trait
1417 /// object.
1418 ///
1419 /// If this regex doesn't have a prefilter, then `None` is returned.
1420 pub fn prefilter(&self) -> Option<&dyn Prefilter> {
1421 match self.prefilter {
1422 None => None,
1423 Some(ref x) => Some(&*x),
1424 }
1425 }
1426
1427 /// Convenience function for returning a prefilter scanner.
1428 fn scanner(&self) -> Option<prefilter::Scanner> {
1429 self.prefilter().map(prefilter::Scanner::new)
1430 }
1431}
1432
1433/// An iterator over all non-overlapping earliest matches for a particular
1434/// infallible search.
1435///
1436/// The iterator yields a [`MultiMatch`] value until no more matches could be
1437/// found. If the underlying search returns an error, then this panics.
1438///
1439/// `A` is the type used to represent the underlying DFAs used by the regex,
1440/// while `P` is the type of prefilter used, if any. The lifetime variables are
1441/// as follows:
1442///
1443/// * `'r` is the lifetime of the regular expression itself.
1444/// * `'t` is the lifetime of the text being searched.
1445#[derive(Clone, Debug)]
1446pub struct FindEarliestMatches<'r, 't, A, P>(
1447 TryFindEarliestMatches<'r, 't, A, P>,
1448);
1449
1450impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> {
1451 fn new(
1452 re: &'r Regex<A, P>,
1453 text: &'t [u8],
1454 ) -> FindEarliestMatches<'r, 't, A, P> {
1455 FindEarliestMatches(TryFindEarliestMatches::new(re, text))
1456 }
1457}
1458
1459impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1460 for FindEarliestMatches<'r, 't, A, P>
1461{
1462 type Item = MultiMatch;
1463
1464 fn next(&mut self) -> Option<MultiMatch> {
1465 next_unwrap(self.0.next())
1466 }
1467}
1468
1469/// An iterator over all non-overlapping leftmost matches for a particular
1470/// infallible search.
1471///
1472/// The iterator yields a [`MultiMatch`] value until no more matches could be
1473/// found. If the underlying search returns an error, then this panics.
1474///
1475/// `A` is the type used to represent the underlying DFAs used by the regex,
1476/// while `P` is the type of prefilter used, if any. The lifetime variables are
1477/// as follows:
1478///
1479/// * `'r` is the lifetime of the regular expression itself.
1480/// * `'t` is the lifetime of the text being searched.
1481#[derive(Clone, Debug)]
1482pub struct FindLeftmostMatches<'r, 't, A, P>(
1483 TryFindLeftmostMatches<'r, 't, A, P>,
1484);
1485
1486impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> {
1487 fn new(
1488 re: &'r Regex<A, P>,
1489 text: &'t [u8],
1490 ) -> FindLeftmostMatches<'r, 't, A, P> {
1491 FindLeftmostMatches(TryFindLeftmostMatches::new(re, text))
1492 }
1493}
1494
1495impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1496 for FindLeftmostMatches<'r, 't, A, P>
1497{
1498 type Item = MultiMatch;
1499
1500 fn next(&mut self) -> Option<MultiMatch> {
1501 next_unwrap(self.0.next())
1502 }
1503}
1504
1505/// An iterator over all overlapping matches for a particular infallible
1506/// search.
1507///
1508/// The iterator yields a [`MultiMatch`] value until no more matches could be
1509/// found. If the underlying search returns an error, then this panics.
1510///
1511/// `A` is the type used to represent the underlying DFAs used by the regex,
1512/// while `P` is the type of prefilter used, if any. The lifetime variables are
1513/// as follows:
1514///
1515/// * `'r` is the lifetime of the regular expression itself.
1516/// * `'t` is the lifetime of the text being searched.
1517#[derive(Clone, Debug)]
1518pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>(
1519 TryFindOverlappingMatches<'r, 't, A, P>,
1520);
1521
1522impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> {
1523 fn new(
1524 re: &'r Regex<A, P>,
1525 text: &'t [u8],
1526 ) -> FindOverlappingMatches<'r, 't, A, P> {
1527 FindOverlappingMatches(TryFindOverlappingMatches::new(re, text))
1528 }
1529}
1530
1531impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1532 for FindOverlappingMatches<'r, 't, A, P>
1533{
1534 type Item = MultiMatch;
1535
1536 fn next(&mut self) -> Option<MultiMatch> {
1537 next_unwrap(self.0.next())
1538 }
1539}
1540
1541/// An iterator over all non-overlapping earliest matches for a particular
1542/// fallible search.
1543///
1544/// The iterator yields a [`MultiMatch`] value until no more matches could be
1545/// found.
1546///
1547/// `A` is the type used to represent the underlying DFAs used by the regex,
1548/// while `P` is the type of prefilter used, if any. The lifetime variables are
1549/// as follows:
1550///
1551/// * `'r` is the lifetime of the regular expression itself.
1552/// * `'t` is the lifetime of the text being searched.
1553#[derive(Clone, Debug)]
1554pub struct TryFindEarliestMatches<'r, 't, A, P> {
1555 re: &'r Regex<A, P>,
1556 scanner: Option<prefilter::Scanner<'r>>,
1557 text: &'t [u8],
1558 last_end: usize,
1559 last_match: Option<usize>,
1560}
1561
1562impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> {
1563 fn new(
1564 re: &'r Regex<A, P>,
1565 text: &'t [u8],
1566 ) -> TryFindEarliestMatches<'r, 't, A, P> {
1567 let scanner: Option> = re.scanner();
1568 TryFindEarliestMatches {
1569 re,
1570 scanner,
1571 text,
1572 last_end: 0,
1573 last_match: None,
1574 }
1575 }
1576}
1577
1578impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1579 for TryFindEarliestMatches<'r, 't, A, P>
1580{
1581 type Item = Result<MultiMatch, MatchError>;
1582
1583 fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
1584 if self.last_end > self.text.len() {
1585 return None;
1586 }
1587 let result = self.re.try_find_earliest_at_imp(
1588 self.scanner.as_mut(),
1589 self.text,
1590 self.last_end,
1591 self.text.len(),
1592 );
1593 let m = match result {
1594 Err(err) => return Some(Err(err)),
1595 Ok(None) => return None,
1596 Ok(Some(m)) => m,
1597 };
1598 if m.is_empty() {
1599 // This is an empty match. To ensure we make progress, start
1600 // the next search at the smallest possible starting position
1601 // of the next match following this one.
1602 self.last_end = if self.re.utf8 {
1603 crate::util::next_utf8(self.text, m.end())
1604 } else {
1605 m.end() + 1
1606 };
1607 // Don't accept empty matches immediately following a match.
1608 // Just move on to the next match.
1609 if Some(m.end()) == self.last_match {
1610 return self.next();
1611 }
1612 } else {
1613 self.last_end = m.end();
1614 }
1615 self.last_match = Some(m.end());
1616 Some(Ok(m))
1617 }
1618}
1619
1620/// An iterator over all non-overlapping leftmost matches for a particular
1621/// fallible search.
1622///
1623/// The iterator yields a [`MultiMatch`] value until no more matches could be
1624/// found.
1625///
1626/// `A` is the type used to represent the underlying DFAs used by the regex,
1627/// while `P` is the type of prefilter used, if any. The lifetime variables are
1628/// as follows:
1629///
1630/// * `'r` is the lifetime of the regular expression itself.
1631/// * `'t` is the lifetime of the text being searched.
1632#[derive(Clone, Debug)]
1633pub struct TryFindLeftmostMatches<'r, 't, A, P> {
1634 re: &'r Regex<A, P>,
1635 scanner: Option<prefilter::Scanner<'r>>,
1636 text: &'t [u8],
1637 last_end: usize,
1638 last_match: Option<usize>,
1639}
1640
1641impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> {
1642 fn new(
1643 re: &'r Regex<A, P>,
1644 text: &'t [u8],
1645 ) -> TryFindLeftmostMatches<'r, 't, A, P> {
1646 let scanner: Option> = re.scanner();
1647 TryFindLeftmostMatches {
1648 re,
1649 scanner,
1650 text,
1651 last_end: 0,
1652 last_match: None,
1653 }
1654 }
1655}
1656
1657impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1658 for TryFindLeftmostMatches<'r, 't, A, P>
1659{
1660 type Item = Result<MultiMatch, MatchError>;
1661
1662 fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
1663 if self.last_end > self.text.len() {
1664 return None;
1665 }
1666 let result = self.re.try_find_leftmost_at_imp(
1667 self.scanner.as_mut(),
1668 self.text,
1669 self.last_end,
1670 self.text.len(),
1671 );
1672 let m = match result {
1673 Err(err) => return Some(Err(err)),
1674 Ok(None) => return None,
1675 Ok(Some(m)) => m,
1676 };
1677 if m.is_empty() {
1678 // This is an empty match. To ensure we make progress, start
1679 // the next search at the smallest possible starting position
1680 // of the next match following this one.
1681 self.last_end = if self.re.utf8 {
1682 crate::util::next_utf8(self.text, m.end())
1683 } else {
1684 m.end() + 1
1685 };
1686 // Don't accept empty matches immediately following a match.
1687 // Just move on to the next match.
1688 if Some(m.end()) == self.last_match {
1689 return self.next();
1690 }
1691 } else {
1692 self.last_end = m.end();
1693 }
1694 self.last_match = Some(m.end());
1695 Some(Ok(m))
1696 }
1697}
1698
1699/// An iterator over all overlapping matches for a particular fallible search.
1700///
1701/// The iterator yields a [`MultiMatch`] value until no more matches could be
1702/// found.
1703///
1704/// `A` is the type used to represent the underlying DFAs used by the regex,
1705/// while `P` is the type of prefilter used, if any. The lifetime variables are
1706/// as follows:
1707///
1708/// * `'r` is the lifetime of the regular expression itself.
1709/// * `'t` is the lifetime of the text being searched.
1710#[derive(Clone, Debug)]
1711pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> {
1712 re: &'r Regex<A, P>,
1713 scanner: Option<prefilter::Scanner<'r>>,
1714 text: &'t [u8],
1715 last_end: usize,
1716 state: OverlappingState,
1717}
1718
1719impl<'r, 't, A: Automaton, P: Prefilter>
1720 TryFindOverlappingMatches<'r, 't, A, P>
1721{
1722 fn new(
1723 re: &'r Regex<A, P>,
1724 text: &'t [u8],
1725 ) -> TryFindOverlappingMatches<'r, 't, A, P> {
1726 let scanner: Option> = re.scanner();
1727 TryFindOverlappingMatches {
1728 re,
1729 scanner,
1730 text,
1731 last_end: 0,
1732 state: OverlappingState::start(),
1733 }
1734 }
1735}
1736
1737impl<'r, 't, A: Automaton, P: Prefilter> Iterator
1738 for TryFindOverlappingMatches<'r, 't, A, P>
1739{
1740 type Item = Result<MultiMatch, MatchError>;
1741
1742 fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
1743 if self.last_end > self.text.len() {
1744 return None;
1745 }
1746 let result = self.re.try_find_overlapping_at_imp(
1747 self.scanner.as_mut(),
1748 self.text,
1749 self.last_end,
1750 self.text.len(),
1751 &mut self.state,
1752 );
1753 let m = match result {
1754 Err(err) => return Some(Err(err)),
1755 Ok(None) => return None,
1756 Ok(Some(m)) => m,
1757 };
1758 // Unlike the non-overlapping case, we're OK with empty matches at this
1759 // level. In particular, the overlapping search algorithm is itself
1760 // responsible for ensuring that progress is always made.
1761 self.last_end = m.end();
1762 Some(Ok(m))
1763 }
1764}
1765
1766/// The configuration used for compiling a DFA-backed regex.
1767///
1768/// A regex configuration is a simple data object that is typically used with
1769/// [`Builder::configure`].
1770#[cfg(feature = "alloc")]
1771#[derive(Clone, Copy, Debug, Default)]
1772pub struct Config {
1773 utf8: Option<bool>,
1774}
1775
1776#[cfg(feature = "alloc")]
1777impl Config {
1778 /// Return a new default regex compiler configuration.
1779 pub fn new() -> Config {
1780 Config::default()
1781 }
1782
1783 /// Whether to enable UTF-8 mode or not.
1784 ///
1785 /// When UTF-8 mode is enabled (the default) and an empty match is seen,
1786 /// the iterators on [`Regex`] will always start the next search at the
1787 /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
1788 /// mode is disabled, such searches are begun at the next byte offset.
1789 ///
1790 /// If this mode is enabled and invalid UTF-8 is given to search, then
1791 /// behavior is unspecified.
1792 ///
1793 /// Generally speaking, one should enable this when
1794 /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
1795 /// and
1796 /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
1797 /// are enabled, and disable it otherwise.
1798 ///
1799 /// # Example
1800 ///
1801 /// This example demonstrates the differences between when this option is
1802 /// enabled and disabled. The differences only arise when the regex can
1803 /// return matches of length zero.
1804 ///
1805 /// In this first snippet, we show the results when UTF-8 mode is disabled.
1806 ///
1807 /// ```
1808 /// use regex_automata::{dfa::regex::Regex, MultiMatch};
1809 ///
1810 /// let re = Regex::builder()
1811 /// .configure(Regex::config().utf8(false))
1812 /// .build(r"")?;
1813 /// let haystack = "a☃z".as_bytes();
1814 /// let mut it = re.find_leftmost_iter(haystack);
1815 /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
1816 /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
1817 /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
1818 /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
1819 /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
1820 /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
1821 /// assert_eq!(None, it.next());
1822 ///
1823 /// # Ok::<(), Box<dyn std::error::Error>>(())
1824 /// ```
1825 ///
1826 /// And in this snippet, we execute the same search on the same haystack,
1827 /// but with UTF-8 mode enabled. Notice that byte offsets that would
1828 /// otherwise split the encoding of `☃` are not returned.
1829 ///
1830 /// ```
1831 /// use regex_automata::{dfa::regex::Regex, MultiMatch};
1832 ///
1833 /// let re = Regex::builder()
1834 /// .configure(Regex::config().utf8(true))
1835 /// .build(r"")?;
1836 /// let haystack = "a☃z".as_bytes();
1837 /// let mut it = re.find_leftmost_iter(haystack);
1838 /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
1839 /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
1840 /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
1841 /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
1842 /// assert_eq!(None, it.next());
1843 ///
1844 /// # Ok::<(), Box<dyn std::error::Error>>(())
1845 /// ```
1846 pub fn utf8(mut self, yes: bool) -> Config {
1847 self.utf8 = Some(yes);
1848 self
1849 }
1850
1851 /// Returns true if and only if this configuration has UTF-8 mode enabled.
1852 ///
1853 /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
1854 /// [`Regex`] will always start the next search at the next UTF-8 encoded
1855 /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
1856 /// next byte offset.
1857 pub fn get_utf8(&self) -> bool {
1858 self.utf8.unwrap_or(true)
1859 }
1860
1861 /// Overwrite the default configuration such that the options in `o` are
1862 /// always used. If an option in `o` is not set, then the corresponding
1863 /// option in `self` is used. If it's not set in `self` either, then it
1864 /// remains not set.
1865 pub(crate) fn overwrite(self, o: Config) -> Config {
1866 Config { utf8: o.utf8.or(self.utf8) }
1867 }
1868}
1869
1870/// A builder for a regex based on deterministic finite automatons.
1871///
1872/// This builder permits configuring options for the syntax of a pattern, the
1873/// NFA construction, the DFA construction and finally the regex searching
1874/// itself. This builder is different from a general purpose regex builder in
1875/// that it permits fine grain configuration of the construction process. The
1876/// trade off for this is complexity, and the possibility of setting a
1877/// configuration that might not make sense. For example, there are three
1878/// different UTF-8 modes:
1879///
1880/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
1881/// pattern itself can contain sub-expressions that match invalid UTF-8.
1882/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
1883/// controls whether the implicit unanchored prefix added to the NFA can
1884/// match through invalid UTF-8 or not.
1885/// * [`Config::utf8`] controls how the regex iterators themselves advance
1886/// the starting position of the next search when a match with zero length is
1887/// found.
1888///
1889/// Generally speaking, callers will want to either enable all of these or
1890/// disable all of these.
1891///
1892/// Internally, building a regex requires building two DFAs, where one is
1893/// responsible for finding the end of a match and the other is responsible
1894/// for finding the start of a match. If you only need to detect whether
1895/// something matched, or only the end of a match, then you should use a
1896/// [`dense::Builder`] to construct a single DFA, which is cheaper than
1897/// building two DFAs.
1898///
1899/// # Build methods
1900///
1901/// This builder has a few "build" methods. In general, it's the result of
1902/// combining the following parameters:
1903///
1904/// * Building one or many regexes.
1905/// * Building a regex with dense or sparse DFAs.
1906///
1907/// The simplest "build" method is [`Builder::build`]. It accepts a single
1908/// pattern and builds a dense DFA using `usize` for the state identifier
1909/// representation.
1910///
1911/// The most general "build" method is [`Builder::build_many`], which permits
1912/// building a regex that searches for multiple patterns simultaneously while
1913/// using a specific state identifier representation.
1914///
1915/// The most flexible "build" method, but hardest to use, is
1916/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
1917/// just a pair of DFAs, and this method allows you to specify those DFAs
1918/// exactly.
1919///
1920/// # Example
1921///
1922/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
1923/// the regex itself. This is generally what you want for matching on
1924/// arbitrary bytes.
1925///
1926/// ```
1927/// use regex_automata::{
1928/// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
1929/// };
1930///
1931/// let re = Regex::builder()
1932/// .configure(Regex::config().utf8(false))
1933/// .syntax(SyntaxConfig::new().utf8(false))
1934/// .thompson(thompson::Config::new().utf8(false))
1935/// .build(r"foo(?-u:[^b])ar.*")?;
1936/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
1937/// let expected = Some(MultiMatch::must(0, 1, 9));
1938/// let got = re.find_leftmost(haystack);
1939/// assert_eq!(expected, got);
1940/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
1941/// // but the subsequent `.*` does not! Disabling UTF-8
1942/// // on the syntax permits this. Notice also that the
1943/// // search was unanchored and skipped over invalid UTF-8.
1944/// // Disabling UTF-8 on the Thompson NFA permits this.
1945/// //
1946/// // N.B. This example does not show the impact of
1947/// // disabling UTF-8 mode on Config, since that
1948/// // only impacts regexes that can produce matches of
1949/// // length 0.
1950/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
1951///
1952/// # Ok::<(), Box<dyn std::error::Error>>(())
1953/// ```
1954#[cfg(feature = "alloc")]
1955#[derive(Clone, Debug)]
1956pub struct Builder {
1957 config: Config,
1958 dfa: dense::Builder,
1959}
1960
1961#[cfg(feature = "alloc")]
1962impl Builder {
1963 /// Create a new regex builder with the default configuration.
1964 pub fn new() -> Builder {
1965 Builder { config: Config::default(), dfa: dense::Builder::new() }
1966 }
1967
1968 /// Build a regex from the given pattern.
1969 ///
1970 /// If there was a problem parsing or compiling the pattern, then an error
1971 /// is returned.
1972 pub fn build(&self, pattern: &str) -> Result<Regex, Error> {
1973 self.build_many(&[pattern])
1974 }
1975
1976 /// Build a regex from the given pattern using sparse DFAs.
1977 ///
1978 /// If there was a problem parsing or compiling the pattern, then an error
1979 /// is returned.
1980 pub fn build_sparse(
1981 &self,
1982 pattern: &str,
1983 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
1984 self.build_many_sparse(&[pattern])
1985 }
1986
1987 /// Build a regex from the given patterns.
1988 pub fn build_many<P: AsRef<str>>(
1989 &self,
1990 patterns: &[P],
1991 ) -> Result<Regex, Error> {
1992 let forward = self.dfa.build_many(patterns)?;
1993 let reverse = self
1994 .dfa
1995 .clone()
1996 .configure(
1997 dense::Config::new()
1998 .anchored(true)
1999 .match_kind(MatchKind::All)
2000 .starts_for_each_pattern(true),
2001 )
2002 .thompson(thompson::Config::new().reverse(true))
2003 .build_many(patterns)?;
2004 Ok(self.build_from_dfas(forward, reverse))
2005 }
2006
2007 /// Build a sparse regex from the given patterns.
2008 pub fn build_many_sparse<P: AsRef<str>>(
2009 &self,
2010 patterns: &[P],
2011 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
2012 let re = self.build_many(patterns)?;
2013 let forward = re.forward().to_sparse()?;
2014 let reverse = re.reverse().to_sparse()?;
2015 Ok(self.build_from_dfas(forward, reverse))
2016 }
2017
2018 /// Build a regex from its component forward and reverse DFAs.
2019 ///
2020 /// This is useful when deserializing a regex from some arbitrary
2021 /// memory region. This is also useful for building regexes from other
2022 /// types of DFAs.
2023 ///
2024 /// If you're building the DFAs from scratch instead of building new DFAs
2025 /// from other DFAs, then you'll need to make sure that the reverse DFA is
2026 /// configured correctly to match the intended semantics. Namely:
2027 ///
2028 /// * It should be anchored.
2029 /// * It should use [`MatchKind::All`] semantics.
2030 /// * It should match in reverse.
2031 /// * It should have anchored start states compiled for each pattern.
2032 /// * Otherwise, its configuration should match the forward DFA.
2033 ///
2034 /// If these conditions are satisfied, then behavior of searches is
2035 /// unspecified.
2036 ///
2037 /// Note that when using this constructor, only the configuration from
2038 /// [`Config`] is applied. The only configuration settings on this builder
2039 /// only apply when the builder owns the construction of the DFAs
2040 /// themselves.
2041 ///
2042 /// # Example
2043 ///
2044 /// This example is a bit a contrived. The usual use of these methods
2045 /// would involve serializing `initial_re` somewhere and then deserializing
2046 /// it later to build a regex. But in this case, we do everything in
2047 /// memory.
2048 ///
2049 /// ```
2050 /// use regex_automata::dfa::regex::Regex;
2051 ///
2052 /// let initial_re = Regex::new("foo[0-9]+")?;
2053 /// assert_eq!(true, initial_re.is_match(b"foo123"));
2054 ///
2055 /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
2056 /// let re = Regex::builder().build_from_dfas(fwd, rev);
2057 /// assert_eq!(true, re.is_match(b"foo123"));
2058 /// # Ok::<(), Box<dyn std::error::Error>>(())
2059 /// ```
2060 ///
2061 /// This example shows how to build a `Regex` that uses sparse DFAs instead
2062 /// of dense DFAs without using one of the convenience `build_sparse`
2063 /// routines:
2064 ///
2065 /// ```
2066 /// use regex_automata::dfa::regex::Regex;
2067 ///
2068 /// let initial_re = Regex::new("foo[0-9]+")?;
2069 /// assert_eq!(true, initial_re.is_match(b"foo123"));
2070 ///
2071 /// let fwd = initial_re.forward().to_sparse()?;
2072 /// let rev = initial_re.reverse().to_sparse()?;
2073 /// let re = Regex::builder().build_from_dfas(fwd, rev);
2074 /// assert_eq!(true, re.is_match(b"foo123"));
2075 /// # Ok::<(), Box<dyn std::error::Error>>(())
2076 /// ```
2077 pub fn build_from_dfas<A: Automaton>(
2078 &self,
2079 forward: A,
2080 reverse: A,
2081 ) -> Regex<A> {
2082 let utf8 = self.config.get_utf8();
2083 Regex { prefilter: None, forward, reverse, utf8 }
2084 }
2085
2086 /// Apply the given regex configuration options to this builder.
2087 pub fn configure(&mut self, config: Config) -> &mut Builder {
2088 self.config = self.config.overwrite(config);
2089 self
2090 }
2091
2092 /// Set the syntax configuration for this builder using
2093 /// [`SyntaxConfig`](crate::SyntaxConfig).
2094 ///
2095 /// This permits setting things like case insensitivity, Unicode and multi
2096 /// line mode.
2097 pub fn syntax(
2098 &mut self,
2099 config: crate::util::syntax::SyntaxConfig,
2100 ) -> &mut Builder {
2101 self.dfa.syntax(config);
2102 self
2103 }
2104
2105 /// Set the Thompson NFA configuration for this builder using
2106 /// [`nfa::thompson::Config`](thompson::Config).
2107 ///
2108 /// This permits setting things like whether additional time should be
2109 /// spent shrinking the size of the NFA.
2110 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
2111 self.dfa.thompson(config);
2112 self
2113 }
2114
2115 /// Set the dense DFA compilation configuration for this builder using
2116 /// [`dense::Config`](dense::Config).
2117 ///
2118 /// This permits setting things like whether the underlying DFAs should
2119 /// be minimized.
2120 pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
2121 self.dfa.configure(config);
2122 self
2123 }
2124}
2125
2126#[cfg(feature = "alloc")]
2127impl Default for Builder {
2128 fn default() -> Builder {
2129 Builder::new()
2130 }
2131}
2132
2133#[inline(always)]
2134fn next_unwrap(
2135 item: Option<Result<MultiMatch, MatchError>>,
2136) -> Option<MultiMatch> {
2137 match item {
2138 None => None,
2139 Some(Ok(m: MultiMatch)) => Some(m),
2140 Some(Err(err: MatchError)) => panic!(
2141 "unexpected regex search error: {}\n\
2142 to handle search errors, use try_ methods",
2143 err,
2144 ),
2145 }
2146}
2147