1/*!
2A DFA-backed `Regex`.
3
4This module provides [`Regex`], which is defined generically over the
5[`Automaton`] trait. A `Regex` implements convenience routines you might have
6come to expect, such as finding the start/end of a match and iterating over
7all non-overlapping matches. This `Regex` type is limited in its capabilities
8to what a DFA can provide. Therefore, APIs involving capturing groups, for
9example, are not provided.
10
11Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
12finds the end offset of a match, where as the other is a "reverse" DFA that
13find the start offset of a match.
14
15See the [parent module](crate::dfa) for examples.
16*/
17
18#[cfg(feature = "alloc")]
19use alloc::vec::Vec;
20
21#[cfg(feature = "dfa-build")]
22use crate::dfa::dense::BuildError;
23use crate::{
24 dfa::{automaton::Automaton, dense},
25 util::{iter, search::Input},
26 Anchored, Match, MatchError,
27};
28#[cfg(feature = "alloc")]
29use crate::{
30 dfa::{sparse, StartKind},
31 util::search::MatchKind,
32};
33
34// When the alloc feature is enabled, the regex type sets its A type parameter
35// to default to an owned dense DFA. But without alloc, we set no default. This
36// makes things a lot more convenient in the common case, since writing out the
37// DFA types is pretty annoying.
38//
39// Since we have two different definitions but only want to write one doc
40// string, we use a macro to capture the doc and other attributes once and then
41// repeat them for each definition.
42macro_rules! define_regex_type {
43 ($(#[$doc:meta])*) => {
44 #[cfg(feature = "alloc")]
45 $(#[$doc])*
46 pub struct Regex<A = dense::OwnedDFA> {
47 forward: A,
48 reverse: A,
49 }
50
51 #[cfg(not(feature = "alloc"))]
52 $(#[$doc])*
53 pub struct Regex<A> {
54 forward: A,
55 reverse: A,
56 }
57 };
58}
59
60define_regex_type!(
61 /// A regular expression that uses deterministic finite automata for fast
62 /// searching.
63 ///
64 /// A regular expression is comprised of two DFAs, a "forward" DFA and a
65 /// "reverse" DFA. The forward DFA is responsible for detecting the end of
66 /// a match while the reverse DFA is responsible for detecting the start
67 /// of a match. Thus, in order to find the bounds of any given match, a
68 /// forward search must first be run followed by a reverse search. A match
69 /// found by the forward DFA guarantees that the reverse DFA will also find
70 /// a match.
71 ///
72 /// The type of the DFA used by a `Regex` corresponds to the `A` type
73 /// parameter, which must satisfy the [`Automaton`] trait. Typically,
74 /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
75 /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
76 /// memory but search faster, while sparse DFAs use less memory but search
77 /// more slowly.
78 ///
79 /// # Crate features
80 ///
81 /// Note that despite what the documentation auto-generates, the _only_
82 /// crate feature needed to use this type is `dfa-search`. You do _not_
83 /// need to enable the `alloc` feature.
84 ///
85 /// By default, a regex's automaton type parameter is set to
86 /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
87 /// in-memory work loads, this is the most convenient type that gives the
88 /// best search performance. When the `alloc` feature is disabled, no
89 /// default type is used.
90 ///
91 /// # When should I use this?
92 ///
93 /// Generally speaking, if you can afford the overhead of building a full
94 /// DFA for your regex, and you don't need things like capturing groups,
95 /// then this is a good choice if you're looking to optimize for matching
96 /// speed. Note however that its speed may be worse than a general purpose
97 /// regex engine if you don't provide a [`dense::Config::prefilter`] to the
98 /// underlying DFA.
99 ///
100 /// # Sparse DFAs
101 ///
102 /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
103 /// used with any kind of DFA. While this crate constructs dense DFAs by
104 /// default, it is easy enough to build corresponding sparse DFAs, and then
105 /// build a regex from them:
106 ///
107 /// ```
108 /// use regex_automata::dfa::regex::Regex;
109 ///
110 /// // First, build a regex that uses dense DFAs.
111 /// let dense_re = Regex::new("foo[0-9]+")?;
112 ///
113 /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
114 /// let fwd = dense_re.forward().to_sparse()?;
115 /// let rev = dense_re.reverse().to_sparse()?;
116 ///
117 /// // Third, build a new regex from the constituent sparse DFAs.
118 /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
119 ///
120 /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
121 /// assert_eq!(true, sparse_re.is_match(b"foo123"));
122 ///
123 /// # Ok::<(), Box<dyn std::error::Error>>(())
124 /// ```
125 ///
126 /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
127 /// more succinctly. (Note though that dense DFAs are still constructed
128 /// first internally, and then converted to sparse DFAs, as in the example
129 /// above.)
130 ///
131 /// ```
132 /// use regex_automata::dfa::regex::Regex;
133 ///
134 /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
135 /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
136 /// assert!(sparse_re.is_match(b"foo123"));
137 ///
138 /// # Ok::<(), Box<dyn std::error::Error>>(())
139 /// ```
140 ///
141 /// # Fallibility
142 ///
143 /// Most of the search routines defined on this type will _panic_ when the
144 /// underlying search fails. This might be because the DFA gave up because
145 /// it saw a quit byte, whether configured explicitly or via heuristic
146 /// Unicode word boundary support, although neither are enabled by default.
147 /// Or it might fail because an invalid `Input` configuration is given,
148 /// for example, with an unsupported [`Anchored`] mode.
149 ///
150 /// If you need to handle these error cases instead of allowing them to
151 /// trigger a panic, then the lower level [`Regex::try_search`] provides
152 /// a fallible API that never panics.
153 ///
154 /// # Example
155 ///
156 /// This example shows how to cause a search to terminate if it sees a
157 /// `\n` byte, and handle the error returned. This could be useful if, for
158 /// example, you wanted to prevent a user supplied pattern from matching
159 /// across a line boundary.
160 ///
161 /// ```
162 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
163 /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
164 ///
165 /// let re = Regex::builder()
166 /// .dense(dfa::dense::Config::new().quit(b'\n', true))
167 /// .build(r"foo\p{any}+bar")?;
168 ///
169 /// let input = Input::new("foo\nbar");
170 /// // Normally this would produce a match, since \p{any} contains '\n'.
171 /// // But since we instructed the automaton to enter a quit state if a
172 /// // '\n' is observed, this produces a match error instead.
173 /// let expected = MatchError::quit(b'\n', 3);
174 /// let got = re.try_search(&input).unwrap_err();
175 /// assert_eq!(expected, got);
176 ///
177 /// # Ok::<(), Box<dyn std::error::Error>>(())
178 /// ```
179 #[derive(Clone, Debug)]
180);
181
182#[cfg(all(feature = "syntax", feature = "dfa-build"))]
183impl Regex {
184 /// Parse the given regular expression using the default configuration and
185 /// return the corresponding regex.
186 ///
187 /// If you want a non-default configuration, then use the [`Builder`] to
188 /// set your own configuration.
189 ///
190 /// # Example
191 ///
192 /// ```
193 /// use regex_automata::{Match, dfa::regex::Regex};
194 ///
195 /// let re = Regex::new("foo[0-9]+bar")?;
196 /// assert_eq!(
197 /// Some(Match::must(0, 3..14)),
198 /// re.find(b"zzzfoo12345barzzz"),
199 /// );
200 /// # Ok::<(), Box<dyn std::error::Error>>(())
201 /// ```
202 pub fn new(pattern: &str) -> Result<Regex, BuildError> {
203 Builder::new().build(pattern)
204 }
205
206 /// Like `new`, but parses multiple patterns into a single "regex set."
207 /// This similarly uses the default regex configuration.
208 ///
209 /// # Example
210 ///
211 /// ```
212 /// use regex_automata::{Match, dfa::regex::Regex};
213 ///
214 /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
215 ///
216 /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
217 /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
218 /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
219 /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
220 /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
221 /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
222 /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
223 /// assert_eq!(None, it.next());
224 /// # Ok::<(), Box<dyn std::error::Error>>(())
225 /// ```
226 pub fn new_many<P: AsRef<str>>(
227 patterns: &[P],
228 ) -> Result<Regex, BuildError> {
229 Builder::new().build_many(patterns)
230 }
231}
232
233#[cfg(all(feature = "syntax", feature = "dfa-build"))]
234impl Regex<sparse::DFA<Vec<u8>>> {
235 /// Parse the given regular expression using the default configuration,
236 /// except using sparse DFAs, and return the corresponding regex.
237 ///
238 /// If you want a non-default configuration, then use the [`Builder`] to
239 /// set your own configuration.
240 ///
241 /// # Example
242 ///
243 /// ```
244 /// use regex_automata::{Match, dfa::regex::Regex};
245 ///
246 /// let re = Regex::new_sparse("foo[0-9]+bar")?;
247 /// assert_eq!(
248 /// Some(Match::must(0, 3..14)),
249 /// re.find(b"zzzfoo12345barzzz"),
250 /// );
251 /// # Ok::<(), Box<dyn std::error::Error>>(())
252 /// ```
253 pub fn new_sparse(
254 pattern: &str,
255 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
256 Builder::new().build_sparse(pattern)
257 }
258
259 /// Like `new`, but parses multiple patterns into a single "regex set"
260 /// using sparse DFAs. This otherwise similarly uses the default regex
261 /// configuration.
262 ///
263 /// # Example
264 ///
265 /// ```
266 /// use regex_automata::{Match, dfa::regex::Regex};
267 ///
268 /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
269 ///
270 /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
271 /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
272 /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
273 /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
274 /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
275 /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
276 /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
277 /// assert_eq!(None, it.next());
278 /// # Ok::<(), Box<dyn std::error::Error>>(())
279 /// ```
280 pub fn new_many_sparse<P: AsRef<str>>(
281 patterns: &[P],
282 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
283 Builder::new().build_many_sparse(patterns)
284 }
285}
286
287/// Convenience routines for regex construction.
288impl Regex<dense::DFA<&'static [u32]>> {
289 /// Return a builder for configuring the construction of a `Regex`.
290 ///
291 /// This is a convenience routine to avoid needing to import the
292 /// [`Builder`] type in common cases.
293 ///
294 /// # Example
295 ///
296 /// This example shows how to use the builder to disable UTF-8 mode
297 /// everywhere.
298 ///
299 /// ```
300 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
301 /// use regex_automata::{
302 /// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
303 /// };
304 ///
305 /// let re = Regex::builder()
306 /// .syntax(syntax::Config::new().utf8(false))
307 /// .thompson(thompson::Config::new().utf8(false))
308 /// .build(r"foo(?-u:[^b])ar.*")?;
309 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
310 /// let expected = Some(Match::must(0, 1..9));
311 /// let got = re.find(haystack);
312 /// assert_eq!(expected, got);
313 ///
314 /// # Ok::<(), Box<dyn std::error::Error>>(())
315 /// ```
316 pub fn builder() -> Builder {
317 Builder::new()
318 }
319}
320
321/// Standard search routines for finding and iterating over matches.
322impl<A: Automaton> Regex<A> {
323 /// Returns true if and only if this regex matches the given haystack.
324 ///
325 /// This routine may short circuit if it knows that scanning future input
326 /// will never lead to a different result. In particular, if the underlying
327 /// DFA enters a match state or a dead state, then this routine will return
328 /// `true` or `false`, respectively, without inspecting any future input.
329 ///
330 /// # Panics
331 ///
332 /// This routine panics if the search could not complete. This can occur
333 /// in a number of circumstances:
334 ///
335 /// * The configuration of the DFA may permit it to "quit" the search.
336 /// For example, setting quit bytes or enabling heuristic support for
337 /// Unicode word boundaries. The default configuration does not enable any
338 /// option that could result in the DFA quitting.
339 /// * When the provided `Input` configuration is not supported. For
340 /// example, by providing an unsupported anchor mode.
341 ///
342 /// When a search panics, callers cannot know whether a match exists or
343 /// not.
344 ///
345 /// Use [`Regex::try_search`] if you want to handle these error conditions.
346 ///
347 /// # Example
348 ///
349 /// ```
350 /// use regex_automata::dfa::regex::Regex;
351 ///
352 /// let re = Regex::new("foo[0-9]+bar")?;
353 /// assert_eq!(true, re.is_match("foo12345bar"));
354 /// assert_eq!(false, re.is_match("foobar"));
355 /// # Ok::<(), Box<dyn std::error::Error>>(())
356 /// ```
357 #[inline]
358 pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
359 // Not only can we do an "earliest" search, but we can avoid doing a
360 // reverse scan too.
361 let input = input.into().earliest(true);
362 self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
363 }
364
365 /// Returns the start and end offset of the leftmost match. If no match
366 /// exists, then `None` is returned.
367 ///
368 /// # Panics
369 ///
370 /// This routine panics if the search could not complete. This can occur
371 /// in a number of circumstances:
372 ///
373 /// * The configuration of the DFA may permit it to "quit" the search.
374 /// For example, setting quit bytes or enabling heuristic support for
375 /// Unicode word boundaries. The default configuration does not enable any
376 /// option that could result in the DFA quitting.
377 /// * When the provided `Input` configuration is not supported. For
378 /// example, by providing an unsupported anchor mode.
379 ///
380 /// When a search panics, callers cannot know whether a match exists or
381 /// not.
382 ///
383 /// Use [`Regex::try_search`] if you want to handle these error conditions.
384 ///
385 /// # Example
386 ///
387 /// ```
388 /// use regex_automata::{Match, dfa::regex::Regex};
389 ///
390 /// // Greediness is applied appropriately.
391 /// let re = Regex::new("foo[0-9]+")?;
392 /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
393 ///
394 /// // Even though a match is found after reading the first byte (`a`),
395 /// // the default leftmost-first match semantics demand that we find the
396 /// // earliest match that prefers earlier parts of the pattern over latter
397 /// // parts.
398 /// let re = Regex::new("abc|a")?;
399 /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
400 /// # Ok::<(), Box<dyn std::error::Error>>(())
401 /// ```
402 #[inline]
403 pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
404 self.try_search(&input.into()).unwrap()
405 }
406
407 /// Returns an iterator over all non-overlapping leftmost matches in the
408 /// given bytes. If no match exists, then the iterator yields no elements.
409 ///
410 /// This corresponds to the "standard" regex search iterator.
411 ///
412 /// # Panics
413 ///
414 /// If the search returns an error during iteration, then iteration
415 /// panics. See [`Regex::find`] for the panic conditions.
416 ///
417 /// Use [`Regex::try_search`] with
418 /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
419 /// handle these error conditions.
420 ///
421 /// # Example
422 ///
423 /// ```
424 /// use regex_automata::{Match, dfa::regex::Regex};
425 ///
426 /// let re = Regex::new("foo[0-9]+")?;
427 /// let text = "foo1 foo12 foo123";
428 /// let matches: Vec<Match> = re.find_iter(text).collect();
429 /// assert_eq!(matches, vec![
430 /// Match::must(0, 0..4),
431 /// Match::must(0, 5..10),
432 /// Match::must(0, 11..17),
433 /// ]);
434 /// # Ok::<(), Box<dyn std::error::Error>>(())
435 /// ```
436 #[inline]
437 pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
438 &'r self,
439 input: I,
440 ) -> FindMatches<'r, 'h, A> {
441 let it = iter::Searcher::new(input.into());
442 FindMatches { re: self, it }
443 }
444}
445
446/// Lower level fallible search routines that permit controlling where the
447/// search starts and ends in a particular sequence.
448impl<A: Automaton> Regex<A> {
449 /// Returns the start and end offset of the leftmost match. If no match
450 /// exists, then `None` is returned.
451 ///
452 /// This is like [`Regex::find`] but with two differences:
453 ///
454 /// 1. It is not generic over `Into<Input>` and instead accepts a
455 /// `&Input`. This permits reusing the same `Input` for multiple searches
456 /// without needing to create a new one. This _may_ help with latency.
457 /// 2. It returns an error if the search could not complete where as
458 /// [`Regex::find`] will panic.
459 ///
460 /// # Errors
461 ///
462 /// This routine errors if the search could not complete. This can occur
463 /// in the following circumstances:
464 ///
465 /// * The configuration of the DFA may permit it to "quit" the search.
466 /// For example, setting quit bytes or enabling heuristic support for
467 /// Unicode word boundaries. The default configuration does not enable any
468 /// option that could result in the DFA quitting.
469 /// * When the provided `Input` configuration is not supported. For
470 /// example, by providing an unsupported anchor mode.
471 ///
472 /// When a search returns an error, callers cannot know whether a match
473 /// exists or not.
474 #[inline]
475 pub fn try_search(
476 &self,
477 input: &Input<'_>,
478 ) -> Result<Option<Match>, MatchError> {
479 let (fwd, rev) = (self.forward(), self.reverse());
480 let end = match fwd.try_search_fwd(input)? {
481 None => return Ok(None),
482 Some(end) => end,
483 };
484 // This special cases an empty match at the beginning of the search. If
485 // our end matches our start, then since a reverse DFA can't match past
486 // the start, it must follow that our starting position is also our end
487 // position. So short circuit and skip the reverse search.
488 if input.start() == end.offset() {
489 return Ok(Some(Match::new(
490 end.pattern(),
491 end.offset()..end.offset(),
492 )));
493 }
494 // We can also skip the reverse search if we know our search was
495 // anchored. This occurs either when the input config is anchored or
496 // when we know the regex itself is anchored. In this case, we know the
497 // start of the match, if one is found, must be the start of the
498 // search.
499 if self.is_anchored(input) {
500 return Ok(Some(Match::new(
501 end.pattern(),
502 input.start()..end.offset(),
503 )));
504 }
505 // N.B. I have tentatively convinced myself that it isn't necessary
506 // to specify the specific pattern for the reverse search since the
507 // reverse search will always find the same pattern to match as the
508 // forward search. But I lack a rigorous proof. Why not just provide
509 // the pattern anyway? Well, if it is needed, then leaving it out
510 // gives us a chance to find a witness. (Also, if we don't need to
511 // specify the pattern, then we don't need to build the reverse DFA
512 // with 'starts_for_each_pattern' enabled.)
513 //
514 // We also need to be careful to disable 'earliest' for the reverse
515 // search, since it could be enabled for the forward search. In the
516 // reverse case, to satisfy "leftmost" criteria, we need to match
517 // as much as we can. We also need to be careful to make the search
518 // anchored. We don't want the reverse search to report any matches
519 // other than the one beginning at the end of our forward search.
520 let revsearch = input
521 .clone()
522 .span(input.start()..end.offset())
523 .anchored(Anchored::Yes)
524 .earliest(false);
525 let start = rev
526 .try_search_rev(&revsearch)?
527 .expect("reverse search must match if forward search does");
528 assert_eq!(
529 start.pattern(),
530 end.pattern(),
531 "forward and reverse search must match same pattern",
532 );
533 assert!(start.offset() <= end.offset());
534 Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
535 }
536
537 /// Returns true if either the given input specifies an anchored search
538 /// or if the underlying DFA is always anchored.
539 fn is_anchored(&self, input: &Input<'_>) -> bool {
540 match input.get_anchored() {
541 Anchored::No => self.forward().is_always_start_anchored(),
542 Anchored::Yes | Anchored::Pattern(_) => true,
543 }
544 }
545}
546
547/// Non-search APIs for querying information about the regex and setting a
548/// prefilter.
549impl<A: Automaton> Regex<A> {
550 /// Return the underlying DFA responsible for forward matching.
551 ///
552 /// This is useful for accessing the underlying DFA and converting it to
553 /// some other format or size. See the [`Builder::build_from_dfas`] docs
554 /// for an example of where this might be useful.
555 pub fn forward(&self) -> &A {
556 &self.forward
557 }
558
559 /// Return the underlying DFA responsible for reverse matching.
560 ///
561 /// This is useful for accessing the underlying DFA and converting it to
562 /// some other format or size. See the [`Builder::build_from_dfas`] docs
563 /// for an example of where this might be useful.
564 pub fn reverse(&self) -> &A {
565 &self.reverse
566 }
567
568 /// Returns the total number of patterns matched by this regex.
569 ///
570 /// # Example
571 ///
572 /// ```
573 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
574 /// use regex_automata::dfa::regex::Regex;
575 ///
576 /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
577 /// assert_eq!(3, re.pattern_len());
578 /// # Ok::<(), Box<dyn std::error::Error>>(())
579 /// ```
580 pub fn pattern_len(&self) -> usize {
581 assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
582 self.forward().pattern_len()
583 }
584}
585
586/// An iterator over all non-overlapping matches for an infallible search.
587///
588/// The iterator yields a [`Match`] value until no more matches could be found.
589/// If the underlying regex engine returns an error, then a panic occurs.
590///
591/// The type parameters are as follows:
592///
593/// * `A` represents the type of the underlying DFA that implements the
594/// [`Automaton`] trait.
595///
596/// The lifetime parameters are as follows:
597///
598/// * `'h` represents the lifetime of the haystack being searched.
599/// * `'r` represents the lifetime of the regex object itself.
600///
601/// This iterator can be created with the [`Regex::find_iter`] method.
602#[derive(Debug)]
603pub struct FindMatches<'r, 'h, A> {
604 re: &'r Regex<A>,
605 it: iter::Searcher<'h>,
606}
607
608impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
609 type Item = Match;
610
611 #[inline]
612 fn next(&mut self) -> Option<Match> {
613 let FindMatches { re, ref mut it } = *self;
614 it.advance(|input| re.try_search(input))
615 }
616}
617
618/// A builder for a regex based on deterministic finite automatons.
619///
620/// This builder permits configuring options for the syntax of a pattern, the
621/// NFA construction, the DFA construction and finally the regex searching
622/// itself. This builder is different from a general purpose regex builder in
623/// that it permits fine grain configuration of the construction process. The
624/// trade off for this is complexity, and the possibility of setting a
625/// configuration that might not make sense. For example, there are two
626/// different UTF-8 modes:
627///
628/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
629/// whether the pattern itself can contain sub-expressions that match invalid
630/// UTF-8.
631/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
632/// how the regex iterators themselves advance the starting position of the
633/// next search when a match with zero length is found.
634///
635/// Generally speaking, callers will want to either enable all of these or
636/// disable all of these.
637///
638/// Internally, building a regex requires building two DFAs, where one is
639/// responsible for finding the end of a match and the other is responsible
640/// for finding the start of a match. If you only need to detect whether
641/// something matched, or only the end of a match, then you should use a
642/// [`dense::Builder`] to construct a single DFA, which is cheaper than
643/// building two DFAs.
644///
645/// # Build methods
646///
647/// This builder has a few "build" methods. In general, it's the result of
648/// combining the following parameters:
649///
650/// * Building one or many regexes.
651/// * Building a regex with dense or sparse DFAs.
652///
653/// The simplest "build" method is [`Builder::build`]. It accepts a single
654/// pattern and builds a dense DFA using `usize` for the state identifier
655/// representation.
656///
657/// The most general "build" method is [`Builder::build_many`], which permits
658/// building a regex that searches for multiple patterns simultaneously while
659/// using a specific state identifier representation.
660///
661/// The most flexible "build" method, but hardest to use, is
662/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
663/// just a pair of DFAs, and this method allows you to specify those DFAs
664/// exactly.
665///
666/// # Example
667///
668/// This example shows how to disable UTF-8 mode in the syntax and the regex
669/// itself. This is generally what you want for matching on arbitrary bytes.
670///
671/// ```
672/// # if cfg!(miri) { return Ok(()); } // miri takes too long
673/// use regex_automata::{
674/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
675/// };
676///
677/// let re = Regex::builder()
678/// .syntax(syntax::Config::new().utf8(false))
679/// .thompson(thompson::Config::new().utf8(false))
680/// .build(r"foo(?-u:[^b])ar.*")?;
681/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
682/// let expected = Some(Match::must(0, 1..9));
683/// let got = re.find(haystack);
684/// assert_eq!(expected, got);
685/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
686/// // but the subsequent `.*` does not! Disabling UTF-8
687/// // on the syntax permits this.
688/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
689///
690/// # Ok::<(), Box<dyn std::error::Error>>(())
691/// ```
692#[derive(Clone, Debug)]
693pub struct Builder {
694 #[cfg(feature = "dfa-build")]
695 dfa: dense::Builder,
696}
697
698impl Builder {
699 /// Create a new regex builder with the default configuration.
700 pub fn new() -> Builder {
701 Builder {
702 #[cfg(feature = "dfa-build")]
703 dfa: dense::Builder::new(),
704 }
705 }
706
707 /// Build a regex from the given pattern.
708 ///
709 /// If there was a problem parsing or compiling the pattern, then an error
710 /// is returned.
711 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
712 pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
713 self.build_many(&[pattern])
714 }
715
716 /// Build a regex from the given pattern using sparse DFAs.
717 ///
718 /// If there was a problem parsing or compiling the pattern, then an error
719 /// is returned.
720 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
721 pub fn build_sparse(
722 &self,
723 pattern: &str,
724 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
725 self.build_many_sparse(&[pattern])
726 }
727
728 /// Build a regex from the given patterns.
729 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
730 pub fn build_many<P: AsRef<str>>(
731 &self,
732 patterns: &[P],
733 ) -> Result<Regex, BuildError> {
734 let forward = self.dfa.build_many(patterns)?;
735 let reverse = self
736 .dfa
737 .clone()
738 .configure(
739 dense::Config::new()
740 .prefilter(None)
741 .specialize_start_states(false)
742 .start_kind(StartKind::Anchored)
743 .match_kind(MatchKind::All),
744 )
745 .thompson(crate::nfa::thompson::Config::new().reverse(true))
746 .build_many(patterns)?;
747 Ok(self.build_from_dfas(forward, reverse))
748 }
749
750 /// Build a sparse regex from the given patterns.
751 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
752 pub fn build_many_sparse<P: AsRef<str>>(
753 &self,
754 patterns: &[P],
755 ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
756 let re = self.build_many(patterns)?;
757 let forward = re.forward().to_sparse()?;
758 let reverse = re.reverse().to_sparse()?;
759 Ok(self.build_from_dfas(forward, reverse))
760 }
761
762 /// Build a regex from its component forward and reverse DFAs.
763 ///
764 /// This is useful when deserializing a regex from some arbitrary
765 /// memory region. This is also useful for building regexes from other
766 /// types of DFAs.
767 ///
768 /// If you're building the DFAs from scratch instead of building new DFAs
769 /// from other DFAs, then you'll need to make sure that the reverse DFA is
770 /// configured correctly to match the intended semantics. Namely:
771 ///
772 /// * It should be anchored.
773 /// * It should use [`MatchKind::All`] semantics.
774 /// * It should match in reverse.
775 /// * Otherwise, its configuration should match the forward DFA.
776 ///
777 /// If these conditions aren't satisfied, then the behavior of searches is
778 /// unspecified.
779 ///
780 /// Note that when using this constructor, no configuration is applied.
781 /// Since this routine provides the DFAs to the builder, there is no
782 /// opportunity to apply other configuration options.
783 ///
784 /// # Example
785 ///
786 /// This example is a bit a contrived. The usual use of these methods
787 /// would involve serializing `initial_re` somewhere and then deserializing
788 /// it later to build a regex. But in this case, we do everything in
789 /// memory.
790 ///
791 /// ```
792 /// use regex_automata::dfa::regex::Regex;
793 ///
794 /// let initial_re = Regex::new("foo[0-9]+")?;
795 /// assert_eq!(true, initial_re.is_match(b"foo123"));
796 ///
797 /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
798 /// let re = Regex::builder().build_from_dfas(fwd, rev);
799 /// assert_eq!(true, re.is_match(b"foo123"));
800 /// # Ok::<(), Box<dyn std::error::Error>>(())
801 /// ```
802 ///
803 /// This example shows how to build a `Regex` that uses sparse DFAs instead
804 /// of dense DFAs without using one of the convenience `build_sparse`
805 /// routines:
806 ///
807 /// ```
808 /// use regex_automata::dfa::regex::Regex;
809 ///
810 /// let initial_re = Regex::new("foo[0-9]+")?;
811 /// assert_eq!(true, initial_re.is_match(b"foo123"));
812 ///
813 /// let fwd = initial_re.forward().to_sparse()?;
814 /// let rev = initial_re.reverse().to_sparse()?;
815 /// let re = Regex::builder().build_from_dfas(fwd, rev);
816 /// assert_eq!(true, re.is_match(b"foo123"));
817 /// # Ok::<(), Box<dyn std::error::Error>>(())
818 /// ```
819 pub fn build_from_dfas<A: Automaton>(
820 &self,
821 forward: A,
822 reverse: A,
823 ) -> Regex<A> {
824 Regex { forward, reverse }
825 }
826
827 /// Set the syntax configuration for this builder using
828 /// [`syntax::Config`](crate::util::syntax::Config).
829 ///
830 /// This permits setting things like case insensitivity, Unicode and multi
831 /// line mode.
832 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
833 pub fn syntax(
834 &mut self,
835 config: crate::util::syntax::Config,
836 ) -> &mut Builder {
837 self.dfa.syntax(config);
838 self
839 }
840
841 /// Set the Thompson NFA configuration for this builder using
842 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
843 ///
844 /// This permits setting things like whether additional time should be
845 /// spent shrinking the size of the NFA.
846 #[cfg(all(feature = "syntax", feature = "dfa-build"))]
847 pub fn thompson(
848 &mut self,
849 config: crate::nfa::thompson::Config,
850 ) -> &mut Builder {
851 self.dfa.thompson(config);
852 self
853 }
854
855 /// Set the dense DFA compilation configuration for this builder using
856 /// [`dense::Config`].
857 ///
858 /// This permits setting things like whether the underlying DFAs should
859 /// be minimized.
860 #[cfg(feature = "dfa-build")]
861 pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
862 self.dfa.configure(config);
863 self
864 }
865}
866
867impl Default for Builder {
868 fn default() -> Builder {
869 Builder::new()
870 }
871}
872