1/*!
2A lazy DFA backed `Regex`.
3
4This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements
5convenience routines you might have come to expect, such as finding a match
6and iterating over all non-overlapping matches. This `Regex` type is limited
7in its capabilities to what a lazy DFA can provide. Therefore, APIs involving
8capturing groups, for example, are not provided.
9
10Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
11finds the end offset of a match, where as the other is a "reverse" DFA that
12find the start offset of a match.
13
14See the [parent module](crate::hybrid) for examples.
15*/
16
17use crate::{
18 hybrid::{
19 dfa::{self, DFA},
20 error::BuildError,
21 },
22 nfa::thompson,
23 util::{
24 iter,
25 search::{Anchored, Input, Match, MatchError, MatchKind},
26 },
27};
28
29/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs")
30/// for searching.
31///
32/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a
33/// "reverse" DFA. The forward DFA is responsible for detecting the end of
34/// a match while the reverse DFA is responsible for detecting the start
35/// of a match. Thus, in order to find the bounds of any given match, a
36/// forward search must first be run followed by a reverse search. A match
37/// found by the forward DFA guarantees that the reverse DFA will also find
38/// a match.
39///
40/// # Fallibility
41///
42/// Most of the search routines defined on this type will _panic_ when the
43/// underlying search fails. This might be because the DFA gave up because it
44/// saw a quit byte, whether configured explicitly or via heuristic Unicode
45/// word boundary support, although neither are enabled by default. It might
46/// also fail if the underlying DFA determines it isn't making effective use of
47/// the cache (which also never happens by default). Or it might fail because
48/// an invalid `Input` configuration is given, for example, with an unsupported
49/// [`Anchored`] mode.
50///
51/// If you need to handle these error cases instead of allowing them to trigger
52/// a panic, then the lower level [`Regex::try_search`] provides a fallible API
53/// that never panics.
54///
55/// # Example
56///
57/// This example shows how to cause a search to terminate if it sees a
58/// `\n` byte, and handle the error returned. This could be useful if, for
59/// example, you wanted to prevent a user supplied pattern from matching
60/// across a line boundary.
61///
62/// ```
63/// # if cfg!(miri) { return Ok(()); } // miri takes too long
64/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError};
65///
66/// let re = Regex::builder()
67/// .dfa(dfa::Config::new().quit(b'\n', true))
68/// .build(r"foo\p{any}+bar")?;
69/// let mut cache = re.create_cache();
70///
71/// let input = Input::new("foo\nbar");
72/// // Normally this would produce a match, since \p{any} contains '\n'.
73/// // But since we instructed the automaton to enter a quit state if a
74/// // '\n' is observed, this produces a match error instead.
75/// let expected = MatchError::quit(b'\n', 3);
76/// let got = re.try_search(&mut cache, &input).unwrap_err();
77/// assert_eq!(expected, got);
78///
79/// # Ok::<(), Box<dyn std::error::Error>>(())
80/// ```
81#[derive(Debug)]
82pub struct Regex {
83 /// The forward lazy DFA. This can only find the end of a match.
84 forward: DFA,
85 /// The reverse lazy DFA. This can only find the start of a match.
86 ///
87 /// This is built with 'all' match semantics (instead of leftmost-first)
88 /// so that it always finds the longest possible match (which corresponds
89 /// to the leftmost starting position). It is also compiled as an anchored
90 /// matcher and has 'starts_for_each_pattern' enabled. Including starting
91 /// states for each pattern is necessary to ensure that we only look for
92 /// matches of a pattern that matched in the forward direction. Otherwise,
93 /// we might wind up finding the "leftmost" starting position of a totally
94 /// different pattern!
95 reverse: DFA,
96}
97
98/// Convenience routines for regex and cache construction.
99impl Regex {
100 /// Parse the given regular expression using the default configuration and
101 /// return the corresponding regex.
102 ///
103 /// If you want a non-default configuration, then use the [`Builder`] to
104 /// set your own configuration.
105 ///
106 /// # Example
107 ///
108 /// ```
109 /// use regex_automata::{hybrid::regex::Regex, Match};
110 ///
111 /// let re = Regex::new("foo[0-9]+bar")?;
112 /// let mut cache = re.create_cache();
113 /// assert_eq!(
114 /// Some(Match::must(0, 3..14)),
115 /// re.find(&mut cache, "zzzfoo12345barzzz"),
116 /// );
117 /// # Ok::<(), Box<dyn std::error::Error>>(())
118 /// ```
119 #[cfg(feature = "syntax")]
120 pub fn new(pattern: &str) -> Result<Regex, BuildError> {
121 Regex::builder().build(pattern)
122 }
123
124 /// Like `new`, but parses multiple patterns into a single "multi regex."
125 /// This similarly uses the default regex configuration.
126 ///
127 /// # Example
128 ///
129 /// ```
130 /// use regex_automata::{hybrid::regex::Regex, Match};
131 ///
132 /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
133 /// let mut cache = re.create_cache();
134 ///
135 /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
136 /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
137 /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
138 /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
139 /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
140 /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
141 /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
142 /// assert_eq!(None, it.next());
143 /// # Ok::<(), Box<dyn std::error::Error>>(())
144 /// ```
145 #[cfg(feature = "syntax")]
146 pub fn new_many<P: AsRef<str>>(
147 patterns: &[P],
148 ) -> Result<Regex, BuildError> {
149 Regex::builder().build_many(patterns)
150 }
151
152 /// Return a builder for configuring the construction of a `Regex`.
153 ///
154 /// This is a convenience routine to avoid needing to import the
155 /// [`Builder`] type in common cases.
156 ///
157 /// # Example
158 ///
159 /// This example shows how to use the builder to disable UTF-8 mode
160 /// everywhere.
161 ///
162 /// ```
163 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
164 /// use regex_automata::{
165 /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
166 /// };
167 ///
168 /// let re = Regex::builder()
169 /// .syntax(syntax::Config::new().utf8(false))
170 /// .thompson(thompson::Config::new().utf8(false))
171 /// .build(r"foo(?-u:[^b])ar.*")?;
172 /// let mut cache = re.create_cache();
173 ///
174 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
175 /// let expected = Some(Match::must(0, 1..9));
176 /// let got = re.find(&mut cache, haystack);
177 /// assert_eq!(expected, got);
178 ///
179 /// # Ok::<(), Box<dyn std::error::Error>>(())
180 /// ```
181 pub fn builder() -> Builder {
182 Builder::new()
183 }
184
185 /// Create a new cache for this `Regex`.
186 ///
187 /// The cache returned should only be used for searches for this
188 /// `Regex`. If you want to reuse the cache for another `Regex`, then
189 /// you must call [`Cache::reset`] with that `Regex` (or, equivalently,
190 /// [`Regex::reset_cache`]).
191 pub fn create_cache(&self) -> Cache {
192 Cache::new(self)
193 }
194
195 /// Reset the given cache such that it can be used for searching with the
196 /// this `Regex` (and only this `Regex`).
197 ///
198 /// A cache reset permits reusing memory already allocated in this cache
199 /// with a different `Regex`.
200 ///
201 /// Resetting a cache sets its "clear count" to 0. This is relevant if the
202 /// `Regex` has been configured to "give up" after it has cleared the cache
203 /// a certain number of times.
204 ///
205 /// # Example
206 ///
207 /// This shows how to re-purpose a cache for use with a different `Regex`.
208 ///
209 /// ```
210 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
211 /// use regex_automata::{hybrid::regex::Regex, Match};
212 ///
213 /// let re1 = Regex::new(r"\w")?;
214 /// let re2 = Regex::new(r"\W")?;
215 ///
216 /// let mut cache = re1.create_cache();
217 /// assert_eq!(
218 /// Some(Match::must(0, 0..2)),
219 /// re1.find(&mut cache, "Δ"),
220 /// );
221 ///
222 /// // Using 'cache' with re2 is not allowed. It may result in panics or
223 /// // incorrect results. In order to re-purpose the cache, we must reset
224 /// // it with the Regex we'd like to use it with.
225 /// //
226 /// // Similarly, after this reset, using the cache with 're1' is also not
227 /// // allowed.
228 /// re2.reset_cache(&mut cache);
229 /// assert_eq!(
230 /// Some(Match::must(0, 0..3)),
231 /// re2.find(&mut cache, "☃"),
232 /// );
233 ///
234 /// # Ok::<(), Box<dyn std::error::Error>>(())
235 /// ```
236 pub fn reset_cache(&self, cache: &mut Cache) {
237 self.forward().reset_cache(&mut cache.forward);
238 self.reverse().reset_cache(&mut cache.reverse);
239 }
240}
241
242/// Standard infallible search routines for finding and iterating over matches.
243impl Regex {
244 /// Returns true if and only if this regex matches the given haystack.
245 ///
246 /// This routine may short circuit if it knows that scanning future input
247 /// will never lead to a different result. In particular, if the underlying
248 /// DFA enters a match state or a dead state, then this routine will return
249 /// `true` or `false`, respectively, without inspecting any future input.
250 ///
251 /// # Panics
252 ///
253 /// This routine panics if the search could not complete. This can occur
254 /// in a number of circumstances:
255 ///
256 /// * The configuration of the lazy DFA may permit it to "quit" the search.
257 /// For example, setting quit bytes or enabling heuristic support for
258 /// Unicode word boundaries. The default configuration does not enable any
259 /// option that could result in the lazy DFA quitting.
260 /// * The configuration of the lazy DFA may also permit it to "give up"
261 /// on a search if it makes ineffective use of its transition table
262 /// cache. The default configuration does not enable this by default,
263 /// although it is typically a good idea to.
264 /// * When the provided `Input` configuration is not supported. For
265 /// example, by providing an unsupported anchor mode.
266 ///
267 /// When a search panics, callers cannot know whether a match exists or
268 /// not.
269 ///
270 /// Use [`Regex::try_search`] if you want to handle these error conditions.
271 ///
272 /// # Example
273 ///
274 /// ```
275 /// use regex_automata::hybrid::regex::Regex;
276 ///
277 /// let re = Regex::new("foo[0-9]+bar")?;
278 /// let mut cache = re.create_cache();
279 ///
280 /// assert!(re.is_match(&mut cache, "foo12345bar"));
281 /// assert!(!re.is_match(&mut cache, "foobar"));
282 /// # Ok::<(), Box<dyn std::error::Error>>(())
283 /// ```
284 #[inline]
285 pub fn is_match<'h, I: Into<Input<'h>>>(
286 &self,
287 cache: &mut Cache,
288 input: I,
289 ) -> bool {
290 // Not only can we do an "earliest" search, but we can avoid doing a
291 // reverse scan too.
292 self.forward()
293 .try_search_fwd(&mut cache.forward, &input.into().earliest(true))
294 .unwrap()
295 .is_some()
296 }
297
298 /// Returns the start and end offset of the leftmost match. If no match
299 /// exists, then `None` is returned.
300 ///
301 /// # Panics
302 ///
303 /// This routine panics if the search could not complete. This can occur
304 /// in a number of circumstances:
305 ///
306 /// * The configuration of the lazy DFA may permit it to "quit" the search.
307 /// For example, setting quit bytes or enabling heuristic support for
308 /// Unicode word boundaries. The default configuration does not enable any
309 /// option that could result in the lazy DFA quitting.
310 /// * The configuration of the lazy DFA may also permit it to "give up"
311 /// on a search if it makes ineffective use of its transition table
312 /// cache. The default configuration does not enable this by default,
313 /// although it is typically a good idea to.
314 /// * When the provided `Input` configuration is not supported. For
315 /// example, by providing an unsupported anchor mode.
316 ///
317 /// When a search panics, callers cannot know whether a match exists or
318 /// not.
319 ///
320 /// Use [`Regex::try_search`] if you want to handle these error conditions.
321 ///
322 /// # Example
323 ///
324 /// ```
325 /// use regex_automata::{Match, hybrid::regex::Regex};
326 ///
327 /// let re = Regex::new("foo[0-9]+")?;
328 /// let mut cache = re.create_cache();
329 /// assert_eq!(
330 /// Some(Match::must(0, 3..11)),
331 /// re.find(&mut cache, "zzzfoo12345zzz"),
332 /// );
333 ///
334 /// // Even though a match is found after reading the first byte (`a`),
335 /// // the default leftmost-first match semantics demand that we find the
336 /// // earliest match that prefers earlier parts of the pattern over latter
337 /// // parts.
338 /// let re = Regex::new("abc|a")?;
339 /// let mut cache = re.create_cache();
340 /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
341 /// # Ok::<(), Box<dyn std::error::Error>>(())
342 /// ```
343 #[inline]
344 pub fn find<'h, I: Into<Input<'h>>>(
345 &self,
346 cache: &mut Cache,
347 input: I,
348 ) -> Option<Match> {
349 self.try_search(cache, &input.into()).unwrap()
350 }
351
352 /// Returns an iterator over all non-overlapping leftmost matches in the
353 /// given bytes. If no match exists, then the iterator yields no elements.
354 ///
355 /// # Panics
356 ///
357 /// This routine panics if the search could not complete. This can occur
358 /// in a number of circumstances:
359 ///
360 /// * The configuration of the lazy DFA may permit it to "quit" the search.
361 /// For example, setting quit bytes or enabling heuristic support for
362 /// Unicode word boundaries. The default configuration does not enable any
363 /// option that could result in the lazy DFA quitting.
364 /// * The configuration of the lazy DFA may also permit it to "give up"
365 /// on a search if it makes ineffective use of its transition table
366 /// cache. The default configuration does not enable this by default,
367 /// although it is typically a good idea to.
368 /// * When the provided `Input` configuration is not supported. For
369 /// example, by providing an unsupported anchor mode.
370 ///
371 /// When a search panics, callers cannot know whether a match exists or
372 /// not.
373 ///
374 /// The above conditions also apply to the iterator returned as well. For
375 /// example, if the lazy DFA gives up or quits during a search using this
376 /// method, then a panic will occur during iteration.
377 ///
378 /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
379 /// if you want to handle these error conditions.
380 ///
381 /// # Example
382 ///
383 /// ```
384 /// use regex_automata::{hybrid::regex::Regex, Match};
385 ///
386 /// let re = Regex::new("foo[0-9]+")?;
387 /// let mut cache = re.create_cache();
388 ///
389 /// let text = "foo1 foo12 foo123";
390 /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
391 /// assert_eq!(matches, vec![
392 /// Match::must(0, 0..4),
393 /// Match::must(0, 5..10),
394 /// Match::must(0, 11..17),
395 /// ]);
396 /// # Ok::<(), Box<dyn std::error::Error>>(())
397 /// ```
398 #[inline]
399 pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
400 &'r self,
401 cache: &'c mut Cache,
402 input: I,
403 ) -> FindMatches<'r, 'c, 'h> {
404 let it = iter::Searcher::new(input.into());
405 FindMatches { re: self, cache, it }
406 }
407}
408
409/// Lower level "search" primitives that accept a `&Input` for cheap reuse
410/// and return an error if one occurs instead of panicking.
411impl Regex {
412 /// Returns the start and end offset of the leftmost match. If no match
413 /// exists, then `None` is returned.
414 ///
415 /// This is like [`Regex::find`] but with two differences:
416 ///
417 /// 1. It is not generic over `Into<Input>` and instead accepts a
418 /// `&Input`. This permits reusing the same `Input` for multiple searches
419 /// without needing to create a new one. This _may_ help with latency.
420 /// 2. It returns an error if the search could not complete where as
421 /// [`Regex::find`] will panic.
422 ///
423 /// # Errors
424 ///
425 /// This routine errors if the search could not complete. This can occur
426 /// in a number of circumstances:
427 ///
428 /// * The configuration of the lazy DFA may permit it to "quit" the search.
429 /// For example, setting quit bytes or enabling heuristic support for
430 /// Unicode word boundaries. The default configuration does not enable any
431 /// option that could result in the lazy DFA quitting.
432 /// * The configuration of the lazy DFA may also permit it to "give up"
433 /// on a search if it makes ineffective use of its transition table
434 /// cache. The default configuration does not enable this by default,
435 /// although it is typically a good idea to.
436 /// * When the provided `Input` configuration is not supported. For
437 /// example, by providing an unsupported anchor mode.
438 ///
439 /// When a search returns an error, callers cannot know whether a match
440 /// exists or not.
441 #[inline]
442 pub fn try_search(
443 &self,
444 cache: &mut Cache,
445 input: &Input<'_>,
446 ) -> Result<Option<Match>, MatchError> {
447 let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
448 let end = match self.forward().try_search_fwd(fcache, input)? {
449 None => return Ok(None),
450 Some(end) => end,
451 };
452 // This special cases an empty match at the beginning of the search. If
453 // our end matches our start, then since a reverse DFA can't match past
454 // the start, it must follow that our starting position is also our end
455 // position. So short circuit and skip the reverse search.
456 if input.start() == end.offset() {
457 return Ok(Some(Match::new(
458 end.pattern(),
459 end.offset()..end.offset(),
460 )));
461 }
462 // We can also skip the reverse search if we know our search was
463 // anchored. This occurs either when the input config is anchored or
464 // when we know the regex itself is anchored. In this case, we know the
465 // start of the match, if one is found, must be the start of the
466 // search.
467 if self.is_anchored(input) {
468 return Ok(Some(Match::new(
469 end.pattern(),
470 input.start()..end.offset(),
471 )));
472 }
473 // N.B. I have tentatively convinced myself that it isn't necessary
474 // to specify the specific pattern for the reverse search since the
475 // reverse search will always find the same pattern to match as the
476 // forward search. But I lack a rigorous proof. Why not just provide
477 // the pattern anyway? Well, if it is needed, then leaving it out
478 // gives us a chance to find a witness. (Also, if we don't need to
479 // specify the pattern, then we don't need to build the reverse DFA
480 // with 'starts_for_each_pattern' enabled. It doesn't matter too much
481 // for the lazy DFA, but does make the overall DFA bigger.)
482 //
483 // We also need to be careful to disable 'earliest' for the reverse
484 // search, since it could be enabled for the forward search. In the
485 // reverse case, to satisfy "leftmost" criteria, we need to match as
486 // much as we can. We also need to be careful to make the search
487 // anchored. We don't want the reverse search to report any matches
488 // other than the one beginning at the end of our forward search.
489 let revsearch = input
490 .clone()
491 .span(input.start()..end.offset())
492 .anchored(Anchored::Yes)
493 .earliest(false);
494 let start = self
495 .reverse()
496 .try_search_rev(rcache, &revsearch)?
497 .expect("reverse search must match if forward search does");
498 debug_assert_eq!(
499 start.pattern(),
500 end.pattern(),
501 "forward and reverse search must match same pattern",
502 );
503 debug_assert!(start.offset() <= end.offset());
504 Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
505 }
506
507 /// Returns true if either the given input specifies an anchored search
508 /// or if the underlying NFA is always anchored.
509 fn is_anchored(&self, input: &Input<'_>) -> bool {
510 match input.get_anchored() {
511 Anchored::No => {
512 self.forward().get_nfa().is_always_start_anchored()
513 }
514 Anchored::Yes | Anchored::Pattern(_) => true,
515 }
516 }
517}
518
519/// Non-search APIs for querying information about the regex and setting a
520/// prefilter.
521impl Regex {
522 /// Return the underlying lazy DFA responsible for forward matching.
523 ///
524 /// This is useful for accessing the underlying lazy DFA and using it
525 /// directly if the situation calls for it.
526 pub fn forward(&self) -> &DFA {
527 &self.forward
528 }
529
530 /// Return the underlying lazy DFA responsible for reverse matching.
531 ///
532 /// This is useful for accessing the underlying lazy DFA and using it
533 /// directly if the situation calls for it.
534 pub fn reverse(&self) -> &DFA {
535 &self.reverse
536 }
537
538 /// Returns the total number of patterns matched by this regex.
539 ///
540 /// # Example
541 ///
542 /// ```
543 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
544 /// use regex_automata::hybrid::regex::Regex;
545 ///
546 /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
547 /// assert_eq!(3, re.pattern_len());
548 /// # Ok::<(), Box<dyn std::error::Error>>(())
549 /// ```
550 pub fn pattern_len(&self) -> usize {
551 assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
552 self.forward().pattern_len()
553 }
554}
555
556/// An iterator over all non-overlapping matches for an infallible search.
557///
558/// The iterator yields a [`Match`] value until no more matches could be found.
559/// If the underlying regex engine returns an error, then a panic occurs.
560///
561/// The lifetime parameters are as follows:
562///
563/// * `'r` represents the lifetime of the regex object.
564/// * `'h` represents the lifetime of the haystack being searched.
565/// * `'c` represents the lifetime of the regex cache.
566///
567/// This iterator can be created with the [`Regex::find_iter`] method.
568#[derive(Debug)]
569pub struct FindMatches<'r, 'c, 'h> {
570 re: &'r Regex,
571 cache: &'c mut Cache,
572 it: iter::Searcher<'h>,
573}
574
575impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
576 type Item = Match;
577
578 #[inline]
579 fn next(&mut self) -> Option<Match> {
580 let FindMatches { re, ref mut cache, ref mut it } = *self;
581 it.advance(|input| re.try_search(cache, input))
582 }
583}
584
585/// A cache represents a partially computed forward and reverse DFA.
586///
587/// A cache is the key component that differentiates a classical DFA and a
588/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
589/// complete transition table that can handle all possible inputs, a hybrid
590/// NFA/DFA starts with an empty transition table and builds only the parts
591/// required during search. The parts that are built are stored in a cache. For
592/// this reason, a cache is a required parameter for nearly every operation on
593/// a [`Regex`].
594///
595/// Caches can be created from their corresponding `Regex` via
596/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`
597/// that created it, or the `Regex` that was most recently used to reset it
598/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in
599/// panics or incorrect results.
600#[derive(Debug, Clone)]
601pub struct Cache {
602 forward: dfa::Cache,
603 reverse: dfa::Cache,
604}
605
606impl Cache {
607 /// Create a new cache for the given `Regex`.
608 ///
609 /// The cache returned should only be used for searches for the given
610 /// `Regex`. If you want to reuse the cache for another `Regex`, then you
611 /// must call [`Cache::reset`] with that `Regex`.
612 pub fn new(re: &Regex) -> Cache {
613 let forward = dfa::Cache::new(re.forward());
614 let reverse = dfa::Cache::new(re.reverse());
615 Cache { forward, reverse }
616 }
617
618 /// Reset this cache such that it can be used for searching with the given
619 /// `Regex` (and only that `Regex`).
620 ///
621 /// A cache reset permits reusing memory already allocated in this cache
622 /// with a different `Regex`.
623 ///
624 /// Resetting a cache sets its "clear count" to 0. This is relevant if the
625 /// `Regex` has been configured to "give up" after it has cleared the cache
626 /// a certain number of times.
627 ///
628 /// # Example
629 ///
630 /// This shows how to re-purpose a cache for use with a different `Regex`.
631 ///
632 /// ```
633 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
634 /// use regex_automata::{hybrid::regex::Regex, Match};
635 ///
636 /// let re1 = Regex::new(r"\w")?;
637 /// let re2 = Regex::new(r"\W")?;
638 ///
639 /// let mut cache = re1.create_cache();
640 /// assert_eq!(
641 /// Some(Match::must(0, 0..2)),
642 /// re1.find(&mut cache, "Δ"),
643 /// );
644 ///
645 /// // Using 'cache' with re2 is not allowed. It may result in panics or
646 /// // incorrect results. In order to re-purpose the cache, we must reset
647 /// // it with the Regex we'd like to use it with.
648 /// //
649 /// // Similarly, after this reset, using the cache with 're1' is also not
650 /// // allowed.
651 /// cache.reset(&re2);
652 /// assert_eq!(
653 /// Some(Match::must(0, 0..3)),
654 /// re2.find(&mut cache, "☃"),
655 /// );
656 ///
657 /// # Ok::<(), Box<dyn std::error::Error>>(())
658 /// ```
659 pub fn reset(&mut self, re: &Regex) {
660 self.forward.reset(re.forward());
661 self.reverse.reset(re.reverse());
662 }
663
664 /// Return a reference to the forward cache.
665 pub fn forward(&mut self) -> &dfa::Cache {
666 &self.forward
667 }
668
669 /// Return a reference to the reverse cache.
670 pub fn reverse(&mut self) -> &dfa::Cache {
671 &self.reverse
672 }
673
674 /// Return a mutable reference to the forward cache.
675 ///
676 /// If you need mutable references to both the forward and reverse caches,
677 /// then use [`Cache::as_parts_mut`].
678 pub fn forward_mut(&mut self) -> &mut dfa::Cache {
679 &mut self.forward
680 }
681
682 /// Return a mutable reference to the reverse cache.
683 ///
684 /// If you need mutable references to both the forward and reverse caches,
685 /// then use [`Cache::as_parts_mut`].
686 pub fn reverse_mut(&mut self) -> &mut dfa::Cache {
687 &mut self.reverse
688 }
689
690 /// Return references to the forward and reverse caches, respectively.
691 pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) {
692 (&self.forward, &self.reverse)
693 }
694
695 /// Return mutable references to the forward and reverse caches,
696 /// respectively.
697 pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
698 (&mut self.forward, &mut self.reverse)
699 }
700
701 /// Returns the heap memory usage, in bytes, as a sum of the forward and
702 /// reverse lazy DFA caches.
703 ///
704 /// This does **not** include the stack size used up by this cache. To
705 /// compute that, use `std::mem::size_of::<Cache>()`.
706 pub fn memory_usage(&self) -> usize {
707 self.forward.memory_usage() + self.reverse.memory_usage()
708 }
709}
710
711/// A builder for a regex based on a hybrid NFA/DFA.
712///
713/// This builder permits configuring options for the syntax of a pattern, the
714/// NFA construction, the lazy DFA construction and finally the regex searching
715/// itself. This builder is different from a general purpose regex builder
716/// in that it permits fine grain configuration of the construction process.
717/// The trade off for this is complexity, and the possibility of setting a
718/// configuration that might not make sense. For example, there are two
719/// different UTF-8 modes:
720///
721/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
722/// whether the pattern itself can contain sub-expressions that match invalid
723/// UTF-8.
724/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
725/// advance the starting position of the next search when a match with zero
726/// length is found.
727///
728/// Generally speaking, callers will want to either enable all of these or
729/// disable all of these.
730///
731/// Internally, building a regex requires building two hybrid NFA/DFAs,
732/// where one is responsible for finding the end of a match and the other is
733/// responsible for finding the start of a match. If you only need to detect
734/// whether something matched, or only the end of a match, then you should use
735/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper
736/// than building two of them.
737///
738/// # Example
739///
740/// This example shows how to disable UTF-8 mode in the syntax and the regex
741/// itself. This is generally what you want for matching on arbitrary bytes.
742///
743/// ```
744/// # if cfg!(miri) { return Ok(()); } // miri takes too long
745/// use regex_automata::{
746/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
747/// };
748///
749/// let re = Regex::builder()
750/// .syntax(syntax::Config::new().utf8(false))
751/// .thompson(thompson::Config::new().utf8(false))
752/// .build(r"foo(?-u:[^b])ar.*")?;
753/// let mut cache = re.create_cache();
754///
755/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
756/// let expected = Some(Match::must(0, 1..9));
757/// let got = re.find(&mut cache, haystack);
758/// assert_eq!(expected, got);
759/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
760/// // but the subsequent `.*` does not! Disabling UTF-8
761/// // on the syntax permits this.
762/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
763///
764/// # Ok::<(), Box<dyn std::error::Error>>(())
765/// ```
766#[derive(Clone, Debug)]
767pub struct Builder {
768 dfa: dfa::Builder,
769}
770
771impl Builder {
772 /// Create a new regex builder with the default configuration.
773 pub fn new() -> Builder {
774 Builder { dfa: DFA::builder() }
775 }
776
777 /// Build a regex from the given pattern.
778 ///
779 /// If there was a problem parsing or compiling the pattern, then an error
780 /// is returned.
781 #[cfg(feature = "syntax")]
782 pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
783 self.build_many(&[pattern])
784 }
785
786 /// Build a regex from the given patterns.
787 #[cfg(feature = "syntax")]
788 pub fn build_many<P: AsRef<str>>(
789 &self,
790 patterns: &[P],
791 ) -> Result<Regex, BuildError> {
792 let forward = self.dfa.build_many(patterns)?;
793 let reverse = self
794 .dfa
795 .clone()
796 .configure(
797 DFA::config()
798 .prefilter(None)
799 .specialize_start_states(false)
800 .match_kind(MatchKind::All),
801 )
802 .thompson(thompson::Config::new().reverse(true))
803 .build_many(patterns)?;
804 Ok(self.build_from_dfas(forward, reverse))
805 }
806
807 /// Build a regex from its component forward and reverse hybrid NFA/DFAs.
808 ///
809 /// This is useful when you've built a forward and reverse lazy DFA
810 /// separately, and want to combine them into a single regex. Once build,
811 /// the individual DFAs given can still be accessed via [`Regex::forward`]
812 /// and [`Regex::reverse`].
813 ///
814 /// It is important that the reverse lazy DFA be compiled under the
815 /// following conditions:
816 ///
817 /// * It should use [`MatchKind::All`] semantics.
818 /// * It should match in reverse.
819 /// * Otherwise, its configuration should match the forward DFA.
820 ///
821 /// If these conditions aren't satisfied, then the behavior of searches is
822 /// unspecified.
823 ///
824 /// Note that when using this constructor, no configuration is applied.
825 /// Since this routine provides the DFAs to the builder, there is no
826 /// opportunity to apply other configuration options.
827 ///
828 /// # Example
829 ///
830 /// This shows how to build individual lazy forward and reverse DFAs, and
831 /// then combine them into a single `Regex`.
832 ///
833 /// ```
834 /// use regex_automata::{
835 /// hybrid::{dfa::DFA, regex::Regex},
836 /// nfa::thompson,
837 /// MatchKind,
838 /// };
839 ///
840 /// let fwd = DFA::new(r"foo[0-9]+")?;
841 /// let rev = DFA::builder()
842 /// .configure(DFA::config().match_kind(MatchKind::All))
843 /// .thompson(thompson::Config::new().reverse(true))
844 /// .build(r"foo[0-9]+")?;
845 ///
846 /// let re = Regex::builder().build_from_dfas(fwd, rev);
847 /// let mut cache = re.create_cache();
848 /// assert_eq!(true, re.is_match(&mut cache, "foo123"));
849 /// # Ok::<(), Box<dyn std::error::Error>>(())
850 /// ```
851 pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
852 Regex { forward, reverse }
853 }
854
855 /// Set the syntax configuration for this builder using
856 /// [`syntax::Config`](crate::util::syntax::Config).
857 ///
858 /// This permits setting things like case insensitivity, Unicode and multi
859 /// line mode.
860 #[cfg(feature = "syntax")]
861 pub fn syntax(
862 &mut self,
863 config: crate::util::syntax::Config,
864 ) -> &mut Builder {
865 self.dfa.syntax(config);
866 self
867 }
868
869 /// Set the Thompson NFA configuration for this builder using
870 /// [`nfa::thompson::Config`](thompson::Config).
871 ///
872 /// This permits setting things like whether additional time should be
873 /// spent shrinking the size of the NFA.
874 #[cfg(feature = "syntax")]
875 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
876 self.dfa.thompson(config);
877 self
878 }
879
880 /// Set the lazy DFA compilation configuration for this builder using
881 /// [`dfa::Config`].
882 ///
883 /// This permits setting things like whether Unicode word boundaries should
884 /// be heuristically supported or settings how the behavior of the cache.
885 pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder {
886 self.dfa.configure(config);
887 self
888 }
889}
890
891impl Default for Builder {
892 fn default() -> Builder {
893 Builder::new()
894 }
895}
896