1 | /*! |
2 | A DFA-backed `Regex`. |
3 | |
4 | This module provides [`Regex`], which is defined generically over the |
5 | [`Automaton`] trait. A `Regex` implements convenience routines you might have |
6 | come to expect, such as finding the start/end of a match and iterating over |
7 | all non-overlapping matches. This `Regex` type is limited in its capabilities |
8 | to what a DFA can provide. Therefore, APIs involving capturing groups, for |
9 | example, are not provided. |
10 | |
11 | Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that |
12 | finds the end offset of a match, where as the other is a "reverse" DFA that |
13 | find the start offset of a match. |
14 | |
15 | See the [parent module](crate::dfa) for examples. |
16 | */ |
17 | |
18 | #[cfg (feature = "alloc" )] |
19 | use alloc::vec::Vec; |
20 | |
21 | #[cfg (feature = "dfa-build" )] |
22 | use crate::dfa::dense::BuildError; |
23 | use crate::{ |
24 | dfa::{automaton::Automaton, dense}, |
25 | util::{iter, search::Input}, |
26 | Anchored, Match, MatchError, |
27 | }; |
28 | #[cfg (feature = "alloc" )] |
29 | use crate::{ |
30 | dfa::{sparse, StartKind}, |
31 | util::search::MatchKind, |
32 | }; |
33 | |
34 | // When the alloc feature is enabled, the regex type sets its A type parameter |
35 | // to default to an owned dense DFA. But without alloc, we set no default. This |
36 | // makes things a lot more convenient in the common case, since writing out the |
37 | // DFA types is pretty annoying. |
38 | // |
39 | // Since we have two different definitions but only want to write one doc |
40 | // string, we use a macro to capture the doc and other attributes once and then |
41 | // repeat them for each definition. |
42 | macro_rules! define_regex_type { |
43 | ($(#[$doc:meta])*) => { |
44 | #[cfg(feature = "alloc" )] |
45 | $(#[$doc])* |
46 | pub struct Regex<A = dense::OwnedDFA> { |
47 | forward: A, |
48 | reverse: A, |
49 | } |
50 | |
51 | #[cfg(not(feature = "alloc" ))] |
52 | $(#[$doc])* |
53 | pub struct Regex<A> { |
54 | forward: A, |
55 | reverse: A, |
56 | } |
57 | }; |
58 | } |
59 | |
60 | define_regex_type!( |
61 | /// A regular expression that uses deterministic finite automata for fast |
62 | /// searching. |
63 | /// |
64 | /// A regular expression is comprised of two DFAs, a "forward" DFA and a |
65 | /// "reverse" DFA. The forward DFA is responsible for detecting the end of |
66 | /// a match while the reverse DFA is responsible for detecting the start |
67 | /// of a match. Thus, in order to find the bounds of any given match, a |
68 | /// forward search must first be run followed by a reverse search. A match |
69 | /// found by the forward DFA guarantees that the reverse DFA will also find |
70 | /// a match. |
71 | /// |
72 | /// The type of the DFA used by a `Regex` corresponds to the `A` type |
73 | /// parameter, which must satisfy the [`Automaton`] trait. Typically, |
74 | /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a |
75 | /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more |
76 | /// memory but search faster, while sparse DFAs use less memory but search |
77 | /// more slowly. |
78 | /// |
79 | /// # Crate features |
80 | /// |
81 | /// Note that despite what the documentation auto-generates, the _only_ |
82 | /// crate feature needed to use this type is `dfa-search`. You do _not_ |
83 | /// need to enable the `alloc` feature. |
84 | /// |
85 | /// By default, a regex's automaton type parameter is set to |
86 | /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most |
87 | /// in-memory work loads, this is the most convenient type that gives the |
88 | /// best search performance. When the `alloc` feature is disabled, no |
89 | /// default type is used. |
90 | /// |
91 | /// # When should I use this? |
92 | /// |
93 | /// Generally speaking, if you can afford the overhead of building a full |
94 | /// DFA for your regex, and you don't need things like capturing groups, |
95 | /// then this is a good choice if you're looking to optimize for matching |
96 | /// speed. Note however that its speed may be worse than a general purpose |
97 | /// regex engine if you don't provide a [`dense::Config::prefilter`] to the |
98 | /// underlying DFA. |
99 | /// |
100 | /// # Sparse DFAs |
101 | /// |
102 | /// Since a `Regex` is generic over the [`Automaton`] trait, it can be |
103 | /// used with any kind of DFA. While this crate constructs dense DFAs by |
104 | /// default, it is easy enough to build corresponding sparse DFAs, and then |
105 | /// build a regex from them: |
106 | /// |
107 | /// ``` |
108 | /// use regex_automata::dfa::regex::Regex; |
109 | /// |
110 | /// // First, build a regex that uses dense DFAs. |
111 | /// let dense_re = Regex::new("foo[0-9]+")?; |
112 | /// |
113 | /// // Second, build sparse DFAs from the forward and reverse dense DFAs. |
114 | /// let fwd = dense_re.forward().to_sparse()?; |
115 | /// let rev = dense_re.reverse().to_sparse()?; |
116 | /// |
117 | /// // Third, build a new regex from the constituent sparse DFAs. |
118 | /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); |
119 | /// |
120 | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
121 | /// assert_eq!(true, sparse_re.is_match(b"foo123")); |
122 | /// |
123 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
124 | /// ``` |
125 | /// |
126 | /// Alternatively, one can use a [`Builder`] to construct a sparse DFA |
127 | /// more succinctly. (Note though that dense DFAs are still constructed |
128 | /// first internally, and then converted to sparse DFAs, as in the example |
129 | /// above.) |
130 | /// |
131 | /// ``` |
132 | /// use regex_automata::dfa::regex::Regex; |
133 | /// |
134 | /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; |
135 | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
136 | /// assert!(sparse_re.is_match(b"foo123")); |
137 | /// |
138 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
139 | /// ``` |
140 | /// |
141 | /// # Fallibility |
142 | /// |
143 | /// Most of the search routines defined on this type will _panic_ when the |
144 | /// underlying search fails. This might be because the DFA gave up because |
145 | /// it saw a quit byte, whether configured explicitly or via heuristic |
146 | /// Unicode word boundary support, although neither are enabled by default. |
147 | /// Or it might fail because an invalid `Input` configuration is given, |
148 | /// for example, with an unsupported [`Anchored`] mode. |
149 | /// |
150 | /// If you need to handle these error cases instead of allowing them to |
151 | /// trigger a panic, then the lower level [`Regex::try_search`] provides |
152 | /// a fallible API that never panics. |
153 | /// |
154 | /// # Example |
155 | /// |
156 | /// This example shows how to cause a search to terminate if it sees a |
157 | /// `\n` byte, and handle the error returned. This could be useful if, for |
158 | /// example, you wanted to prevent a user supplied pattern from matching |
159 | /// across a line boundary. |
160 | /// |
161 | /// ``` |
162 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
163 | /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError}; |
164 | /// |
165 | /// let re = Regex::builder() |
166 | /// .dense(dfa::dense::Config::new().quit(b'\n', true)) |
167 | /// .build(r"foo\p{any}+bar")?; |
168 | /// |
169 | /// let input = Input::new("foo\nbar"); |
170 | /// // Normally this would produce a match, since \p{any} contains '\n'. |
171 | /// // But since we instructed the automaton to enter a quit state if a |
172 | /// // '\n' is observed, this produces a match error instead. |
173 | /// let expected = MatchError::quit(b'\n', 3); |
174 | /// let got = re.try_search(&input).unwrap_err(); |
175 | /// assert_eq!(expected, got); |
176 | /// |
177 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
178 | /// ``` |
179 | #[derive(Clone, Debug)] |
180 | ); |
181 | |
182 | #[cfg (all(feature = "syntax" , feature = "dfa-build" ))] |
183 | impl Regex { |
184 | /// Parse the given regular expression using the default configuration and |
185 | /// return the corresponding regex. |
186 | /// |
187 | /// If you want a non-default configuration, then use the [`Builder`] to |
188 | /// set your own configuration. |
189 | /// |
190 | /// # Example |
191 | /// |
192 | /// ``` |
193 | /// use regex_automata::{Match, dfa::regex::Regex}; |
194 | /// |
195 | /// let re = Regex::new("foo[0-9]+bar" )?; |
196 | /// assert_eq!( |
197 | /// Some(Match::must(0, 3..14)), |
198 | /// re.find(b"zzzfoo12345barzzz" ), |
199 | /// ); |
200 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
201 | /// ``` |
202 | pub fn new(pattern: &str) -> Result<Regex, BuildError> { |
203 | Builder::new().build(pattern) |
204 | } |
205 | |
206 | /// Like `new`, but parses multiple patterns into a single "regex set." |
207 | /// This similarly uses the default regex configuration. |
208 | /// |
209 | /// # Example |
210 | /// |
211 | /// ``` |
212 | /// use regex_automata::{Match, dfa::regex::Regex}; |
213 | /// |
214 | /// let re = Regex::new_many(&["[a-z]+" , "[0-9]+" ])?; |
215 | /// |
216 | /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux" ); |
217 | /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); |
218 | /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); |
219 | /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); |
220 | /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); |
221 | /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); |
222 | /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); |
223 | /// assert_eq!(None, it.next()); |
224 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
225 | /// ``` |
226 | pub fn new_many<P: AsRef<str>>( |
227 | patterns: &[P], |
228 | ) -> Result<Regex, BuildError> { |
229 | Builder::new().build_many(patterns) |
230 | } |
231 | } |
232 | |
233 | #[cfg (all(feature = "syntax" , feature = "dfa-build" ))] |
234 | impl Regex<sparse::DFA<Vec<u8>>> { |
235 | /// Parse the given regular expression using the default configuration, |
236 | /// except using sparse DFAs, and return the corresponding regex. |
237 | /// |
238 | /// If you want a non-default configuration, then use the [`Builder`] to |
239 | /// set your own configuration. |
240 | /// |
241 | /// # Example |
242 | /// |
243 | /// ``` |
244 | /// use regex_automata::{Match, dfa::regex::Regex}; |
245 | /// |
246 | /// let re = Regex::new_sparse("foo[0-9]+bar" )?; |
247 | /// assert_eq!( |
248 | /// Some(Match::must(0, 3..14)), |
249 | /// re.find(b"zzzfoo12345barzzz" ), |
250 | /// ); |
251 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
252 | /// ``` |
253 | pub fn new_sparse( |
254 | pattern: &str, |
255 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { |
256 | Builder::new().build_sparse(pattern) |
257 | } |
258 | |
259 | /// Like `new`, but parses multiple patterns into a single "regex set" |
260 | /// using sparse DFAs. This otherwise similarly uses the default regex |
261 | /// configuration. |
262 | /// |
263 | /// # Example |
264 | /// |
265 | /// ``` |
266 | /// use regex_automata::{Match, dfa::regex::Regex}; |
267 | /// |
268 | /// let re = Regex::new_many_sparse(&["[a-z]+" , "[0-9]+" ])?; |
269 | /// |
270 | /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux" ); |
271 | /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); |
272 | /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); |
273 | /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); |
274 | /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); |
275 | /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); |
276 | /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); |
277 | /// assert_eq!(None, it.next()); |
278 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
279 | /// ``` |
280 | pub fn new_many_sparse<P: AsRef<str>>( |
281 | patterns: &[P], |
282 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { |
283 | Builder::new().build_many_sparse(patterns) |
284 | } |
285 | } |
286 | |
287 | /// Convenience routines for regex construction. |
288 | impl Regex<dense::DFA<&'static [u32]>> { |
289 | /// Return a builder for configuring the construction of a `Regex`. |
290 | /// |
291 | /// This is a convenience routine to avoid needing to import the |
292 | /// [`Builder`] type in common cases. |
293 | /// |
294 | /// # Example |
295 | /// |
296 | /// This example shows how to use the builder to disable UTF-8 mode |
297 | /// everywhere. |
298 | /// |
299 | /// ``` |
300 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
301 | /// use regex_automata::{ |
302 | /// dfa::regex::Regex, nfa::thompson, util::syntax, Match, |
303 | /// }; |
304 | /// |
305 | /// let re = Regex::builder() |
306 | /// .syntax(syntax::Config::new().utf8(false)) |
307 | /// .thompson(thompson::Config::new().utf8(false)) |
308 | /// .build(r"foo(?-u:[^b])ar.*" )?; |
309 | /// let haystack = b" \xFEfoo \xFFarzz \xE2\x98\xFF\n" ; |
310 | /// let expected = Some(Match::must(0, 1..9)); |
311 | /// let got = re.find(haystack); |
312 | /// assert_eq!(expected, got); |
313 | /// |
314 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
315 | /// ``` |
316 | pub fn builder() -> Builder { |
317 | Builder::new() |
318 | } |
319 | } |
320 | |
321 | /// Standard search routines for finding and iterating over matches. |
322 | impl<A: Automaton> Regex<A> { |
323 | /// Returns true if and only if this regex matches the given haystack. |
324 | /// |
325 | /// This routine may short circuit if it knows that scanning future input |
326 | /// will never lead to a different result. In particular, if the underlying |
327 | /// DFA enters a match state or a dead state, then this routine will return |
328 | /// `true` or `false`, respectively, without inspecting any future input. |
329 | /// |
330 | /// # Panics |
331 | /// |
332 | /// This routine panics if the search could not complete. This can occur |
333 | /// in a number of circumstances: |
334 | /// |
335 | /// * The configuration of the DFA may permit it to "quit" the search. |
336 | /// For example, setting quit bytes or enabling heuristic support for |
337 | /// Unicode word boundaries. The default configuration does not enable any |
338 | /// option that could result in the DFA quitting. |
339 | /// * When the provided `Input` configuration is not supported. For |
340 | /// example, by providing an unsupported anchor mode. |
341 | /// |
342 | /// When a search panics, callers cannot know whether a match exists or |
343 | /// not. |
344 | /// |
345 | /// Use [`Regex::try_search`] if you want to handle these error conditions. |
346 | /// |
347 | /// # Example |
348 | /// |
349 | /// ``` |
350 | /// use regex_automata::dfa::regex::Regex; |
351 | /// |
352 | /// let re = Regex::new("foo[0-9]+bar" )?; |
353 | /// assert_eq!(true, re.is_match("foo12345bar" )); |
354 | /// assert_eq!(false, re.is_match("foobar" )); |
355 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
356 | /// ``` |
357 | #[inline ] |
358 | pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool { |
359 | // Not only can we do an "earliest" search, but we can avoid doing a |
360 | // reverse scan too. |
361 | let input = input.into().earliest(true); |
362 | self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap() |
363 | } |
364 | |
365 | /// Returns the start and end offset of the leftmost match. If no match |
366 | /// exists, then `None` is returned. |
367 | /// |
368 | /// # Panics |
369 | /// |
370 | /// This routine panics if the search could not complete. This can occur |
371 | /// in a number of circumstances: |
372 | /// |
373 | /// * The configuration of the DFA may permit it to "quit" the search. |
374 | /// For example, setting quit bytes or enabling heuristic support for |
375 | /// Unicode word boundaries. The default configuration does not enable any |
376 | /// option that could result in the DFA quitting. |
377 | /// * When the provided `Input` configuration is not supported. For |
378 | /// example, by providing an unsupported anchor mode. |
379 | /// |
380 | /// When a search panics, callers cannot know whether a match exists or |
381 | /// not. |
382 | /// |
383 | /// Use [`Regex::try_search`] if you want to handle these error conditions. |
384 | /// |
385 | /// # Example |
386 | /// |
387 | /// ``` |
388 | /// use regex_automata::{Match, dfa::regex::Regex}; |
389 | /// |
390 | /// // Greediness is applied appropriately. |
391 | /// let re = Regex::new("foo[0-9]+" )?; |
392 | /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz" )); |
393 | /// |
394 | /// // Even though a match is found after reading the first byte (`a`), |
395 | /// // the default leftmost-first match semantics demand that we find the |
396 | /// // earliest match that prefers earlier parts of the pattern over latter |
397 | /// // parts. |
398 | /// let re = Regex::new("abc|a" )?; |
399 | /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc" )); |
400 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
401 | /// ``` |
402 | #[inline ] |
403 | pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> { |
404 | self.try_search(&input.into()).unwrap() |
405 | } |
406 | |
407 | /// Returns an iterator over all non-overlapping leftmost matches in the |
408 | /// given bytes. If no match exists, then the iterator yields no elements. |
409 | /// |
410 | /// This corresponds to the "standard" regex search iterator. |
411 | /// |
412 | /// # Panics |
413 | /// |
414 | /// If the search returns an error during iteration, then iteration |
415 | /// panics. See [`Regex::find`] for the panic conditions. |
416 | /// |
417 | /// Use [`Regex::try_search`] with |
418 | /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to |
419 | /// handle these error conditions. |
420 | /// |
421 | /// # Example |
422 | /// |
423 | /// ``` |
424 | /// use regex_automata::{Match, dfa::regex::Regex}; |
425 | /// |
426 | /// let re = Regex::new("foo[0-9]+" )?; |
427 | /// let text = "foo1 foo12 foo123" ; |
428 | /// let matches: Vec<Match> = re.find_iter(text).collect(); |
429 | /// assert_eq!(matches, vec![ |
430 | /// Match::must(0, 0..4), |
431 | /// Match::must(0, 5..10), |
432 | /// Match::must(0, 11..17), |
433 | /// ]); |
434 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
435 | /// ``` |
436 | #[inline ] |
437 | pub fn find_iter<'r, 'h, I: Into<Input<'h>>>( |
438 | &'r self, |
439 | input: I, |
440 | ) -> FindMatches<'r, 'h, A> { |
441 | let it = iter::Searcher::new(input.into()); |
442 | FindMatches { re: self, it } |
443 | } |
444 | } |
445 | |
446 | /// Lower level fallible search routines that permit controlling where the |
447 | /// search starts and ends in a particular sequence. |
448 | impl<A: Automaton> Regex<A> { |
449 | /// Returns the start and end offset of the leftmost match. If no match |
450 | /// exists, then `None` is returned. |
451 | /// |
452 | /// This is like [`Regex::find`] but with two differences: |
453 | /// |
454 | /// 1. It is not generic over `Into<Input>` and instead accepts a |
455 | /// `&Input`. This permits reusing the same `Input` for multiple searches |
456 | /// without needing to create a new one. This _may_ help with latency. |
457 | /// 2. It returns an error if the search could not complete where as |
458 | /// [`Regex::find`] will panic. |
459 | /// |
460 | /// # Errors |
461 | /// |
462 | /// This routine errors if the search could not complete. This can occur |
463 | /// in the following circumstances: |
464 | /// |
465 | /// * The configuration of the DFA may permit it to "quit" the search. |
466 | /// For example, setting quit bytes or enabling heuristic support for |
467 | /// Unicode word boundaries. The default configuration does not enable any |
468 | /// option that could result in the DFA quitting. |
469 | /// * When the provided `Input` configuration is not supported. For |
470 | /// example, by providing an unsupported anchor mode. |
471 | /// |
472 | /// When a search returns an error, callers cannot know whether a match |
473 | /// exists or not. |
474 | #[inline ] |
475 | pub fn try_search( |
476 | &self, |
477 | input: &Input<'_>, |
478 | ) -> Result<Option<Match>, MatchError> { |
479 | let (fwd, rev) = (self.forward(), self.reverse()); |
480 | let end = match fwd.try_search_fwd(input)? { |
481 | None => return Ok(None), |
482 | Some(end) => end, |
483 | }; |
484 | // This special cases an empty match at the beginning of the search. If |
485 | // our end matches our start, then since a reverse DFA can't match past |
486 | // the start, it must follow that our starting position is also our end |
487 | // position. So short circuit and skip the reverse search. |
488 | if input.start() == end.offset() { |
489 | return Ok(Some(Match::new( |
490 | end.pattern(), |
491 | end.offset()..end.offset(), |
492 | ))); |
493 | } |
494 | // We can also skip the reverse search if we know our search was |
495 | // anchored. This occurs either when the input config is anchored or |
496 | // when we know the regex itself is anchored. In this case, we know the |
497 | // start of the match, if one is found, must be the start of the |
498 | // search. |
499 | if self.is_anchored(input) { |
500 | return Ok(Some(Match::new( |
501 | end.pattern(), |
502 | input.start()..end.offset(), |
503 | ))); |
504 | } |
505 | // N.B. I have tentatively convinced myself that it isn't necessary |
506 | // to specify the specific pattern for the reverse search since the |
507 | // reverse search will always find the same pattern to match as the |
508 | // forward search. But I lack a rigorous proof. Why not just provide |
509 | // the pattern anyway? Well, if it is needed, then leaving it out |
510 | // gives us a chance to find a witness. (Also, if we don't need to |
511 | // specify the pattern, then we don't need to build the reverse DFA |
512 | // with 'starts_for_each_pattern' enabled.) |
513 | // |
514 | // We also need to be careful to disable 'earliest' for the reverse |
515 | // search, since it could be enabled for the forward search. In the |
516 | // reverse case, to satisfy "leftmost" criteria, we need to match |
517 | // as much as we can. We also need to be careful to make the search |
518 | // anchored. We don't want the reverse search to report any matches |
519 | // other than the one beginning at the end of our forward search. |
520 | let revsearch = input |
521 | .clone() |
522 | .span(input.start()..end.offset()) |
523 | .anchored(Anchored::Yes) |
524 | .earliest(false); |
525 | let start = rev |
526 | .try_search_rev(&revsearch)? |
527 | .expect("reverse search must match if forward search does" ); |
528 | assert_eq!( |
529 | start.pattern(), |
530 | end.pattern(), |
531 | "forward and reverse search must match same pattern" , |
532 | ); |
533 | assert!(start.offset() <= end.offset()); |
534 | Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) |
535 | } |
536 | |
537 | /// Returns true if either the given input specifies an anchored search |
538 | /// or if the underlying DFA is always anchored. |
539 | fn is_anchored(&self, input: &Input<'_>) -> bool { |
540 | match input.get_anchored() { |
541 | Anchored::No => self.forward().is_always_start_anchored(), |
542 | Anchored::Yes | Anchored::Pattern(_) => true, |
543 | } |
544 | } |
545 | } |
546 | |
547 | /// Non-search APIs for querying information about the regex and setting a |
548 | /// prefilter. |
549 | impl<A: Automaton> Regex<A> { |
550 | /// Return the underlying DFA responsible for forward matching. |
551 | /// |
552 | /// This is useful for accessing the underlying DFA and converting it to |
553 | /// some other format or size. See the [`Builder::build_from_dfas`] docs |
554 | /// for an example of where this might be useful. |
555 | pub fn forward(&self) -> &A { |
556 | &self.forward |
557 | } |
558 | |
559 | /// Return the underlying DFA responsible for reverse matching. |
560 | /// |
561 | /// This is useful for accessing the underlying DFA and converting it to |
562 | /// some other format or size. See the [`Builder::build_from_dfas`] docs |
563 | /// for an example of where this might be useful. |
564 | pub fn reverse(&self) -> &A { |
565 | &self.reverse |
566 | } |
567 | |
568 | /// Returns the total number of patterns matched by this regex. |
569 | /// |
570 | /// # Example |
571 | /// |
572 | /// ``` |
573 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
574 | /// use regex_automata::dfa::regex::Regex; |
575 | /// |
576 | /// let re = Regex::new_many(&[r"[a-z]+" , r"[0-9]+" , r"\w+" ])?; |
577 | /// assert_eq!(3, re.pattern_len()); |
578 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
579 | /// ``` |
580 | pub fn pattern_len(&self) -> usize { |
581 | assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); |
582 | self.forward().pattern_len() |
583 | } |
584 | } |
585 | |
586 | /// An iterator over all non-overlapping matches for an infallible search. |
587 | /// |
588 | /// The iterator yields a [`Match`] value until no more matches could be found. |
589 | /// If the underlying regex engine returns an error, then a panic occurs. |
590 | /// |
591 | /// The type parameters are as follows: |
592 | /// |
593 | /// * `A` represents the type of the underlying DFA that implements the |
594 | /// [`Automaton`] trait. |
595 | /// |
596 | /// The lifetime parameters are as follows: |
597 | /// |
598 | /// * `'h` represents the lifetime of the haystack being searched. |
599 | /// * `'r` represents the lifetime of the regex object itself. |
600 | /// |
601 | /// This iterator can be created with the [`Regex::find_iter`] method. |
602 | #[derive(Debug)] |
603 | pub struct FindMatches<'r, 'h, A> { |
604 | re: &'r Regex<A>, |
605 | it: iter::Searcher<'h>, |
606 | } |
607 | |
608 | impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> { |
609 | type Item = Match; |
610 | |
611 | #[inline ] |
612 | fn next(&mut self) -> Option<Match> { |
613 | let FindMatches { re, ref mut it } = *self; |
614 | it.advance(|input| re.try_search(input)) |
615 | } |
616 | } |
617 | |
618 | /// A builder for a regex based on deterministic finite automatons. |
619 | /// |
620 | /// This builder permits configuring options for the syntax of a pattern, the |
621 | /// NFA construction, the DFA construction and finally the regex searching |
622 | /// itself. This builder is different from a general purpose regex builder in |
623 | /// that it permits fine grain configuration of the construction process. The |
624 | /// trade off for this is complexity, and the possibility of setting a |
625 | /// configuration that might not make sense. For example, there are two |
626 | /// different UTF-8 modes: |
627 | /// |
628 | /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls |
629 | /// whether the pattern itself can contain sub-expressions that match invalid |
630 | /// UTF-8. |
631 | /// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls |
632 | /// how the regex iterators themselves advance the starting position of the |
633 | /// next search when a match with zero length is found. |
634 | /// |
635 | /// Generally speaking, callers will want to either enable all of these or |
636 | /// disable all of these. |
637 | /// |
638 | /// Internally, building a regex requires building two DFAs, where one is |
639 | /// responsible for finding the end of a match and the other is responsible |
640 | /// for finding the start of a match. If you only need to detect whether |
641 | /// something matched, or only the end of a match, then you should use a |
642 | /// [`dense::Builder`] to construct a single DFA, which is cheaper than |
643 | /// building two DFAs. |
644 | /// |
645 | /// # Build methods |
646 | /// |
647 | /// This builder has a few "build" methods. In general, it's the result of |
648 | /// combining the following parameters: |
649 | /// |
650 | /// * Building one or many regexes. |
651 | /// * Building a regex with dense or sparse DFAs. |
652 | /// |
653 | /// The simplest "build" method is [`Builder::build`]. It accepts a single |
654 | /// pattern and builds a dense DFA using `usize` for the state identifier |
655 | /// representation. |
656 | /// |
657 | /// The most general "build" method is [`Builder::build_many`], which permits |
658 | /// building a regex that searches for multiple patterns simultaneously while |
659 | /// using a specific state identifier representation. |
660 | /// |
661 | /// The most flexible "build" method, but hardest to use, is |
662 | /// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is |
663 | /// just a pair of DFAs, and this method allows you to specify those DFAs |
664 | /// exactly. |
665 | /// |
666 | /// # Example |
667 | /// |
668 | /// This example shows how to disable UTF-8 mode in the syntax and the regex |
669 | /// itself. This is generally what you want for matching on arbitrary bytes. |
670 | /// |
671 | /// ``` |
672 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
673 | /// use regex_automata::{ |
674 | /// dfa::regex::Regex, nfa::thompson, util::syntax, Match, |
675 | /// }; |
676 | /// |
677 | /// let re = Regex::builder() |
678 | /// .syntax(syntax::Config::new().utf8(false)) |
679 | /// .thompson(thompson::Config::new().utf8(false)) |
680 | /// .build(r"foo(?-u:[^b])ar.*" )?; |
681 | /// let haystack = b" \xFEfoo \xFFarzz \xE2\x98\xFF\n" ; |
682 | /// let expected = Some(Match::must(0, 1..9)); |
683 | /// let got = re.find(haystack); |
684 | /// assert_eq!(expected, got); |
685 | /// // Notice that `(?-u:[^b])` matches invalid UTF-8, |
686 | /// // but the subsequent `.*` does not! Disabling UTF-8 |
687 | /// // on the syntax permits this. |
688 | /// assert_eq!(b"foo \xFFarzz" , &haystack[got.unwrap().range()]); |
689 | /// |
690 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
691 | /// ``` |
692 | #[derive(Clone, Debug)] |
693 | pub struct Builder { |
694 | #[cfg (feature = "dfa-build" )] |
695 | dfa: dense::Builder, |
696 | } |
697 | |
698 | impl Builder { |
699 | /// Create a new regex builder with the default configuration. |
700 | pub fn new() -> Builder { |
701 | Builder { |
702 | #[cfg (feature = "dfa-build" )] |
703 | dfa: dense::Builder::new(), |
704 | } |
705 | } |
706 | |
707 | /// Build a regex from the given pattern. |
708 | /// |
709 | /// If there was a problem parsing or compiling the pattern, then an error |
710 | /// is returned. |
711 | #[cfg (all(feature = "syntax" , feature = "dfa-build" ))] |
712 | pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { |
713 | self.build_many(&[pattern]) |
714 | } |
715 | |
716 | /// Build a regex from the given pattern using sparse DFAs. |
717 | /// |
718 | /// If there was a problem parsing or compiling the pattern, then an error |
719 | /// is returned. |
720 | #[cfg (all(feature = "syntax" , feature = "dfa-build" ))] |
721 | pub fn build_sparse( |
722 | &self, |
723 | pattern: &str, |
724 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { |
725 | self.build_many_sparse(&[pattern]) |
726 | } |
727 | |
728 | /// Build a regex from the given patterns. |
729 | #[cfg (all(feature = "syntax" , feature = "dfa-build" ))] |
730 | pub fn build_many<P: AsRef<str>>( |
731 | &self, |
732 | patterns: &[P], |
733 | ) -> Result<Regex, BuildError> { |
734 | let forward = self.dfa.build_many(patterns)?; |
735 | let reverse = self |
736 | .dfa |
737 | .clone() |
738 | .configure( |
739 | dense::Config::new() |
740 | .prefilter(None) |
741 | .specialize_start_states(false) |
742 | .start_kind(StartKind::Anchored) |
743 | .match_kind(MatchKind::All), |
744 | ) |
745 | .thompson(crate::nfa::thompson::Config::new().reverse(true)) |
746 | .build_many(patterns)?; |
747 | Ok(self.build_from_dfas(forward, reverse)) |
748 | } |
749 | |
750 | /// Build a sparse regex from the given patterns. |
751 | #[cfg (all(feature = "syntax" , feature = "dfa-build" ))] |
752 | pub fn build_many_sparse<P: AsRef<str>>( |
753 | &self, |
754 | patterns: &[P], |
755 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { |
756 | let re = self.build_many(patterns)?; |
757 | let forward = re.forward().to_sparse()?; |
758 | let reverse = re.reverse().to_sparse()?; |
759 | Ok(self.build_from_dfas(forward, reverse)) |
760 | } |
761 | |
762 | /// Build a regex from its component forward and reverse DFAs. |
763 | /// |
764 | /// This is useful when deserializing a regex from some arbitrary |
765 | /// memory region. This is also useful for building regexes from other |
766 | /// types of DFAs. |
767 | /// |
768 | /// If you're building the DFAs from scratch instead of building new DFAs |
769 | /// from other DFAs, then you'll need to make sure that the reverse DFA is |
770 | /// configured correctly to match the intended semantics. Namely: |
771 | /// |
772 | /// * It should be anchored. |
773 | /// * It should use [`MatchKind::All`] semantics. |
774 | /// * It should match in reverse. |
775 | /// * Otherwise, its configuration should match the forward DFA. |
776 | /// |
777 | /// If these conditions aren't satisfied, then the behavior of searches is |
778 | /// unspecified. |
779 | /// |
780 | /// Note that when using this constructor, no configuration is applied. |
781 | /// Since this routine provides the DFAs to the builder, there is no |
782 | /// opportunity to apply other configuration options. |
783 | /// |
784 | /// # Example |
785 | /// |
786 | /// This example is a bit a contrived. The usual use of these methods |
787 | /// would involve serializing `initial_re` somewhere and then deserializing |
788 | /// it later to build a regex. But in this case, we do everything in |
789 | /// memory. |
790 | /// |
791 | /// ``` |
792 | /// use regex_automata::dfa::regex::Regex; |
793 | /// |
794 | /// let initial_re = Regex::new("foo[0-9]+" )?; |
795 | /// assert_eq!(true, initial_re.is_match(b"foo123" )); |
796 | /// |
797 | /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); |
798 | /// let re = Regex::builder().build_from_dfas(fwd, rev); |
799 | /// assert_eq!(true, re.is_match(b"foo123" )); |
800 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
801 | /// ``` |
802 | /// |
803 | /// This example shows how to build a `Regex` that uses sparse DFAs instead |
804 | /// of dense DFAs without using one of the convenience `build_sparse` |
805 | /// routines: |
806 | /// |
807 | /// ``` |
808 | /// use regex_automata::dfa::regex::Regex; |
809 | /// |
810 | /// let initial_re = Regex::new("foo[0-9]+" )?; |
811 | /// assert_eq!(true, initial_re.is_match(b"foo123" )); |
812 | /// |
813 | /// let fwd = initial_re.forward().to_sparse()?; |
814 | /// let rev = initial_re.reverse().to_sparse()?; |
815 | /// let re = Regex::builder().build_from_dfas(fwd, rev); |
816 | /// assert_eq!(true, re.is_match(b"foo123" )); |
817 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
818 | /// ``` |
819 | pub fn build_from_dfas<A: Automaton>( |
820 | &self, |
821 | forward: A, |
822 | reverse: A, |
823 | ) -> Regex<A> { |
824 | Regex { forward, reverse } |
825 | } |
826 | |
827 | /// Set the syntax configuration for this builder using |
828 | /// [`syntax::Config`](crate::util::syntax::Config). |
829 | /// |
830 | /// This permits setting things like case insensitivity, Unicode and multi |
831 | /// line mode. |
832 | #[cfg (all(feature = "syntax" , feature = "dfa-build" ))] |
833 | pub fn syntax( |
834 | &mut self, |
835 | config: crate::util::syntax::Config, |
836 | ) -> &mut Builder { |
837 | self.dfa.syntax(config); |
838 | self |
839 | } |
840 | |
841 | /// Set the Thompson NFA configuration for this builder using |
842 | /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). |
843 | /// |
844 | /// This permits setting things like whether additional time should be |
845 | /// spent shrinking the size of the NFA. |
846 | #[cfg (all(feature = "syntax" , feature = "dfa-build" ))] |
847 | pub fn thompson( |
848 | &mut self, |
849 | config: crate::nfa::thompson::Config, |
850 | ) -> &mut Builder { |
851 | self.dfa.thompson(config); |
852 | self |
853 | } |
854 | |
855 | /// Set the dense DFA compilation configuration for this builder using |
856 | /// [`dense::Config`]. |
857 | /// |
858 | /// This permits setting things like whether the underlying DFAs should |
859 | /// be minimized. |
860 | #[cfg (feature = "dfa-build" )] |
861 | pub fn dense(&mut self, config: dense::Config) -> &mut Builder { |
862 | self.dfa.configure(config); |
863 | self |
864 | } |
865 | } |
866 | |
867 | impl Default for Builder { |
868 | fn default() -> Builder { |
869 | Builder::new() |
870 | } |
871 | } |
872 | |