1 | /*! |
2 | A DFA-backed `Regex`. |
3 | |
4 | This module provides [`Regex`], which is defined generically over the |
5 | [`Automaton`] trait. A `Regex` implements convenience routines you might have |
6 | come to expect, such as finding the start/end of a match and iterating over |
7 | all non-overlapping matches. This `Regex` type is limited in its capabilities |
8 | to what a DFA can provide. Therefore, APIs involving capturing groups, for |
9 | example, are not provided. |
10 | |
11 | Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that |
12 | finds the end offset of a match, where as the other is a "reverse" DFA that |
13 | find the start offset of a match. |
14 | |
15 | See the [parent module](crate::dfa) for examples. |
16 | */ |
17 | |
18 | #[cfg (feature = "alloc" )] |
19 | use alloc::vec::Vec; |
20 | |
21 | use crate::{ |
22 | dfa::automaton::{Automaton, OverlappingState}, |
23 | util::prefilter::{self, Prefilter}, |
24 | MatchError, MultiMatch, |
25 | }; |
26 | #[cfg (feature = "alloc" )] |
27 | use crate::{ |
28 | dfa::{dense, error::Error, sparse}, |
29 | nfa::thompson, |
30 | util::matchtypes::MatchKind, |
31 | }; |
32 | |
33 | // When the alloc feature is enabled, the regex type sets its A type parameter |
34 | // to default to an owned dense DFA. But without alloc, we set no default. This |
35 | // makes things a lot more convenient in the common case, since writing out the |
36 | // DFA types is pretty annoying. |
37 | // |
38 | // Since we have two different definitions but only want to write one doc |
39 | // string, we use a macro to capture the doc and other attributes once and then |
40 | // repeat them for each definition. |
41 | macro_rules! define_regex_type { |
42 | ($(#[$doc:meta])*) => { |
43 | #[cfg(feature = "alloc" )] |
44 | $(#[$doc])* |
45 | pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> { |
46 | prefilter: Option<P>, |
47 | forward: A, |
48 | reverse: A, |
49 | utf8: bool, |
50 | } |
51 | |
52 | #[cfg(not(feature = "alloc" ))] |
53 | $(#[$doc])* |
54 | pub struct Regex<A, P = prefilter::None> { |
55 | prefilter: Option<P>, |
56 | forward: A, |
57 | reverse: A, |
58 | utf8: bool, |
59 | } |
60 | }; |
61 | } |
62 | |
63 | define_regex_type!( |
64 | /// A regular expression that uses deterministic finite automata for fast |
65 | /// searching. |
66 | /// |
67 | /// A regular expression is comprised of two DFAs, a "forward" DFA and a |
68 | /// "reverse" DFA. The forward DFA is responsible for detecting the end of |
69 | /// a match while the reverse DFA is responsible for detecting the start |
70 | /// of a match. Thus, in order to find the bounds of any given match, a |
71 | /// forward search must first be run followed by a reverse search. A match |
72 | /// found by the forward DFA guarantees that the reverse DFA will also find |
73 | /// a match. |
74 | /// |
75 | /// The type of the DFA used by a `Regex` corresponds to the `A` type |
76 | /// parameter, which must satisfy the [`Automaton`] trait. Typically, |
77 | /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a |
78 | /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more |
79 | /// memory but search faster, while sparse DFAs use less memory but search |
80 | /// more slowly. |
81 | /// |
82 | /// By default, a regex's automaton type parameter is set to |
83 | /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most |
84 | /// in-memory work loads, this is the most convenient type that gives the |
85 | /// best search performance. When the `alloc` feature is disabled, no |
86 | /// default type is used. |
87 | /// |
88 | /// A `Regex` also has a `P` type parameter, which is used to select the |
89 | /// prefilter used during search. By default, no prefilter is enabled by |
90 | /// setting the type to default to [`prefilter::None`]. A prefilter can be |
91 | /// enabled by using the [`Regex::prefilter`] method. |
92 | /// |
93 | /// # When should I use this? |
94 | /// |
95 | /// Generally speaking, if you can afford the overhead of building a full |
96 | /// DFA for your regex, and you don't need things like capturing groups, |
97 | /// then this is a good choice if you're looking to optimize for matching |
98 | /// speed. Note however that its speed may be worse than a general purpose |
99 | /// regex engine if you don't select a good [prefilter]. |
100 | /// |
101 | /// # Earliest vs Leftmost vs Overlapping |
102 | /// |
103 | /// The search routines exposed on a `Regex` reflect three different ways |
104 | /// of searching: |
105 | /// |
106 | /// * "earliest" means to stop as soon as a match has been detected. |
107 | /// * "leftmost" means to continue matching until the underlying |
108 | /// automaton cannot advance. This reflects "standard" searching you |
109 | /// might be used to in other regex engines. e.g., This permits |
110 | /// non-greedy and greedy searching to work as you would expect. |
111 | /// * "overlapping" means to find all possible matches, even if they |
112 | /// overlap. |
113 | /// |
114 | /// Generally speaking, when doing an overlapping search, you'll want to |
115 | /// build your regex DFAs with [`MatchKind::All`] semantics. Using |
116 | /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is |
117 | /// likely to lead to odd behavior since `LeftmostFirst` specifically omits |
118 | /// some matches that can never be reported due to its semantics. |
119 | /// |
120 | /// The following example shows the differences between how these different |
121 | /// types of searches impact looking for matches of `[a-z]+` in the |
122 | /// haystack `abc`. |
123 | /// |
124 | /// ``` |
125 | /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch}; |
126 | /// |
127 | /// let pattern = r"[a-z]+"; |
128 | /// let haystack = "abc".as_bytes(); |
129 | /// |
130 | /// // With leftmost-first semantics, we test "earliest" and "leftmost". |
131 | /// let re = dfa::regex::Builder::new() |
132 | /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst)) |
133 | /// .build(pattern)?; |
134 | /// |
135 | /// // "earliest" searching isn't impacted by greediness |
136 | /// let mut it = re.find_earliest_iter(haystack); |
137 | /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); |
138 | /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); |
139 | /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); |
140 | /// assert_eq!(None, it.next()); |
141 | /// |
142 | /// // "leftmost" searching supports greediness (and non-greediness) |
143 | /// let mut it = re.find_leftmost_iter(haystack); |
144 | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
145 | /// assert_eq!(None, it.next()); |
146 | /// |
147 | /// // For overlapping, we want "all" match kind semantics. |
148 | /// let re = dfa::regex::Builder::new() |
149 | /// .dense(dense::Config::new().match_kind(MatchKind::All)) |
150 | /// .build(pattern)?; |
151 | /// |
152 | /// // In the overlapping search, we find all three possible matches |
153 | /// // starting at the beginning of the haystack. |
154 | /// let mut it = re.find_overlapping_iter(haystack); |
155 | /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); |
156 | /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); |
157 | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
158 | /// assert_eq!(None, it.next()); |
159 | /// |
160 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
161 | /// ``` |
162 | /// |
163 | /// # Sparse DFAs |
164 | /// |
165 | /// Since a `Regex` is generic over the [`Automaton`] trait, it can be |
166 | /// used with any kind of DFA. While this crate constructs dense DFAs by |
167 | /// default, it is easy enough to build corresponding sparse DFAs, and then |
168 | /// build a regex from them: |
169 | /// |
170 | /// ``` |
171 | /// use regex_automata::dfa::regex::Regex; |
172 | /// |
173 | /// // First, build a regex that uses dense DFAs. |
174 | /// let dense_re = Regex::new("foo[0-9]+")?; |
175 | /// |
176 | /// // Second, build sparse DFAs from the forward and reverse dense DFAs. |
177 | /// let fwd = dense_re.forward().to_sparse()?; |
178 | /// let rev = dense_re.reverse().to_sparse()?; |
179 | /// |
180 | /// // Third, build a new regex from the constituent sparse DFAs. |
181 | /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); |
182 | /// |
183 | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
184 | /// assert_eq!(true, sparse_re.is_match(b"foo123")); |
185 | /// |
186 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
187 | /// ``` |
188 | /// |
189 | /// Alternatively, one can use a [`Builder`] to construct a sparse DFA |
190 | /// more succinctly. (Note though that dense DFAs are still constructed |
191 | /// first internally, and then converted to sparse DFAs, as in the example |
192 | /// above.) |
193 | /// |
194 | /// ``` |
195 | /// use regex_automata::dfa::regex::Regex; |
196 | /// |
197 | /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; |
198 | /// // A regex that uses sparse DFAs can be used just like with dense DFAs. |
199 | /// assert!(sparse_re.is_match(b"foo123")); |
200 | /// |
201 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
202 | /// ``` |
203 | /// |
204 | /// # Fallibility |
205 | /// |
206 | /// In non-default configurations, the DFAs generated in this module may |
207 | /// return an error during a search. (Currently, the only way this happens |
208 | /// is if quit bytes are added or Unicode word boundaries are heuristically |
209 | /// enabled, both of which are turned off by default.) For convenience, the |
210 | /// main search routines, like [`find_leftmost`](Regex::find_leftmost), |
211 | /// will panic if an error occurs. However, if you need to use DFAs |
212 | /// which may produce an error at search time, then there are fallible |
213 | /// equivalents of all search routines. For example, for `find_leftmost`, |
214 | /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost). |
215 | /// The routines prefixed with `try_` return `Result<Option<MultiMatch>, |
216 | /// MatchError>`, where as the infallible routines simply return |
217 | /// `Option<MultiMatch>`. |
218 | /// |
219 | /// # Example |
220 | /// |
221 | /// This example shows how to cause a search to terminate if it sees a |
222 | /// `\n` byte, and handle the error returned. This could be useful if, for |
223 | /// example, you wanted to prevent a user supplied pattern from matching |
224 | /// across a line boundary. |
225 | /// |
226 | /// ``` |
227 | /// use regex_automata::{dfa::{self, regex::Regex}, MatchError}; |
228 | /// |
229 | /// let re = Regex::builder() |
230 | /// .dense(dfa::dense::Config::new().quit(b'\n', true)) |
231 | /// .build(r"foo\p{any}+bar")?; |
232 | /// |
233 | /// let haystack = "foo\nbar".as_bytes(); |
234 | /// // Normally this would produce a match, since \p{any} contains '\n'. |
235 | /// // But since we instructed the automaton to enter a quit state if a |
236 | /// // '\n' is observed, this produces a match error instead. |
237 | /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; |
238 | /// let got = re.try_find_leftmost(haystack).unwrap_err(); |
239 | /// assert_eq!(expected, got); |
240 | /// |
241 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
242 | /// ``` |
243 | #[derive (Clone, Debug)] |
244 | ); |
245 | |
246 | #[cfg (feature = "alloc" )] |
247 | impl Regex { |
248 | /// Parse the given regular expression using the default configuration and |
249 | /// return the corresponding regex. |
250 | /// |
251 | /// If you want a non-default configuration, then use the [`Builder`] to |
252 | /// set your own configuration. |
253 | /// |
254 | /// # Example |
255 | /// |
256 | /// ``` |
257 | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
258 | /// |
259 | /// let re = Regex::new("foo[0-9]+bar")?; |
260 | /// assert_eq!( |
261 | /// Some(MultiMatch::must(0, 3, 14)), |
262 | /// re.find_leftmost(b"zzzfoo12345barzzz"), |
263 | /// ); |
264 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
265 | /// ``` |
266 | pub fn new(pattern: &str) -> Result<Regex, Error> { |
267 | Builder::new().build(pattern) |
268 | } |
269 | |
270 | /// Like `new`, but parses multiple patterns into a single "regex set." |
271 | /// This similarly uses the default regex configuration. |
272 | /// |
273 | /// # Example |
274 | /// |
275 | /// ``` |
276 | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
277 | /// |
278 | /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; |
279 | /// |
280 | /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); |
281 | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
282 | /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); |
283 | /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); |
284 | /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); |
285 | /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); |
286 | /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); |
287 | /// assert_eq!(None, it.next()); |
288 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
289 | /// ``` |
290 | pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> { |
291 | Builder::new().build_many(patterns) |
292 | } |
293 | } |
294 | |
295 | #[cfg (feature = "alloc" )] |
296 | impl Regex<sparse::DFA<Vec<u8>>> { |
297 | /// Parse the given regular expression using the default configuration, |
298 | /// except using sparse DFAs, and return the corresponding regex. |
299 | /// |
300 | /// If you want a non-default configuration, then use the [`Builder`] to |
301 | /// set your own configuration. |
302 | /// |
303 | /// # Example |
304 | /// |
305 | /// ``` |
306 | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
307 | /// |
308 | /// let re = Regex::new_sparse("foo[0-9]+bar")?; |
309 | /// assert_eq!( |
310 | /// Some(MultiMatch::must(0, 3, 14)), |
311 | /// re.find_leftmost(b"zzzfoo12345barzzz"), |
312 | /// ); |
313 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
314 | /// ``` |
315 | pub fn new_sparse( |
316 | pattern: &str, |
317 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
318 | Builder::new().build_sparse(pattern) |
319 | } |
320 | |
321 | /// Like `new`, but parses multiple patterns into a single "regex set" |
322 | /// using sparse DFAs. This otherwise similarly uses the default regex |
323 | /// configuration. |
324 | /// |
325 | /// # Example |
326 | /// |
327 | /// ``` |
328 | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
329 | /// |
330 | /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; |
331 | /// |
332 | /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); |
333 | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); |
334 | /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); |
335 | /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); |
336 | /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); |
337 | /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); |
338 | /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); |
339 | /// assert_eq!(None, it.next()); |
340 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
341 | /// ``` |
342 | pub fn new_many_sparse<P: AsRef<str>>( |
343 | patterns: &[P], |
344 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
345 | Builder::new().build_many_sparse(patterns) |
346 | } |
347 | } |
348 | |
349 | /// Convenience routines for regex construction. |
350 | #[cfg (feature = "alloc" )] |
351 | impl Regex { |
352 | /// Return a default configuration for a `Regex`. |
353 | /// |
354 | /// This is a convenience routine to avoid needing to import the `Config` |
355 | /// type when customizing the construction of a regex. |
356 | /// |
357 | /// # Example |
358 | /// |
359 | /// This example shows how to disable UTF-8 mode for `Regex` iteration. |
360 | /// When UTF-8 mode is disabled, the position immediately following an |
361 | /// empty match is where the next search begins, instead of the next |
362 | /// position of a UTF-8 encoded codepoint. |
363 | /// |
364 | /// ``` |
365 | /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
366 | /// |
367 | /// let re = Regex::builder() |
368 | /// .configure(Regex::config().utf8(false)) |
369 | /// .build(r"")?; |
370 | /// let haystack = "a☃z".as_bytes(); |
371 | /// let mut it = re.find_leftmost_iter(haystack); |
372 | /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); |
373 | /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); |
374 | /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); |
375 | /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); |
376 | /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); |
377 | /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); |
378 | /// assert_eq!(None, it.next()); |
379 | /// |
380 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
381 | /// ``` |
382 | pub fn config() -> Config { |
383 | Config::new() |
384 | } |
385 | |
386 | /// Return a builder for configuring the construction of a `Regex`. |
387 | /// |
388 | /// This is a convenience routine to avoid needing to import the |
389 | /// [`Builder`] type in common cases. |
390 | /// |
391 | /// # Example |
392 | /// |
393 | /// This example shows how to use the builder to disable UTF-8 mode |
394 | /// everywhere. |
395 | /// |
396 | /// ``` |
397 | /// use regex_automata::{ |
398 | /// dfa::regex::Regex, |
399 | /// nfa::thompson, |
400 | /// MultiMatch, SyntaxConfig, |
401 | /// }; |
402 | /// |
403 | /// let re = Regex::builder() |
404 | /// .configure(Regex::config().utf8(false)) |
405 | /// .syntax(SyntaxConfig::new().utf8(false)) |
406 | /// .thompson(thompson::Config::new().utf8(false)) |
407 | /// .build(r"foo(?-u:[^b])ar.*")?; |
408 | /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
409 | /// let expected = Some(MultiMatch::must(0, 1, 9)); |
410 | /// let got = re.find_leftmost(haystack); |
411 | /// assert_eq!(expected, got); |
412 | /// |
413 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
414 | /// ``` |
415 | pub fn builder() -> Builder { |
416 | Builder::new() |
417 | } |
418 | } |
419 | |
420 | /// Standard search routines for finding and iterating over matches. |
421 | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
422 | /// Returns true if and only if this regex matches the given haystack. |
423 | /// |
424 | /// This routine may short circuit if it knows that scanning future input |
425 | /// will never lead to a different result. In particular, if the underlying |
426 | /// DFA enters a match state or a dead state, then this routine will return |
427 | /// `true` or `false`, respectively, without inspecting any future input. |
428 | /// |
429 | /// # Panics |
430 | /// |
431 | /// If the underlying DFAs return an error, then this routine panics. This |
432 | /// only occurs in non-default configurations where quit bytes are used or |
433 | /// Unicode word boundaries are heuristically enabled. |
434 | /// |
435 | /// The fallible version of this routine is |
436 | /// [`try_is_match`](Regex::try_is_match). |
437 | /// |
438 | /// # Example |
439 | /// |
440 | /// ``` |
441 | /// use regex_automata::dfa::regex::Regex; |
442 | /// |
443 | /// let re = Regex::new("foo[0-9]+bar" )?; |
444 | /// assert_eq!(true, re.is_match(b"foo12345bar" )); |
445 | /// assert_eq!(false, re.is_match(b"foobar" )); |
446 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
447 | /// ``` |
448 | pub fn is_match(&self, haystack: &[u8]) -> bool { |
449 | self.is_match_at(haystack, 0, haystack.len()) |
450 | } |
451 | |
452 | /// Returns the first position at which a match is found. |
453 | /// |
454 | /// This routine stops scanning input in precisely the same circumstances |
455 | /// as `is_match`. The key difference is that this routine returns the |
456 | /// position at which it stopped scanning input if and only if a match |
457 | /// was found. If no match is found, then `None` is returned. |
458 | /// |
459 | /// # Panics |
460 | /// |
461 | /// If the underlying DFAs return an error, then this routine panics. This |
462 | /// only occurs in non-default configurations where quit bytes are used or |
463 | /// Unicode word boundaries are heuristically enabled. |
464 | /// |
465 | /// The fallible version of this routine is |
466 | /// [`try_find_earliest`](Regex::try_find_earliest). |
467 | /// |
468 | /// # Example |
469 | /// |
470 | /// ``` |
471 | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
472 | /// |
473 | /// // Normally, the leftmost first match would greedily consume as many |
474 | /// // decimal digits as it could. But a match is detected as soon as one |
475 | /// // digit is seen. |
476 | /// let re = Regex::new("foo[0-9]+" )?; |
477 | /// assert_eq!( |
478 | /// Some(MultiMatch::must(0, 0, 4)), |
479 | /// re.find_earliest(b"foo12345" ), |
480 | /// ); |
481 | /// |
482 | /// // Normally, the end of the leftmost first match here would be 3, |
483 | /// // but the "earliest" match semantics detect a match earlier. |
484 | /// let re = Regex::new("abc|a" )?; |
485 | /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc" )); |
486 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
487 | /// ``` |
488 | pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> { |
489 | self.find_earliest_at(haystack, 0, haystack.len()) |
490 | } |
491 | |
492 | /// Returns the start and end offset of the leftmost match. If no match |
493 | /// exists, then `None` is returned. |
494 | /// |
495 | /// # Panics |
496 | /// |
497 | /// If the underlying DFAs return an error, then this routine panics. This |
498 | /// only occurs in non-default configurations where quit bytes are used or |
499 | /// Unicode word boundaries are heuristically enabled. |
500 | /// |
501 | /// The fallible version of this routine is |
502 | /// [`try_find_leftmost`](Regex::try_find_leftmost). |
503 | /// |
504 | /// # Example |
505 | /// |
506 | /// ``` |
507 | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
508 | /// |
509 | /// // Greediness is applied appropriately when compared to find_earliest. |
510 | /// let re = Regex::new("foo[0-9]+" )?; |
511 | /// assert_eq!( |
512 | /// Some(MultiMatch::must(0, 3, 11)), |
513 | /// re.find_leftmost(b"zzzfoo12345zzz" ), |
514 | /// ); |
515 | /// |
516 | /// // Even though a match is found after reading the first byte (`a`), |
517 | /// // the default leftmost-first match semantics demand that we find the |
518 | /// // earliest match that prefers earlier parts of the pattern over latter |
519 | /// // parts. |
520 | /// let re = Regex::new("abc|a" )?; |
521 | /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc" )); |
522 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
523 | /// ``` |
524 | pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> { |
525 | self.find_leftmost_at(haystack, 0, haystack.len()) |
526 | } |
527 | |
528 | /// Search for the first overlapping match in `haystack`. |
529 | /// |
530 | /// This routine is principally useful when searching for multiple patterns |
531 | /// on inputs where multiple patterns may match the same regions of text. |
532 | /// In particular, callers must preserve the automaton's search state from |
533 | /// prior calls so that the implementation knows where the last match |
534 | /// occurred and which pattern was reported. |
535 | /// |
536 | /// # Panics |
537 | /// |
538 | /// If the underlying DFAs return an error, then this routine panics. This |
539 | /// only occurs in non-default configurations where quit bytes are used or |
540 | /// Unicode word boundaries are heuristically enabled. |
541 | /// |
542 | /// The fallible version of this routine is |
543 | /// [`try_find_overlapping`](Regex::try_find_overlapping). |
544 | /// |
545 | /// # Example |
546 | /// |
547 | /// This example shows how to run an overlapping search with multiple |
548 | /// regexes. |
549 | /// |
550 | /// ``` |
551 | /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; |
552 | /// |
553 | /// let re = Regex::builder() |
554 | /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) |
555 | /// .build_many(&[r"\w+$" , r"\S+$" ])?; |
556 | /// let haystack = "@foo" .as_bytes(); |
557 | /// let mut state = dfa::OverlappingState::start(); |
558 | /// |
559 | /// let expected = Some(MultiMatch::must(1, 0, 4)); |
560 | /// let got = re.find_overlapping(haystack, &mut state); |
561 | /// assert_eq!(expected, got); |
562 | /// |
563 | /// // The first pattern also matches at the same position, so re-running |
564 | /// // the search will yield another match. Notice also that the first |
565 | /// // pattern is returned after the second. This is because the second |
566 | /// // pattern begins its match before the first, is therefore an earlier |
567 | /// // match and is thus reported first. |
568 | /// let expected = Some(MultiMatch::must(0, 1, 4)); |
569 | /// let got = re.find_overlapping(haystack, &mut state); |
570 | /// assert_eq!(expected, got); |
571 | /// |
572 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
573 | /// ``` |
574 | pub fn find_overlapping( |
575 | &self, |
576 | haystack: &[u8], |
577 | state: &mut OverlappingState, |
578 | ) -> Option<MultiMatch> { |
579 | self.find_overlapping_at(haystack, 0, haystack.len(), state) |
580 | } |
581 | |
582 | /// Returns an iterator over all non-overlapping "earliest" matches. |
583 | /// |
584 | /// Match positions are reported as soon as a match is known to occur, even |
585 | /// if the standard leftmost match would be longer. |
586 | /// |
587 | /// # Panics |
588 | /// |
589 | /// If the underlying DFAs return an error during iteration, then iteration |
590 | /// panics. This only occurs in non-default configurations where quit bytes |
591 | /// are used or Unicode word boundaries are heuristically enabled. |
592 | /// |
593 | /// The fallible version of this routine is |
594 | /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). |
595 | /// |
596 | /// # Example |
597 | /// |
598 | /// This example shows how to run an "earliest" iterator. |
599 | /// |
600 | /// ``` |
601 | /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
602 | /// |
603 | /// let re = Regex::new("[0-9]+" )?; |
604 | /// let haystack = "123" .as_bytes(); |
605 | /// |
606 | /// // Normally, a standard leftmost iterator would return a single |
607 | /// // match, but since "earliest" detects matches earlier, we get |
608 | /// // three matches. |
609 | /// let mut it = re.find_earliest_iter(haystack); |
610 | /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); |
611 | /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); |
612 | /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); |
613 | /// assert_eq!(None, it.next()); |
614 | /// |
615 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
616 | /// ``` |
617 | pub fn find_earliest_iter<'r, 't>( |
618 | &'r self, |
619 | haystack: &'t [u8], |
620 | ) -> FindEarliestMatches<'r, 't, A, P> { |
621 | FindEarliestMatches::new(self, haystack) |
622 | } |
623 | |
624 | /// Returns an iterator over all non-overlapping leftmost matches in the |
625 | /// given bytes. If no match exists, then the iterator yields no elements. |
626 | /// |
627 | /// This corresponds to the "standard" regex search iterator. |
628 | /// |
629 | /// # Panics |
630 | /// |
631 | /// If the underlying DFAs return an error during iteration, then iteration |
632 | /// panics. This only occurs in non-default configurations where quit bytes |
633 | /// are used or Unicode word boundaries are heuristically enabled. |
634 | /// |
635 | /// The fallible version of this routine is |
636 | /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). |
637 | /// |
638 | /// # Example |
639 | /// |
640 | /// ``` |
641 | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
642 | /// |
643 | /// let re = Regex::new("foo[0-9]+" )?; |
644 | /// let text = b"foo1 foo12 foo123" ; |
645 | /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); |
646 | /// assert_eq!(matches, vec![ |
647 | /// MultiMatch::must(0, 0, 4), |
648 | /// MultiMatch::must(0, 5, 10), |
649 | /// MultiMatch::must(0, 11, 17), |
650 | /// ]); |
651 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
652 | /// ``` |
653 | pub fn find_leftmost_iter<'r, 't>( |
654 | &'r self, |
655 | haystack: &'t [u8], |
656 | ) -> FindLeftmostMatches<'r, 't, A, P> { |
657 | FindLeftmostMatches::new(self, haystack) |
658 | } |
659 | |
660 | /// Returns an iterator over all overlapping matches in the given haystack. |
661 | /// |
662 | /// This routine is principally useful when searching for multiple patterns |
663 | /// on inputs where multiple patterns may match the same regions of text. |
664 | /// The iterator takes care of handling the overlapping state that must be |
665 | /// threaded through every search. |
666 | /// |
667 | /// # Panics |
668 | /// |
669 | /// If the underlying DFAs return an error during iteration, then iteration |
670 | /// panics. This only occurs in non-default configurations where quit bytes |
671 | /// are used or Unicode word boundaries are heuristically enabled. |
672 | /// |
673 | /// The fallible version of this routine is |
674 | /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). |
675 | /// |
676 | /// # Example |
677 | /// |
678 | /// This example shows how to run an overlapping search with multiple |
679 | /// regexes. |
680 | /// |
681 | /// ``` |
682 | /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; |
683 | /// |
684 | /// let re = Regex::builder() |
685 | /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) |
686 | /// .build_many(&[r"\w+$" , r"\S+$" ])?; |
687 | /// let haystack = "@foo" .as_bytes(); |
688 | /// |
689 | /// let mut it = re.find_overlapping_iter(haystack); |
690 | /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); |
691 | /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); |
692 | /// assert_eq!(None, it.next()); |
693 | /// |
694 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
695 | /// ``` |
696 | pub fn find_overlapping_iter<'r, 't>( |
697 | &'r self, |
698 | haystack: &'t [u8], |
699 | ) -> FindOverlappingMatches<'r, 't, A, P> { |
700 | FindOverlappingMatches::new(self, haystack) |
701 | } |
702 | } |
703 | |
704 | /// Lower level infallible search routines that permit controlling where |
705 | /// the search starts and ends in a particular sequence. This is useful for |
706 | /// executing searches that need to take surrounding context into account. This |
707 | /// is required for correctly implementing iteration because of look-around |
708 | /// operators (`^`, `$`, `\b`). |
709 | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
710 | /// Returns true if and only if this regex matches the given haystack. |
711 | /// |
712 | /// This routine may short circuit if it knows that scanning future input |
713 | /// will never lead to a different result. In particular, if the underlying |
714 | /// DFA enters a match state or a dead state, then this routine will return |
715 | /// `true` or `false`, respectively, without inspecting any future input. |
716 | /// |
717 | /// # Searching a substring of the haystack |
718 | /// |
719 | /// Being an "at" search routine, this permits callers to search a |
720 | /// substring of `haystack` by specifying a range in `haystack`. |
721 | /// Why expose this as an API instead of just asking callers to use |
722 | /// `&input[start..end]`? The reason is that regex matching often wants |
723 | /// to take the surrounding context into account in order to handle |
724 | /// look-around (`^`, `$` and `\b`). |
725 | /// |
726 | /// # Panics |
727 | /// |
728 | /// If the underlying DFAs return an error, then this routine panics. This |
729 | /// only occurs in non-default configurations where quit bytes are used or |
730 | /// Unicode word boundaries are heuristically enabled. |
731 | /// |
732 | /// The fallible version of this routine is |
733 | /// [`try_is_match_at`](Regex::try_is_match_at). |
734 | pub fn is_match_at( |
735 | &self, |
736 | haystack: &[u8], |
737 | start: usize, |
738 | end: usize, |
739 | ) -> bool { |
740 | self.try_is_match_at(haystack, start, end).unwrap() |
741 | } |
742 | |
743 | /// Returns the first position at which a match is found. |
744 | /// |
745 | /// This routine stops scanning input in precisely the same circumstances |
746 | /// as `is_match`. The key difference is that this routine returns the |
747 | /// position at which it stopped scanning input if and only if a match |
748 | /// was found. If no match is found, then `None` is returned. |
749 | /// |
750 | /// # Searching a substring of the haystack |
751 | /// |
752 | /// Being an "at" search routine, this permits callers to search a |
753 | /// substring of `haystack` by specifying a range in `haystack`. |
754 | /// Why expose this as an API instead of just asking callers to use |
755 | /// `&input[start..end]`? The reason is that regex matching often wants |
756 | /// to take the surrounding context into account in order to handle |
757 | /// look-around (`^`, `$` and `\b`). |
758 | /// |
759 | /// This is useful when implementing an iterator over matches |
760 | /// within the same haystack, which cannot be done correctly by simply |
761 | /// providing a subslice of `haystack`. |
762 | /// |
763 | /// # Panics |
764 | /// |
765 | /// If the underlying DFAs return an error, then this routine panics. This |
766 | /// only occurs in non-default configurations where quit bytes are used or |
767 | /// Unicode word boundaries are heuristically enabled. |
768 | /// |
769 | /// The fallible version of this routine is |
770 | /// [`try_find_earliest_at`](Regex::try_find_earliest_at). |
771 | pub fn find_earliest_at( |
772 | &self, |
773 | haystack: &[u8], |
774 | start: usize, |
775 | end: usize, |
776 | ) -> Option<MultiMatch> { |
777 | self.try_find_earliest_at(haystack, start, end).unwrap() |
778 | } |
779 | |
780 | /// Returns the same as `find_leftmost`, but starts the search at the given |
781 | /// offset. |
782 | /// |
783 | /// The significance of the starting point is that it takes the surrounding |
784 | /// context into consideration. For example, if the DFA is anchored, then |
785 | /// a match can only occur when `start == 0`. |
786 | /// |
787 | /// # Searching a substring of the haystack |
788 | /// |
789 | /// Being an "at" search routine, this permits callers to search a |
790 | /// substring of `haystack` by specifying a range in `haystack`. |
791 | /// Why expose this as an API instead of just asking callers to use |
792 | /// `&input[start..end]`? The reason is that regex matching often wants |
793 | /// to take the surrounding context into account in order to handle |
794 | /// look-around (`^`, `$` and `\b`). |
795 | /// |
796 | /// This is useful when implementing an iterator over matches within the |
797 | /// same haystack, which cannot be done correctly by simply providing a |
798 | /// subslice of `haystack`. |
799 | /// |
800 | /// # Panics |
801 | /// |
802 | /// If the underlying DFAs return an error, then this routine panics. This |
803 | /// only occurs in non-default configurations where quit bytes are used or |
804 | /// Unicode word boundaries are heuristically enabled. |
805 | /// |
806 | /// The fallible version of this routine is |
807 | /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). |
808 | pub fn find_leftmost_at( |
809 | &self, |
810 | haystack: &[u8], |
811 | start: usize, |
812 | end: usize, |
813 | ) -> Option<MultiMatch> { |
814 | self.try_find_leftmost_at(haystack, start, end).unwrap() |
815 | } |
816 | |
817 | /// Search for the first overlapping match within a given range of |
818 | /// `haystack`. |
819 | /// |
820 | /// This routine is principally useful when searching for multiple patterns |
821 | /// on inputs where multiple patterns may match the same regions of text. |
822 | /// In particular, callers must preserve the automaton's search state from |
823 | /// prior calls so that the implementation knows where the last match |
824 | /// occurred and which pattern was reported. |
825 | /// |
826 | /// # Searching a substring of the haystack |
827 | /// |
828 | /// Being an "at" search routine, this permits callers to search a |
829 | /// substring of `haystack` by specifying a range in `haystack`. |
830 | /// Why expose this as an API instead of just asking callers to use |
831 | /// `&input[start..end]`? The reason is that regex matching often wants |
832 | /// to take the surrounding context into account in order to handle |
833 | /// look-around (`^`, `$` and `\b`). |
834 | /// |
835 | /// This is useful when implementing an iterator over matches |
836 | /// within the same haystack, which cannot be done correctly by simply |
837 | /// providing a subslice of `haystack`. |
838 | /// |
839 | /// # Panics |
840 | /// |
841 | /// If the underlying DFAs return an error, then this routine panics. This |
842 | /// only occurs in non-default configurations where quit bytes are used or |
843 | /// Unicode word boundaries are heuristically enabled. |
844 | /// |
845 | /// The fallible version of this routine is |
846 | /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). |
847 | pub fn find_overlapping_at( |
848 | &self, |
849 | haystack: &[u8], |
850 | start: usize, |
851 | end: usize, |
852 | state: &mut OverlappingState, |
853 | ) -> Option<MultiMatch> { |
854 | self.try_find_overlapping_at(haystack, start, end, state).unwrap() |
855 | } |
856 | } |
857 | |
858 | /// Fallible search routines. These may return an error when the underlying |
859 | /// DFAs have been configured in a way that permits them to fail during a |
860 | /// search. |
861 | /// |
862 | /// Errors during search only occur when the DFA has been explicitly |
863 | /// configured to do so, usually by specifying one or more "quit" bytes or by |
864 | /// heuristically enabling Unicode word boundaries. |
865 | /// |
866 | /// Errors will never be returned using the default configuration. So these |
867 | /// fallible routines are only needed for particular configurations. |
868 | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
869 | /// Returns true if and only if this regex matches the given haystack. |
870 | /// |
871 | /// This routine may short circuit if it knows that scanning future input |
872 | /// will never lead to a different result. In particular, if the underlying |
873 | /// DFA enters a match state or a dead state, then this routine will return |
874 | /// `true` or `false`, respectively, without inspecting any future input. |
875 | /// |
876 | /// # Errors |
877 | /// |
878 | /// This routine only errors if the search could not complete. For |
879 | /// DFA-based regexes, this only occurs in a non-default configuration |
880 | /// where quit bytes are used or Unicode word boundaries are heuristically |
881 | /// enabled. |
882 | /// |
883 | /// When a search cannot complete, callers cannot know whether a match |
884 | /// exists or not. |
885 | /// |
886 | /// The infallible (panics on error) version of this routine is |
887 | /// [`is_match`](Regex::is_match). |
888 | pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> { |
889 | self.try_is_match_at(haystack, 0, haystack.len()) |
890 | } |
891 | |
892 | /// Returns the first position at which a match is found. |
893 | /// |
894 | /// This routine stops scanning input in precisely the same circumstances |
895 | /// as `is_match`. The key difference is that this routine returns the |
896 | /// position at which it stopped scanning input if and only if a match |
897 | /// was found. If no match is found, then `None` is returned. |
898 | /// |
899 | /// # Errors |
900 | /// |
901 | /// This routine only errors if the search could not complete. For |
902 | /// DFA-based regexes, this only occurs in a non-default configuration |
903 | /// where quit bytes are used or Unicode word boundaries are heuristically |
904 | /// enabled. |
905 | /// |
906 | /// When a search cannot complete, callers cannot know whether a match |
907 | /// exists or not. |
908 | /// |
909 | /// The infallible (panics on error) version of this routine is |
910 | /// [`find_earliest`](Regex::find_earliest). |
911 | pub fn try_find_earliest( |
912 | &self, |
913 | haystack: &[u8], |
914 | ) -> Result<Option<MultiMatch>, MatchError> { |
915 | self.try_find_earliest_at(haystack, 0, haystack.len()) |
916 | } |
917 | |
918 | /// Returns the start and end offset of the leftmost match. If no match |
919 | /// exists, then `None` is returned. |
920 | /// |
921 | /// # Errors |
922 | /// |
923 | /// This routine only errors if the search could not complete. For |
924 | /// DFA-based regexes, this only occurs in a non-default configuration |
925 | /// where quit bytes are used or Unicode word boundaries are heuristically |
926 | /// enabled. |
927 | /// |
928 | /// When a search cannot complete, callers cannot know whether a match |
929 | /// exists or not. |
930 | /// |
931 | /// The infallible (panics on error) version of this routine is |
932 | /// [`find_leftmost`](Regex::find_leftmost). |
933 | pub fn try_find_leftmost( |
934 | &self, |
935 | haystack: &[u8], |
936 | ) -> Result<Option<MultiMatch>, MatchError> { |
937 | self.try_find_leftmost_at(haystack, 0, haystack.len()) |
938 | } |
939 | |
940 | /// Search for the first overlapping match in `haystack`. |
941 | /// |
942 | /// This routine is principally useful when searching for multiple patterns |
943 | /// on inputs where multiple patterns may match the same regions of text. |
944 | /// In particular, callers must preserve the automaton's search state from |
945 | /// prior calls so that the implementation knows where the last match |
946 | /// occurred and which pattern was reported. |
947 | /// |
948 | /// # Errors |
949 | /// |
950 | /// This routine only errors if the search could not complete. For |
951 | /// DFA-based regexes, this only occurs in a non-default configuration |
952 | /// where quit bytes are used or Unicode word boundaries are heuristically |
953 | /// enabled. |
954 | /// |
955 | /// When a search cannot complete, callers cannot know whether a match |
956 | /// exists or not. |
957 | /// |
958 | /// The infallible (panics on error) version of this routine is |
959 | /// [`find_overlapping`](Regex::find_overlapping). |
960 | pub fn try_find_overlapping( |
961 | &self, |
962 | haystack: &[u8], |
963 | state: &mut OverlappingState, |
964 | ) -> Result<Option<MultiMatch>, MatchError> { |
965 | self.try_find_overlapping_at(haystack, 0, haystack.len(), state) |
966 | } |
967 | |
968 | /// Returns an iterator over all non-overlapping "earliest" matches. |
969 | /// |
970 | /// Match positions are reported as soon as a match is known to occur, even |
971 | /// if the standard leftmost match would be longer. |
972 | /// |
973 | /// # Errors |
974 | /// |
975 | /// This iterator only yields errors if the search could not complete. For |
976 | /// DFA-based regexes, this only occurs in a non-default configuration |
977 | /// where quit bytes are used or Unicode word boundaries are heuristically |
978 | /// enabled. |
979 | /// |
980 | /// When a search cannot complete, callers cannot know whether a match |
981 | /// exists or not. |
982 | /// |
983 | /// The infallible (panics on error) version of this routine is |
984 | /// [`find_earliest_iter`](Regex::find_earliest_iter). |
985 | pub fn try_find_earliest_iter<'r, 't>( |
986 | &'r self, |
987 | haystack: &'t [u8], |
988 | ) -> TryFindEarliestMatches<'r, 't, A, P> { |
989 | TryFindEarliestMatches::new(self, haystack) |
990 | } |
991 | |
992 | /// Returns an iterator over all non-overlapping leftmost matches in the |
993 | /// given bytes. If no match exists, then the iterator yields no elements. |
994 | /// |
995 | /// This corresponds to the "standard" regex search iterator. |
996 | /// |
997 | /// # Errors |
998 | /// |
999 | /// This iterator only yields errors if the search could not complete. For |
1000 | /// DFA-based regexes, this only occurs in a non-default configuration |
1001 | /// where quit bytes are used or Unicode word boundaries are heuristically |
1002 | /// enabled. |
1003 | /// |
1004 | /// When a search cannot complete, callers cannot know whether a match |
1005 | /// exists or not. |
1006 | /// |
1007 | /// The infallible (panics on error) version of this routine is |
1008 | /// [`find_leftmost_iter`](Regex::find_leftmost_iter). |
1009 | pub fn try_find_leftmost_iter<'r, 't>( |
1010 | &'r self, |
1011 | haystack: &'t [u8], |
1012 | ) -> TryFindLeftmostMatches<'r, 't, A, P> { |
1013 | TryFindLeftmostMatches::new(self, haystack) |
1014 | } |
1015 | |
1016 | /// Returns an iterator over all overlapping matches in the given haystack. |
1017 | /// |
1018 | /// This routine is principally useful when searching for multiple patterns |
1019 | /// on inputs where multiple patterns may match the same regions of text. |
1020 | /// The iterator takes care of handling the overlapping state that must be |
1021 | /// threaded through every search. |
1022 | /// |
1023 | /// # Errors |
1024 | /// |
1025 | /// This iterator only yields errors if the search could not complete. For |
1026 | /// DFA-based regexes, this only occurs in a non-default configuration |
1027 | /// where quit bytes are used or Unicode word boundaries are heuristically |
1028 | /// enabled. |
1029 | /// |
1030 | /// When a search cannot complete, callers cannot know whether a match |
1031 | /// exists or not. |
1032 | /// |
1033 | /// The infallible (panics on error) version of this routine is |
1034 | /// [`find_overlapping_iter`](Regex::find_overlapping_iter). |
1035 | pub fn try_find_overlapping_iter<'r, 't>( |
1036 | &'r self, |
1037 | haystack: &'t [u8], |
1038 | ) -> TryFindOverlappingMatches<'r, 't, A, P> { |
1039 | TryFindOverlappingMatches::new(self, haystack) |
1040 | } |
1041 | } |
1042 | |
1043 | /// Lower level fallible search routines that permit controlling where the |
1044 | /// search starts and ends in a particular sequence. |
1045 | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
1046 | /// Returns true if and only if this regex matches the given haystack. |
1047 | /// |
1048 | /// This routine may short circuit if it knows that scanning future input |
1049 | /// will never lead to a different result. In particular, if the underlying |
1050 | /// DFA enters a match state or a dead state, then this routine will return |
1051 | /// `true` or `false`, respectively, without inspecting any future input. |
1052 | /// |
1053 | /// # Searching a substring of the haystack |
1054 | /// |
1055 | /// Being an "at" search routine, this permits callers to search a |
1056 | /// substring of `haystack` by specifying a range in `haystack`. |
1057 | /// Why expose this as an API instead of just asking callers to use |
1058 | /// `&input[start..end]`? The reason is that regex matching often wants |
1059 | /// to take the surrounding context into account in order to handle |
1060 | /// look-around (`^`, `$` and `\b`). |
1061 | /// |
1062 | /// # Errors |
1063 | /// |
1064 | /// This routine only errors if the search could not complete. For |
1065 | /// DFA-based regexes, this only occurs in a non-default configuration |
1066 | /// where quit bytes are used, Unicode word boundaries are heuristically |
1067 | /// enabled or limits are set on the number of times the lazy DFA's cache |
1068 | /// may be cleared. |
1069 | /// |
1070 | /// When a search cannot complete, callers cannot know whether a match |
1071 | /// exists or not. |
1072 | /// |
1073 | /// The infallible (panics on error) version of this routine is |
1074 | /// [`is_match_at`](Regex::is_match_at). |
1075 | pub fn try_is_match_at( |
1076 | &self, |
1077 | haystack: &[u8], |
1078 | start: usize, |
1079 | end: usize, |
1080 | ) -> Result<bool, MatchError> { |
1081 | self.forward() |
1082 | .find_earliest_fwd_at( |
1083 | self.scanner().as_mut(), |
1084 | None, |
1085 | haystack, |
1086 | start, |
1087 | end, |
1088 | ) |
1089 | .map(|x| x.is_some()) |
1090 | } |
1091 | |
1092 | /// Returns the first position at which a match is found. |
1093 | /// |
1094 | /// This routine stops scanning input in precisely the same circumstances |
1095 | /// as `is_match`. The key difference is that this routine returns the |
1096 | /// position at which it stopped scanning input if and only if a match |
1097 | /// was found. If no match is found, then `None` is returned. |
1098 | /// |
1099 | /// # Searching a substring of the haystack |
1100 | /// |
1101 | /// Being an "at" search routine, this permits callers to search a |
1102 | /// substring of `haystack` by specifying a range in `haystack`. |
1103 | /// Why expose this as an API instead of just asking callers to use |
1104 | /// `&input[start..end]`? The reason is that regex matching often wants |
1105 | /// to take the surrounding context into account in order to handle |
1106 | /// look-around (`^`, `$` and `\b`). |
1107 | /// |
1108 | /// This is useful when implementing an iterator over matches |
1109 | /// within the same haystack, which cannot be done correctly by simply |
1110 | /// providing a subslice of `haystack`. |
1111 | /// |
1112 | /// # Errors |
1113 | /// |
1114 | /// This routine only errors if the search could not complete. For |
1115 | /// DFA-based regexes, this only occurs in a non-default configuration |
1116 | /// where quit bytes are used or Unicode word boundaries are heuristically |
1117 | /// enabled. |
1118 | /// |
1119 | /// When a search cannot complete, callers cannot know whether a match |
1120 | /// exists or not. |
1121 | /// |
1122 | /// The infallible (panics on error) version of this routine is |
1123 | /// [`find_earliest_at`](Regex::find_earliest_at). |
1124 | pub fn try_find_earliest_at( |
1125 | &self, |
1126 | haystack: &[u8], |
1127 | start: usize, |
1128 | end: usize, |
1129 | ) -> Result<Option<MultiMatch>, MatchError> { |
1130 | self.try_find_earliest_at_imp( |
1131 | self.scanner().as_mut(), |
1132 | haystack, |
1133 | start, |
1134 | end, |
1135 | ) |
1136 | } |
1137 | |
1138 | /// The implementation of "earliest" searching, where a prefilter scanner |
1139 | /// may be given. |
1140 | fn try_find_earliest_at_imp( |
1141 | &self, |
1142 | pre: Option<&mut prefilter::Scanner>, |
1143 | haystack: &[u8], |
1144 | start: usize, |
1145 | end: usize, |
1146 | ) -> Result<Option<MultiMatch>, MatchError> { |
1147 | // N.B. We use `&&A` here to call `Automaton` methods, which ensures |
1148 | // that we always use the `impl Automaton for &A` for calling methods. |
1149 | // Since this is the usual way that automata are used, this helps |
1150 | // reduce the number of monomorphized copies of the search code. |
1151 | let (fwd, rev) = (self.forward(), self.reverse()); |
1152 | let end = match (&fwd) |
1153 | .find_earliest_fwd_at(pre, None, haystack, start, end)? |
1154 | { |
1155 | None => return Ok(None), |
1156 | Some(end) => end, |
1157 | }; |
1158 | // N.B. The only time we need to tell the reverse searcher the pattern |
1159 | // to match is in the overlapping case, since it's ambiguous. In the |
1160 | // leftmost case, I have tentatively convinced myself that it isn't |
1161 | // necessary and the reverse search will always find the same pattern |
1162 | // to match as the forward search. But I lack a rigorous proof. |
1163 | let start = (&rev) |
1164 | .find_earliest_rev_at(None, haystack, start, end.offset())? |
1165 | .expect("reverse search must match if forward search does" ); |
1166 | assert_eq!( |
1167 | start.pattern(), |
1168 | end.pattern(), |
1169 | "forward and reverse search must match same pattern" |
1170 | ); |
1171 | assert!(start.offset() <= end.offset()); |
1172 | Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) |
1173 | } |
1174 | |
1175 | /// Returns the start and end offset of the leftmost match. If no match |
1176 | /// exists, then `None` is returned. |
1177 | /// |
1178 | /// # Searching a substring of the haystack |
1179 | /// |
1180 | /// Being an "at" search routine, this permits callers to search a |
1181 | /// substring of `haystack` by specifying a range in `haystack`. |
1182 | /// Why expose this as an API instead of just asking callers to use |
1183 | /// `&input[start..end]`? The reason is that regex matching often wants |
1184 | /// to take the surrounding context into account in order to handle |
1185 | /// look-around (`^`, `$` and `\b`). |
1186 | /// |
1187 | /// This is useful when implementing an iterator over matches |
1188 | /// within the same haystack, which cannot be done correctly by simply |
1189 | /// providing a subslice of `haystack`. |
1190 | /// |
1191 | /// # Errors |
1192 | /// |
1193 | /// This routine only errors if the search could not complete. For |
1194 | /// DFA-based regexes, this only occurs in a non-default configuration |
1195 | /// where quit bytes are used or Unicode word boundaries are heuristically |
1196 | /// enabled. |
1197 | /// |
1198 | /// When a search cannot complete, callers cannot know whether a match |
1199 | /// exists or not. |
1200 | /// |
1201 | /// The infallible (panics on error) version of this routine is |
1202 | /// [`find_leftmost_at`](Regex::find_leftmost_at). |
1203 | pub fn try_find_leftmost_at( |
1204 | &self, |
1205 | haystack: &[u8], |
1206 | start: usize, |
1207 | end: usize, |
1208 | ) -> Result<Option<MultiMatch>, MatchError> { |
1209 | self.try_find_leftmost_at_imp( |
1210 | self.scanner().as_mut(), |
1211 | haystack, |
1212 | start, |
1213 | end, |
1214 | ) |
1215 | } |
1216 | |
1217 | /// The implementation of leftmost searching, where a prefilter scanner |
1218 | /// may be given. |
1219 | fn try_find_leftmost_at_imp( |
1220 | &self, |
1221 | scanner: Option<&mut prefilter::Scanner>, |
1222 | haystack: &[u8], |
1223 | start: usize, |
1224 | end: usize, |
1225 | ) -> Result<Option<MultiMatch>, MatchError> { |
1226 | // N.B. We use `&&A` here to call `Automaton` methods, which ensures |
1227 | // that we always use the `impl Automaton for &A` for calling methods. |
1228 | // Since this is the usual way that automata are used, this helps |
1229 | // reduce the number of monomorphized copies of the search code. |
1230 | let (fwd, rev) = (self.forward(), self.reverse()); |
1231 | let end = match (&fwd) |
1232 | .find_leftmost_fwd_at(scanner, None, haystack, start, end)? |
1233 | { |
1234 | None => return Ok(None), |
1235 | Some(end) => end, |
1236 | }; |
1237 | // N.B. The only time we need to tell the reverse searcher the pattern |
1238 | // to match is in the overlapping case, since it's ambiguous. In the |
1239 | // leftmost case, I have tentatively convinced myself that it isn't |
1240 | // necessary and the reverse search will always find the same pattern |
1241 | // to match as the forward search. But I lack a rigorous proof. Why not |
1242 | // just provide the pattern anyway? Well, if it is needed, then leaving |
1243 | // it out gives us a chance to find a witness. |
1244 | let start = (&rev) |
1245 | .find_leftmost_rev_at(None, haystack, start, end.offset())? |
1246 | .expect("reverse search must match if forward search does" ); |
1247 | assert_eq!( |
1248 | start.pattern(), |
1249 | end.pattern(), |
1250 | "forward and reverse search must match same pattern" , |
1251 | ); |
1252 | assert!(start.offset() <= end.offset()); |
1253 | Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) |
1254 | } |
1255 | |
1256 | /// Search for the first overlapping match within a given range of |
1257 | /// `haystack`. |
1258 | /// |
1259 | /// This routine is principally useful when searching for multiple patterns |
1260 | /// on inputs where multiple patterns may match the same regions of text. |
1261 | /// In particular, callers must preserve the automaton's search state from |
1262 | /// prior calls so that the implementation knows where the last match |
1263 | /// occurred and which pattern was reported. |
1264 | /// |
1265 | /// # Searching a substring of the haystack |
1266 | /// |
1267 | /// Being an "at" search routine, this permits callers to search a |
1268 | /// substring of `haystack` by specifying a range in `haystack`. |
1269 | /// Why expose this as an API instead of just asking callers to use |
1270 | /// `&input[start..end]`? The reason is that regex matching often wants |
1271 | /// to take the surrounding context into account in order to handle |
1272 | /// look-around (`^`, `$` and `\b`). |
1273 | /// |
1274 | /// This is useful when implementing an iterator over matches |
1275 | /// within the same haystack, which cannot be done correctly by simply |
1276 | /// providing a subslice of `haystack`. |
1277 | /// |
1278 | /// # Errors |
1279 | /// |
1280 | /// This routine only errors if the search could not complete. For |
1281 | /// DFA-based regexes, this only occurs in a non-default configuration |
1282 | /// where quit bytes are used or Unicode word boundaries are heuristically |
1283 | /// enabled. |
1284 | /// |
1285 | /// When a search cannot complete, callers cannot know whether a match |
1286 | /// exists or not. |
1287 | /// |
1288 | /// The infallible (panics on error) version of this routine is |
1289 | /// [`find_overlapping_at`](Regex::find_overlapping_at). |
1290 | pub fn try_find_overlapping_at( |
1291 | &self, |
1292 | haystack: &[u8], |
1293 | start: usize, |
1294 | end: usize, |
1295 | state: &mut OverlappingState, |
1296 | ) -> Result<Option<MultiMatch>, MatchError> { |
1297 | self.try_find_overlapping_at_imp( |
1298 | self.scanner().as_mut(), |
1299 | haystack, |
1300 | start, |
1301 | end, |
1302 | state, |
1303 | ) |
1304 | } |
1305 | |
1306 | /// The implementation of overlapping search at a given range in |
1307 | /// `haystack`, where `scanner` is a prefilter (if active) and `state` is |
1308 | /// the current state of the search. |
1309 | fn try_find_overlapping_at_imp( |
1310 | &self, |
1311 | scanner: Option<&mut prefilter::Scanner>, |
1312 | haystack: &[u8], |
1313 | start: usize, |
1314 | end: usize, |
1315 | state: &mut OverlappingState, |
1316 | ) -> Result<Option<MultiMatch>, MatchError> { |
1317 | // N.B. We use `&&A` here to call `Automaton` methods, which ensures |
1318 | // that we always use the `impl Automaton for &A` for calling methods. |
1319 | // Since this is the usual way that automata are used, this helps |
1320 | // reduce the number of monomorphized copies of the search code. |
1321 | let (fwd, rev) = (self.forward(), self.reverse()); |
1322 | // TODO: Decide whether it's worth making this assert work. It doesn't |
1323 | // work currently because 'has_starts_for_each_pattern' isn't on the |
1324 | // Automaton trait. Without this assert, we still get a panic, but it's |
1325 | // a bit more inscrutable. |
1326 | // assert!( |
1327 | // rev.has_starts_for_each_pattern(), |
1328 | // "overlapping searches require that the reverse DFA is \ |
1329 | // compiled with the 'starts_for_each_pattern' option", |
1330 | // ); |
1331 | let end = match (&fwd).find_overlapping_fwd_at( |
1332 | scanner, None, haystack, start, end, state, |
1333 | )? { |
1334 | None => return Ok(None), |
1335 | Some(end) => end, |
1336 | }; |
1337 | // Unlike the leftmost cases, the reverse overlapping search may match |
1338 | // a different pattern than the forward search. See test failures when |
1339 | // using `None` instead of `Some(end.pattern())` below. Thus, we must |
1340 | // run our reverse search using the pattern that matched in the forward |
1341 | // direction. |
1342 | let start = (&rev) |
1343 | .find_leftmost_rev_at( |
1344 | Some(end.pattern()), |
1345 | haystack, |
1346 | 0, |
1347 | end.offset(), |
1348 | )? |
1349 | .expect("reverse search must match if forward search does" ); |
1350 | assert!(start.offset() <= end.offset()); |
1351 | assert_eq!(start.pattern(), end.pattern()); |
1352 | Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) |
1353 | } |
1354 | } |
1355 | |
1356 | /// Non-search APIs for querying information about the regex and setting a |
1357 | /// prefilter. |
1358 | impl<A: Automaton, P: Prefilter> Regex<A, P> { |
1359 | /// Attach the given prefilter to this regex. |
1360 | pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> { |
1361 | Regex { |
1362 | prefilter: Some(prefilter), |
1363 | forward: self.forward, |
1364 | reverse: self.reverse, |
1365 | utf8: self.utf8, |
1366 | } |
1367 | } |
1368 | |
1369 | /// Remove any prefilter from this regex. |
1370 | pub fn without_prefilter(self) -> Regex<A> { |
1371 | Regex { |
1372 | prefilter: None, |
1373 | forward: self.forward, |
1374 | reverse: self.reverse, |
1375 | utf8: self.utf8, |
1376 | } |
1377 | } |
1378 | |
1379 | /// Return the underlying DFA responsible for forward matching. |
1380 | /// |
1381 | /// This is useful for accessing the underlying DFA and converting it to |
1382 | /// some other format or size. See the [`Builder::build_from_dfas`] docs |
1383 | /// for an example of where this might be useful. |
1384 | pub fn forward(&self) -> &A { |
1385 | &self.forward |
1386 | } |
1387 | |
1388 | /// Return the underlying DFA responsible for reverse matching. |
1389 | /// |
1390 | /// This is useful for accessing the underlying DFA and converting it to |
1391 | /// some other format or size. See the [`Builder::build_from_dfas`] docs |
1392 | /// for an example of where this might be useful. |
1393 | pub fn reverse(&self) -> &A { |
1394 | &self.reverse |
1395 | } |
1396 | |
1397 | /// Returns the total number of patterns matched by this regex. |
1398 | /// |
1399 | /// # Example |
1400 | /// |
1401 | /// ``` |
1402 | /// use regex_automata::{MultiMatch, dfa::regex::Regex}; |
1403 | /// |
1404 | /// let re = Regex::new_many(&[r"[a-z]+" , r"[0-9]+" , r"\w+" ])?; |
1405 | /// assert_eq!(3, re.pattern_count()); |
1406 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1407 | /// ``` |
1408 | pub fn pattern_count(&self) -> usize { |
1409 | assert_eq!( |
1410 | self.forward().pattern_count(), |
1411 | self.reverse().pattern_count() |
1412 | ); |
1413 | self.forward().pattern_count() |
1414 | } |
1415 | |
1416 | /// Convenience function for returning this regex's prefilter as a trait |
1417 | /// object. |
1418 | /// |
1419 | /// If this regex doesn't have a prefilter, then `None` is returned. |
1420 | pub fn prefilter(&self) -> Option<&dyn Prefilter> { |
1421 | match self.prefilter { |
1422 | None => None, |
1423 | Some(ref x) => Some(&*x), |
1424 | } |
1425 | } |
1426 | |
1427 | /// Convenience function for returning a prefilter scanner. |
1428 | fn scanner(&self) -> Option<prefilter::Scanner> { |
1429 | self.prefilter().map(prefilter::Scanner::new) |
1430 | } |
1431 | } |
1432 | |
1433 | /// An iterator over all non-overlapping earliest matches for a particular |
1434 | /// infallible search. |
1435 | /// |
1436 | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1437 | /// found. If the underlying search returns an error, then this panics. |
1438 | /// |
1439 | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1440 | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1441 | /// as follows: |
1442 | /// |
1443 | /// * `'r` is the lifetime of the regular expression itself. |
1444 | /// * `'t` is the lifetime of the text being searched. |
1445 | #[derive (Clone, Debug)] |
1446 | pub struct FindEarliestMatches<'r, 't, A, P>( |
1447 | TryFindEarliestMatches<'r, 't, A, P>, |
1448 | ); |
1449 | |
1450 | impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> { |
1451 | fn new( |
1452 | re: &'r Regex<A, P>, |
1453 | text: &'t [u8], |
1454 | ) -> FindEarliestMatches<'r, 't, A, P> { |
1455 | FindEarliestMatches(TryFindEarliestMatches::new(re, text)) |
1456 | } |
1457 | } |
1458 | |
1459 | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1460 | for FindEarliestMatches<'r, 't, A, P> |
1461 | { |
1462 | type Item = MultiMatch; |
1463 | |
1464 | fn next(&mut self) -> Option<MultiMatch> { |
1465 | next_unwrap(self.0.next()) |
1466 | } |
1467 | } |
1468 | |
1469 | /// An iterator over all non-overlapping leftmost matches for a particular |
1470 | /// infallible search. |
1471 | /// |
1472 | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1473 | /// found. If the underlying search returns an error, then this panics. |
1474 | /// |
1475 | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1476 | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1477 | /// as follows: |
1478 | /// |
1479 | /// * `'r` is the lifetime of the regular expression itself. |
1480 | /// * `'t` is the lifetime of the text being searched. |
1481 | #[derive (Clone, Debug)] |
1482 | pub struct FindLeftmostMatches<'r, 't, A, P>( |
1483 | TryFindLeftmostMatches<'r, 't, A, P>, |
1484 | ); |
1485 | |
1486 | impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> { |
1487 | fn new( |
1488 | re: &'r Regex<A, P>, |
1489 | text: &'t [u8], |
1490 | ) -> FindLeftmostMatches<'r, 't, A, P> { |
1491 | FindLeftmostMatches(TryFindLeftmostMatches::new(re, text)) |
1492 | } |
1493 | } |
1494 | |
1495 | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1496 | for FindLeftmostMatches<'r, 't, A, P> |
1497 | { |
1498 | type Item = MultiMatch; |
1499 | |
1500 | fn next(&mut self) -> Option<MultiMatch> { |
1501 | next_unwrap(self.0.next()) |
1502 | } |
1503 | } |
1504 | |
1505 | /// An iterator over all overlapping matches for a particular infallible |
1506 | /// search. |
1507 | /// |
1508 | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1509 | /// found. If the underlying search returns an error, then this panics. |
1510 | /// |
1511 | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1512 | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1513 | /// as follows: |
1514 | /// |
1515 | /// * `'r` is the lifetime of the regular expression itself. |
1516 | /// * `'t` is the lifetime of the text being searched. |
1517 | #[derive (Clone, Debug)] |
1518 | pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>( |
1519 | TryFindOverlappingMatches<'r, 't, A, P>, |
1520 | ); |
1521 | |
1522 | impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> { |
1523 | fn new( |
1524 | re: &'r Regex<A, P>, |
1525 | text: &'t [u8], |
1526 | ) -> FindOverlappingMatches<'r, 't, A, P> { |
1527 | FindOverlappingMatches(TryFindOverlappingMatches::new(re, text)) |
1528 | } |
1529 | } |
1530 | |
1531 | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1532 | for FindOverlappingMatches<'r, 't, A, P> |
1533 | { |
1534 | type Item = MultiMatch; |
1535 | |
1536 | fn next(&mut self) -> Option<MultiMatch> { |
1537 | next_unwrap(self.0.next()) |
1538 | } |
1539 | } |
1540 | |
1541 | /// An iterator over all non-overlapping earliest matches for a particular |
1542 | /// fallible search. |
1543 | /// |
1544 | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1545 | /// found. |
1546 | /// |
1547 | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1548 | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1549 | /// as follows: |
1550 | /// |
1551 | /// * `'r` is the lifetime of the regular expression itself. |
1552 | /// * `'t` is the lifetime of the text being searched. |
1553 | #[derive (Clone, Debug)] |
1554 | pub struct TryFindEarliestMatches<'r, 't, A, P> { |
1555 | re: &'r Regex<A, P>, |
1556 | scanner: Option<prefilter::Scanner<'r>>, |
1557 | text: &'t [u8], |
1558 | last_end: usize, |
1559 | last_match: Option<usize>, |
1560 | } |
1561 | |
1562 | impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> { |
1563 | fn new( |
1564 | re: &'r Regex<A, P>, |
1565 | text: &'t [u8], |
1566 | ) -> TryFindEarliestMatches<'r, 't, A, P> { |
1567 | let scanner: Option> = re.scanner(); |
1568 | TryFindEarliestMatches { |
1569 | re, |
1570 | scanner, |
1571 | text, |
1572 | last_end: 0, |
1573 | last_match: None, |
1574 | } |
1575 | } |
1576 | } |
1577 | |
1578 | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1579 | for TryFindEarliestMatches<'r, 't, A, P> |
1580 | { |
1581 | type Item = Result<MultiMatch, MatchError>; |
1582 | |
1583 | fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { |
1584 | if self.last_end > self.text.len() { |
1585 | return None; |
1586 | } |
1587 | let result = self.re.try_find_earliest_at_imp( |
1588 | self.scanner.as_mut(), |
1589 | self.text, |
1590 | self.last_end, |
1591 | self.text.len(), |
1592 | ); |
1593 | let m = match result { |
1594 | Err(err) => return Some(Err(err)), |
1595 | Ok(None) => return None, |
1596 | Ok(Some(m)) => m, |
1597 | }; |
1598 | if m.is_empty() { |
1599 | // This is an empty match. To ensure we make progress, start |
1600 | // the next search at the smallest possible starting position |
1601 | // of the next match following this one. |
1602 | self.last_end = if self.re.utf8 { |
1603 | crate::util::next_utf8(self.text, m.end()) |
1604 | } else { |
1605 | m.end() + 1 |
1606 | }; |
1607 | // Don't accept empty matches immediately following a match. |
1608 | // Just move on to the next match. |
1609 | if Some(m.end()) == self.last_match { |
1610 | return self.next(); |
1611 | } |
1612 | } else { |
1613 | self.last_end = m.end(); |
1614 | } |
1615 | self.last_match = Some(m.end()); |
1616 | Some(Ok(m)) |
1617 | } |
1618 | } |
1619 | |
1620 | /// An iterator over all non-overlapping leftmost matches for a particular |
1621 | /// fallible search. |
1622 | /// |
1623 | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1624 | /// found. |
1625 | /// |
1626 | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1627 | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1628 | /// as follows: |
1629 | /// |
1630 | /// * `'r` is the lifetime of the regular expression itself. |
1631 | /// * `'t` is the lifetime of the text being searched. |
1632 | #[derive (Clone, Debug)] |
1633 | pub struct TryFindLeftmostMatches<'r, 't, A, P> { |
1634 | re: &'r Regex<A, P>, |
1635 | scanner: Option<prefilter::Scanner<'r>>, |
1636 | text: &'t [u8], |
1637 | last_end: usize, |
1638 | last_match: Option<usize>, |
1639 | } |
1640 | |
1641 | impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> { |
1642 | fn new( |
1643 | re: &'r Regex<A, P>, |
1644 | text: &'t [u8], |
1645 | ) -> TryFindLeftmostMatches<'r, 't, A, P> { |
1646 | let scanner: Option> = re.scanner(); |
1647 | TryFindLeftmostMatches { |
1648 | re, |
1649 | scanner, |
1650 | text, |
1651 | last_end: 0, |
1652 | last_match: None, |
1653 | } |
1654 | } |
1655 | } |
1656 | |
1657 | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1658 | for TryFindLeftmostMatches<'r, 't, A, P> |
1659 | { |
1660 | type Item = Result<MultiMatch, MatchError>; |
1661 | |
1662 | fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { |
1663 | if self.last_end > self.text.len() { |
1664 | return None; |
1665 | } |
1666 | let result = self.re.try_find_leftmost_at_imp( |
1667 | self.scanner.as_mut(), |
1668 | self.text, |
1669 | self.last_end, |
1670 | self.text.len(), |
1671 | ); |
1672 | let m = match result { |
1673 | Err(err) => return Some(Err(err)), |
1674 | Ok(None) => return None, |
1675 | Ok(Some(m)) => m, |
1676 | }; |
1677 | if m.is_empty() { |
1678 | // This is an empty match. To ensure we make progress, start |
1679 | // the next search at the smallest possible starting position |
1680 | // of the next match following this one. |
1681 | self.last_end = if self.re.utf8 { |
1682 | crate::util::next_utf8(self.text, m.end()) |
1683 | } else { |
1684 | m.end() + 1 |
1685 | }; |
1686 | // Don't accept empty matches immediately following a match. |
1687 | // Just move on to the next match. |
1688 | if Some(m.end()) == self.last_match { |
1689 | return self.next(); |
1690 | } |
1691 | } else { |
1692 | self.last_end = m.end(); |
1693 | } |
1694 | self.last_match = Some(m.end()); |
1695 | Some(Ok(m)) |
1696 | } |
1697 | } |
1698 | |
1699 | /// An iterator over all overlapping matches for a particular fallible search. |
1700 | /// |
1701 | /// The iterator yields a [`MultiMatch`] value until no more matches could be |
1702 | /// found. |
1703 | /// |
1704 | /// `A` is the type used to represent the underlying DFAs used by the regex, |
1705 | /// while `P` is the type of prefilter used, if any. The lifetime variables are |
1706 | /// as follows: |
1707 | /// |
1708 | /// * `'r` is the lifetime of the regular expression itself. |
1709 | /// * `'t` is the lifetime of the text being searched. |
1710 | #[derive (Clone, Debug)] |
1711 | pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> { |
1712 | re: &'r Regex<A, P>, |
1713 | scanner: Option<prefilter::Scanner<'r>>, |
1714 | text: &'t [u8], |
1715 | last_end: usize, |
1716 | state: OverlappingState, |
1717 | } |
1718 | |
1719 | impl<'r, 't, A: Automaton, P: Prefilter> |
1720 | TryFindOverlappingMatches<'r, 't, A, P> |
1721 | { |
1722 | fn new( |
1723 | re: &'r Regex<A, P>, |
1724 | text: &'t [u8], |
1725 | ) -> TryFindOverlappingMatches<'r, 't, A, P> { |
1726 | let scanner: Option> = re.scanner(); |
1727 | TryFindOverlappingMatches { |
1728 | re, |
1729 | scanner, |
1730 | text, |
1731 | last_end: 0, |
1732 | state: OverlappingState::start(), |
1733 | } |
1734 | } |
1735 | } |
1736 | |
1737 | impl<'r, 't, A: Automaton, P: Prefilter> Iterator |
1738 | for TryFindOverlappingMatches<'r, 't, A, P> |
1739 | { |
1740 | type Item = Result<MultiMatch, MatchError>; |
1741 | |
1742 | fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { |
1743 | if self.last_end > self.text.len() { |
1744 | return None; |
1745 | } |
1746 | let result = self.re.try_find_overlapping_at_imp( |
1747 | self.scanner.as_mut(), |
1748 | self.text, |
1749 | self.last_end, |
1750 | self.text.len(), |
1751 | &mut self.state, |
1752 | ); |
1753 | let m = match result { |
1754 | Err(err) => return Some(Err(err)), |
1755 | Ok(None) => return None, |
1756 | Ok(Some(m)) => m, |
1757 | }; |
1758 | // Unlike the non-overlapping case, we're OK with empty matches at this |
1759 | // level. In particular, the overlapping search algorithm is itself |
1760 | // responsible for ensuring that progress is always made. |
1761 | self.last_end = m.end(); |
1762 | Some(Ok(m)) |
1763 | } |
1764 | } |
1765 | |
1766 | /// The configuration used for compiling a DFA-backed regex. |
1767 | /// |
1768 | /// A regex configuration is a simple data object that is typically used with |
1769 | /// [`Builder::configure`]. |
1770 | #[cfg (feature = "alloc" )] |
1771 | #[derive (Clone, Copy, Debug, Default)] |
1772 | pub struct Config { |
1773 | utf8: Option<bool>, |
1774 | } |
1775 | |
1776 | #[cfg (feature = "alloc" )] |
1777 | impl Config { |
1778 | /// Return a new default regex compiler configuration. |
1779 | pub fn new() -> Config { |
1780 | Config::default() |
1781 | } |
1782 | |
1783 | /// Whether to enable UTF-8 mode or not. |
1784 | /// |
1785 | /// When UTF-8 mode is enabled (the default) and an empty match is seen, |
1786 | /// the iterators on [`Regex`] will always start the next search at the |
1787 | /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 |
1788 | /// mode is disabled, such searches are begun at the next byte offset. |
1789 | /// |
1790 | /// If this mode is enabled and invalid UTF-8 is given to search, then |
1791 | /// behavior is unspecified. |
1792 | /// |
1793 | /// Generally speaking, one should enable this when |
1794 | /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) |
1795 | /// and |
1796 | /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) |
1797 | /// are enabled, and disable it otherwise. |
1798 | /// |
1799 | /// # Example |
1800 | /// |
1801 | /// This example demonstrates the differences between when this option is |
1802 | /// enabled and disabled. The differences only arise when the regex can |
1803 | /// return matches of length zero. |
1804 | /// |
1805 | /// In this first snippet, we show the results when UTF-8 mode is disabled. |
1806 | /// |
1807 | /// ``` |
1808 | /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
1809 | /// |
1810 | /// let re = Regex::builder() |
1811 | /// .configure(Regex::config().utf8(false)) |
1812 | /// .build(r"")?; |
1813 | /// let haystack = "a☃z".as_bytes(); |
1814 | /// let mut it = re.find_leftmost_iter(haystack); |
1815 | /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); |
1816 | /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); |
1817 | /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); |
1818 | /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); |
1819 | /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); |
1820 | /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); |
1821 | /// assert_eq!(None, it.next()); |
1822 | /// |
1823 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1824 | /// ``` |
1825 | /// |
1826 | /// And in this snippet, we execute the same search on the same haystack, |
1827 | /// but with UTF-8 mode enabled. Notice that byte offsets that would |
1828 | /// otherwise split the encoding of `☃` are not returned. |
1829 | /// |
1830 | /// ``` |
1831 | /// use regex_automata::{dfa::regex::Regex, MultiMatch}; |
1832 | /// |
1833 | /// let re = Regex::builder() |
1834 | /// .configure(Regex::config().utf8(true)) |
1835 | /// .build(r"")?; |
1836 | /// let haystack = "a☃z".as_bytes(); |
1837 | /// let mut it = re.find_leftmost_iter(haystack); |
1838 | /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); |
1839 | /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); |
1840 | /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); |
1841 | /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); |
1842 | /// assert_eq!(None, it.next()); |
1843 | /// |
1844 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1845 | /// ``` |
1846 | pub fn utf8(mut self, yes: bool) -> Config { |
1847 | self.utf8 = Some(yes); |
1848 | self |
1849 | } |
1850 | |
1851 | /// Returns true if and only if this configuration has UTF-8 mode enabled. |
1852 | /// |
1853 | /// When UTF-8 mode is enabled and an empty match is seen, the iterators on |
1854 | /// [`Regex`] will always start the next search at the next UTF-8 encoded |
1855 | /// codepoint. When UTF-8 mode is disabled, such searches are begun at the |
1856 | /// next byte offset. |
1857 | pub fn get_utf8(&self) -> bool { |
1858 | self.utf8.unwrap_or(true) |
1859 | } |
1860 | |
1861 | /// Overwrite the default configuration such that the options in `o` are |
1862 | /// always used. If an option in `o` is not set, then the corresponding |
1863 | /// option in `self` is used. If it's not set in `self` either, then it |
1864 | /// remains not set. |
1865 | pub(crate) fn overwrite(self, o: Config) -> Config { |
1866 | Config { utf8: o.utf8.or(self.utf8) } |
1867 | } |
1868 | } |
1869 | |
1870 | /// A builder for a regex based on deterministic finite automatons. |
1871 | /// |
1872 | /// This builder permits configuring options for the syntax of a pattern, the |
1873 | /// NFA construction, the DFA construction and finally the regex searching |
1874 | /// itself. This builder is different from a general purpose regex builder in |
1875 | /// that it permits fine grain configuration of the construction process. The |
1876 | /// trade off for this is complexity, and the possibility of setting a |
1877 | /// configuration that might not make sense. For example, there are three |
1878 | /// different UTF-8 modes: |
1879 | /// |
1880 | /// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the |
1881 | /// pattern itself can contain sub-expressions that match invalid UTF-8. |
1882 | /// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) |
1883 | /// controls whether the implicit unanchored prefix added to the NFA can |
1884 | /// match through invalid UTF-8 or not. |
1885 | /// * [`Config::utf8`] controls how the regex iterators themselves advance |
1886 | /// the starting position of the next search when a match with zero length is |
1887 | /// found. |
1888 | /// |
1889 | /// Generally speaking, callers will want to either enable all of these or |
1890 | /// disable all of these. |
1891 | /// |
1892 | /// Internally, building a regex requires building two DFAs, where one is |
1893 | /// responsible for finding the end of a match and the other is responsible |
1894 | /// for finding the start of a match. If you only need to detect whether |
1895 | /// something matched, or only the end of a match, then you should use a |
1896 | /// [`dense::Builder`] to construct a single DFA, which is cheaper than |
1897 | /// building two DFAs. |
1898 | /// |
1899 | /// # Build methods |
1900 | /// |
1901 | /// This builder has a few "build" methods. In general, it's the result of |
1902 | /// combining the following parameters: |
1903 | /// |
1904 | /// * Building one or many regexes. |
1905 | /// * Building a regex with dense or sparse DFAs. |
1906 | /// |
1907 | /// The simplest "build" method is [`Builder::build`]. It accepts a single |
1908 | /// pattern and builds a dense DFA using `usize` for the state identifier |
1909 | /// representation. |
1910 | /// |
1911 | /// The most general "build" method is [`Builder::build_many`], which permits |
1912 | /// building a regex that searches for multiple patterns simultaneously while |
1913 | /// using a specific state identifier representation. |
1914 | /// |
1915 | /// The most flexible "build" method, but hardest to use, is |
1916 | /// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is |
1917 | /// just a pair of DFAs, and this method allows you to specify those DFAs |
1918 | /// exactly. |
1919 | /// |
1920 | /// # Example |
1921 | /// |
1922 | /// This example shows how to disable UTF-8 mode in the syntax, the NFA and |
1923 | /// the regex itself. This is generally what you want for matching on |
1924 | /// arbitrary bytes. |
1925 | /// |
1926 | /// ``` |
1927 | /// use regex_automata::{ |
1928 | /// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig |
1929 | /// }; |
1930 | /// |
1931 | /// let re = Regex::builder() |
1932 | /// .configure(Regex::config().utf8(false)) |
1933 | /// .syntax(SyntaxConfig::new().utf8(false)) |
1934 | /// .thompson(thompson::Config::new().utf8(false)) |
1935 | /// .build(r"foo(?-u:[^b])ar.*")?; |
1936 | /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; |
1937 | /// let expected = Some(MultiMatch::must(0, 1, 9)); |
1938 | /// let got = re.find_leftmost(haystack); |
1939 | /// assert_eq!(expected, got); |
1940 | /// // Notice that `(?-u:[^b])` matches invalid UTF-8, |
1941 | /// // but the subsequent `.*` does not! Disabling UTF-8 |
1942 | /// // on the syntax permits this. Notice also that the |
1943 | /// // search was unanchored and skipped over invalid UTF-8. |
1944 | /// // Disabling UTF-8 on the Thompson NFA permits this. |
1945 | /// // |
1946 | /// // N.B. This example does not show the impact of |
1947 | /// // disabling UTF-8 mode on Config, since that |
1948 | /// // only impacts regexes that can produce matches of |
1949 | /// // length 0. |
1950 | /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); |
1951 | /// |
1952 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1953 | /// ``` |
1954 | #[cfg (feature = "alloc" )] |
1955 | #[derive (Clone, Debug)] |
1956 | pub struct Builder { |
1957 | config: Config, |
1958 | dfa: dense::Builder, |
1959 | } |
1960 | |
1961 | #[cfg (feature = "alloc" )] |
1962 | impl Builder { |
1963 | /// Create a new regex builder with the default configuration. |
1964 | pub fn new() -> Builder { |
1965 | Builder { config: Config::default(), dfa: dense::Builder::new() } |
1966 | } |
1967 | |
1968 | /// Build a regex from the given pattern. |
1969 | /// |
1970 | /// If there was a problem parsing or compiling the pattern, then an error |
1971 | /// is returned. |
1972 | pub fn build(&self, pattern: &str) -> Result<Regex, Error> { |
1973 | self.build_many(&[pattern]) |
1974 | } |
1975 | |
1976 | /// Build a regex from the given pattern using sparse DFAs. |
1977 | /// |
1978 | /// If there was a problem parsing or compiling the pattern, then an error |
1979 | /// is returned. |
1980 | pub fn build_sparse( |
1981 | &self, |
1982 | pattern: &str, |
1983 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
1984 | self.build_many_sparse(&[pattern]) |
1985 | } |
1986 | |
1987 | /// Build a regex from the given patterns. |
1988 | pub fn build_many<P: AsRef<str>>( |
1989 | &self, |
1990 | patterns: &[P], |
1991 | ) -> Result<Regex, Error> { |
1992 | let forward = self.dfa.build_many(patterns)?; |
1993 | let reverse = self |
1994 | .dfa |
1995 | .clone() |
1996 | .configure( |
1997 | dense::Config::new() |
1998 | .anchored(true) |
1999 | .match_kind(MatchKind::All) |
2000 | .starts_for_each_pattern(true), |
2001 | ) |
2002 | .thompson(thompson::Config::new().reverse(true)) |
2003 | .build_many(patterns)?; |
2004 | Ok(self.build_from_dfas(forward, reverse)) |
2005 | } |
2006 | |
2007 | /// Build a sparse regex from the given patterns. |
2008 | pub fn build_many_sparse<P: AsRef<str>>( |
2009 | &self, |
2010 | patterns: &[P], |
2011 | ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { |
2012 | let re = self.build_many(patterns)?; |
2013 | let forward = re.forward().to_sparse()?; |
2014 | let reverse = re.reverse().to_sparse()?; |
2015 | Ok(self.build_from_dfas(forward, reverse)) |
2016 | } |
2017 | |
2018 | /// Build a regex from its component forward and reverse DFAs. |
2019 | /// |
2020 | /// This is useful when deserializing a regex from some arbitrary |
2021 | /// memory region. This is also useful for building regexes from other |
2022 | /// types of DFAs. |
2023 | /// |
2024 | /// If you're building the DFAs from scratch instead of building new DFAs |
2025 | /// from other DFAs, then you'll need to make sure that the reverse DFA is |
2026 | /// configured correctly to match the intended semantics. Namely: |
2027 | /// |
2028 | /// * It should be anchored. |
2029 | /// * It should use [`MatchKind::All`] semantics. |
2030 | /// * It should match in reverse. |
2031 | /// * It should have anchored start states compiled for each pattern. |
2032 | /// * Otherwise, its configuration should match the forward DFA. |
2033 | /// |
2034 | /// If these conditions are satisfied, then behavior of searches is |
2035 | /// unspecified. |
2036 | /// |
2037 | /// Note that when using this constructor, only the configuration from |
2038 | /// [`Config`] is applied. The only configuration settings on this builder |
2039 | /// only apply when the builder owns the construction of the DFAs |
2040 | /// themselves. |
2041 | /// |
2042 | /// # Example |
2043 | /// |
2044 | /// This example is a bit a contrived. The usual use of these methods |
2045 | /// would involve serializing `initial_re` somewhere and then deserializing |
2046 | /// it later to build a regex. But in this case, we do everything in |
2047 | /// memory. |
2048 | /// |
2049 | /// ``` |
2050 | /// use regex_automata::dfa::regex::Regex; |
2051 | /// |
2052 | /// let initial_re = Regex::new("foo[0-9]+")?; |
2053 | /// assert_eq!(true, initial_re.is_match(b"foo123")); |
2054 | /// |
2055 | /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); |
2056 | /// let re = Regex::builder().build_from_dfas(fwd, rev); |
2057 | /// assert_eq!(true, re.is_match(b"foo123")); |
2058 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2059 | /// ``` |
2060 | /// |
2061 | /// This example shows how to build a `Regex` that uses sparse DFAs instead |
2062 | /// of dense DFAs without using one of the convenience `build_sparse` |
2063 | /// routines: |
2064 | /// |
2065 | /// ``` |
2066 | /// use regex_automata::dfa::regex::Regex; |
2067 | /// |
2068 | /// let initial_re = Regex::new("foo[0-9]+")?; |
2069 | /// assert_eq!(true, initial_re.is_match(b"foo123")); |
2070 | /// |
2071 | /// let fwd = initial_re.forward().to_sparse()?; |
2072 | /// let rev = initial_re.reverse().to_sparse()?; |
2073 | /// let re = Regex::builder().build_from_dfas(fwd, rev); |
2074 | /// assert_eq!(true, re.is_match(b"foo123")); |
2075 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
2076 | /// ``` |
2077 | pub fn build_from_dfas<A: Automaton>( |
2078 | &self, |
2079 | forward: A, |
2080 | reverse: A, |
2081 | ) -> Regex<A> { |
2082 | let utf8 = self.config.get_utf8(); |
2083 | Regex { prefilter: None, forward, reverse, utf8 } |
2084 | } |
2085 | |
2086 | /// Apply the given regex configuration options to this builder. |
2087 | pub fn configure(&mut self, config: Config) -> &mut Builder { |
2088 | self.config = self.config.overwrite(config); |
2089 | self |
2090 | } |
2091 | |
2092 | /// Set the syntax configuration for this builder using |
2093 | /// [`SyntaxConfig`](crate::SyntaxConfig). |
2094 | /// |
2095 | /// This permits setting things like case insensitivity, Unicode and multi |
2096 | /// line mode. |
2097 | pub fn syntax( |
2098 | &mut self, |
2099 | config: crate::util::syntax::SyntaxConfig, |
2100 | ) -> &mut Builder { |
2101 | self.dfa.syntax(config); |
2102 | self |
2103 | } |
2104 | |
2105 | /// Set the Thompson NFA configuration for this builder using |
2106 | /// [`nfa::thompson::Config`](thompson::Config). |
2107 | /// |
2108 | /// This permits setting things like whether additional time should be |
2109 | /// spent shrinking the size of the NFA. |
2110 | pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { |
2111 | self.dfa.thompson(config); |
2112 | self |
2113 | } |
2114 | |
2115 | /// Set the dense DFA compilation configuration for this builder using |
2116 | /// [`dense::Config`](dense::Config). |
2117 | /// |
2118 | /// This permits setting things like whether the underlying DFAs should |
2119 | /// be minimized. |
2120 | pub fn dense(&mut self, config: dense::Config) -> &mut Builder { |
2121 | self.dfa.configure(config); |
2122 | self |
2123 | } |
2124 | } |
2125 | |
2126 | #[cfg (feature = "alloc" )] |
2127 | impl Default for Builder { |
2128 | fn default() -> Builder { |
2129 | Builder::new() |
2130 | } |
2131 | } |
2132 | |
2133 | #[inline (always)] |
2134 | fn next_unwrap( |
2135 | item: Option<Result<MultiMatch, MatchError>>, |
2136 | ) -> Option<MultiMatch> { |
2137 | match item { |
2138 | None => None, |
2139 | Some(Ok(m: MultiMatch)) => Some(m), |
2140 | Some(Err(err: MatchError)) => panic!( |
2141 | "unexpected regex search error: {}\n\ |
2142 | to handle search errors, use try_ methods" , |
2143 | err, |
2144 | ), |
2145 | } |
2146 | } |
2147 | |