1 | /*! |
2 | Types and routines that support the search APIs of most regex engines. |
3 | |
4 | This sub-module isn't exposed directly, but rather, its contents are exported |
5 | at the crate root due to the universality of most of the types and routines in |
6 | this module. |
7 | */ |
8 | |
9 | use core::ops::{Range, RangeBounds}; |
10 | |
11 | use crate::util::{escape::DebugByte, primitives::PatternID, utf8}; |
12 | |
13 | /// The parameters for a regex search including the haystack to search. |
14 | /// |
15 | /// It turns out that regex searches have a few parameters, and in most cases, |
16 | /// those parameters have defaults that work in the vast majority of cases. |
17 | /// This `Input` type exists to make that common case seamnless while also |
18 | /// providing an avenue for changing the parameters of a search. In particular, |
19 | /// this type enables doing so without a combinatorial explosion of different |
20 | /// methods and/or superfluous parameters in the common cases. |
21 | /// |
22 | /// An `Input` permits configuring the following things: |
23 | /// |
24 | /// * Search only a substring of a haystack, while taking the broader context |
25 | /// into account for resolving look-around assertions. |
26 | /// * Indicating whether to search for all patterns in a regex, or to |
27 | /// only search for one pattern in particular. |
28 | /// * Whether to perform an anchored on unanchored search. |
29 | /// * Whether to report a match as early as possible. |
30 | /// |
31 | /// All of these parameters, except for the haystack, have sensible default |
32 | /// values. This means that the minimal search configuration is simply a call |
33 | /// to [`Input::new`] with your haystack. Setting any other parameter is |
34 | /// optional. |
35 | /// |
36 | /// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a |
37 | /// `From<H> for Input` implementation. This is useful because many of the |
38 | /// search APIs in this crate accept an `Into<Input>`. This means you can |
39 | /// provide string or byte strings to these routines directly, and they'll |
40 | /// automatically get converted into an `Input` for you. |
41 | /// |
42 | /// The lifetime parameter `'h` refers to the lifetime of the haystack. |
43 | /// |
44 | /// # Organization |
45 | /// |
46 | /// The API of `Input` is split into a few different parts: |
47 | /// |
48 | /// * A builder-like API that transforms a `Input` by value. Examples: |
49 | /// [`Input::span`] and [`Input::anchored`]. |
50 | /// * A setter API that permits mutating parameters in place. Examples: |
51 | /// [`Input::set_span`] and [`Input::set_anchored`]. |
52 | /// * A getter API that permits retrieving any of the search parameters. |
53 | /// Examples: [`Input::get_span`] and [`Input::get_anchored`]. |
54 | /// * A few convenience getter routines that don't conform to the above naming |
55 | /// pattern due to how common they are. Examples: [`Input::haystack`], |
56 | /// [`Input::start`] and [`Input::end`]. |
57 | /// * Miscellaneous predicates and other helper routines that are useful |
58 | /// in some contexts. Examples: [`Input::is_char_boundary`]. |
59 | /// |
60 | /// A `Input` exposes so much because it is meant to be used by both callers of |
61 | /// regex engines _and_ implementors of regex engines. A constraining factor is |
62 | /// that regex engines should accept a `&Input` as its lowest level API, which |
63 | /// means that implementors should only use the "getter" APIs of a `Input`. |
64 | /// |
65 | /// # Valid bounds and search termination |
66 | /// |
67 | /// An `Input` permits setting the bounds of a search via either |
68 | /// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or |
69 | /// else a panic will occur. Bounds are valid if and only if: |
70 | /// |
71 | /// * The bounds represent a valid range into the input's haystack. |
72 | /// * **or** the end bound is a valid ending bound for the haystack *and* |
73 | /// the start bound is exactly one greater than the start bound. |
74 | /// |
75 | /// In the latter case, [`Input::is_done`] will return true and indicates any |
76 | /// search receiving such an input should immediately return with no match. |
77 | /// |
78 | /// Note that while `Input` is used for reverse searches in this crate, the |
79 | /// `Input::is_done` predicate assumes a forward search. Because unsigned |
80 | /// offsets are used internally, there is no way to tell from only the offsets |
81 | /// whether a reverse search is done or not. |
82 | /// |
83 | /// # Regex engine support |
84 | /// |
85 | /// Any regex engine accepting an `Input` must support at least the following |
86 | /// things: |
87 | /// |
88 | /// * Searching a `&[u8]` for matches. |
89 | /// * Searching a substring of `&[u8]` for a match, such that any match |
90 | /// reported must appear entirely within that substring. |
91 | /// * For a forwards search, a match should never be reported when |
92 | /// [`Input::is_done`] returns true. (For reverse searches, termination should |
93 | /// be handled outside of `Input`.) |
94 | /// |
95 | /// Supporting other aspects of an `Input` are optional, but regex engines |
96 | /// should handle aspects they don't support gracefully. How this is done is |
97 | /// generally up to the regex engine. This crate generally treats unsupported |
98 | /// anchored modes as an error to report for example, but for simplicity, in |
99 | /// the meta regex engine, trying to search with an invalid pattern ID just |
100 | /// results in no match being reported. |
101 | #[derive(Clone)] |
102 | pub struct Input<'h> { |
103 | haystack: &'h [u8], |
104 | span: Span, |
105 | anchored: Anchored, |
106 | earliest: bool, |
107 | } |
108 | |
109 | impl<'h> Input<'h> { |
110 | /// Create a new search configuration for the given haystack. |
111 | #[inline ] |
112 | pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { |
113 | Input { |
114 | haystack: haystack.as_ref(), |
115 | span: Span { start: 0, end: haystack.as_ref().len() }, |
116 | anchored: Anchored::No, |
117 | earliest: false, |
118 | } |
119 | } |
120 | |
121 | /// Set the span for this search. |
122 | /// |
123 | /// This routine does not panic if the span given is not a valid range for |
124 | /// this search's haystack. If this search is run with an invalid range, |
125 | /// then the most likely outcome is that the actual search execution will |
126 | /// panic. |
127 | /// |
128 | /// This routine is generic over how a span is provided. While |
129 | /// a [`Span`] may be given directly, one may also provide a |
130 | /// `std::ops::Range<usize>`. To provide anything supported by range |
131 | /// syntax, use the [`Input::range`] method. |
132 | /// |
133 | /// The default span is the entire haystack. |
134 | /// |
135 | /// Note that [`Input::range`] overrides this method and vice versa. |
136 | /// |
137 | /// # Panics |
138 | /// |
139 | /// This panics if the given span does not correspond to valid bounds in |
140 | /// the haystack or the termination of a search. |
141 | /// |
142 | /// # Example |
143 | /// |
144 | /// This example shows how the span of the search can impact whether a |
145 | /// match is reported or not. This is particularly relevant for look-around |
146 | /// operators, which might take things outside of the span into account |
147 | /// when determining whether they match. |
148 | /// |
149 | /// ``` |
150 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
151 | /// use regex_automata::{ |
152 | /// nfa::thompson::pikevm::PikeVM, |
153 | /// Match, Input, |
154 | /// }; |
155 | /// |
156 | /// // Look for 'at', but as a distinct word. |
157 | /// let re = PikeVM::new(r"\bat\b" )?; |
158 | /// let mut cache = re.create_cache(); |
159 | /// let mut caps = re.create_captures(); |
160 | /// |
161 | /// // Our haystack contains 'at', but not as a distinct word. |
162 | /// let haystack = "batter" ; |
163 | /// |
164 | /// // A standard search finds nothing, as expected. |
165 | /// let input = Input::new(haystack); |
166 | /// re.search(&mut cache, &input, &mut caps); |
167 | /// assert_eq!(None, caps.get_match()); |
168 | /// |
169 | /// // But if we wanted to search starting at position '1', we might |
170 | /// // slice the haystack. If we do this, it's impossible for the \b |
171 | /// // anchors to take the surrounding context into account! And thus, |
172 | /// // a match is produced. |
173 | /// let input = Input::new(&haystack[1..3]); |
174 | /// re.search(&mut cache, &input, &mut caps); |
175 | /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match()); |
176 | /// |
177 | /// // But if we specify the span of the search instead of slicing the |
178 | /// // haystack, then the regex engine can "see" outside of the span |
179 | /// // and resolve the anchors correctly. |
180 | /// let input = Input::new(haystack).span(1..3); |
181 | /// re.search(&mut cache, &input, &mut caps); |
182 | /// assert_eq!(None, caps.get_match()); |
183 | /// |
184 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
185 | /// ``` |
186 | /// |
187 | /// This may seem a little ham-fisted, but this scenario tends to come up |
188 | /// if some other regex engine found the match span and now you need to |
189 | /// re-process that span to look for capturing groups. (e.g., Run a faster |
190 | /// DFA first, find a match, then run the PikeVM on just the match span to |
191 | /// resolve capturing groups.) In order to implement that sort of logic |
192 | /// correctly, you need to set the span on the search instead of slicing |
193 | /// the haystack directly. |
194 | /// |
195 | /// The other advantage of using this routine to specify the bounds of the |
196 | /// search is that the match offsets are still reported in terms of the |
197 | /// original haystack. For example, the second search in the example above |
198 | /// reported a match at position `0`, even though `at` starts at offset |
199 | /// `1` because we sliced the haystack. |
200 | #[inline ] |
201 | pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { |
202 | self.set_span(span); |
203 | self |
204 | } |
205 | |
206 | /// Like `Input::span`, but accepts any range instead. |
207 | /// |
208 | /// This routine does not panic if the range given is not a valid range for |
209 | /// this search's haystack. If this search is run with an invalid range, |
210 | /// then the most likely outcome is that the actual search execution will |
211 | /// panic. |
212 | /// |
213 | /// The default range is the entire haystack. |
214 | /// |
215 | /// Note that [`Input::span`] overrides this method and vice versa. |
216 | /// |
217 | /// # Panics |
218 | /// |
219 | /// This routine will panic if the given range could not be converted |
220 | /// to a valid [`Range`]. For example, this would panic when given |
221 | /// `0..=usize::MAX` since it cannot be represented using a half-open |
222 | /// interval in terms of `usize`. |
223 | /// |
224 | /// This also panics if the given range does not correspond to valid bounds |
225 | /// in the haystack or the termination of a search. |
226 | /// |
227 | /// # Example |
228 | /// |
229 | /// ``` |
230 | /// use regex_automata::Input; |
231 | /// |
232 | /// let input = Input::new("foobar" ); |
233 | /// assert_eq!(0..6, input.get_range()); |
234 | /// |
235 | /// let input = Input::new("foobar" ).range(2..=4); |
236 | /// assert_eq!(2..5, input.get_range()); |
237 | /// ``` |
238 | #[inline ] |
239 | pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { |
240 | self.set_range(range); |
241 | self |
242 | } |
243 | |
244 | /// Sets the anchor mode of a search. |
245 | /// |
246 | /// When a search is anchored (so that's [`Anchored::Yes`] or |
247 | /// [`Anchored::Pattern`]), a match must begin at the start of a search. |
248 | /// When a search is not anchored (that's [`Anchored::No`]), regex engines |
249 | /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix |
250 | /// permits a match to appear anywhere. |
251 | /// |
252 | /// By default, the anchored mode is [`Anchored::No`]. |
253 | /// |
254 | /// **WARNING:** this is subtly different than using a `^` at the start of |
255 | /// your regex. A `^` forces a regex to match exclusively at the start of |
256 | /// a haystack, regardless of where you begin your search. In contrast, |
257 | /// anchoring a search will allow your regex to match anywhere in your |
258 | /// haystack, but the match must start at the beginning of a search. |
259 | /// |
260 | /// For example, consider the haystack `aba` and the following searches: |
261 | /// |
262 | /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba` |
263 | /// starting at position `2`. Since `^` requires the match to start at |
264 | /// the beginning of the haystack and `2 > 0`, no match is found. |
265 | /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba` |
266 | /// starting at position `2`. This reports a match at `[2, 3]` since |
267 | /// the match starts where the search started. Since there is no `^`, |
268 | /// there is no requirement for the match to start at the beginning of |
269 | /// the haystack. |
270 | /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba` |
271 | /// starting at position `1`. Since `b` corresponds to position `1` and |
272 | /// since the search is anchored, it finds no match. While the regex |
273 | /// matches at other positions, configuring the search to be anchored |
274 | /// requires that it only report a match that begins at the same offset |
275 | /// as the beginning of the search. |
276 | /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba` |
277 | /// starting at position `1`. Since the search is not anchored and |
278 | /// the regex does not start with `^`, the search executes as if there |
279 | /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it |
280 | /// reports a match at `[2, 3]`. |
281 | /// |
282 | /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`, |
283 | /// except it only reports matches for a particular pattern. |
284 | /// |
285 | /// # Example |
286 | /// |
287 | /// This demonstrates the differences between an anchored search and |
288 | /// a pattern that begins with `^` (as described in the above warning |
289 | /// message). |
290 | /// |
291 | /// ``` |
292 | /// use regex_automata::{ |
293 | /// nfa::thompson::pikevm::PikeVM, |
294 | /// Anchored, Match, Input, |
295 | /// }; |
296 | /// |
297 | /// let haystack = "aba" ; |
298 | /// |
299 | /// let re = PikeVM::new(r"^a" )?; |
300 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
301 | /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No); |
302 | /// re.search(&mut cache, &input, &mut caps); |
303 | /// // No match is found because 2 is not the beginning of the haystack, |
304 | /// // which is what ^ requires. |
305 | /// assert_eq!(None, caps.get_match()); |
306 | /// |
307 | /// let re = PikeVM::new(r"a" )?; |
308 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
309 | /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes); |
310 | /// re.search(&mut cache, &input, &mut caps); |
311 | /// // An anchored search can still match anywhere in the haystack, it just |
312 | /// // must begin at the start of the search which is '2' in this case. |
313 | /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match()); |
314 | /// |
315 | /// let re = PikeVM::new(r"a" )?; |
316 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
317 | /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes); |
318 | /// re.search(&mut cache, &input, &mut caps); |
319 | /// // No match is found since we start searching at offset 1 which |
320 | /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match |
321 | /// // is found. |
322 | /// assert_eq!(None, caps.get_match()); |
323 | /// |
324 | /// let re = PikeVM::new(r"a" )?; |
325 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
326 | /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No); |
327 | /// re.search(&mut cache, &input, &mut caps); |
328 | /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the |
329 | /// // pattern. Even though the search starts at 'b', the 'match anything' |
330 | /// // prefix allows the search to match 'a'. |
331 | /// let expected = Some(Match::must(0, 2..3)); |
332 | /// assert_eq!(expected, caps.get_match()); |
333 | /// |
334 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
335 | /// ``` |
336 | #[inline ] |
337 | pub fn anchored(mut self, mode: Anchored) -> Input<'h> { |
338 | self.set_anchored(mode); |
339 | self |
340 | } |
341 | |
342 | /// Whether to execute an "earliest" search or not. |
343 | /// |
344 | /// When running a non-overlapping search, an "earliest" search will return |
345 | /// the match location as early as possible. For example, given a pattern |
346 | /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search |
347 | /// will return `foo12345` as a match. But an "earliest" search for regex |
348 | /// engines that support "earliest" semantics will return `foo1` as a |
349 | /// match, since as soon as the first digit following `foo` is seen, it is |
350 | /// known to have found a match. |
351 | /// |
352 | /// Note that "earliest" semantics generally depend on the regex engine. |
353 | /// Different regex engines may determine there is a match at different |
354 | /// points. So there is no guarantee that "earliest" matches will always |
355 | /// return the same offsets for all regex engines. The "earliest" notion |
356 | /// is really about when the particular regex engine determines there is |
357 | /// a match rather than a consistent semantic unto itself. This is often |
358 | /// useful for implementing "did a match occur or not" predicates, but |
359 | /// sometimes the offset is useful as well. |
360 | /// |
361 | /// This is disabled by default. |
362 | /// |
363 | /// # Example |
364 | /// |
365 | /// This example shows the difference between "earliest" searching and |
366 | /// normal searching. |
367 | /// |
368 | /// ``` |
369 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; |
370 | /// |
371 | /// let re = PikeVM::new(r"foo[0-9]+" )?; |
372 | /// let mut cache = re.create_cache(); |
373 | /// let mut caps = re.create_captures(); |
374 | /// |
375 | /// // A normal search implements greediness like you expect. |
376 | /// let input = Input::new("foo12345" ); |
377 | /// re.search(&mut cache, &input, &mut caps); |
378 | /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); |
379 | /// |
380 | /// // When 'earliest' is enabled and the regex engine supports |
381 | /// // it, the search will bail once it knows a match has been |
382 | /// // found. |
383 | /// let input = Input::new("foo12345" ).earliest(true); |
384 | /// re.search(&mut cache, &input, &mut caps); |
385 | /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match()); |
386 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
387 | /// ``` |
388 | #[inline ] |
389 | pub fn earliest(mut self, yes: bool) -> Input<'h> { |
390 | self.set_earliest(yes); |
391 | self |
392 | } |
393 | |
394 | /// Set the span for this search configuration. |
395 | /// |
396 | /// This is like the [`Input::span`] method, except this mutates the |
397 | /// span in place. |
398 | /// |
399 | /// This routine is generic over how a span is provided. While |
400 | /// a [`Span`] may be given directly, one may also provide a |
401 | /// `std::ops::Range<usize>`. |
402 | /// |
403 | /// # Panics |
404 | /// |
405 | /// This panics if the given span does not correspond to valid bounds in |
406 | /// the haystack or the termination of a search. |
407 | /// |
408 | /// # Example |
409 | /// |
410 | /// ``` |
411 | /// use regex_automata::Input; |
412 | /// |
413 | /// let mut input = Input::new("foobar" ); |
414 | /// assert_eq!(0..6, input.get_range()); |
415 | /// input.set_span(2..4); |
416 | /// assert_eq!(2..4, input.get_range()); |
417 | /// ``` |
418 | #[inline ] |
419 | pub fn set_span<S: Into<Span>>(&mut self, span: S) { |
420 | let span = span.into(); |
421 | assert!( |
422 | span.end <= self.haystack.len() |
423 | && span.start <= span.end.wrapping_add(1), |
424 | "invalid span {:?} for haystack of length {}" , |
425 | span, |
426 | self.haystack.len(), |
427 | ); |
428 | self.span = span; |
429 | } |
430 | |
431 | /// Set the span for this search configuration given any range. |
432 | /// |
433 | /// This is like the [`Input::range`] method, except this mutates the |
434 | /// span in place. |
435 | /// |
436 | /// This routine does not panic if the range given is not a valid range for |
437 | /// this search's haystack. If this search is run with an invalid range, |
438 | /// then the most likely outcome is that the actual search execution will |
439 | /// panic. |
440 | /// |
441 | /// # Panics |
442 | /// |
443 | /// This routine will panic if the given range could not be converted |
444 | /// to a valid [`Range`]. For example, this would panic when given |
445 | /// `0..=usize::MAX` since it cannot be represented using a half-open |
446 | /// interval in terms of `usize`. |
447 | /// |
448 | /// This also panics if the given span does not correspond to valid bounds |
449 | /// in the haystack or the termination of a search. |
450 | /// |
451 | /// # Example |
452 | /// |
453 | /// ``` |
454 | /// use regex_automata::Input; |
455 | /// |
456 | /// let mut input = Input::new("foobar" ); |
457 | /// assert_eq!(0..6, input.get_range()); |
458 | /// input.set_range(2..=4); |
459 | /// assert_eq!(2..5, input.get_range()); |
460 | /// ``` |
461 | #[inline ] |
462 | pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { |
463 | use core::ops::Bound; |
464 | |
465 | // It's a little weird to convert ranges into spans, and then spans |
466 | // back into ranges when we actually slice the haystack. Because |
467 | // of that process, we always represent everything as a half-open |
468 | // internal. Therefore, handling things like m..=n is a little awkward. |
469 | let start = match range.start_bound() { |
470 | Bound::Included(&i) => i, |
471 | // Can this case ever happen? Range syntax doesn't support it... |
472 | Bound::Excluded(&i) => i.checked_add(1).unwrap(), |
473 | Bound::Unbounded => 0, |
474 | }; |
475 | let end = match range.end_bound() { |
476 | Bound::Included(&i) => i.checked_add(1).unwrap(), |
477 | Bound::Excluded(&i) => i, |
478 | Bound::Unbounded => self.haystack().len(), |
479 | }; |
480 | self.set_span(Span { start, end }); |
481 | } |
482 | |
483 | /// Set the starting offset for the span for this search configuration. |
484 | /// |
485 | /// This is a convenience routine for only mutating the start of a span |
486 | /// without having to set the entire span. |
487 | /// |
488 | /// # Panics |
489 | /// |
490 | /// This panics if the span resulting from the new start position does not |
491 | /// correspond to valid bounds in the haystack or the termination of a |
492 | /// search. |
493 | /// |
494 | /// # Example |
495 | /// |
496 | /// ``` |
497 | /// use regex_automata::Input; |
498 | /// |
499 | /// let mut input = Input::new("foobar" ); |
500 | /// assert_eq!(0..6, input.get_range()); |
501 | /// input.set_start(5); |
502 | /// assert_eq!(5..6, input.get_range()); |
503 | /// ``` |
504 | #[inline ] |
505 | pub fn set_start(&mut self, start: usize) { |
506 | self.set_span(Span { start, ..self.get_span() }); |
507 | } |
508 | |
509 | /// Set the ending offset for the span for this search configuration. |
510 | /// |
511 | /// This is a convenience routine for only mutating the end of a span |
512 | /// without having to set the entire span. |
513 | /// |
514 | /// # Panics |
515 | /// |
516 | /// This panics if the span resulting from the new end position does not |
517 | /// correspond to valid bounds in the haystack or the termination of a |
518 | /// search. |
519 | /// |
520 | /// # Example |
521 | /// |
522 | /// ``` |
523 | /// use regex_automata::Input; |
524 | /// |
525 | /// let mut input = Input::new("foobar" ); |
526 | /// assert_eq!(0..6, input.get_range()); |
527 | /// input.set_end(5); |
528 | /// assert_eq!(0..5, input.get_range()); |
529 | /// ``` |
530 | #[inline ] |
531 | pub fn set_end(&mut self, end: usize) { |
532 | self.set_span(Span { end, ..self.get_span() }); |
533 | } |
534 | |
535 | /// Set the anchor mode of a search. |
536 | /// |
537 | /// This is like [`Input::anchored`], except it mutates the search |
538 | /// configuration in place. |
539 | /// |
540 | /// # Example |
541 | /// |
542 | /// ``` |
543 | /// use regex_automata::{Anchored, Input, PatternID}; |
544 | /// |
545 | /// let mut input = Input::new("foobar" ); |
546 | /// assert_eq!(Anchored::No, input.get_anchored()); |
547 | /// |
548 | /// let pid = PatternID::must(5); |
549 | /// input.set_anchored(Anchored::Pattern(pid)); |
550 | /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); |
551 | /// ``` |
552 | #[inline ] |
553 | pub fn set_anchored(&mut self, mode: Anchored) { |
554 | self.anchored = mode; |
555 | } |
556 | |
557 | /// Set whether the search should execute in "earliest" mode or not. |
558 | /// |
559 | /// This is like [`Input::earliest`], except it mutates the search |
560 | /// configuration in place. |
561 | /// |
562 | /// # Example |
563 | /// |
564 | /// ``` |
565 | /// use regex_automata::Input; |
566 | /// |
567 | /// let mut input = Input::new("foobar" ); |
568 | /// assert!(!input.get_earliest()); |
569 | /// input.set_earliest(true); |
570 | /// assert!(input.get_earliest()); |
571 | /// ``` |
572 | #[inline ] |
573 | pub fn set_earliest(&mut self, yes: bool) { |
574 | self.earliest = yes; |
575 | } |
576 | |
577 | /// Return a borrow of the underlying haystack as a slice of bytes. |
578 | /// |
579 | /// # Example |
580 | /// |
581 | /// ``` |
582 | /// use regex_automata::Input; |
583 | /// |
584 | /// let input = Input::new("foobar" ); |
585 | /// assert_eq!(b"foobar" , input.haystack()); |
586 | /// ``` |
587 | #[inline ] |
588 | pub fn haystack(&self) -> &[u8] { |
589 | self.haystack |
590 | } |
591 | |
592 | /// Return the start position of this search. |
593 | /// |
594 | /// This is a convenience routine for `search.get_span().start()`. |
595 | /// |
596 | /// When [`Input::is_done`] is `false`, this is guaranteed to return |
597 | /// an offset that is less than or equal to [`Input::end`]. Otherwise, |
598 | /// the offset is one greater than [`Input::end`]. |
599 | /// |
600 | /// # Example |
601 | /// |
602 | /// ``` |
603 | /// use regex_automata::Input; |
604 | /// |
605 | /// let input = Input::new("foobar" ); |
606 | /// assert_eq!(0, input.start()); |
607 | /// |
608 | /// let input = Input::new("foobar" ).span(2..4); |
609 | /// assert_eq!(2, input.start()); |
610 | /// ``` |
611 | #[inline ] |
612 | pub fn start(&self) -> usize { |
613 | self.get_span().start |
614 | } |
615 | |
616 | /// Return the end position of this search. |
617 | /// |
618 | /// This is a convenience routine for `search.get_span().end()`. |
619 | /// |
620 | /// This is guaranteed to return an offset that is a valid exclusive end |
621 | /// bound for this input's haystack. |
622 | /// |
623 | /// # Example |
624 | /// |
625 | /// ``` |
626 | /// use regex_automata::Input; |
627 | /// |
628 | /// let input = Input::new("foobar" ); |
629 | /// assert_eq!(6, input.end()); |
630 | /// |
631 | /// let input = Input::new("foobar" ).span(2..4); |
632 | /// assert_eq!(4, input.end()); |
633 | /// ``` |
634 | #[inline ] |
635 | pub fn end(&self) -> usize { |
636 | self.get_span().end |
637 | } |
638 | |
639 | /// Return the span for this search configuration. |
640 | /// |
641 | /// If one was not explicitly set, then the span corresponds to the entire |
642 | /// range of the haystack. |
643 | /// |
644 | /// When [`Input::is_done`] is `false`, the span returned is guaranteed |
645 | /// to correspond to valid bounds for this input's haystack. |
646 | /// |
647 | /// # Example |
648 | /// |
649 | /// ``` |
650 | /// use regex_automata::{Input, Span}; |
651 | /// |
652 | /// let input = Input::new("foobar" ); |
653 | /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); |
654 | /// ``` |
655 | #[inline ] |
656 | pub fn get_span(&self) -> Span { |
657 | self.span |
658 | } |
659 | |
660 | /// Return the span as a range for this search configuration. |
661 | /// |
662 | /// If one was not explicitly set, then the span corresponds to the entire |
663 | /// range of the haystack. |
664 | /// |
665 | /// When [`Input::is_done`] is `false`, the range returned is guaranteed |
666 | /// to correspond to valid bounds for this input's haystack. |
667 | /// |
668 | /// # Example |
669 | /// |
670 | /// ``` |
671 | /// use regex_automata::Input; |
672 | /// |
673 | /// let input = Input::new("foobar" ); |
674 | /// assert_eq!(0..6, input.get_range()); |
675 | /// ``` |
676 | #[inline ] |
677 | pub fn get_range(&self) -> Range<usize> { |
678 | self.get_span().range() |
679 | } |
680 | |
681 | /// Return the anchored mode for this search configuration. |
682 | /// |
683 | /// If no anchored mode was set, then it defaults to [`Anchored::No`]. |
684 | /// |
685 | /// # Example |
686 | /// |
687 | /// ``` |
688 | /// use regex_automata::{Anchored, Input, PatternID}; |
689 | /// |
690 | /// let mut input = Input::new("foobar" ); |
691 | /// assert_eq!(Anchored::No, input.get_anchored()); |
692 | /// |
693 | /// let pid = PatternID::must(5); |
694 | /// input.set_anchored(Anchored::Pattern(pid)); |
695 | /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); |
696 | /// ``` |
697 | #[inline ] |
698 | pub fn get_anchored(&self) -> Anchored { |
699 | self.anchored |
700 | } |
701 | |
702 | /// Return whether this search should execute in "earliest" mode. |
703 | /// |
704 | /// # Example |
705 | /// |
706 | /// ``` |
707 | /// use regex_automata::Input; |
708 | /// |
709 | /// let input = Input::new("foobar" ); |
710 | /// assert!(!input.get_earliest()); |
711 | /// ``` |
712 | #[inline ] |
713 | pub fn get_earliest(&self) -> bool { |
714 | self.earliest |
715 | } |
716 | |
717 | /// Return true if and only if this search can never return any other |
718 | /// matches. |
719 | /// |
720 | /// This occurs when the start position of this search is greater than the |
721 | /// end position of the search. |
722 | /// |
723 | /// # Example |
724 | /// |
725 | /// ``` |
726 | /// use regex_automata::Input; |
727 | /// |
728 | /// let mut input = Input::new("foobar" ); |
729 | /// assert!(!input.is_done()); |
730 | /// input.set_start(6); |
731 | /// assert!(!input.is_done()); |
732 | /// input.set_start(7); |
733 | /// assert!(input.is_done()); |
734 | /// ``` |
735 | #[inline ] |
736 | pub fn is_done(&self) -> bool { |
737 | self.get_span().start > self.get_span().end |
738 | } |
739 | |
740 | /// Returns true if and only if the given offset in this search's haystack |
741 | /// falls on a valid UTF-8 encoded codepoint boundary. |
742 | /// |
743 | /// If the haystack is not valid UTF-8, then the behavior of this routine |
744 | /// is unspecified. |
745 | /// |
746 | /// # Example |
747 | /// |
748 | /// This shows where codepoint boundaries do and don't exist in valid |
749 | /// UTF-8. |
750 | /// |
751 | /// ``` |
752 | /// use regex_automata::Input; |
753 | /// |
754 | /// let input = Input::new("☃" ); |
755 | /// assert!(input.is_char_boundary(0)); |
756 | /// assert!(!input.is_char_boundary(1)); |
757 | /// assert!(!input.is_char_boundary(2)); |
758 | /// assert!(input.is_char_boundary(3)); |
759 | /// assert!(!input.is_char_boundary(4)); |
760 | /// ``` |
761 | #[inline ] |
762 | pub fn is_char_boundary(&self, offset: usize) -> bool { |
763 | utf8::is_boundary(self.haystack(), offset) |
764 | } |
765 | } |
766 | |
767 | impl<'h> core::fmt::Debug for Input<'h> { |
768 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
769 | use crate::util::escape::DebugHaystack; |
770 | |
771 | f.debug_struct("Input" ) |
772 | .field("haystack" , &DebugHaystack(self.haystack())) |
773 | .field("span" , &self.span) |
774 | .field("anchored" , &self.anchored) |
775 | .field("earliest" , &self.earliest) |
776 | .finish() |
777 | } |
778 | } |
779 | |
780 | impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { |
781 | fn from(haystack: &'h H) -> Input<'h> { |
782 | Input::new(haystack) |
783 | } |
784 | } |
785 | |
786 | /// A representation of a span reported by a regex engine. |
787 | /// |
788 | /// A span corresponds to the starting and ending _byte offsets_ of a |
789 | /// contiguous region of bytes. The starting offset is inclusive while the |
790 | /// ending offset is exclusive. That is, a span is a half-open interval. |
791 | /// |
792 | /// A span is used to report the offsets of a match, but it is also used to |
793 | /// convey which region of a haystack should be searched via routines like |
794 | /// [`Input::span`]. |
795 | /// |
796 | /// This is basically equivalent to a `std::ops::Range<usize>`, except this |
797 | /// type implements `Copy` which makes it more ergonomic to use in the context |
798 | /// of this crate. Like a range, this implements `Index` for `[u8]` and `str`, |
799 | /// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`, |
800 | /// which means things like `Span::from(5..10)` work. |
801 | #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
802 | pub struct Span { |
803 | /// The start offset of the span, inclusive. |
804 | pub start: usize, |
805 | /// The end offset of the span, exclusive. |
806 | pub end: usize, |
807 | } |
808 | |
809 | impl Span { |
810 | /// Returns this span as a range. |
811 | #[inline ] |
812 | pub fn range(&self) -> Range<usize> { |
813 | Range::from(*self) |
814 | } |
815 | |
816 | /// Returns true when this span is empty. That is, when `start >= end`. |
817 | #[inline ] |
818 | pub fn is_empty(&self) -> bool { |
819 | self.start >= self.end |
820 | } |
821 | |
822 | /// Returns the length of this span. |
823 | /// |
824 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
825 | #[inline ] |
826 | pub fn len(&self) -> usize { |
827 | self.end.saturating_sub(self.start) |
828 | } |
829 | |
830 | /// Returns true when the given offset is contained within this span. |
831 | /// |
832 | /// Note that an empty span contains no offsets and will always return |
833 | /// false. |
834 | #[inline ] |
835 | pub fn contains(&self, offset: usize) -> bool { |
836 | !self.is_empty() && self.start <= offset && offset <= self.end |
837 | } |
838 | |
839 | /// Returns a new span with `offset` added to this span's `start` and `end` |
840 | /// values. |
841 | #[inline ] |
842 | pub fn offset(&self, offset: usize) -> Span { |
843 | Span { start: self.start + offset, end: self.end + offset } |
844 | } |
845 | } |
846 | |
847 | impl core::fmt::Debug for Span { |
848 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
849 | write!(f, "{}..{}" , self.start, self.end) |
850 | } |
851 | } |
852 | |
853 | impl core::ops::Index<Span> for [u8] { |
854 | type Output = [u8]; |
855 | |
856 | #[inline ] |
857 | fn index(&self, index: Span) -> &[u8] { |
858 | &self[index.range()] |
859 | } |
860 | } |
861 | |
862 | impl core::ops::IndexMut<Span> for [u8] { |
863 | #[inline ] |
864 | fn index_mut(&mut self, index: Span) -> &mut [u8] { |
865 | &mut self[index.range()] |
866 | } |
867 | } |
868 | |
869 | impl core::ops::Index<Span> for str { |
870 | type Output = str; |
871 | |
872 | #[inline ] |
873 | fn index(&self, index: Span) -> &str { |
874 | &self[index.range()] |
875 | } |
876 | } |
877 | |
878 | impl From<Range<usize>> for Span { |
879 | #[inline ] |
880 | fn from(range: Range<usize>) -> Span { |
881 | Span { start: range.start, end: range.end } |
882 | } |
883 | } |
884 | |
885 | impl From<Span> for Range<usize> { |
886 | #[inline ] |
887 | fn from(span: Span) -> Range<usize> { |
888 | Range { start: span.start, end: span.end } |
889 | } |
890 | } |
891 | |
892 | impl PartialEq<Range<usize>> for Span { |
893 | #[inline ] |
894 | fn eq(&self, range: &Range<usize>) -> bool { |
895 | self.start == range.start && self.end == range.end |
896 | } |
897 | } |
898 | |
899 | impl PartialEq<Span> for Range<usize> { |
900 | #[inline ] |
901 | fn eq(&self, span: &Span) -> bool { |
902 | self.start == span.start && self.end == span.end |
903 | } |
904 | } |
905 | |
906 | /// A representation of "half" of a match reported by a DFA. |
907 | /// |
908 | /// This is called a "half" match because it only includes the end location (or |
909 | /// start location for a reverse search) of a match. This corresponds to the |
910 | /// information that a single DFA scan can report. Getting the other half of |
911 | /// the match requires a second scan with a reversed DFA. |
912 | /// |
913 | /// A half match also includes the pattern that matched. The pattern is |
914 | /// identified by an ID, which corresponds to its position (starting from `0`) |
915 | /// relative to other patterns used to construct the corresponding DFA. If only |
916 | /// a single pattern is provided to the DFA, then all matches are guaranteed to |
917 | /// have a pattern ID of `0`. |
918 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |
919 | pub struct HalfMatch { |
920 | /// The pattern ID. |
921 | pattern: PatternID, |
922 | /// The offset of the match. |
923 | /// |
924 | /// For forward searches, the offset is exclusive. For reverse searches, |
925 | /// the offset is inclusive. |
926 | offset: usize, |
927 | } |
928 | |
929 | impl HalfMatch { |
930 | /// Create a new half match from a pattern ID and a byte offset. |
931 | #[inline ] |
932 | pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { |
933 | HalfMatch { pattern, offset } |
934 | } |
935 | |
936 | /// Create a new half match from a pattern ID and a byte offset. |
937 | /// |
938 | /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a |
939 | /// [`PatternID`]. This panics if the given `usize` is not representable |
940 | /// as a `PatternID`. |
941 | #[inline ] |
942 | pub fn must(pattern: usize, offset: usize) -> HalfMatch { |
943 | HalfMatch::new(PatternID::new(pattern).unwrap(), offset) |
944 | } |
945 | |
946 | /// Returns the ID of the pattern that matched. |
947 | /// |
948 | /// The ID of a pattern is derived from the position in which it was |
949 | /// originally inserted into the corresponding DFA. The first pattern has |
950 | /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. |
951 | #[inline ] |
952 | pub fn pattern(&self) -> PatternID { |
953 | self.pattern |
954 | } |
955 | |
956 | /// The position of the match. |
957 | /// |
958 | /// If this match was produced by a forward search, then the offset is |
959 | /// exclusive. If this match was produced by a reverse search, then the |
960 | /// offset is inclusive. |
961 | #[inline ] |
962 | pub fn offset(&self) -> usize { |
963 | self.offset |
964 | } |
965 | } |
966 | |
967 | /// A representation of a match reported by a regex engine. |
968 | /// |
969 | /// A match has two essential pieces of information: the [`PatternID`] that |
970 | /// matches, and the [`Span`] of the match in a haystack. |
971 | /// |
972 | /// The pattern is identified by an ID, which corresponds to its position |
973 | /// (starting from `0`) relative to other patterns used to construct the |
974 | /// corresponding regex engine. If only a single pattern is provided, then all |
975 | /// matches are guaranteed to have a pattern ID of `0`. |
976 | /// |
977 | /// Every match reported by a regex engine guarantees that its span has its |
978 | /// start offset as less than or equal to its end offset. |
979 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |
980 | pub struct Match { |
981 | /// The pattern ID. |
982 | pattern: PatternID, |
983 | /// The underlying match span. |
984 | span: Span, |
985 | } |
986 | |
987 | impl Match { |
988 | /// Create a new match from a pattern ID and a span. |
989 | /// |
990 | /// This constructor is generic over how a span is provided. While |
991 | /// a [`Span`] may be given directly, one may also provide a |
992 | /// `std::ops::Range<usize>`. |
993 | /// |
994 | /// # Panics |
995 | /// |
996 | /// This panics if `end < start`. |
997 | /// |
998 | /// # Example |
999 | /// |
1000 | /// This shows how to create a match for the first pattern in a regex |
1001 | /// object using convenient range syntax. |
1002 | /// |
1003 | /// ``` |
1004 | /// use regex_automata::{Match, PatternID}; |
1005 | /// |
1006 | /// let m = Match::new(PatternID::ZERO, 5..10); |
1007 | /// assert_eq!(0, m.pattern().as_usize()); |
1008 | /// assert_eq!(5, m.start()); |
1009 | /// assert_eq!(10, m.end()); |
1010 | /// ``` |
1011 | #[inline ] |
1012 | pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { |
1013 | let span: Span = span.into(); |
1014 | assert!(span.start <= span.end, "invalid match span" ); |
1015 | Match { pattern, span } |
1016 | } |
1017 | |
1018 | /// Create a new match from a pattern ID and a byte offset span. |
1019 | /// |
1020 | /// This constructor is generic over how a span is provided. While |
1021 | /// a [`Span`] may be given directly, one may also provide a |
1022 | /// `std::ops::Range<usize>`. |
1023 | /// |
1024 | /// This is like [`Match::new`], but accepts a `usize` instead of a |
1025 | /// [`PatternID`]. This panics if the given `usize` is not representable |
1026 | /// as a `PatternID`. |
1027 | /// |
1028 | /// # Panics |
1029 | /// |
1030 | /// This panics if `end < start` or if `pattern > PatternID::MAX`. |
1031 | /// |
1032 | /// # Example |
1033 | /// |
1034 | /// This shows how to create a match for the third pattern in a regex |
1035 | /// object using convenient range syntax. |
1036 | /// |
1037 | /// ``` |
1038 | /// use regex_automata::Match; |
1039 | /// |
1040 | /// let m = Match::must(3, 5..10); |
1041 | /// assert_eq!(3, m.pattern().as_usize()); |
1042 | /// assert_eq!(5, m.start()); |
1043 | /// assert_eq!(10, m.end()); |
1044 | /// ``` |
1045 | #[inline ] |
1046 | pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { |
1047 | Match::new(PatternID::must(pattern), span) |
1048 | } |
1049 | |
1050 | /// Returns the ID of the pattern that matched. |
1051 | /// |
1052 | /// The ID of a pattern is derived from the position in which it was |
1053 | /// originally inserted into the corresponding regex engine. The first |
1054 | /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and |
1055 | /// so on. |
1056 | #[inline ] |
1057 | pub fn pattern(&self) -> PatternID { |
1058 | self.pattern |
1059 | } |
1060 | |
1061 | /// The starting position of the match. |
1062 | /// |
1063 | /// This is a convenience routine for `Match::span().start`. |
1064 | #[inline ] |
1065 | pub fn start(&self) -> usize { |
1066 | self.span().start |
1067 | } |
1068 | |
1069 | /// The ending position of the match. |
1070 | /// |
1071 | /// This is a convenience routine for `Match::span().end`. |
1072 | #[inline ] |
1073 | pub fn end(&self) -> usize { |
1074 | self.span().end |
1075 | } |
1076 | |
1077 | /// Returns the match span as a range. |
1078 | /// |
1079 | /// This is a convenience routine for `Match::span().range()`. |
1080 | #[inline ] |
1081 | pub fn range(&self) -> core::ops::Range<usize> { |
1082 | self.span().range() |
1083 | } |
1084 | |
1085 | /// Returns the span for this match. |
1086 | #[inline ] |
1087 | pub fn span(&self) -> Span { |
1088 | self.span |
1089 | } |
1090 | |
1091 | /// Returns true when the span in this match is empty. |
1092 | /// |
1093 | /// An empty match can only be returned when the regex itself can match |
1094 | /// the empty string. |
1095 | #[inline ] |
1096 | pub fn is_empty(&self) -> bool { |
1097 | self.span().is_empty() |
1098 | } |
1099 | |
1100 | /// Returns the length of this match. |
1101 | /// |
1102 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
1103 | #[inline ] |
1104 | pub fn len(&self) -> usize { |
1105 | self.span().len() |
1106 | } |
1107 | } |
1108 | |
1109 | /// A set of `PatternID`s. |
1110 | /// |
1111 | /// A set of pattern identifiers is useful for recording which patterns have |
1112 | /// matched a particular haystack. A pattern set _only_ includes pattern |
1113 | /// identifiers. It does not include offset information. |
1114 | /// |
1115 | /// # Example |
1116 | /// |
1117 | /// This shows basic usage of a set. |
1118 | /// |
1119 | /// ``` |
1120 | /// use regex_automata::{PatternID, PatternSet}; |
1121 | /// |
1122 | /// let pid1 = PatternID::must(5); |
1123 | /// let pid2 = PatternID::must(8); |
1124 | /// // Create a new empty set. |
1125 | /// let mut set = PatternSet::new(10); |
1126 | /// // Insert pattern IDs. |
1127 | /// set.insert(pid1); |
1128 | /// set.insert(pid2); |
1129 | /// // Test membership. |
1130 | /// assert!(set.contains(pid1)); |
1131 | /// assert!(set.contains(pid2)); |
1132 | /// // Get all members. |
1133 | /// assert_eq!( |
1134 | /// vec![5, 8], |
1135 | /// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(), |
1136 | /// ); |
1137 | /// // Clear the set. |
1138 | /// set.clear(); |
1139 | /// // Test that it is indeed empty. |
1140 | /// assert!(set.is_empty()); |
1141 | /// ``` |
1142 | #[cfg (feature = "alloc" )] |
1143 | #[derive(Clone, Debug, Eq, PartialEq)] |
1144 | pub struct PatternSet { |
1145 | /// The number of patterns set to 'true' in this set. |
1146 | len: usize, |
1147 | /// A map from PatternID to boolean of whether a pattern matches or not. |
1148 | /// |
1149 | /// This should probably be a bitset, but it's probably unlikely to matter |
1150 | /// much in practice. |
1151 | /// |
1152 | /// The main downside of this representation (and similarly for a bitset) |
1153 | /// is that iteration scales with the capacity of the set instead of |
1154 | /// the length of the set. This doesn't seem likely to be a problem in |
1155 | /// practice. |
1156 | /// |
1157 | /// Another alternative is to just use a 'SparseSet' for this. It does use |
1158 | /// more memory (quite a bit more), but that seems fine I think compared |
1159 | /// to the memory being used by the regex engine. The real hiccup with |
1160 | /// it is that it yields pattern IDs in the order they were inserted. |
1161 | /// Which is actually kind of nice, but at the time of writing, pattern |
1162 | /// IDs are yielded in ascending order in the regex crate RegexSet API. |
1163 | /// If we did change to 'SparseSet', we could provide an additional |
1164 | /// 'iter_match_order' iterator, but keep the ascending order one for |
1165 | /// compatibility. |
1166 | which: alloc::boxed::Box<[bool]>, |
1167 | } |
1168 | |
1169 | #[cfg (feature = "alloc" )] |
1170 | impl PatternSet { |
1171 | /// Create a new set of pattern identifiers with the given capacity. |
1172 | /// |
1173 | /// The given capacity typically corresponds to (at least) the number of |
1174 | /// patterns in a compiled regex object. |
1175 | /// |
1176 | /// # Panics |
1177 | /// |
1178 | /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is |
1179 | /// impossible if you use the `pattern_len()` method as defined on any of |
1180 | /// the regex engines in this crate. Namely, a regex will fail to build by |
1181 | /// returning an error if the number of patterns given to it exceeds the |
1182 | /// limit. Therefore, the number of patterns in a valid regex is always |
1183 | /// a correct capacity to provide here. |
1184 | pub fn new(capacity: usize) -> PatternSet { |
1185 | assert!( |
1186 | capacity <= PatternID::LIMIT, |
1187 | "pattern set capacity exceeds limit of {}" , |
1188 | PatternID::LIMIT, |
1189 | ); |
1190 | PatternSet { |
1191 | len: 0, |
1192 | which: alloc::vec![false; capacity].into_boxed_slice(), |
1193 | } |
1194 | } |
1195 | |
1196 | /// Clear this set such that it contains no pattern IDs. |
1197 | pub fn clear(&mut self) { |
1198 | self.len = 0; |
1199 | for matched in self.which.iter_mut() { |
1200 | *matched = false; |
1201 | } |
1202 | } |
1203 | |
1204 | /// Return true if and only if the given pattern identifier is in this set. |
1205 | pub fn contains(&self, pid: PatternID) -> bool { |
1206 | pid.as_usize() < self.capacity() && self.which[pid] |
1207 | } |
1208 | |
1209 | /// Insert the given pattern identifier into this set and return `true` if |
1210 | /// the given pattern ID was not previously in this set. |
1211 | /// |
1212 | /// If the pattern identifier is already in this set, then this is a no-op. |
1213 | /// |
1214 | /// Use [`PatternSet::try_insert`] for a fallible version of this routine. |
1215 | /// |
1216 | /// # Panics |
1217 | /// |
1218 | /// This panics if this pattern set has insufficient capacity to |
1219 | /// store the given pattern ID. |
1220 | pub fn insert(&mut self, pid: PatternID) -> bool { |
1221 | self.try_insert(pid) |
1222 | .expect("PatternSet should have sufficient capacity" ) |
1223 | } |
1224 | |
1225 | /// Insert the given pattern identifier into this set and return `true` if |
1226 | /// the given pattern ID was not previously in this set. |
1227 | /// |
1228 | /// If the pattern identifier is already in this set, then this is a no-op. |
1229 | /// |
1230 | /// # Errors |
1231 | /// |
1232 | /// This returns an error if this pattern set has insufficient capacity to |
1233 | /// store the given pattern ID. |
1234 | pub fn try_insert( |
1235 | &mut self, |
1236 | pid: PatternID, |
1237 | ) -> Result<bool, PatternSetInsertError> { |
1238 | if pid.as_usize() >= self.capacity() { |
1239 | return Err(PatternSetInsertError { |
1240 | attempted: pid, |
1241 | capacity: self.capacity(), |
1242 | }); |
1243 | } |
1244 | if self.which[pid] { |
1245 | return Ok(false); |
1246 | } |
1247 | self.len += 1; |
1248 | self.which[pid] = true; |
1249 | Ok(true) |
1250 | } |
1251 | |
1252 | /* |
1253 | // This is currently commented out because it is unused and it is unclear |
1254 | // whether it's useful or not. What's the harm in having it? When, if |
1255 | // we ever wanted to change our representation to a 'SparseSet', then |
1256 | // supporting this method would be a bit tricky. So in order to keep some |
1257 | // API evolution flexibility, we leave it out for now. |
1258 | |
1259 | /// Remove the given pattern identifier from this set. |
1260 | /// |
1261 | /// If the pattern identifier was not previously in this set, then this |
1262 | /// does not change the set and returns `false`. |
1263 | /// |
1264 | /// # Panics |
1265 | /// |
1266 | /// This panics if `pid` exceeds the capacity of this set. |
1267 | pub fn remove(&mut self, pid: PatternID) -> bool { |
1268 | if !self.which[pid] { |
1269 | return false; |
1270 | } |
1271 | self.len -= 1; |
1272 | self.which[pid] = false; |
1273 | true |
1274 | } |
1275 | */ |
1276 | |
1277 | /// Return true if and only if this set has no pattern identifiers in it. |
1278 | pub fn is_empty(&self) -> bool { |
1279 | self.len() == 0 |
1280 | } |
1281 | |
1282 | /// Return true if and only if this set has the maximum number of pattern |
1283 | /// identifiers in the set. This occurs precisely when `PatternSet::len() |
1284 | /// == PatternSet::capacity()`. |
1285 | /// |
1286 | /// This particular property is useful to test because it may allow one to |
1287 | /// stop a search earlier than you might otherwise. Namely, if a search is |
1288 | /// only reporting which patterns match a haystack and if you know all of |
1289 | /// the patterns match at a given point, then there's no new information |
1290 | /// that can be learned by continuing the search. (Because a pattern set |
1291 | /// does not keep track of offset information.) |
1292 | pub fn is_full(&self) -> bool { |
1293 | self.len() == self.capacity() |
1294 | } |
1295 | |
1296 | /// Returns the total number of pattern identifiers in this set. |
1297 | pub fn len(&self) -> usize { |
1298 | self.len |
1299 | } |
1300 | |
1301 | /// Returns the total number of pattern identifiers that may be stored |
1302 | /// in this set. |
1303 | /// |
1304 | /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`]. |
1305 | /// |
1306 | /// Typically, the capacity of a pattern set matches the number of patterns |
1307 | /// in a regex object with which you are searching. |
1308 | pub fn capacity(&self) -> usize { |
1309 | self.which.len() |
1310 | } |
1311 | |
1312 | /// Returns an iterator over all pattern identifiers in this set. |
1313 | /// |
1314 | /// The iterator yields pattern identifiers in ascending order, starting |
1315 | /// at zero. |
1316 | pub fn iter(&self) -> PatternSetIter<'_> { |
1317 | PatternSetIter { it: self.which.iter().enumerate() } |
1318 | } |
1319 | } |
1320 | |
1321 | /// An error that occurs when a `PatternID` failed to insert into a |
1322 | /// `PatternSet`. |
1323 | /// |
1324 | /// An insert fails when the given `PatternID` exceeds the configured capacity |
1325 | /// of the `PatternSet`. |
1326 | /// |
1327 | /// This error is created by the [`PatternSet::try_insert`] routine. |
1328 | #[cfg (feature = "alloc" )] |
1329 | #[derive(Clone, Debug)] |
1330 | pub struct PatternSetInsertError { |
1331 | attempted: PatternID, |
1332 | capacity: usize, |
1333 | } |
1334 | |
1335 | #[cfg (feature = "std" )] |
1336 | impl std::error::Error for PatternSetInsertError {} |
1337 | |
1338 | #[cfg (feature = "alloc" )] |
1339 | impl core::fmt::Display for PatternSetInsertError { |
1340 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
1341 | write!( |
1342 | f, |
1343 | "failed to insert pattern ID {} into pattern set \ |
1344 | with insufficiet capacity of {}" , |
1345 | self.attempted.as_usize(), |
1346 | self.capacity, |
1347 | ) |
1348 | } |
1349 | } |
1350 | |
1351 | /// An iterator over all pattern identifiers in a [`PatternSet`]. |
1352 | /// |
1353 | /// The lifetime parameter `'a` refers to the lifetime of the pattern set being |
1354 | /// iterated over. |
1355 | /// |
1356 | /// This iterator is created by the [`PatternSet::iter`] method. |
1357 | #[cfg (feature = "alloc" )] |
1358 | #[derive(Clone, Debug)] |
1359 | pub struct PatternSetIter<'a> { |
1360 | it: core::iter::Enumerate<core::slice::Iter<'a, bool>>, |
1361 | } |
1362 | |
1363 | #[cfg (feature = "alloc" )] |
1364 | impl<'a> Iterator for PatternSetIter<'a> { |
1365 | type Item = PatternID; |
1366 | |
1367 | fn next(&mut self) -> Option<PatternID> { |
1368 | while let Some((index, &yes)) = self.it.next() { |
1369 | if yes { |
1370 | // Only valid 'PatternID' values can be inserted into the set |
1371 | // and construction of the set panics if the capacity would |
1372 | // permit storing invalid pattern IDs. Thus, 'yes' is only true |
1373 | // precisely when 'index' corresponds to a valid 'PatternID'. |
1374 | return Some(PatternID::new_unchecked(index)); |
1375 | } |
1376 | } |
1377 | None |
1378 | } |
1379 | |
1380 | fn size_hint(&self) -> (usize, Option<usize>) { |
1381 | self.it.size_hint() |
1382 | } |
1383 | } |
1384 | |
1385 | #[cfg (feature = "alloc" )] |
1386 | impl<'a> DoubleEndedIterator for PatternSetIter<'a> { |
1387 | fn next_back(&mut self) -> Option<PatternID> { |
1388 | while let Some((index, &yes)) = self.it.next_back() { |
1389 | if yes { |
1390 | // Only valid 'PatternID' values can be inserted into the set |
1391 | // and construction of the set panics if the capacity would |
1392 | // permit storing invalid pattern IDs. Thus, 'yes' is only true |
1393 | // precisely when 'index' corresponds to a valid 'PatternID'. |
1394 | return Some(PatternID::new_unchecked(index)); |
1395 | } |
1396 | } |
1397 | None |
1398 | } |
1399 | } |
1400 | |
1401 | /// The type of anchored search to perform. |
1402 | /// |
1403 | /// This is *almost* a boolean option. That is, you can either do an unanchored |
1404 | /// search for any pattern in a regex, or you can do an anchored search for any |
1405 | /// pattern in a regex. |
1406 | /// |
1407 | /// A third option exists that, assuming the regex engine supports it, permits |
1408 | /// you to do an anchored search for a specific pattern. |
1409 | /// |
1410 | /// Note that there is no way to run an unanchored search for a specific |
1411 | /// pattern. If you need that, you'll need to build separate regexes for each |
1412 | /// pattern. |
1413 | /// |
1414 | /// # Errors |
1415 | /// |
1416 | /// If a regex engine does not support the anchored mode selected, then the |
1417 | /// regex engine will return an error. While any non-trivial regex engine |
1418 | /// should support at least one of the available anchored modes, there is no |
1419 | /// singular mode that is guaranteed to be universally supported. Some regex |
1420 | /// engines might only support unanchored searches (DFAs compiled without |
1421 | /// anchored starting states) and some regex engines might only support |
1422 | /// anchored searches (like the one-pass DFA). |
1423 | /// |
1424 | /// The specific error returned is a [`MatchError`] with a |
1425 | /// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the |
1426 | /// `Anchored` value given that is unsupported. |
1427 | /// |
1428 | /// Note that regex engines should report "no match" if, for example, an |
1429 | /// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where |
1430 | /// anchored searches for a specific pattern are supported. This is smooths out |
1431 | /// behavior such that it's possible to guarantee that an error never occurs |
1432 | /// based on how the regex engine is configured. All regex engines in this |
1433 | /// crate report "no match" when searching for an invalid pattern ID, but where |
1434 | /// searching for a valid pattern ID is otherwise supported. |
1435 | /// |
1436 | /// # Example |
1437 | /// |
1438 | /// This example shows how to use the various `Anchored` modes to run a |
1439 | /// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) |
1440 | /// because it supports all modes unconditionally. Some regex engines, like |
1441 | /// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored |
1442 | /// searches. |
1443 | /// |
1444 | /// ``` |
1445 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1446 | /// use regex_automata::{ |
1447 | /// nfa::thompson::pikevm::PikeVM, |
1448 | /// Anchored, Input, Match, PatternID, |
1449 | /// }; |
1450 | /// |
1451 | /// let re = PikeVM::new_many(&[ |
1452 | /// r"Mrs. \w+" , |
1453 | /// r"Miss \w+" , |
1454 | /// r"Mr. \w+" , |
1455 | /// r"Ms. \w+" , |
1456 | /// ])?; |
1457 | /// let mut cache = re.create_cache(); |
1458 | /// let hay = "Hello Mr. Springsteen!" ; |
1459 | /// |
1460 | /// // The default is to do an unanchored search. |
1461 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); |
1462 | /// // Explicitly ask for an unanchored search. Same as above. |
1463 | /// let input = Input::new(hay).anchored(Anchored::No); |
1464 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); |
1465 | /// |
1466 | /// // Now try an anchored search. Since the match doesn't start at the |
1467 | /// // beginning of the haystack, no match is found! |
1468 | /// let input = Input::new(hay).anchored(Anchored::Yes); |
1469 | /// assert_eq!(None, re.find(&mut cache, input)); |
1470 | /// |
1471 | /// // We can try an anchored search again, but move the location of where |
1472 | /// // we start the search. Note that the offsets reported are still in |
1473 | /// // terms of the overall haystack and not relative to where we started |
1474 | /// // the search. |
1475 | /// let input = Input::new(hay).anchored(Anchored::Yes).range(6..); |
1476 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); |
1477 | /// |
1478 | /// // Now try an anchored search for a specific pattern. We specifically |
1479 | /// // choose a pattern that we know doesn't match to prove that the search |
1480 | /// // only looks for the pattern we provide. |
1481 | /// let input = Input::new(hay) |
1482 | /// .anchored(Anchored::Pattern(PatternID::must(1))) |
1483 | /// .range(6..); |
1484 | /// assert_eq!(None, re.find(&mut cache, input)); |
1485 | /// |
1486 | /// // But if we switch it to the pattern that we know matches, then we find |
1487 | /// // the match. |
1488 | /// let input = Input::new(hay) |
1489 | /// .anchored(Anchored::Pattern(PatternID::must(2))) |
1490 | /// .range(6..); |
1491 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); |
1492 | /// |
1493 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1494 | /// ``` |
1495 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
1496 | pub enum Anchored { |
1497 | /// Run an unanchored search. This means a match may occur anywhere at or |
1498 | /// after the start position of the search. |
1499 | /// |
1500 | /// This search can return a match for any pattern in the regex. |
1501 | No, |
1502 | /// Run an anchored search. This means that a match must begin at the |
1503 | /// start position of the search. |
1504 | /// |
1505 | /// This search can return a match for any pattern in the regex. |
1506 | Yes, |
1507 | /// Run an anchored search for a specific pattern. This means that a match |
1508 | /// must be for the given pattern and must begin at the start position of |
1509 | /// the search. |
1510 | Pattern(PatternID), |
1511 | } |
1512 | |
1513 | impl Anchored { |
1514 | /// Returns true if and only if this anchor mode corresponds to any kind of |
1515 | /// anchored search. |
1516 | /// |
1517 | /// # Example |
1518 | /// |
1519 | /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern` |
1520 | /// are considered anchored searches. |
1521 | /// |
1522 | /// ``` |
1523 | /// use regex_automata::{Anchored, PatternID}; |
1524 | /// |
1525 | /// assert!(!Anchored::No.is_anchored()); |
1526 | /// assert!(Anchored::Yes.is_anchored()); |
1527 | /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored()); |
1528 | /// ``` |
1529 | #[inline ] |
1530 | pub fn is_anchored(&self) -> bool { |
1531 | matches!(*self, Anchored::Yes | Anchored::Pattern(_)) |
1532 | } |
1533 | |
1534 | /// Returns the pattern ID associated with this configuration if it is an |
1535 | /// anchored search for a specific pattern. Otherwise `None` is returned. |
1536 | /// |
1537 | /// # Example |
1538 | /// |
1539 | /// ``` |
1540 | /// use regex_automata::{Anchored, PatternID}; |
1541 | /// |
1542 | /// assert_eq!(None, Anchored::No.pattern()); |
1543 | /// assert_eq!(None, Anchored::Yes.pattern()); |
1544 | /// |
1545 | /// let pid = PatternID::must(5); |
1546 | /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern()); |
1547 | /// ``` |
1548 | #[inline ] |
1549 | pub fn pattern(&self) -> Option<PatternID> { |
1550 | match *self { |
1551 | Anchored::Pattern(pid) => Some(pid), |
1552 | _ => None, |
1553 | } |
1554 | } |
1555 | } |
1556 | |
1557 | /// The kind of match semantics to use for a regex pattern. |
1558 | /// |
1559 | /// The default match kind is `LeftmostFirst`, and this corresponds to the |
1560 | /// match semantics used by most backtracking engines, such as Perl. |
1561 | /// |
1562 | /// # Leftmost first or "preference order" match semantics |
1563 | /// |
1564 | /// Leftmost-first semantics determine which match to report when there are |
1565 | /// multiple paths through a regex that match at the same position. The tie is |
1566 | /// essentially broken by how a backtracker would behave. For example, consider |
1567 | /// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this |
1568 | /// case, both the `foofoo` and `foo` branches match at position `0`. So should |
1569 | /// the end of the match be `3` or `6`? |
1570 | /// |
1571 | /// A backtracker will conceptually work by trying `foofoofoo` and failing. |
1572 | /// Then it will try `foofoo`, find the match and stop there. Thus, the |
1573 | /// leftmost-first match position is `6`. This is called "leftmost-first" or |
1574 | /// "preference order" because the order of the branches as written in the |
1575 | /// regex pattern is what determines how to break the tie. |
1576 | /// |
1577 | /// (Note that leftmost-longest match semantics, which break ties by always |
1578 | /// taking the longest matching string, are not currently supported by this |
1579 | /// crate. These match semantics tend to be found in POSIX regex engines.) |
1580 | /// |
1581 | /// This example shows how leftmost-first semantics work, and how it even |
1582 | /// applies to multi-pattern regexes: |
1583 | /// |
1584 | /// ``` |
1585 | /// use regex_automata::{ |
1586 | /// nfa::thompson::pikevm::PikeVM, |
1587 | /// Match, |
1588 | /// }; |
1589 | /// |
1590 | /// let re = PikeVM::new_many(&[ |
1591 | /// r"foofoofoo" , |
1592 | /// r"foofoo" , |
1593 | /// r"foo" , |
1594 | /// ])?; |
1595 | /// let mut cache = re.create_cache(); |
1596 | /// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo" ).collect(); |
1597 | /// let expected = vec![Match::must(1, 0..6)]; |
1598 | /// assert_eq!(expected, got); |
1599 | /// |
1600 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1601 | /// ``` |
1602 | /// |
1603 | /// # All matches |
1604 | /// |
1605 | /// The `All` match semantics report any and all matches, and generally will |
1606 | /// attempt to match as much as possible. It doesn't respect any sort of match |
1607 | /// priority at all, so things like non-greedy matching don't work in this |
1608 | /// mode. |
1609 | /// |
1610 | /// The fact that non-greedy matching doesn't work generally makes most forms |
1611 | /// of unanchored non-overlapping searches have unintuitive behavior. Namely, |
1612 | /// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the |
1613 | /// beginning of the pattern, which is specifically non-greedy. Since it will |
1614 | /// be treated as greedy in `All` match semantics, this generally means that |
1615 | /// it will first attempt to consume all of the haystack and is likely to wind |
1616 | /// up skipping matches. |
1617 | /// |
1618 | /// Generally speaking, `All` should only be used in two circumstances: |
1619 | /// |
1620 | /// * When running an anchored search and there is a desire to match as much as |
1621 | /// possible. For example, when building a reverse regex matcher to find the |
1622 | /// start of a match after finding the end. In this case, the reverse search |
1623 | /// is anchored to the end of the match found by the forward search. |
1624 | /// * When running overlapping searches. Since `All` encodes all possible |
1625 | /// matches, this is generally what you want for an overlapping search. If you |
1626 | /// try to use leftmost-first in an overlapping search, it is likely to produce |
1627 | /// counter-intuitive results since leftmost-first specifically excludes some |
1628 | /// matches from its underlying finite state machine. |
1629 | /// |
1630 | /// This example demonstrates the counter-intuitive behavior of `All` semantics |
1631 | /// when using a standard leftmost unanchored search: |
1632 | /// |
1633 | /// ``` |
1634 | /// use regex_automata::{ |
1635 | /// nfa::thompson::pikevm::PikeVM, |
1636 | /// Match, MatchKind, |
1637 | /// }; |
1638 | /// |
1639 | /// let re = PikeVM::builder() |
1640 | /// .configure(PikeVM::config().match_kind(MatchKind::All)) |
1641 | /// .build("foo" )?; |
1642 | /// let hay = "first foo second foo wat" ; |
1643 | /// let mut cache = re.create_cache(); |
1644 | /// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect(); |
1645 | /// // Notice that it completely skips the first 'foo'! |
1646 | /// let expected = vec![Match::must(0, 17..20)]; |
1647 | /// assert_eq!(expected, got); |
1648 | /// |
1649 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1650 | /// ``` |
1651 | /// |
1652 | /// This second example shows how `All` semantics are useful for an overlapping |
1653 | /// search. Note that we use lower level lazy DFA APIs here since the NFA |
1654 | /// engines only currently support a very limited form of overlapping search. |
1655 | /// |
1656 | /// ``` |
1657 | /// use regex_automata::{ |
1658 | /// hybrid::dfa::{DFA, OverlappingState}, |
1659 | /// HalfMatch, Input, MatchKind, |
1660 | /// }; |
1661 | /// |
1662 | /// let re = DFA::builder() |
1663 | /// // If we didn't set 'All' semantics here, then the regex would only |
1664 | /// // match 'foo' at offset 3 and nothing else. Why? Because the state |
1665 | /// // machine implements preference order and knows that the 'foofoo' and |
1666 | /// // 'foofoofoo' branches can never match since 'foo' will always match |
1667 | /// // when they match and take priority. |
1668 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
1669 | /// .build(r"foo|foofoo|foofoofoo" )?; |
1670 | /// let mut cache = re.create_cache(); |
1671 | /// let mut state = OverlappingState::start(); |
1672 | /// let input = Input::new("foofoofoo" ); |
1673 | /// let mut got = vec![]; |
1674 | /// loop { |
1675 | /// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?; |
1676 | /// let m = match state.get_match() { |
1677 | /// None => break, |
1678 | /// Some(m) => m, |
1679 | /// }; |
1680 | /// got.push(m); |
1681 | /// } |
1682 | /// let expected = vec![ |
1683 | /// HalfMatch::must(0, 3), |
1684 | /// HalfMatch::must(0, 6), |
1685 | /// HalfMatch::must(0, 9), |
1686 | /// ]; |
1687 | /// assert_eq!(expected, got); |
1688 | /// |
1689 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1690 | /// ``` |
1691 | #[non_exhaustive ] |
1692 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
1693 | pub enum MatchKind { |
1694 | /// Report all possible matches. |
1695 | All, |
1696 | /// Report only the leftmost matches. When multiple leftmost matches exist, |
1697 | /// report the match corresponding to the part of the regex that appears |
1698 | /// first in the syntax. |
1699 | LeftmostFirst, |
1700 | // There is prior art in RE2 that shows that we should be able to add |
1701 | // LeftmostLongest too. The tricky part of it is supporting ungreedy |
1702 | // repetitions. Instead of treating all NFA states as having equivalent |
1703 | // priority (as in 'All') or treating all NFA states as having distinct |
1704 | // priority based on order (as in 'LeftmostFirst'), we instead group NFA |
1705 | // states into sets, and treat members of each set as having equivalent |
1706 | // priority, but having greater priority than all following members |
1707 | // of different sets. |
1708 | // |
1709 | // However, it's not clear whether it's really worth adding this. After |
1710 | // all, leftmost-longest can be emulated when using literals by using |
1711 | // leftmost-first and sorting the literals by length in descending order. |
1712 | // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will |
1713 | // always match `a` in `ab` when using leftmost-first, but leftmost-longest |
1714 | // would match `ab`. |
1715 | } |
1716 | |
1717 | impl MatchKind { |
1718 | #[cfg (feature = "alloc" )] |
1719 | pub(crate) fn continue_past_first_match(&self) -> bool { |
1720 | *self == MatchKind::All |
1721 | } |
1722 | } |
1723 | |
1724 | impl Default for MatchKind { |
1725 | fn default() -> MatchKind { |
1726 | MatchKind::LeftmostFirst |
1727 | } |
1728 | } |
1729 | |
1730 | /// An error indicating that a search stopped before reporting whether a |
1731 | /// match exists or not. |
1732 | /// |
1733 | /// To be very clear, this error type implies that one cannot assume that no |
1734 | /// matches occur, since the search stopped before completing. That is, if |
1735 | /// you're looking for information about where a search determined that no |
1736 | /// match can occur, then this error type does *not* give you that. (Indeed, at |
1737 | /// the time of writing, if you need such a thing, you have to write your own |
1738 | /// search routine.) |
1739 | /// |
1740 | /// Normally, when one searches for something, the response is either an |
1741 | /// affirmative "it was found at this location" or a negative "not found at |
1742 | /// all." However, in some cases, a regex engine can be configured to stop its |
1743 | /// search before concluding whether a match exists or not. When this happens, |
1744 | /// it may be important for the caller to know why the regex engine gave up and |
1745 | /// where in the input it gave up at. This error type exposes the 'why' and the |
1746 | /// 'where.' |
1747 | /// |
1748 | /// For example, the DFAs provided by this library generally cannot correctly |
1749 | /// implement Unicode word boundaries. Instead, they provide an option to |
1750 | /// eagerly support them on ASCII text (since Unicode word boundaries are |
1751 | /// equivalent to ASCII word boundaries when searching ASCII text), but will |
1752 | /// "give up" if a non-ASCII byte is seen. In such cases, one is usually |
1753 | /// required to either report the failure to the caller (unergonomic) or |
1754 | /// otherwise fall back to some other regex engine (ergonomic, but potentially |
1755 | /// costly). |
1756 | /// |
1757 | /// More generally, some regex engines offer the ability for callers to specify |
1758 | /// certain bytes that will trigger the regex engine to automatically quit if |
1759 | /// they are seen. |
1760 | /// |
1761 | /// Still yet, there may be other reasons for a failed match. For example, |
1762 | /// the hybrid DFA provided by this crate can be configured to give up if it |
1763 | /// believes that it is not efficient. This in turn permits callers to choose a |
1764 | /// different regex engine. |
1765 | /// |
1766 | /// (Note that DFAs are configured by default to never quit or give up in this |
1767 | /// fashion. For example, by default, a DFA will fail to build if the regex |
1768 | /// pattern contains a Unicode word boundary. One needs to opt into the "quit" |
1769 | /// behavior via options, like |
1770 | /// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).) |
1771 | /// |
1772 | /// There are a couple other ways a search |
1773 | /// can fail. For example, when using the |
1774 | /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) |
1775 | /// with a haystack that is too long, or trying to run an unanchored search |
1776 | /// with a [one-pass DFA](crate::dfa::onepass). |
1777 | #[derive(Clone, Debug, Eq, PartialEq)] |
1778 | pub struct MatchError( |
1779 | #[cfg (feature = "alloc" )] alloc::boxed::Box<MatchErrorKind>, |
1780 | #[cfg (not(feature = "alloc" ))] MatchErrorKind, |
1781 | ); |
1782 | |
1783 | impl MatchError { |
1784 | /// Create a new error value with the given kind. |
1785 | /// |
1786 | /// This is a more verbose version of the kind-specific constructors, |
1787 | /// e.g., `MatchError::quit`. |
1788 | pub fn new(kind: MatchErrorKind) -> MatchError { |
1789 | #[cfg (feature = "alloc" )] |
1790 | { |
1791 | MatchError(alloc::boxed::Box::new(kind)) |
1792 | } |
1793 | #[cfg (not(feature = "alloc" ))] |
1794 | { |
1795 | MatchError(kind) |
1796 | } |
1797 | } |
1798 | |
1799 | /// Returns a reference to the underlying error kind. |
1800 | pub fn kind(&self) -> &MatchErrorKind { |
1801 | &self.0 |
1802 | } |
1803 | |
1804 | /// Create a new "quit" error. The given `byte` corresponds to the value |
1805 | /// that tripped a search's quit condition, and `offset` corresponds to the |
1806 | /// location in the haystack at which the search quit. |
1807 | /// |
1808 | /// This is the same as calling `MatchError::new` with a |
1809 | /// [`MatchErrorKind::Quit`] kind. |
1810 | pub fn quit(byte: u8, offset: usize) -> MatchError { |
1811 | MatchError::new(MatchErrorKind::Quit { byte, offset }) |
1812 | } |
1813 | |
1814 | /// Create a new "gave up" error. The given `offset` corresponds to the |
1815 | /// location in the haystack at which the search gave up. |
1816 | /// |
1817 | /// This is the same as calling `MatchError::new` with a |
1818 | /// [`MatchErrorKind::GaveUp`] kind. |
1819 | pub fn gave_up(offset: usize) -> MatchError { |
1820 | MatchError::new(MatchErrorKind::GaveUp { offset }) |
1821 | } |
1822 | |
1823 | /// Create a new "haystack too long" error. The given `len` corresponds to |
1824 | /// the length of the haystack that was problematic. |
1825 | /// |
1826 | /// This is the same as calling `MatchError::new` with a |
1827 | /// [`MatchErrorKind::HaystackTooLong`] kind. |
1828 | pub fn haystack_too_long(len: usize) -> MatchError { |
1829 | MatchError::new(MatchErrorKind::HaystackTooLong { len }) |
1830 | } |
1831 | |
1832 | /// Create a new "unsupported anchored" error. This occurs when the caller |
1833 | /// requests a search with an anchor mode that is not supported by the |
1834 | /// regex engine. |
1835 | /// |
1836 | /// This is the same as calling `MatchError::new` with a |
1837 | /// [`MatchErrorKind::UnsupportedAnchored`] kind. |
1838 | pub fn unsupported_anchored(mode: Anchored) -> MatchError { |
1839 | MatchError::new(MatchErrorKind::UnsupportedAnchored { mode }) |
1840 | } |
1841 | } |
1842 | |
1843 | /// The underlying kind of a [`MatchError`]. |
1844 | /// |
1845 | /// This is a **non-exhaustive** enum. That means new variants may be added in |
1846 | /// a semver-compatible release. |
1847 | #[non_exhaustive ] |
1848 | #[derive(Clone, Debug, Eq, PartialEq)] |
1849 | pub enum MatchErrorKind { |
1850 | /// The search saw a "quit" byte at which it was instructed to stop |
1851 | /// searching. |
1852 | Quit { |
1853 | /// The "quit" byte that was observed that caused the search to stop. |
1854 | byte: u8, |
1855 | /// The offset at which the quit byte was observed. |
1856 | offset: usize, |
1857 | }, |
1858 | /// The search, based on heuristics, determined that it would be better |
1859 | /// to stop, typically to provide the caller an opportunity to use an |
1860 | /// alternative regex engine. |
1861 | /// |
1862 | /// Currently, the only way for this to occur is via the lazy DFA and |
1863 | /// only when it is configured to do so (it will not return this error by |
1864 | /// default). |
1865 | GaveUp { |
1866 | /// The offset at which the search stopped. This corresponds to the |
1867 | /// position immediately following the last byte scanned. |
1868 | offset: usize, |
1869 | }, |
1870 | /// This error occurs if the haystack given to the regex engine was too |
1871 | /// long to be searched. This occurs, for example, with regex engines |
1872 | /// like the bounded backtracker that have a configurable fixed amount of |
1873 | /// capacity that is tied to the length of the haystack. Anything beyond |
1874 | /// that configured limit will result in an error at search time. |
1875 | HaystackTooLong { |
1876 | /// The length of the haystack that exceeded the limit. |
1877 | len: usize, |
1878 | }, |
1879 | /// An error indicating that a particular type of anchored search was |
1880 | /// requested, but that the regex engine does not support it. |
1881 | /// |
1882 | /// Note that this error should not be returned by a regex engine simply |
1883 | /// because the pattern ID is invalid (i.e., equal to or exceeds the number |
1884 | /// of patterns in the regex). In that case, the regex engine should report |
1885 | /// a non-match. |
1886 | UnsupportedAnchored { |
1887 | /// The anchored mode given that is unsupported. |
1888 | mode: Anchored, |
1889 | }, |
1890 | } |
1891 | |
1892 | #[cfg (feature = "std" )] |
1893 | impl std::error::Error for MatchError {} |
1894 | |
1895 | impl core::fmt::Display for MatchError { |
1896 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
1897 | match *self.kind() { |
1898 | MatchErrorKind::Quit { byte, offset } => write!( |
1899 | f, |
1900 | "quit search after observing byte {:?} at offset {}" , |
1901 | DebugByte(byte), |
1902 | offset, |
1903 | ), |
1904 | MatchErrorKind::GaveUp { offset } => { |
1905 | write!(f, "gave up searching at offset {}" , offset) |
1906 | } |
1907 | MatchErrorKind::HaystackTooLong { len } => { |
1908 | write!(f, "haystack of length {} is too long" , len) |
1909 | } |
1910 | MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => { |
1911 | write!(f, "anchored searches are not supported or enabled" ) |
1912 | } |
1913 | MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => { |
1914 | write!(f, "unanchored searches are not supported or enabled" ) |
1915 | } |
1916 | MatchErrorKind::UnsupportedAnchored { |
1917 | mode: Anchored::Pattern(pid), |
1918 | } => { |
1919 | write!( |
1920 | f, |
1921 | "anchored searches for a specific pattern ({}) are \ |
1922 | not supported or enabled" , |
1923 | pid.as_usize(), |
1924 | ) |
1925 | } |
1926 | } |
1927 | } |
1928 | } |
1929 | |
1930 | #[cfg (test)] |
1931 | mod tests { |
1932 | use super::*; |
1933 | |
1934 | // We test that our 'MatchError' type is the size we expect. This isn't an |
1935 | // API guarantee, but if the size increases, we really want to make sure we |
1936 | // decide to do that intentionally. So this should be a speed bump. And in |
1937 | // general, we should not increase the size without a very good reason. |
1938 | // |
1939 | // Why? Because low level search APIs return Result<.., MatchError>. When |
1940 | // MatchError gets bigger, so to does the Result type. |
1941 | // |
1942 | // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes |
1943 | // the importance of keeping a small error type. But without 'alloc', we |
1944 | // still want things to be small. |
1945 | #[test] |
1946 | fn match_error_size() { |
1947 | let expected_size = if cfg!(feature = "alloc" ) { |
1948 | core::mem::size_of::<usize>() |
1949 | } else { |
1950 | 2 * core::mem::size_of::<usize>() |
1951 | }; |
1952 | assert_eq!(expected_size, core::mem::size_of::<MatchError>()); |
1953 | } |
1954 | |
1955 | // Same as above, but for the underlying match error kind. |
1956 | #[cfg (target_pointer_width = "64" )] |
1957 | #[test] |
1958 | fn match_error_kind_size() { |
1959 | let expected_size = 2 * core::mem::size_of::<usize>(); |
1960 | assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); |
1961 | } |
1962 | |
1963 | #[cfg (target_pointer_width = "32" )] |
1964 | #[test] |
1965 | fn match_error_kind_size() { |
1966 | let expected_size = 3 * core::mem::size_of::<usize>(); |
1967 | assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); |
1968 | } |
1969 | } |
1970 | |