1 | /*! |
2 | Types and routines that support the search APIs of most regex engines. |
3 | |
4 | This sub-module isn't exposed directly, but rather, its contents are exported |
5 | at the crate root due to the universality of most of the types and routines in |
6 | this module. |
7 | */ |
8 | |
9 | use core::ops::{Range, RangeBounds}; |
10 | |
11 | use crate::util::{escape::DebugByte, primitives::PatternID, utf8}; |
12 | |
13 | /// The parameters for a regex search including the haystack to search. |
14 | /// |
15 | /// It turns out that regex searches have a few parameters, and in most cases, |
16 | /// those parameters have defaults that work in the vast majority of cases. |
17 | /// This `Input` type exists to make that common case seamnless while also |
18 | /// providing an avenue for changing the parameters of a search. In particular, |
19 | /// this type enables doing so without a combinatorial explosion of different |
20 | /// methods and/or superfluous parameters in the common cases. |
21 | /// |
22 | /// An `Input` permits configuring the following things: |
23 | /// |
24 | /// * Search only a substring of a haystack, while taking the broader context |
25 | /// into account for resolving look-around assertions. |
26 | /// * Indicating whether to search for all patterns in a regex, or to |
27 | /// only search for one pattern in particular. |
28 | /// * Whether to perform an anchored on unanchored search. |
29 | /// * Whether to report a match as early as possible. |
30 | /// |
31 | /// All of these parameters, except for the haystack, have sensible default |
32 | /// values. This means that the minimal search configuration is simply a call |
33 | /// to [`Input::new`] with your haystack. Setting any other parameter is |
34 | /// optional. |
35 | /// |
36 | /// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a |
37 | /// `From<H> for Input` implementation. This is useful because many of the |
38 | /// search APIs in this crate accept an `Into<Input>`. This means you can |
39 | /// provide string or byte strings to these routines directly, and they'll |
40 | /// automatically get converted into an `Input` for you. |
41 | /// |
42 | /// The lifetime parameter `'h` refers to the lifetime of the haystack. |
43 | /// |
44 | /// # Organization |
45 | /// |
46 | /// The API of `Input` is split into a few different parts: |
47 | /// |
48 | /// * A builder-like API that transforms a `Input` by value. Examples: |
49 | /// [`Input::span`] and [`Input::anchored`]. |
50 | /// * A setter API that permits mutating parameters in place. Examples: |
51 | /// [`Input::set_span`] and [`Input::set_anchored`]. |
52 | /// * A getter API that permits retrieving any of the search parameters. |
53 | /// Examples: [`Input::get_span`] and [`Input::get_anchored`]. |
54 | /// * A few convenience getter routines that don't conform to the above naming |
55 | /// pattern due to how common they are. Examples: [`Input::haystack`], |
56 | /// [`Input::start`] and [`Input::end`]. |
57 | /// * Miscellaneous predicates and other helper routines that are useful |
58 | /// in some contexts. Examples: [`Input::is_char_boundary`]. |
59 | /// |
60 | /// A `Input` exposes so much because it is meant to be used by both callers of |
61 | /// regex engines _and_ implementors of regex engines. A constraining factor is |
62 | /// that regex engines should accept a `&Input` as its lowest level API, which |
63 | /// means that implementors should only use the "getter" APIs of a `Input`. |
64 | /// |
65 | /// # Valid bounds and search termination |
66 | /// |
67 | /// An `Input` permits setting the bounds of a search via either |
68 | /// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or |
69 | /// else a panic will occur. Bounds are valid if and only if: |
70 | /// |
71 | /// * The bounds represent a valid range into the input's haystack. |
72 | /// * **or** the end bound is a valid ending bound for the haystack *and* |
73 | /// the start bound is exactly one greater than the start bound. |
74 | /// |
75 | /// In the latter case, [`Input::is_done`] will return true and indicates any |
76 | /// search receiving such an input should immediately return with no match. |
77 | /// |
78 | /// Note that while `Input` is used for reverse searches in this crate, the |
79 | /// `Input::is_done` predicate assumes a forward search. Because unsigned |
80 | /// offsets are used internally, there is no way to tell from only the offsets |
81 | /// whether a reverse search is done or not. |
82 | /// |
83 | /// # Regex engine support |
84 | /// |
85 | /// Any regex engine accepting an `Input` must support at least the following |
86 | /// things: |
87 | /// |
88 | /// * Searching a `&[u8]` for matches. |
89 | /// * Searching a substring of `&[u8]` for a match, such that any match |
90 | /// reported must appear entirely within that substring. |
91 | /// * For a forwards search, a match should never be reported when |
92 | /// [`Input::is_done`] returns true. (For reverse searches, termination should |
93 | /// be handled outside of `Input`.) |
94 | /// |
95 | /// Supporting other aspects of an `Input` are optional, but regex engines |
96 | /// should handle aspects they don't support gracefully. How this is done is |
97 | /// generally up to the regex engine. This crate generally treats unsupported |
98 | /// anchored modes as an error to report for example, but for simplicity, in |
99 | /// the meta regex engine, trying to search with an invalid pattern ID just |
100 | /// results in no match being reported. |
101 | #[derive (Clone)] |
102 | pub struct Input<'h> { |
103 | haystack: &'h [u8], |
104 | span: Span, |
105 | anchored: Anchored, |
106 | earliest: bool, |
107 | } |
108 | |
109 | impl<'h> Input<'h> { |
110 | /// Create a new search configuration for the given haystack. |
111 | #[inline ] |
112 | pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { |
113 | // Perform only one call to `haystack.as_ref()` to protect from incorrect |
114 | // implementations that return different values from multiple calls. |
115 | // This is important because there's code that relies on `span` not being |
116 | // out of bounds with respect to the stored `haystack`. |
117 | let haystack = haystack.as_ref(); |
118 | Input { |
119 | haystack, |
120 | span: Span { start: 0, end: haystack.len() }, |
121 | anchored: Anchored::No, |
122 | earliest: false, |
123 | } |
124 | } |
125 | |
126 | /// Set the span for this search. |
127 | /// |
128 | /// This routine does not panic if the span given is not a valid range for |
129 | /// this search's haystack. If this search is run with an invalid range, |
130 | /// then the most likely outcome is that the actual search execution will |
131 | /// panic. |
132 | /// |
133 | /// This routine is generic over how a span is provided. While |
134 | /// a [`Span`] may be given directly, one may also provide a |
135 | /// `std::ops::Range<usize>`. To provide anything supported by range |
136 | /// syntax, use the [`Input::range`] method. |
137 | /// |
138 | /// The default span is the entire haystack. |
139 | /// |
140 | /// Note that [`Input::range`] overrides this method and vice versa. |
141 | /// |
142 | /// # Panics |
143 | /// |
144 | /// This panics if the given span does not correspond to valid bounds in |
145 | /// the haystack or the termination of a search. |
146 | /// |
147 | /// # Example |
148 | /// |
149 | /// This example shows how the span of the search can impact whether a |
150 | /// match is reported or not. This is particularly relevant for look-around |
151 | /// operators, which might take things outside of the span into account |
152 | /// when determining whether they match. |
153 | /// |
154 | /// ``` |
155 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
156 | /// use regex_automata::{ |
157 | /// nfa::thompson::pikevm::PikeVM, |
158 | /// Match, Input, |
159 | /// }; |
160 | /// |
161 | /// // Look for 'at', but as a distinct word. |
162 | /// let re = PikeVM::new(r"\bat\b" )?; |
163 | /// let mut cache = re.create_cache(); |
164 | /// let mut caps = re.create_captures(); |
165 | /// |
166 | /// // Our haystack contains 'at', but not as a distinct word. |
167 | /// let haystack = "batter" ; |
168 | /// |
169 | /// // A standard search finds nothing, as expected. |
170 | /// let input = Input::new(haystack); |
171 | /// re.search(&mut cache, &input, &mut caps); |
172 | /// assert_eq!(None, caps.get_match()); |
173 | /// |
174 | /// // But if we wanted to search starting at position '1', we might |
175 | /// // slice the haystack. If we do this, it's impossible for the \b |
176 | /// // anchors to take the surrounding context into account! And thus, |
177 | /// // a match is produced. |
178 | /// let input = Input::new(&haystack[1..3]); |
179 | /// re.search(&mut cache, &input, &mut caps); |
180 | /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match()); |
181 | /// |
182 | /// // But if we specify the span of the search instead of slicing the |
183 | /// // haystack, then the regex engine can "see" outside of the span |
184 | /// // and resolve the anchors correctly. |
185 | /// let input = Input::new(haystack).span(1..3); |
186 | /// re.search(&mut cache, &input, &mut caps); |
187 | /// assert_eq!(None, caps.get_match()); |
188 | /// |
189 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
190 | /// ``` |
191 | /// |
192 | /// This may seem a little ham-fisted, but this scenario tends to come up |
193 | /// if some other regex engine found the match span and now you need to |
194 | /// re-process that span to look for capturing groups. (e.g., Run a faster |
195 | /// DFA first, find a match, then run the PikeVM on just the match span to |
196 | /// resolve capturing groups.) In order to implement that sort of logic |
197 | /// correctly, you need to set the span on the search instead of slicing |
198 | /// the haystack directly. |
199 | /// |
200 | /// The other advantage of using this routine to specify the bounds of the |
201 | /// search is that the match offsets are still reported in terms of the |
202 | /// original haystack. For example, the second search in the example above |
203 | /// reported a match at position `0`, even though `at` starts at offset |
204 | /// `1` because we sliced the haystack. |
205 | #[inline ] |
206 | pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { |
207 | self.set_span(span); |
208 | self |
209 | } |
210 | |
211 | /// Like `Input::span`, but accepts any range instead. |
212 | /// |
213 | /// This routine does not panic if the range given is not a valid range for |
214 | /// this search's haystack. If this search is run with an invalid range, |
215 | /// then the most likely outcome is that the actual search execution will |
216 | /// panic. |
217 | /// |
218 | /// The default range is the entire haystack. |
219 | /// |
220 | /// Note that [`Input::span`] overrides this method and vice versa. |
221 | /// |
222 | /// # Panics |
223 | /// |
224 | /// This routine will panic if the given range could not be converted |
225 | /// to a valid [`Range`]. For example, this would panic when given |
226 | /// `0..=usize::MAX` since it cannot be represented using a half-open |
227 | /// interval in terms of `usize`. |
228 | /// |
229 | /// This also panics if the given range does not correspond to valid bounds |
230 | /// in the haystack or the termination of a search. |
231 | /// |
232 | /// # Example |
233 | /// |
234 | /// ``` |
235 | /// use regex_automata::Input; |
236 | /// |
237 | /// let input = Input::new("foobar" ); |
238 | /// assert_eq!(0..6, input.get_range()); |
239 | /// |
240 | /// let input = Input::new("foobar" ).range(2..=4); |
241 | /// assert_eq!(2..5, input.get_range()); |
242 | /// ``` |
243 | #[inline ] |
244 | pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { |
245 | self.set_range(range); |
246 | self |
247 | } |
248 | |
249 | /// Sets the anchor mode of a search. |
250 | /// |
251 | /// When a search is anchored (so that's [`Anchored::Yes`] or |
252 | /// [`Anchored::Pattern`]), a match must begin at the start of a search. |
253 | /// When a search is not anchored (that's [`Anchored::No`]), regex engines |
254 | /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix |
255 | /// permits a match to appear anywhere. |
256 | /// |
257 | /// By default, the anchored mode is [`Anchored::No`]. |
258 | /// |
259 | /// **WARNING:** this is subtly different than using a `^` at the start of |
260 | /// your regex. A `^` forces a regex to match exclusively at the start of |
261 | /// a haystack, regardless of where you begin your search. In contrast, |
262 | /// anchoring a search will allow your regex to match anywhere in your |
263 | /// haystack, but the match must start at the beginning of a search. |
264 | /// |
265 | /// For example, consider the haystack `aba` and the following searches: |
266 | /// |
267 | /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba` |
268 | /// starting at position `2`. Since `^` requires the match to start at |
269 | /// the beginning of the haystack and `2 > 0`, no match is found. |
270 | /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba` |
271 | /// starting at position `2`. This reports a match at `[2, 3]` since |
272 | /// the match starts where the search started. Since there is no `^`, |
273 | /// there is no requirement for the match to start at the beginning of |
274 | /// the haystack. |
275 | /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba` |
276 | /// starting at position `1`. Since `b` corresponds to position `1` and |
277 | /// since the search is anchored, it finds no match. While the regex |
278 | /// matches at other positions, configuring the search to be anchored |
279 | /// requires that it only report a match that begins at the same offset |
280 | /// as the beginning of the search. |
281 | /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba` |
282 | /// starting at position `1`. Since the search is not anchored and |
283 | /// the regex does not start with `^`, the search executes as if there |
284 | /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it |
285 | /// reports a match at `[2, 3]`. |
286 | /// |
287 | /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`, |
288 | /// except it only reports matches for a particular pattern. |
289 | /// |
290 | /// # Example |
291 | /// |
292 | /// This demonstrates the differences between an anchored search and |
293 | /// a pattern that begins with `^` (as described in the above warning |
294 | /// message). |
295 | /// |
296 | /// ``` |
297 | /// use regex_automata::{ |
298 | /// nfa::thompson::pikevm::PikeVM, |
299 | /// Anchored, Match, Input, |
300 | /// }; |
301 | /// |
302 | /// let haystack = "aba" ; |
303 | /// |
304 | /// let re = PikeVM::new(r"^a" )?; |
305 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
306 | /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No); |
307 | /// re.search(&mut cache, &input, &mut caps); |
308 | /// // No match is found because 2 is not the beginning of the haystack, |
309 | /// // which is what ^ requires. |
310 | /// assert_eq!(None, caps.get_match()); |
311 | /// |
312 | /// let re = PikeVM::new(r"a" )?; |
313 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
314 | /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes); |
315 | /// re.search(&mut cache, &input, &mut caps); |
316 | /// // An anchored search can still match anywhere in the haystack, it just |
317 | /// // must begin at the start of the search which is '2' in this case. |
318 | /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match()); |
319 | /// |
320 | /// let re = PikeVM::new(r"a" )?; |
321 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
322 | /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes); |
323 | /// re.search(&mut cache, &input, &mut caps); |
324 | /// // No match is found since we start searching at offset 1 which |
325 | /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match |
326 | /// // is found. |
327 | /// assert_eq!(None, caps.get_match()); |
328 | /// |
329 | /// let re = PikeVM::new(r"a" )?; |
330 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
331 | /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No); |
332 | /// re.search(&mut cache, &input, &mut caps); |
333 | /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the |
334 | /// // pattern. Even though the search starts at 'b', the 'match anything' |
335 | /// // prefix allows the search to match 'a'. |
336 | /// let expected = Some(Match::must(0, 2..3)); |
337 | /// assert_eq!(expected, caps.get_match()); |
338 | /// |
339 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
340 | /// ``` |
341 | #[inline ] |
342 | pub fn anchored(mut self, mode: Anchored) -> Input<'h> { |
343 | self.set_anchored(mode); |
344 | self |
345 | } |
346 | |
347 | /// Whether to execute an "earliest" search or not. |
348 | /// |
349 | /// When running a non-overlapping search, an "earliest" search will return |
350 | /// the match location as early as possible. For example, given a pattern |
351 | /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search |
352 | /// will return `foo12345` as a match. But an "earliest" search for regex |
353 | /// engines that support "earliest" semantics will return `foo1` as a |
354 | /// match, since as soon as the first digit following `foo` is seen, it is |
355 | /// known to have found a match. |
356 | /// |
357 | /// Note that "earliest" semantics generally depend on the regex engine. |
358 | /// Different regex engines may determine there is a match at different |
359 | /// points. So there is no guarantee that "earliest" matches will always |
360 | /// return the same offsets for all regex engines. The "earliest" notion |
361 | /// is really about when the particular regex engine determines there is |
362 | /// a match rather than a consistent semantic unto itself. This is often |
363 | /// useful for implementing "did a match occur or not" predicates, but |
364 | /// sometimes the offset is useful as well. |
365 | /// |
366 | /// This is disabled by default. |
367 | /// |
368 | /// # Example |
369 | /// |
370 | /// This example shows the difference between "earliest" searching and |
371 | /// normal searching. |
372 | /// |
373 | /// ``` |
374 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; |
375 | /// |
376 | /// let re = PikeVM::new(r"foo[0-9]+" )?; |
377 | /// let mut cache = re.create_cache(); |
378 | /// let mut caps = re.create_captures(); |
379 | /// |
380 | /// // A normal search implements greediness like you expect. |
381 | /// let input = Input::new("foo12345" ); |
382 | /// re.search(&mut cache, &input, &mut caps); |
383 | /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); |
384 | /// |
385 | /// // When 'earliest' is enabled and the regex engine supports |
386 | /// // it, the search will bail once it knows a match has been |
387 | /// // found. |
388 | /// let input = Input::new("foo12345" ).earliest(true); |
389 | /// re.search(&mut cache, &input, &mut caps); |
390 | /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match()); |
391 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
392 | /// ``` |
393 | #[inline ] |
394 | pub fn earliest(mut self, yes: bool) -> Input<'h> { |
395 | self.set_earliest(yes); |
396 | self |
397 | } |
398 | |
399 | /// Set the span for this search configuration. |
400 | /// |
401 | /// This is like the [`Input::span`] method, except this mutates the |
402 | /// span in place. |
403 | /// |
404 | /// This routine is generic over how a span is provided. While |
405 | /// a [`Span`] may be given directly, one may also provide a |
406 | /// `std::ops::Range<usize>`. |
407 | /// |
408 | /// # Panics |
409 | /// |
410 | /// This panics if the given span does not correspond to valid bounds in |
411 | /// the haystack or the termination of a search. |
412 | /// |
413 | /// # Example |
414 | /// |
415 | /// ``` |
416 | /// use regex_automata::Input; |
417 | /// |
418 | /// let mut input = Input::new("foobar" ); |
419 | /// assert_eq!(0..6, input.get_range()); |
420 | /// input.set_span(2..4); |
421 | /// assert_eq!(2..4, input.get_range()); |
422 | /// ``` |
423 | #[inline ] |
424 | pub fn set_span<S: Into<Span>>(&mut self, span: S) { |
425 | let span = span.into(); |
426 | assert!( |
427 | span.end <= self.haystack.len() |
428 | && span.start <= span.end.wrapping_add(1), |
429 | "invalid span {:?} for haystack of length {}" , |
430 | span, |
431 | self.haystack.len(), |
432 | ); |
433 | self.span = span; |
434 | } |
435 | |
436 | /// Set the span for this search configuration given any range. |
437 | /// |
438 | /// This is like the [`Input::range`] method, except this mutates the |
439 | /// span in place. |
440 | /// |
441 | /// This routine does not panic if the range given is not a valid range for |
442 | /// this search's haystack. If this search is run with an invalid range, |
443 | /// then the most likely outcome is that the actual search execution will |
444 | /// panic. |
445 | /// |
446 | /// # Panics |
447 | /// |
448 | /// This routine will panic if the given range could not be converted |
449 | /// to a valid [`Range`]. For example, this would panic when given |
450 | /// `0..=usize::MAX` since it cannot be represented using a half-open |
451 | /// interval in terms of `usize`. |
452 | /// |
453 | /// This also panics if the given span does not correspond to valid bounds |
454 | /// in the haystack or the termination of a search. |
455 | /// |
456 | /// # Example |
457 | /// |
458 | /// ``` |
459 | /// use regex_automata::Input; |
460 | /// |
461 | /// let mut input = Input::new("foobar" ); |
462 | /// assert_eq!(0..6, input.get_range()); |
463 | /// input.set_range(2..=4); |
464 | /// assert_eq!(2..5, input.get_range()); |
465 | /// ``` |
466 | #[inline ] |
467 | pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { |
468 | use core::ops::Bound; |
469 | |
470 | // It's a little weird to convert ranges into spans, and then spans |
471 | // back into ranges when we actually slice the haystack. Because |
472 | // of that process, we always represent everything as a half-open |
473 | // internal. Therefore, handling things like m..=n is a little awkward. |
474 | let start = match range.start_bound() { |
475 | Bound::Included(&i) => i, |
476 | // Can this case ever happen? Range syntax doesn't support it... |
477 | Bound::Excluded(&i) => i.checked_add(1).unwrap(), |
478 | Bound::Unbounded => 0, |
479 | }; |
480 | let end = match range.end_bound() { |
481 | Bound::Included(&i) => i.checked_add(1).unwrap(), |
482 | Bound::Excluded(&i) => i, |
483 | Bound::Unbounded => self.haystack().len(), |
484 | }; |
485 | self.set_span(Span { start, end }); |
486 | } |
487 | |
488 | /// Set the starting offset for the span for this search configuration. |
489 | /// |
490 | /// This is a convenience routine for only mutating the start of a span |
491 | /// without having to set the entire span. |
492 | /// |
493 | /// # Panics |
494 | /// |
495 | /// This panics if the span resulting from the new start position does not |
496 | /// correspond to valid bounds in the haystack or the termination of a |
497 | /// search. |
498 | /// |
499 | /// # Example |
500 | /// |
501 | /// ``` |
502 | /// use regex_automata::Input; |
503 | /// |
504 | /// let mut input = Input::new("foobar" ); |
505 | /// assert_eq!(0..6, input.get_range()); |
506 | /// input.set_start(5); |
507 | /// assert_eq!(5..6, input.get_range()); |
508 | /// ``` |
509 | #[inline ] |
510 | pub fn set_start(&mut self, start: usize) { |
511 | self.set_span(Span { start, ..self.get_span() }); |
512 | } |
513 | |
514 | /// Set the ending offset for the span for this search configuration. |
515 | /// |
516 | /// This is a convenience routine for only mutating the end of a span |
517 | /// without having to set the entire span. |
518 | /// |
519 | /// # Panics |
520 | /// |
521 | /// This panics if the span resulting from the new end position does not |
522 | /// correspond to valid bounds in the haystack or the termination of a |
523 | /// search. |
524 | /// |
525 | /// # Example |
526 | /// |
527 | /// ``` |
528 | /// use regex_automata::Input; |
529 | /// |
530 | /// let mut input = Input::new("foobar" ); |
531 | /// assert_eq!(0..6, input.get_range()); |
532 | /// input.set_end(5); |
533 | /// assert_eq!(0..5, input.get_range()); |
534 | /// ``` |
535 | #[inline ] |
536 | pub fn set_end(&mut self, end: usize) { |
537 | self.set_span(Span { end, ..self.get_span() }); |
538 | } |
539 | |
540 | /// Set the anchor mode of a search. |
541 | /// |
542 | /// This is like [`Input::anchored`], except it mutates the search |
543 | /// configuration in place. |
544 | /// |
545 | /// # Example |
546 | /// |
547 | /// ``` |
548 | /// use regex_automata::{Anchored, Input, PatternID}; |
549 | /// |
550 | /// let mut input = Input::new("foobar" ); |
551 | /// assert_eq!(Anchored::No, input.get_anchored()); |
552 | /// |
553 | /// let pid = PatternID::must(5); |
554 | /// input.set_anchored(Anchored::Pattern(pid)); |
555 | /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); |
556 | /// ``` |
557 | #[inline ] |
558 | pub fn set_anchored(&mut self, mode: Anchored) { |
559 | self.anchored = mode; |
560 | } |
561 | |
562 | /// Set whether the search should execute in "earliest" mode or not. |
563 | /// |
564 | /// This is like [`Input::earliest`], except it mutates the search |
565 | /// configuration in place. |
566 | /// |
567 | /// # Example |
568 | /// |
569 | /// ``` |
570 | /// use regex_automata::Input; |
571 | /// |
572 | /// let mut input = Input::new("foobar" ); |
573 | /// assert!(!input.get_earliest()); |
574 | /// input.set_earliest(true); |
575 | /// assert!(input.get_earliest()); |
576 | /// ``` |
577 | #[inline ] |
578 | pub fn set_earliest(&mut self, yes: bool) { |
579 | self.earliest = yes; |
580 | } |
581 | |
582 | /// Return a borrow of the underlying haystack as a slice of bytes. |
583 | /// |
584 | /// # Example |
585 | /// |
586 | /// ``` |
587 | /// use regex_automata::Input; |
588 | /// |
589 | /// let input = Input::new("foobar" ); |
590 | /// assert_eq!(b"foobar" , input.haystack()); |
591 | /// ``` |
592 | #[inline ] |
593 | pub fn haystack(&self) -> &[u8] { |
594 | self.haystack |
595 | } |
596 | |
597 | /// Return the start position of this search. |
598 | /// |
599 | /// This is a convenience routine for `search.get_span().start()`. |
600 | /// |
601 | /// When [`Input::is_done`] is `false`, this is guaranteed to return |
602 | /// an offset that is less than or equal to [`Input::end`]. Otherwise, |
603 | /// the offset is one greater than [`Input::end`]. |
604 | /// |
605 | /// # Example |
606 | /// |
607 | /// ``` |
608 | /// use regex_automata::Input; |
609 | /// |
610 | /// let input = Input::new("foobar" ); |
611 | /// assert_eq!(0, input.start()); |
612 | /// |
613 | /// let input = Input::new("foobar" ).span(2..4); |
614 | /// assert_eq!(2, input.start()); |
615 | /// ``` |
616 | #[inline ] |
617 | pub fn start(&self) -> usize { |
618 | self.get_span().start |
619 | } |
620 | |
621 | /// Return the end position of this search. |
622 | /// |
623 | /// This is a convenience routine for `search.get_span().end()`. |
624 | /// |
625 | /// This is guaranteed to return an offset that is a valid exclusive end |
626 | /// bound for this input's haystack. |
627 | /// |
628 | /// # Example |
629 | /// |
630 | /// ``` |
631 | /// use regex_automata::Input; |
632 | /// |
633 | /// let input = Input::new("foobar" ); |
634 | /// assert_eq!(6, input.end()); |
635 | /// |
636 | /// let input = Input::new("foobar" ).span(2..4); |
637 | /// assert_eq!(4, input.end()); |
638 | /// ``` |
639 | #[inline ] |
640 | pub fn end(&self) -> usize { |
641 | self.get_span().end |
642 | } |
643 | |
644 | /// Return the span for this search configuration. |
645 | /// |
646 | /// If one was not explicitly set, then the span corresponds to the entire |
647 | /// range of the haystack. |
648 | /// |
649 | /// When [`Input::is_done`] is `false`, the span returned is guaranteed |
650 | /// to correspond to valid bounds for this input's haystack. |
651 | /// |
652 | /// # Example |
653 | /// |
654 | /// ``` |
655 | /// use regex_automata::{Input, Span}; |
656 | /// |
657 | /// let input = Input::new("foobar" ); |
658 | /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); |
659 | /// ``` |
660 | #[inline ] |
661 | pub fn get_span(&self) -> Span { |
662 | self.span |
663 | } |
664 | |
665 | /// Return the span as a range for this search configuration. |
666 | /// |
667 | /// If one was not explicitly set, then the span corresponds to the entire |
668 | /// range of the haystack. |
669 | /// |
670 | /// When [`Input::is_done`] is `false`, the range returned is guaranteed |
671 | /// to correspond to valid bounds for this input's haystack. |
672 | /// |
673 | /// # Example |
674 | /// |
675 | /// ``` |
676 | /// use regex_automata::Input; |
677 | /// |
678 | /// let input = Input::new("foobar" ); |
679 | /// assert_eq!(0..6, input.get_range()); |
680 | /// ``` |
681 | #[inline ] |
682 | pub fn get_range(&self) -> Range<usize> { |
683 | self.get_span().range() |
684 | } |
685 | |
686 | /// Return the anchored mode for this search configuration. |
687 | /// |
688 | /// If no anchored mode was set, then it defaults to [`Anchored::No`]. |
689 | /// |
690 | /// # Example |
691 | /// |
692 | /// ``` |
693 | /// use regex_automata::{Anchored, Input, PatternID}; |
694 | /// |
695 | /// let mut input = Input::new("foobar" ); |
696 | /// assert_eq!(Anchored::No, input.get_anchored()); |
697 | /// |
698 | /// let pid = PatternID::must(5); |
699 | /// input.set_anchored(Anchored::Pattern(pid)); |
700 | /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); |
701 | /// ``` |
702 | #[inline ] |
703 | pub fn get_anchored(&self) -> Anchored { |
704 | self.anchored |
705 | } |
706 | |
707 | /// Return whether this search should execute in "earliest" mode. |
708 | /// |
709 | /// # Example |
710 | /// |
711 | /// ``` |
712 | /// use regex_automata::Input; |
713 | /// |
714 | /// let input = Input::new("foobar" ); |
715 | /// assert!(!input.get_earliest()); |
716 | /// ``` |
717 | #[inline ] |
718 | pub fn get_earliest(&self) -> bool { |
719 | self.earliest |
720 | } |
721 | |
722 | /// Return true if and only if this search can never return any other |
723 | /// matches. |
724 | /// |
725 | /// This occurs when the start position of this search is greater than the |
726 | /// end position of the search. |
727 | /// |
728 | /// # Example |
729 | /// |
730 | /// ``` |
731 | /// use regex_automata::Input; |
732 | /// |
733 | /// let mut input = Input::new("foobar" ); |
734 | /// assert!(!input.is_done()); |
735 | /// input.set_start(6); |
736 | /// assert!(!input.is_done()); |
737 | /// input.set_start(7); |
738 | /// assert!(input.is_done()); |
739 | /// ``` |
740 | #[inline ] |
741 | pub fn is_done(&self) -> bool { |
742 | self.get_span().start > self.get_span().end |
743 | } |
744 | |
745 | /// Returns true if and only if the given offset in this search's haystack |
746 | /// falls on a valid UTF-8 encoded codepoint boundary. |
747 | /// |
748 | /// If the haystack is not valid UTF-8, then the behavior of this routine |
749 | /// is unspecified. |
750 | /// |
751 | /// # Example |
752 | /// |
753 | /// This shows where codepoint boundaries do and don't exist in valid |
754 | /// UTF-8. |
755 | /// |
756 | /// ``` |
757 | /// use regex_automata::Input; |
758 | /// |
759 | /// let input = Input::new("☃" ); |
760 | /// assert!(input.is_char_boundary(0)); |
761 | /// assert!(!input.is_char_boundary(1)); |
762 | /// assert!(!input.is_char_boundary(2)); |
763 | /// assert!(input.is_char_boundary(3)); |
764 | /// assert!(!input.is_char_boundary(4)); |
765 | /// ``` |
766 | #[inline ] |
767 | pub fn is_char_boundary(&self, offset: usize) -> bool { |
768 | utf8::is_boundary(self.haystack(), offset) |
769 | } |
770 | } |
771 | |
772 | impl<'h> core::fmt::Debug for Input<'h> { |
773 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
774 | use crate::util::escape::DebugHaystack; |
775 | |
776 | f&mut DebugStruct<'_, '_>.debug_struct("Input" ) |
777 | .field("haystack" , &DebugHaystack(self.haystack())) |
778 | .field("span" , &self.span) |
779 | .field("anchored" , &self.anchored) |
780 | .field(name:"earliest" , &self.earliest) |
781 | .finish() |
782 | } |
783 | } |
784 | |
785 | impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { |
786 | fn from(haystack: &'h H) -> Input<'h> { |
787 | Input::new(haystack) |
788 | } |
789 | } |
790 | |
791 | /// A representation of a span reported by a regex engine. |
792 | /// |
793 | /// A span corresponds to the starting and ending _byte offsets_ of a |
794 | /// contiguous region of bytes. The starting offset is inclusive while the |
795 | /// ending offset is exclusive. That is, a span is a half-open interval. |
796 | /// |
797 | /// A span is used to report the offsets of a match, but it is also used to |
798 | /// convey which region of a haystack should be searched via routines like |
799 | /// [`Input::span`]. |
800 | /// |
801 | /// This is basically equivalent to a `std::ops::Range<usize>`, except this |
802 | /// type implements `Copy` which makes it more ergonomic to use in the context |
803 | /// of this crate. Like a range, this implements `Index` for `[u8]` and `str`, |
804 | /// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`, |
805 | /// which means things like `Span::from(5..10)` work. |
806 | #[derive (Clone, Copy, Eq, Hash, PartialEq)] |
807 | pub struct Span { |
808 | /// The start offset of the span, inclusive. |
809 | pub start: usize, |
810 | /// The end offset of the span, exclusive. |
811 | pub end: usize, |
812 | } |
813 | |
814 | impl Span { |
815 | /// Returns this span as a range. |
816 | #[inline ] |
817 | pub fn range(&self) -> Range<usize> { |
818 | Range::from(*self) |
819 | } |
820 | |
821 | /// Returns true when this span is empty. That is, when `start >= end`. |
822 | #[inline ] |
823 | pub fn is_empty(&self) -> bool { |
824 | self.start >= self.end |
825 | } |
826 | |
827 | /// Returns the length of this span. |
828 | /// |
829 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
830 | #[inline ] |
831 | pub fn len(&self) -> usize { |
832 | self.end.saturating_sub(self.start) |
833 | } |
834 | |
835 | /// Returns true when the given offset is contained within this span. |
836 | /// |
837 | /// Note that an empty span contains no offsets and will always return |
838 | /// false. |
839 | #[inline ] |
840 | pub fn contains(&self, offset: usize) -> bool { |
841 | !self.is_empty() && self.start <= offset && offset <= self.end |
842 | } |
843 | |
844 | /// Returns a new span with `offset` added to this span's `start` and `end` |
845 | /// values. |
846 | #[inline ] |
847 | pub fn offset(&self, offset: usize) -> Span { |
848 | Span { start: self.start + offset, end: self.end + offset } |
849 | } |
850 | } |
851 | |
852 | impl core::fmt::Debug for Span { |
853 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
854 | write!(f, " {}.. {}" , self.start, self.end) |
855 | } |
856 | } |
857 | |
858 | impl core::ops::Index<Span> for [u8] { |
859 | type Output = [u8]; |
860 | |
861 | #[inline ] |
862 | fn index(&self, index: Span) -> &[u8] { |
863 | &self[index.range()] |
864 | } |
865 | } |
866 | |
867 | impl core::ops::IndexMut<Span> for [u8] { |
868 | #[inline ] |
869 | fn index_mut(&mut self, index: Span) -> &mut [u8] { |
870 | &mut self[index.range()] |
871 | } |
872 | } |
873 | |
874 | impl core::ops::Index<Span> for str { |
875 | type Output = str; |
876 | |
877 | #[inline ] |
878 | fn index(&self, index: Span) -> &str { |
879 | &self[index.range()] |
880 | } |
881 | } |
882 | |
883 | impl From<Range<usize>> for Span { |
884 | #[inline ] |
885 | fn from(range: Range<usize>) -> Span { |
886 | Span { start: range.start, end: range.end } |
887 | } |
888 | } |
889 | |
890 | impl From<Span> for Range<usize> { |
891 | #[inline ] |
892 | fn from(span: Span) -> Range<usize> { |
893 | Range { start: span.start, end: span.end } |
894 | } |
895 | } |
896 | |
897 | impl PartialEq<Range<usize>> for Span { |
898 | #[inline ] |
899 | fn eq(&self, range: &Range<usize>) -> bool { |
900 | self.start == range.start && self.end == range.end |
901 | } |
902 | } |
903 | |
904 | impl PartialEq<Span> for Range<usize> { |
905 | #[inline ] |
906 | fn eq(&self, span: &Span) -> bool { |
907 | self.start == span.start && self.end == span.end |
908 | } |
909 | } |
910 | |
911 | /// A representation of "half" of a match reported by a DFA. |
912 | /// |
913 | /// This is called a "half" match because it only includes the end location (or |
914 | /// start location for a reverse search) of a match. This corresponds to the |
915 | /// information that a single DFA scan can report. Getting the other half of |
916 | /// the match requires a second scan with a reversed DFA. |
917 | /// |
918 | /// A half match also includes the pattern that matched. The pattern is |
919 | /// identified by an ID, which corresponds to its position (starting from `0`) |
920 | /// relative to other patterns used to construct the corresponding DFA. If only |
921 | /// a single pattern is provided to the DFA, then all matches are guaranteed to |
922 | /// have a pattern ID of `0`. |
923 | #[derive (Clone, Copy, Debug, Eq, Hash, PartialEq)] |
924 | pub struct HalfMatch { |
925 | /// The pattern ID. |
926 | pattern: PatternID, |
927 | /// The offset of the match. |
928 | /// |
929 | /// For forward searches, the offset is exclusive. For reverse searches, |
930 | /// the offset is inclusive. |
931 | offset: usize, |
932 | } |
933 | |
934 | impl HalfMatch { |
935 | /// Create a new half match from a pattern ID and a byte offset. |
936 | #[inline ] |
937 | pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { |
938 | HalfMatch { pattern, offset } |
939 | } |
940 | |
941 | /// Create a new half match from a pattern ID and a byte offset. |
942 | /// |
943 | /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a |
944 | /// [`PatternID`]. This panics if the given `usize` is not representable |
945 | /// as a `PatternID`. |
946 | #[inline ] |
947 | pub fn must(pattern: usize, offset: usize) -> HalfMatch { |
948 | HalfMatch::new(PatternID::new(pattern).unwrap(), offset) |
949 | } |
950 | |
951 | /// Returns the ID of the pattern that matched. |
952 | /// |
953 | /// The ID of a pattern is derived from the position in which it was |
954 | /// originally inserted into the corresponding DFA. The first pattern has |
955 | /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. |
956 | #[inline ] |
957 | pub fn pattern(&self) -> PatternID { |
958 | self.pattern |
959 | } |
960 | |
961 | /// The position of the match. |
962 | /// |
963 | /// If this match was produced by a forward search, then the offset is |
964 | /// exclusive. If this match was produced by a reverse search, then the |
965 | /// offset is inclusive. |
966 | #[inline ] |
967 | pub fn offset(&self) -> usize { |
968 | self.offset |
969 | } |
970 | } |
971 | |
972 | /// A representation of a match reported by a regex engine. |
973 | /// |
974 | /// A match has two essential pieces of information: the [`PatternID`] that |
975 | /// matches, and the [`Span`] of the match in a haystack. |
976 | /// |
977 | /// The pattern is identified by an ID, which corresponds to its position |
978 | /// (starting from `0`) relative to other patterns used to construct the |
979 | /// corresponding regex engine. If only a single pattern is provided, then all |
980 | /// matches are guaranteed to have a pattern ID of `0`. |
981 | /// |
982 | /// Every match reported by a regex engine guarantees that its span has its |
983 | /// start offset as less than or equal to its end offset. |
984 | #[derive (Clone, Copy, Debug, Eq, Hash, PartialEq)] |
985 | pub struct Match { |
986 | /// The pattern ID. |
987 | pattern: PatternID, |
988 | /// The underlying match span. |
989 | span: Span, |
990 | } |
991 | |
992 | impl Match { |
993 | /// Create a new match from a pattern ID and a span. |
994 | /// |
995 | /// This constructor is generic over how a span is provided. While |
996 | /// a [`Span`] may be given directly, one may also provide a |
997 | /// `std::ops::Range<usize>`. |
998 | /// |
999 | /// # Panics |
1000 | /// |
1001 | /// This panics if `end < start`. |
1002 | /// |
1003 | /// # Example |
1004 | /// |
1005 | /// This shows how to create a match for the first pattern in a regex |
1006 | /// object using convenient range syntax. |
1007 | /// |
1008 | /// ``` |
1009 | /// use regex_automata::{Match, PatternID}; |
1010 | /// |
1011 | /// let m = Match::new(PatternID::ZERO, 5..10); |
1012 | /// assert_eq!(0, m.pattern().as_usize()); |
1013 | /// assert_eq!(5, m.start()); |
1014 | /// assert_eq!(10, m.end()); |
1015 | /// ``` |
1016 | #[inline ] |
1017 | pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { |
1018 | let span: Span = span.into(); |
1019 | assert!(span.start <= span.end, "invalid match span" ); |
1020 | Match { pattern, span } |
1021 | } |
1022 | |
1023 | /// Create a new match from a pattern ID and a byte offset span. |
1024 | /// |
1025 | /// This constructor is generic over how a span is provided. While |
1026 | /// a [`Span`] may be given directly, one may also provide a |
1027 | /// `std::ops::Range<usize>`. |
1028 | /// |
1029 | /// This is like [`Match::new`], but accepts a `usize` instead of a |
1030 | /// [`PatternID`]. This panics if the given `usize` is not representable |
1031 | /// as a `PatternID`. |
1032 | /// |
1033 | /// # Panics |
1034 | /// |
1035 | /// This panics if `end < start` or if `pattern > PatternID::MAX`. |
1036 | /// |
1037 | /// # Example |
1038 | /// |
1039 | /// This shows how to create a match for the third pattern in a regex |
1040 | /// object using convenient range syntax. |
1041 | /// |
1042 | /// ``` |
1043 | /// use regex_automata::Match; |
1044 | /// |
1045 | /// let m = Match::must(3, 5..10); |
1046 | /// assert_eq!(3, m.pattern().as_usize()); |
1047 | /// assert_eq!(5, m.start()); |
1048 | /// assert_eq!(10, m.end()); |
1049 | /// ``` |
1050 | #[inline ] |
1051 | pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { |
1052 | Match::new(PatternID::must(pattern), span) |
1053 | } |
1054 | |
1055 | /// Returns the ID of the pattern that matched. |
1056 | /// |
1057 | /// The ID of a pattern is derived from the position in which it was |
1058 | /// originally inserted into the corresponding regex engine. The first |
1059 | /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and |
1060 | /// so on. |
1061 | #[inline ] |
1062 | pub fn pattern(&self) -> PatternID { |
1063 | self.pattern |
1064 | } |
1065 | |
1066 | /// The starting position of the match. |
1067 | /// |
1068 | /// This is a convenience routine for `Match::span().start`. |
1069 | #[inline ] |
1070 | pub fn start(&self) -> usize { |
1071 | self.span().start |
1072 | } |
1073 | |
1074 | /// The ending position of the match. |
1075 | /// |
1076 | /// This is a convenience routine for `Match::span().end`. |
1077 | #[inline ] |
1078 | pub fn end(&self) -> usize { |
1079 | self.span().end |
1080 | } |
1081 | |
1082 | /// Returns the match span as a range. |
1083 | /// |
1084 | /// This is a convenience routine for `Match::span().range()`. |
1085 | #[inline ] |
1086 | pub fn range(&self) -> core::ops::Range<usize> { |
1087 | self.span().range() |
1088 | } |
1089 | |
1090 | /// Returns the span for this match. |
1091 | #[inline ] |
1092 | pub fn span(&self) -> Span { |
1093 | self.span |
1094 | } |
1095 | |
1096 | /// Returns true when the span in this match is empty. |
1097 | /// |
1098 | /// An empty match can only be returned when the regex itself can match |
1099 | /// the empty string. |
1100 | #[inline ] |
1101 | pub fn is_empty(&self) -> bool { |
1102 | self.span().is_empty() |
1103 | } |
1104 | |
1105 | /// Returns the length of this match. |
1106 | /// |
1107 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
1108 | #[inline ] |
1109 | pub fn len(&self) -> usize { |
1110 | self.span().len() |
1111 | } |
1112 | } |
1113 | |
1114 | /// A set of `PatternID`s. |
1115 | /// |
1116 | /// A set of pattern identifiers is useful for recording which patterns have |
1117 | /// matched a particular haystack. A pattern set _only_ includes pattern |
1118 | /// identifiers. It does not include offset information. |
1119 | /// |
1120 | /// # Example |
1121 | /// |
1122 | /// This shows basic usage of a set. |
1123 | /// |
1124 | /// ``` |
1125 | /// use regex_automata::{PatternID, PatternSet}; |
1126 | /// |
1127 | /// let pid1 = PatternID::must(5); |
1128 | /// let pid2 = PatternID::must(8); |
1129 | /// // Create a new empty set. |
1130 | /// let mut set = PatternSet::new(10); |
1131 | /// // Insert pattern IDs. |
1132 | /// set.insert(pid1); |
1133 | /// set.insert(pid2); |
1134 | /// // Test membership. |
1135 | /// assert!(set.contains(pid1)); |
1136 | /// assert!(set.contains(pid2)); |
1137 | /// // Get all members. |
1138 | /// assert_eq!( |
1139 | /// vec![5, 8], |
1140 | /// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(), |
1141 | /// ); |
1142 | /// // Clear the set. |
1143 | /// set.clear(); |
1144 | /// // Test that it is indeed empty. |
1145 | /// assert!(set.is_empty()); |
1146 | /// ``` |
1147 | #[cfg (feature = "alloc" )] |
1148 | #[derive (Clone, Debug, Eq, PartialEq)] |
1149 | pub struct PatternSet { |
1150 | /// The number of patterns set to 'true' in this set. |
1151 | len: usize, |
1152 | /// A map from PatternID to boolean of whether a pattern matches or not. |
1153 | /// |
1154 | /// This should probably be a bitset, but it's probably unlikely to matter |
1155 | /// much in practice. |
1156 | /// |
1157 | /// The main downside of this representation (and similarly for a bitset) |
1158 | /// is that iteration scales with the capacity of the set instead of |
1159 | /// the length of the set. This doesn't seem likely to be a problem in |
1160 | /// practice. |
1161 | /// |
1162 | /// Another alternative is to just use a 'SparseSet' for this. It does use |
1163 | /// more memory (quite a bit more), but that seems fine I think compared |
1164 | /// to the memory being used by the regex engine. The real hiccup with |
1165 | /// it is that it yields pattern IDs in the order they were inserted. |
1166 | /// Which is actually kind of nice, but at the time of writing, pattern |
1167 | /// IDs are yielded in ascending order in the regex crate RegexSet API. |
1168 | /// If we did change to 'SparseSet', we could provide an additional |
1169 | /// 'iter_match_order' iterator, but keep the ascending order one for |
1170 | /// compatibility. |
1171 | which: alloc::boxed::Box<[bool]>, |
1172 | } |
1173 | |
1174 | #[cfg (feature = "alloc" )] |
1175 | impl PatternSet { |
1176 | /// Create a new set of pattern identifiers with the given capacity. |
1177 | /// |
1178 | /// The given capacity typically corresponds to (at least) the number of |
1179 | /// patterns in a compiled regex object. |
1180 | /// |
1181 | /// # Panics |
1182 | /// |
1183 | /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is |
1184 | /// impossible if you use the `pattern_len()` method as defined on any of |
1185 | /// the regex engines in this crate. Namely, a regex will fail to build by |
1186 | /// returning an error if the number of patterns given to it exceeds the |
1187 | /// limit. Therefore, the number of patterns in a valid regex is always |
1188 | /// a correct capacity to provide here. |
1189 | pub fn new(capacity: usize) -> PatternSet { |
1190 | assert!( |
1191 | capacity <= PatternID::LIMIT, |
1192 | "pattern set capacity exceeds limit of {}" , |
1193 | PatternID::LIMIT, |
1194 | ); |
1195 | PatternSet { |
1196 | len: 0, |
1197 | which: alloc::vec![false; capacity].into_boxed_slice(), |
1198 | } |
1199 | } |
1200 | |
1201 | /// Clear this set such that it contains no pattern IDs. |
1202 | pub fn clear(&mut self) { |
1203 | self.len = 0; |
1204 | for matched in self.which.iter_mut() { |
1205 | *matched = false; |
1206 | } |
1207 | } |
1208 | |
1209 | /// Return true if and only if the given pattern identifier is in this set. |
1210 | pub fn contains(&self, pid: PatternID) -> bool { |
1211 | pid.as_usize() < self.capacity() && self.which[pid] |
1212 | } |
1213 | |
1214 | /// Insert the given pattern identifier into this set and return `true` if |
1215 | /// the given pattern ID was not previously in this set. |
1216 | /// |
1217 | /// If the pattern identifier is already in this set, then this is a no-op. |
1218 | /// |
1219 | /// Use [`PatternSet::try_insert`] for a fallible version of this routine. |
1220 | /// |
1221 | /// # Panics |
1222 | /// |
1223 | /// This panics if this pattern set has insufficient capacity to |
1224 | /// store the given pattern ID. |
1225 | pub fn insert(&mut self, pid: PatternID) -> bool { |
1226 | self.try_insert(pid) |
1227 | .expect("PatternSet should have sufficient capacity" ) |
1228 | } |
1229 | |
1230 | /// Insert the given pattern identifier into this set and return `true` if |
1231 | /// the given pattern ID was not previously in this set. |
1232 | /// |
1233 | /// If the pattern identifier is already in this set, then this is a no-op. |
1234 | /// |
1235 | /// # Errors |
1236 | /// |
1237 | /// This returns an error if this pattern set has insufficient capacity to |
1238 | /// store the given pattern ID. |
1239 | pub fn try_insert( |
1240 | &mut self, |
1241 | pid: PatternID, |
1242 | ) -> Result<bool, PatternSetInsertError> { |
1243 | if pid.as_usize() >= self.capacity() { |
1244 | return Err(PatternSetInsertError { |
1245 | attempted: pid, |
1246 | capacity: self.capacity(), |
1247 | }); |
1248 | } |
1249 | if self.which[pid] { |
1250 | return Ok(false); |
1251 | } |
1252 | self.len += 1; |
1253 | self.which[pid] = true; |
1254 | Ok(true) |
1255 | } |
1256 | |
1257 | /* |
1258 | // This is currently commented out because it is unused and it is unclear |
1259 | // whether it's useful or not. What's the harm in having it? When, if |
1260 | // we ever wanted to change our representation to a 'SparseSet', then |
1261 | // supporting this method would be a bit tricky. So in order to keep some |
1262 | // API evolution flexibility, we leave it out for now. |
1263 | |
1264 | /// Remove the given pattern identifier from this set. |
1265 | /// |
1266 | /// If the pattern identifier was not previously in this set, then this |
1267 | /// does not change the set and returns `false`. |
1268 | /// |
1269 | /// # Panics |
1270 | /// |
1271 | /// This panics if `pid` exceeds the capacity of this set. |
1272 | pub fn remove(&mut self, pid: PatternID) -> bool { |
1273 | if !self.which[pid] { |
1274 | return false; |
1275 | } |
1276 | self.len -= 1; |
1277 | self.which[pid] = false; |
1278 | true |
1279 | } |
1280 | */ |
1281 | |
1282 | /// Return true if and only if this set has no pattern identifiers in it. |
1283 | pub fn is_empty(&self) -> bool { |
1284 | self.len() == 0 |
1285 | } |
1286 | |
1287 | /// Return true if and only if this set has the maximum number of pattern |
1288 | /// identifiers in the set. This occurs precisely when `PatternSet::len() |
1289 | /// == PatternSet::capacity()`. |
1290 | /// |
1291 | /// This particular property is useful to test because it may allow one to |
1292 | /// stop a search earlier than you might otherwise. Namely, if a search is |
1293 | /// only reporting which patterns match a haystack and if you know all of |
1294 | /// the patterns match at a given point, then there's no new information |
1295 | /// that can be learned by continuing the search. (Because a pattern set |
1296 | /// does not keep track of offset information.) |
1297 | pub fn is_full(&self) -> bool { |
1298 | self.len() == self.capacity() |
1299 | } |
1300 | |
1301 | /// Returns the total number of pattern identifiers in this set. |
1302 | pub fn len(&self) -> usize { |
1303 | self.len |
1304 | } |
1305 | |
1306 | /// Returns the total number of pattern identifiers that may be stored |
1307 | /// in this set. |
1308 | /// |
1309 | /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`]. |
1310 | /// |
1311 | /// Typically, the capacity of a pattern set matches the number of patterns |
1312 | /// in a regex object with which you are searching. |
1313 | pub fn capacity(&self) -> usize { |
1314 | self.which.len() |
1315 | } |
1316 | |
1317 | /// Returns an iterator over all pattern identifiers in this set. |
1318 | /// |
1319 | /// The iterator yields pattern identifiers in ascending order, starting |
1320 | /// at zero. |
1321 | pub fn iter(&self) -> PatternSetIter<'_> { |
1322 | PatternSetIter { it: self.which.iter().enumerate() } |
1323 | } |
1324 | } |
1325 | |
1326 | /// An error that occurs when a `PatternID` failed to insert into a |
1327 | /// `PatternSet`. |
1328 | /// |
1329 | /// An insert fails when the given `PatternID` exceeds the configured capacity |
1330 | /// of the `PatternSet`. |
1331 | /// |
1332 | /// This error is created by the [`PatternSet::try_insert`] routine. |
1333 | #[cfg (feature = "alloc" )] |
1334 | #[derive (Clone, Debug)] |
1335 | pub struct PatternSetInsertError { |
1336 | attempted: PatternID, |
1337 | capacity: usize, |
1338 | } |
1339 | |
1340 | #[cfg (feature = "std" )] |
1341 | impl std::error::Error for PatternSetInsertError {} |
1342 | |
1343 | #[cfg (feature = "alloc" )] |
1344 | impl core::fmt::Display for PatternSetInsertError { |
1345 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
1346 | write!( |
1347 | f, |
1348 | "failed to insert pattern ID {} into pattern set \ |
1349 | with insufficiet capacity of {}" , |
1350 | self.attempted.as_usize(), |
1351 | self.capacity, |
1352 | ) |
1353 | } |
1354 | } |
1355 | |
1356 | /// An iterator over all pattern identifiers in a [`PatternSet`]. |
1357 | /// |
1358 | /// The lifetime parameter `'a` refers to the lifetime of the pattern set being |
1359 | /// iterated over. |
1360 | /// |
1361 | /// This iterator is created by the [`PatternSet::iter`] method. |
1362 | #[cfg (feature = "alloc" )] |
1363 | #[derive (Clone, Debug)] |
1364 | pub struct PatternSetIter<'a> { |
1365 | it: core::iter::Enumerate<core::slice::Iter<'a, bool>>, |
1366 | } |
1367 | |
1368 | #[cfg (feature = "alloc" )] |
1369 | impl<'a> Iterator for PatternSetIter<'a> { |
1370 | type Item = PatternID; |
1371 | |
1372 | fn next(&mut self) -> Option<PatternID> { |
1373 | while let Some((index: usize, &yes: bool)) = self.it.next() { |
1374 | if yes { |
1375 | // Only valid 'PatternID' values can be inserted into the set |
1376 | // and construction of the set panics if the capacity would |
1377 | // permit storing invalid pattern IDs. Thus, 'yes' is only true |
1378 | // precisely when 'index' corresponds to a valid 'PatternID'. |
1379 | return Some(PatternID::new_unchecked(index)); |
1380 | } |
1381 | } |
1382 | None |
1383 | } |
1384 | |
1385 | fn size_hint(&self) -> (usize, Option<usize>) { |
1386 | self.it.size_hint() |
1387 | } |
1388 | } |
1389 | |
1390 | #[cfg (feature = "alloc" )] |
1391 | impl<'a> DoubleEndedIterator for PatternSetIter<'a> { |
1392 | fn next_back(&mut self) -> Option<PatternID> { |
1393 | while let Some((index: usize, &yes: bool)) = self.it.next_back() { |
1394 | if yes { |
1395 | // Only valid 'PatternID' values can be inserted into the set |
1396 | // and construction of the set panics if the capacity would |
1397 | // permit storing invalid pattern IDs. Thus, 'yes' is only true |
1398 | // precisely when 'index' corresponds to a valid 'PatternID'. |
1399 | return Some(PatternID::new_unchecked(index)); |
1400 | } |
1401 | } |
1402 | None |
1403 | } |
1404 | } |
1405 | |
1406 | /// The type of anchored search to perform. |
1407 | /// |
1408 | /// This is *almost* a boolean option. That is, you can either do an unanchored |
1409 | /// search for any pattern in a regex, or you can do an anchored search for any |
1410 | /// pattern in a regex. |
1411 | /// |
1412 | /// A third option exists that, assuming the regex engine supports it, permits |
1413 | /// you to do an anchored search for a specific pattern. |
1414 | /// |
1415 | /// Note that there is no way to run an unanchored search for a specific |
1416 | /// pattern. If you need that, you'll need to build separate regexes for each |
1417 | /// pattern. |
1418 | /// |
1419 | /// # Errors |
1420 | /// |
1421 | /// If a regex engine does not support the anchored mode selected, then the |
1422 | /// regex engine will return an error. While any non-trivial regex engine |
1423 | /// should support at least one of the available anchored modes, there is no |
1424 | /// singular mode that is guaranteed to be universally supported. Some regex |
1425 | /// engines might only support unanchored searches (DFAs compiled without |
1426 | /// anchored starting states) and some regex engines might only support |
1427 | /// anchored searches (like the one-pass DFA). |
1428 | /// |
1429 | /// The specific error returned is a [`MatchError`] with a |
1430 | /// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the |
1431 | /// `Anchored` value given that is unsupported. |
1432 | /// |
1433 | /// Note that regex engines should report "no match" if, for example, an |
1434 | /// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where |
1435 | /// anchored searches for a specific pattern are supported. This is smooths out |
1436 | /// behavior such that it's possible to guarantee that an error never occurs |
1437 | /// based on how the regex engine is configured. All regex engines in this |
1438 | /// crate report "no match" when searching for an invalid pattern ID, but where |
1439 | /// searching for a valid pattern ID is otherwise supported. |
1440 | /// |
1441 | /// # Example |
1442 | /// |
1443 | /// This example shows how to use the various `Anchored` modes to run a |
1444 | /// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) |
1445 | /// because it supports all modes unconditionally. Some regex engines, like |
1446 | /// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored |
1447 | /// searches. |
1448 | /// |
1449 | /// ``` |
1450 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1451 | /// use regex_automata::{ |
1452 | /// nfa::thompson::pikevm::PikeVM, |
1453 | /// Anchored, Input, Match, PatternID, |
1454 | /// }; |
1455 | /// |
1456 | /// let re = PikeVM::new_many(&[ |
1457 | /// r"Mrs. \w+" , |
1458 | /// r"Miss \w+" , |
1459 | /// r"Mr. \w+" , |
1460 | /// r"Ms. \w+" , |
1461 | /// ])?; |
1462 | /// let mut cache = re.create_cache(); |
1463 | /// let hay = "Hello Mr. Springsteen!" ; |
1464 | /// |
1465 | /// // The default is to do an unanchored search. |
1466 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); |
1467 | /// // Explicitly ask for an unanchored search. Same as above. |
1468 | /// let input = Input::new(hay).anchored(Anchored::No); |
1469 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); |
1470 | /// |
1471 | /// // Now try an anchored search. Since the match doesn't start at the |
1472 | /// // beginning of the haystack, no match is found! |
1473 | /// let input = Input::new(hay).anchored(Anchored::Yes); |
1474 | /// assert_eq!(None, re.find(&mut cache, input)); |
1475 | /// |
1476 | /// // We can try an anchored search again, but move the location of where |
1477 | /// // we start the search. Note that the offsets reported are still in |
1478 | /// // terms of the overall haystack and not relative to where we started |
1479 | /// // the search. |
1480 | /// let input = Input::new(hay).anchored(Anchored::Yes).range(6..); |
1481 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); |
1482 | /// |
1483 | /// // Now try an anchored search for a specific pattern. We specifically |
1484 | /// // choose a pattern that we know doesn't match to prove that the search |
1485 | /// // only looks for the pattern we provide. |
1486 | /// let input = Input::new(hay) |
1487 | /// .anchored(Anchored::Pattern(PatternID::must(1))) |
1488 | /// .range(6..); |
1489 | /// assert_eq!(None, re.find(&mut cache, input)); |
1490 | /// |
1491 | /// // But if we switch it to the pattern that we know matches, then we find |
1492 | /// // the match. |
1493 | /// let input = Input::new(hay) |
1494 | /// .anchored(Anchored::Pattern(PatternID::must(2))) |
1495 | /// .range(6..); |
1496 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); |
1497 | /// |
1498 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1499 | /// ``` |
1500 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
1501 | pub enum Anchored { |
1502 | /// Run an unanchored search. This means a match may occur anywhere at or |
1503 | /// after the start position of the search. |
1504 | /// |
1505 | /// This search can return a match for any pattern in the regex. |
1506 | No, |
1507 | /// Run an anchored search. This means that a match must begin at the |
1508 | /// start position of the search. |
1509 | /// |
1510 | /// This search can return a match for any pattern in the regex. |
1511 | Yes, |
1512 | /// Run an anchored search for a specific pattern. This means that a match |
1513 | /// must be for the given pattern and must begin at the start position of |
1514 | /// the search. |
1515 | Pattern(PatternID), |
1516 | } |
1517 | |
1518 | impl Anchored { |
1519 | /// Returns true if and only if this anchor mode corresponds to any kind of |
1520 | /// anchored search. |
1521 | /// |
1522 | /// # Example |
1523 | /// |
1524 | /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern` |
1525 | /// are considered anchored searches. |
1526 | /// |
1527 | /// ``` |
1528 | /// use regex_automata::{Anchored, PatternID}; |
1529 | /// |
1530 | /// assert!(!Anchored::No.is_anchored()); |
1531 | /// assert!(Anchored::Yes.is_anchored()); |
1532 | /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored()); |
1533 | /// ``` |
1534 | #[inline ] |
1535 | pub fn is_anchored(&self) -> bool { |
1536 | matches!(*self, Anchored::Yes | Anchored::Pattern(_)) |
1537 | } |
1538 | |
1539 | /// Returns the pattern ID associated with this configuration if it is an |
1540 | /// anchored search for a specific pattern. Otherwise `None` is returned. |
1541 | /// |
1542 | /// # Example |
1543 | /// |
1544 | /// ``` |
1545 | /// use regex_automata::{Anchored, PatternID}; |
1546 | /// |
1547 | /// assert_eq!(None, Anchored::No.pattern()); |
1548 | /// assert_eq!(None, Anchored::Yes.pattern()); |
1549 | /// |
1550 | /// let pid = PatternID::must(5); |
1551 | /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern()); |
1552 | /// ``` |
1553 | #[inline ] |
1554 | pub fn pattern(&self) -> Option<PatternID> { |
1555 | match *self { |
1556 | Anchored::Pattern(pid) => Some(pid), |
1557 | _ => None, |
1558 | } |
1559 | } |
1560 | } |
1561 | |
1562 | /// The kind of match semantics to use for a regex pattern. |
1563 | /// |
1564 | /// The default match kind is `LeftmostFirst`, and this corresponds to the |
1565 | /// match semantics used by most backtracking engines, such as Perl. |
1566 | /// |
1567 | /// # Leftmost first or "preference order" match semantics |
1568 | /// |
1569 | /// Leftmost-first semantics determine which match to report when there are |
1570 | /// multiple paths through a regex that match at the same position. The tie is |
1571 | /// essentially broken by how a backtracker would behave. For example, consider |
1572 | /// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this |
1573 | /// case, both the `foofoo` and `foo` branches match at position `0`. So should |
1574 | /// the end of the match be `3` or `6`? |
1575 | /// |
1576 | /// A backtracker will conceptually work by trying `foofoofoo` and failing. |
1577 | /// Then it will try `foofoo`, find the match and stop there. Thus, the |
1578 | /// leftmost-first match position is `6`. This is called "leftmost-first" or |
1579 | /// "preference order" because the order of the branches as written in the |
1580 | /// regex pattern is what determines how to break the tie. |
1581 | /// |
1582 | /// (Note that leftmost-longest match semantics, which break ties by always |
1583 | /// taking the longest matching string, are not currently supported by this |
1584 | /// crate. These match semantics tend to be found in POSIX regex engines.) |
1585 | /// |
1586 | /// This example shows how leftmost-first semantics work, and how it even |
1587 | /// applies to multi-pattern regexes: |
1588 | /// |
1589 | /// ``` |
1590 | /// use regex_automata::{ |
1591 | /// nfa::thompson::pikevm::PikeVM, |
1592 | /// Match, |
1593 | /// }; |
1594 | /// |
1595 | /// let re = PikeVM::new_many(&[ |
1596 | /// r"foofoofoo" , |
1597 | /// r"foofoo" , |
1598 | /// r"foo" , |
1599 | /// ])?; |
1600 | /// let mut cache = re.create_cache(); |
1601 | /// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo" ).collect(); |
1602 | /// let expected = vec![Match::must(1, 0..6)]; |
1603 | /// assert_eq!(expected, got); |
1604 | /// |
1605 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1606 | /// ``` |
1607 | /// |
1608 | /// # All matches |
1609 | /// |
1610 | /// The `All` match semantics report any and all matches, and generally will |
1611 | /// attempt to match as much as possible. It doesn't respect any sort of match |
1612 | /// priority at all, so things like non-greedy matching don't work in this |
1613 | /// mode. |
1614 | /// |
1615 | /// The fact that non-greedy matching doesn't work generally makes most forms |
1616 | /// of unanchored non-overlapping searches have unintuitive behavior. Namely, |
1617 | /// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the |
1618 | /// beginning of the pattern, which is specifically non-greedy. Since it will |
1619 | /// be treated as greedy in `All` match semantics, this generally means that |
1620 | /// it will first attempt to consume all of the haystack and is likely to wind |
1621 | /// up skipping matches. |
1622 | /// |
1623 | /// Generally speaking, `All` should only be used in two circumstances: |
1624 | /// |
1625 | /// * When running an anchored search and there is a desire to match as much as |
1626 | /// possible. For example, when building a reverse regex matcher to find the |
1627 | /// start of a match after finding the end. In this case, the reverse search |
1628 | /// is anchored to the end of the match found by the forward search. |
1629 | /// * When running overlapping searches. Since `All` encodes all possible |
1630 | /// matches, this is generally what you want for an overlapping search. If you |
1631 | /// try to use leftmost-first in an overlapping search, it is likely to produce |
1632 | /// counter-intuitive results since leftmost-first specifically excludes some |
1633 | /// matches from its underlying finite state machine. |
1634 | /// |
1635 | /// This example demonstrates the counter-intuitive behavior of `All` semantics |
1636 | /// when using a standard leftmost unanchored search: |
1637 | /// |
1638 | /// ``` |
1639 | /// use regex_automata::{ |
1640 | /// nfa::thompson::pikevm::PikeVM, |
1641 | /// Match, MatchKind, |
1642 | /// }; |
1643 | /// |
1644 | /// let re = PikeVM::builder() |
1645 | /// .configure(PikeVM::config().match_kind(MatchKind::All)) |
1646 | /// .build("foo" )?; |
1647 | /// let hay = "first foo second foo wat" ; |
1648 | /// let mut cache = re.create_cache(); |
1649 | /// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect(); |
1650 | /// // Notice that it completely skips the first 'foo'! |
1651 | /// let expected = vec![Match::must(0, 17..20)]; |
1652 | /// assert_eq!(expected, got); |
1653 | /// |
1654 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1655 | /// ``` |
1656 | /// |
1657 | /// This second example shows how `All` semantics are useful for an overlapping |
1658 | /// search. Note that we use lower level lazy DFA APIs here since the NFA |
1659 | /// engines only currently support a very limited form of overlapping search. |
1660 | /// |
1661 | /// ``` |
1662 | /// use regex_automata::{ |
1663 | /// hybrid::dfa::{DFA, OverlappingState}, |
1664 | /// HalfMatch, Input, MatchKind, |
1665 | /// }; |
1666 | /// |
1667 | /// let re = DFA::builder() |
1668 | /// // If we didn't set 'All' semantics here, then the regex would only |
1669 | /// // match 'foo' at offset 3 and nothing else. Why? Because the state |
1670 | /// // machine implements preference order and knows that the 'foofoo' and |
1671 | /// // 'foofoofoo' branches can never match since 'foo' will always match |
1672 | /// // when they match and take priority. |
1673 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
1674 | /// .build(r"foo|foofoo|foofoofoo" )?; |
1675 | /// let mut cache = re.create_cache(); |
1676 | /// let mut state = OverlappingState::start(); |
1677 | /// let input = Input::new("foofoofoo" ); |
1678 | /// let mut got = vec![]; |
1679 | /// loop { |
1680 | /// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?; |
1681 | /// let m = match state.get_match() { |
1682 | /// None => break, |
1683 | /// Some(m) => m, |
1684 | /// }; |
1685 | /// got.push(m); |
1686 | /// } |
1687 | /// let expected = vec![ |
1688 | /// HalfMatch::must(0, 3), |
1689 | /// HalfMatch::must(0, 6), |
1690 | /// HalfMatch::must(0, 9), |
1691 | /// ]; |
1692 | /// assert_eq!(expected, got); |
1693 | /// |
1694 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1695 | /// ``` |
1696 | #[non_exhaustive ] |
1697 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
1698 | pub enum MatchKind { |
1699 | /// Report all possible matches. |
1700 | All, |
1701 | /// Report only the leftmost matches. When multiple leftmost matches exist, |
1702 | /// report the match corresponding to the part of the regex that appears |
1703 | /// first in the syntax. |
1704 | LeftmostFirst, |
1705 | // There is prior art in RE2 that shows that we should be able to add |
1706 | // LeftmostLongest too. The tricky part of it is supporting ungreedy |
1707 | // repetitions. Instead of treating all NFA states as having equivalent |
1708 | // priority (as in 'All') or treating all NFA states as having distinct |
1709 | // priority based on order (as in 'LeftmostFirst'), we instead group NFA |
1710 | // states into sets, and treat members of each set as having equivalent |
1711 | // priority, but having greater priority than all following members |
1712 | // of different sets. |
1713 | // |
1714 | // However, it's not clear whether it's really worth adding this. After |
1715 | // all, leftmost-longest can be emulated when using literals by using |
1716 | // leftmost-first and sorting the literals by length in descending order. |
1717 | // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will |
1718 | // always match `a` in `ab` when using leftmost-first, but leftmost-longest |
1719 | // would match `ab`. |
1720 | } |
1721 | |
1722 | impl MatchKind { |
1723 | #[cfg (feature = "alloc" )] |
1724 | pub(crate) fn continue_past_first_match(&self) -> bool { |
1725 | *self == MatchKind::All |
1726 | } |
1727 | } |
1728 | |
1729 | impl Default for MatchKind { |
1730 | fn default() -> MatchKind { |
1731 | MatchKind::LeftmostFirst |
1732 | } |
1733 | } |
1734 | |
1735 | /// An error indicating that a search stopped before reporting whether a |
1736 | /// match exists or not. |
1737 | /// |
1738 | /// To be very clear, this error type implies that one cannot assume that no |
1739 | /// matches occur, since the search stopped before completing. That is, if |
1740 | /// you're looking for information about where a search determined that no |
1741 | /// match can occur, then this error type does *not* give you that. (Indeed, at |
1742 | /// the time of writing, if you need such a thing, you have to write your own |
1743 | /// search routine.) |
1744 | /// |
1745 | /// Normally, when one searches for something, the response is either an |
1746 | /// affirmative "it was found at this location" or a negative "not found at |
1747 | /// all." However, in some cases, a regex engine can be configured to stop its |
1748 | /// search before concluding whether a match exists or not. When this happens, |
1749 | /// it may be important for the caller to know why the regex engine gave up and |
1750 | /// where in the input it gave up at. This error type exposes the 'why' and the |
1751 | /// 'where.' |
1752 | /// |
1753 | /// For example, the DFAs provided by this library generally cannot correctly |
1754 | /// implement Unicode word boundaries. Instead, they provide an option to |
1755 | /// eagerly support them on ASCII text (since Unicode word boundaries are |
1756 | /// equivalent to ASCII word boundaries when searching ASCII text), but will |
1757 | /// "give up" if a non-ASCII byte is seen. In such cases, one is usually |
1758 | /// required to either report the failure to the caller (unergonomic) or |
1759 | /// otherwise fall back to some other regex engine (ergonomic, but potentially |
1760 | /// costly). |
1761 | /// |
1762 | /// More generally, some regex engines offer the ability for callers to specify |
1763 | /// certain bytes that will trigger the regex engine to automatically quit if |
1764 | /// they are seen. |
1765 | /// |
1766 | /// Still yet, there may be other reasons for a failed match. For example, |
1767 | /// the hybrid DFA provided by this crate can be configured to give up if it |
1768 | /// believes that it is not efficient. This in turn permits callers to choose a |
1769 | /// different regex engine. |
1770 | /// |
1771 | /// (Note that DFAs are configured by default to never quit or give up in this |
1772 | /// fashion. For example, by default, a DFA will fail to build if the regex |
1773 | /// pattern contains a Unicode word boundary. One needs to opt into the "quit" |
1774 | /// behavior via options, like |
1775 | /// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).) |
1776 | /// |
1777 | /// There are a couple other ways a search |
1778 | /// can fail. For example, when using the |
1779 | /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) |
1780 | /// with a haystack that is too long, or trying to run an unanchored search |
1781 | /// with a [one-pass DFA](crate::dfa::onepass). |
1782 | #[derive (Clone, Debug, Eq, PartialEq)] |
1783 | pub struct MatchError( |
1784 | #[cfg (feature = "alloc" )] alloc::boxed::Box<MatchErrorKind>, |
1785 | #[cfg (not(feature = "alloc" ))] MatchErrorKind, |
1786 | ); |
1787 | |
1788 | impl MatchError { |
1789 | /// Create a new error value with the given kind. |
1790 | /// |
1791 | /// This is a more verbose version of the kind-specific constructors, |
1792 | /// e.g., `MatchError::quit`. |
1793 | pub fn new(kind: MatchErrorKind) -> MatchError { |
1794 | #[cfg (feature = "alloc" )] |
1795 | { |
1796 | MatchError(alloc::boxed::Box::new(kind)) |
1797 | } |
1798 | #[cfg (not(feature = "alloc" ))] |
1799 | { |
1800 | MatchError(kind) |
1801 | } |
1802 | } |
1803 | |
1804 | /// Returns a reference to the underlying error kind. |
1805 | pub fn kind(&self) -> &MatchErrorKind { |
1806 | &self.0 |
1807 | } |
1808 | |
1809 | /// Create a new "quit" error. The given `byte` corresponds to the value |
1810 | /// that tripped a search's quit condition, and `offset` corresponds to the |
1811 | /// location in the haystack at which the search quit. |
1812 | /// |
1813 | /// This is the same as calling `MatchError::new` with a |
1814 | /// [`MatchErrorKind::Quit`] kind. |
1815 | pub fn quit(byte: u8, offset: usize) -> MatchError { |
1816 | MatchError::new(MatchErrorKind::Quit { byte, offset }) |
1817 | } |
1818 | |
1819 | /// Create a new "gave up" error. The given `offset` corresponds to the |
1820 | /// location in the haystack at which the search gave up. |
1821 | /// |
1822 | /// This is the same as calling `MatchError::new` with a |
1823 | /// [`MatchErrorKind::GaveUp`] kind. |
1824 | pub fn gave_up(offset: usize) -> MatchError { |
1825 | MatchError::new(MatchErrorKind::GaveUp { offset }) |
1826 | } |
1827 | |
1828 | /// Create a new "haystack too long" error. The given `len` corresponds to |
1829 | /// the length of the haystack that was problematic. |
1830 | /// |
1831 | /// This is the same as calling `MatchError::new` with a |
1832 | /// [`MatchErrorKind::HaystackTooLong`] kind. |
1833 | pub fn haystack_too_long(len: usize) -> MatchError { |
1834 | MatchError::new(MatchErrorKind::HaystackTooLong { len }) |
1835 | } |
1836 | |
1837 | /// Create a new "unsupported anchored" error. This occurs when the caller |
1838 | /// requests a search with an anchor mode that is not supported by the |
1839 | /// regex engine. |
1840 | /// |
1841 | /// This is the same as calling `MatchError::new` with a |
1842 | /// [`MatchErrorKind::UnsupportedAnchored`] kind. |
1843 | pub fn unsupported_anchored(mode: Anchored) -> MatchError { |
1844 | MatchError::new(MatchErrorKind::UnsupportedAnchored { mode }) |
1845 | } |
1846 | } |
1847 | |
1848 | /// The underlying kind of a [`MatchError`]. |
1849 | /// |
1850 | /// This is a **non-exhaustive** enum. That means new variants may be added in |
1851 | /// a semver-compatible release. |
1852 | #[non_exhaustive ] |
1853 | #[derive (Clone, Debug, Eq, PartialEq)] |
1854 | pub enum MatchErrorKind { |
1855 | /// The search saw a "quit" byte at which it was instructed to stop |
1856 | /// searching. |
1857 | Quit { |
1858 | /// The "quit" byte that was observed that caused the search to stop. |
1859 | byte: u8, |
1860 | /// The offset at which the quit byte was observed. |
1861 | offset: usize, |
1862 | }, |
1863 | /// The search, based on heuristics, determined that it would be better |
1864 | /// to stop, typically to provide the caller an opportunity to use an |
1865 | /// alternative regex engine. |
1866 | /// |
1867 | /// Currently, the only way for this to occur is via the lazy DFA and |
1868 | /// only when it is configured to do so (it will not return this error by |
1869 | /// default). |
1870 | GaveUp { |
1871 | /// The offset at which the search stopped. This corresponds to the |
1872 | /// position immediately following the last byte scanned. |
1873 | offset: usize, |
1874 | }, |
1875 | /// This error occurs if the haystack given to the regex engine was too |
1876 | /// long to be searched. This occurs, for example, with regex engines |
1877 | /// like the bounded backtracker that have a configurable fixed amount of |
1878 | /// capacity that is tied to the length of the haystack. Anything beyond |
1879 | /// that configured limit will result in an error at search time. |
1880 | HaystackTooLong { |
1881 | /// The length of the haystack that exceeded the limit. |
1882 | len: usize, |
1883 | }, |
1884 | /// An error indicating that a particular type of anchored search was |
1885 | /// requested, but that the regex engine does not support it. |
1886 | /// |
1887 | /// Note that this error should not be returned by a regex engine simply |
1888 | /// because the pattern ID is invalid (i.e., equal to or exceeds the number |
1889 | /// of patterns in the regex). In that case, the regex engine should report |
1890 | /// a non-match. |
1891 | UnsupportedAnchored { |
1892 | /// The anchored mode given that is unsupported. |
1893 | mode: Anchored, |
1894 | }, |
1895 | } |
1896 | |
1897 | #[cfg (feature = "std" )] |
1898 | impl std::error::Error for MatchError {} |
1899 | |
1900 | impl core::fmt::Display for MatchError { |
1901 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
1902 | match *self.kind() { |
1903 | MatchErrorKind::Quit { byte, offset } => write!( |
1904 | f, |
1905 | "quit search after observing byte {:?} at offset {}" , |
1906 | DebugByte(byte), |
1907 | offset, |
1908 | ), |
1909 | MatchErrorKind::GaveUp { offset } => { |
1910 | write!(f, "gave up searching at offset {}" , offset) |
1911 | } |
1912 | MatchErrorKind::HaystackTooLong { len } => { |
1913 | write!(f, "haystack of length {} is too long" , len) |
1914 | } |
1915 | MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => { |
1916 | write!(f, "anchored searches are not supported or enabled" ) |
1917 | } |
1918 | MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => { |
1919 | write!(f, "unanchored searches are not supported or enabled" ) |
1920 | } |
1921 | MatchErrorKind::UnsupportedAnchored { |
1922 | mode: Anchored::Pattern(pid), |
1923 | } => { |
1924 | write!( |
1925 | f, |
1926 | "anchored searches for a specific pattern ( {}) are \ |
1927 | not supported or enabled" , |
1928 | pid.as_usize(), |
1929 | ) |
1930 | } |
1931 | } |
1932 | } |
1933 | } |
1934 | |
1935 | #[cfg (test)] |
1936 | mod tests { |
1937 | use super::*; |
1938 | |
1939 | // We test that our 'MatchError' type is the size we expect. This isn't an |
1940 | // API guarantee, but if the size increases, we really want to make sure we |
1941 | // decide to do that intentionally. So this should be a speed bump. And in |
1942 | // general, we should not increase the size without a very good reason. |
1943 | // |
1944 | // Why? Because low level search APIs return Result<.., MatchError>. When |
1945 | // MatchError gets bigger, so to does the Result type. |
1946 | // |
1947 | // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes |
1948 | // the importance of keeping a small error type. But without 'alloc', we |
1949 | // still want things to be small. |
1950 | #[test ] |
1951 | fn match_error_size() { |
1952 | let expected_size = if cfg!(feature = "alloc" ) { |
1953 | core::mem::size_of::<usize>() |
1954 | } else { |
1955 | 2 * core::mem::size_of::<usize>() |
1956 | }; |
1957 | assert_eq!(expected_size, core::mem::size_of::<MatchError>()); |
1958 | } |
1959 | |
1960 | // Same as above, but for the underlying match error kind. |
1961 | #[cfg (target_pointer_width = "64" )] |
1962 | #[test ] |
1963 | fn match_error_kind_size() { |
1964 | let expected_size = 2 * core::mem::size_of::<usize>(); |
1965 | assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); |
1966 | } |
1967 | |
1968 | #[cfg (target_pointer_width = "32" )] |
1969 | #[test ] |
1970 | fn match_error_kind_size() { |
1971 | let expected_size = 3 * core::mem::size_of::<usize>(); |
1972 | assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); |
1973 | } |
1974 | |
1975 | #[test ] |
1976 | fn incorrect_asref_guard() { |
1977 | struct Bad(std::cell::Cell<bool>); |
1978 | |
1979 | impl AsRef<[u8]> for Bad { |
1980 | fn as_ref(&self) -> &[u8] { |
1981 | if self.0.replace(false) { |
1982 | &[] |
1983 | } else { |
1984 | &[0; 1000] |
1985 | } |
1986 | } |
1987 | } |
1988 | |
1989 | let bad = Bad(std::cell::Cell::new(true)); |
1990 | let input = Input::new(&bad); |
1991 | assert!(input.end() <= input.haystack().len()); |
1992 | } |
1993 | } |
1994 | |