1/*!
2Types and routines that support the search APIs of most regex engines.
3
4This sub-module isn't exposed directly, but rather, its contents are exported
5at the crate root due to the universality of most of the types and routines in
6this module.
7*/
8
9use core::ops::{Range, RangeBounds};
10
11use crate::util::{escape::DebugByte, primitives::PatternID, utf8};
12
13/// The parameters for a regex search including the haystack to search.
14///
15/// It turns out that regex searches have a few parameters, and in most cases,
16/// those parameters have defaults that work in the vast majority of cases.
17/// This `Input` type exists to make that common case seamnless while also
18/// providing an avenue for changing the parameters of a search. In particular,
19/// this type enables doing so without a combinatorial explosion of different
20/// methods and/or superfluous parameters in the common cases.
21///
22/// An `Input` permits configuring the following things:
23///
24/// * Search only a substring of a haystack, while taking the broader context
25/// into account for resolving look-around assertions.
26/// * Indicating whether to search for all patterns in a regex, or to
27/// only search for one pattern in particular.
28/// * Whether to perform an anchored on unanchored search.
29/// * Whether to report a match as early as possible.
30///
31/// All of these parameters, except for the haystack, have sensible default
32/// values. This means that the minimal search configuration is simply a call
33/// to [`Input::new`] with your haystack. Setting any other parameter is
34/// optional.
35///
36/// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a
37/// `From<H> for Input` implementation. This is useful because many of the
38/// search APIs in this crate accept an `Into<Input>`. This means you can
39/// provide string or byte strings to these routines directly, and they'll
40/// automatically get converted into an `Input` for you.
41///
42/// The lifetime parameter `'h` refers to the lifetime of the haystack.
43///
44/// # Organization
45///
46/// The API of `Input` is split into a few different parts:
47///
48/// * A builder-like API that transforms a `Input` by value. Examples:
49/// [`Input::span`] and [`Input::anchored`].
50/// * A setter API that permits mutating parameters in place. Examples:
51/// [`Input::set_span`] and [`Input::set_anchored`].
52/// * A getter API that permits retrieving any of the search parameters.
53/// Examples: [`Input::get_span`] and [`Input::get_anchored`].
54/// * A few convenience getter routines that don't conform to the above naming
55/// pattern due to how common they are. Examples: [`Input::haystack`],
56/// [`Input::start`] and [`Input::end`].
57/// * Miscellaneous predicates and other helper routines that are useful
58/// in some contexts. Examples: [`Input::is_char_boundary`].
59///
60/// A `Input` exposes so much because it is meant to be used by both callers of
61/// regex engines _and_ implementors of regex engines. A constraining factor is
62/// that regex engines should accept a `&Input` as its lowest level API, which
63/// means that implementors should only use the "getter" APIs of a `Input`.
64///
65/// # Valid bounds and search termination
66///
67/// An `Input` permits setting the bounds of a search via either
68/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or
69/// else a panic will occur. Bounds are valid if and only if:
70///
71/// * The bounds represent a valid range into the input's haystack.
72/// * **or** the end bound is a valid ending bound for the haystack *and*
73/// the start bound is exactly one greater than the start bound.
74///
75/// In the latter case, [`Input::is_done`] will return true and indicates any
76/// search receiving such an input should immediately return with no match.
77///
78/// Note that while `Input` is used for reverse searches in this crate, the
79/// `Input::is_done` predicate assumes a forward search. Because unsigned
80/// offsets are used internally, there is no way to tell from only the offsets
81/// whether a reverse search is done or not.
82///
83/// # Regex engine support
84///
85/// Any regex engine accepting an `Input` must support at least the following
86/// things:
87///
88/// * Searching a `&[u8]` for matches.
89/// * Searching a substring of `&[u8]` for a match, such that any match
90/// reported must appear entirely within that substring.
91/// * For a forwards search, a match should never be reported when
92/// [`Input::is_done`] returns true. (For reverse searches, termination should
93/// be handled outside of `Input`.)
94///
95/// Supporting other aspects of an `Input` are optional, but regex engines
96/// should handle aspects they don't support gracefully. How this is done is
97/// generally up to the regex engine. This crate generally treats unsupported
98/// anchored modes as an error to report for example, but for simplicity, in
99/// the meta regex engine, trying to search with an invalid pattern ID just
100/// results in no match being reported.
101#[derive(Clone)]
102pub struct Input<'h> {
103 haystack: &'h [u8],
104 span: Span,
105 anchored: Anchored,
106 earliest: bool,
107}
108
109impl<'h> Input<'h> {
110 /// Create a new search configuration for the given haystack.
111 #[inline]
112 pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> {
113 // Perform only one call to `haystack.as_ref()` to protect from incorrect
114 // implementations that return different values from multiple calls.
115 // This is important because there's code that relies on `span` not being
116 // out of bounds with respect to the stored `haystack`.
117 let haystack = haystack.as_ref();
118 Input {
119 haystack,
120 span: Span { start: 0, end: haystack.len() },
121 anchored: Anchored::No,
122 earliest: false,
123 }
124 }
125
126 /// Set the span for this search.
127 ///
128 /// This routine does not panic if the span given is not a valid range for
129 /// this search's haystack. If this search is run with an invalid range,
130 /// then the most likely outcome is that the actual search execution will
131 /// panic.
132 ///
133 /// This routine is generic over how a span is provided. While
134 /// a [`Span`] may be given directly, one may also provide a
135 /// `std::ops::Range<usize>`. To provide anything supported by range
136 /// syntax, use the [`Input::range`] method.
137 ///
138 /// The default span is the entire haystack.
139 ///
140 /// Note that [`Input::range`] overrides this method and vice versa.
141 ///
142 /// # Panics
143 ///
144 /// This panics if the given span does not correspond to valid bounds in
145 /// the haystack or the termination of a search.
146 ///
147 /// # Example
148 ///
149 /// This example shows how the span of the search can impact whether a
150 /// match is reported or not. This is particularly relevant for look-around
151 /// operators, which might take things outside of the span into account
152 /// when determining whether they match.
153 ///
154 /// ```
155 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
156 /// use regex_automata::{
157 /// nfa::thompson::pikevm::PikeVM,
158 /// Match, Input,
159 /// };
160 ///
161 /// // Look for 'at', but as a distinct word.
162 /// let re = PikeVM::new(r"\bat\b")?;
163 /// let mut cache = re.create_cache();
164 /// let mut caps = re.create_captures();
165 ///
166 /// // Our haystack contains 'at', but not as a distinct word.
167 /// let haystack = "batter";
168 ///
169 /// // A standard search finds nothing, as expected.
170 /// let input = Input::new(haystack);
171 /// re.search(&mut cache, &input, &mut caps);
172 /// assert_eq!(None, caps.get_match());
173 ///
174 /// // But if we wanted to search starting at position '1', we might
175 /// // slice the haystack. If we do this, it's impossible for the \b
176 /// // anchors to take the surrounding context into account! And thus,
177 /// // a match is produced.
178 /// let input = Input::new(&haystack[1..3]);
179 /// re.search(&mut cache, &input, &mut caps);
180 /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match());
181 ///
182 /// // But if we specify the span of the search instead of slicing the
183 /// // haystack, then the regex engine can "see" outside of the span
184 /// // and resolve the anchors correctly.
185 /// let input = Input::new(haystack).span(1..3);
186 /// re.search(&mut cache, &input, &mut caps);
187 /// assert_eq!(None, caps.get_match());
188 ///
189 /// # Ok::<(), Box<dyn std::error::Error>>(())
190 /// ```
191 ///
192 /// This may seem a little ham-fisted, but this scenario tends to come up
193 /// if some other regex engine found the match span and now you need to
194 /// re-process that span to look for capturing groups. (e.g., Run a faster
195 /// DFA first, find a match, then run the PikeVM on just the match span to
196 /// resolve capturing groups.) In order to implement that sort of logic
197 /// correctly, you need to set the span on the search instead of slicing
198 /// the haystack directly.
199 ///
200 /// The other advantage of using this routine to specify the bounds of the
201 /// search is that the match offsets are still reported in terms of the
202 /// original haystack. For example, the second search in the example above
203 /// reported a match at position `0`, even though `at` starts at offset
204 /// `1` because we sliced the haystack.
205 #[inline]
206 pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> {
207 self.set_span(span);
208 self
209 }
210
211 /// Like `Input::span`, but accepts any range instead.
212 ///
213 /// This routine does not panic if the range given is not a valid range for
214 /// this search's haystack. If this search is run with an invalid range,
215 /// then the most likely outcome is that the actual search execution will
216 /// panic.
217 ///
218 /// The default range is the entire haystack.
219 ///
220 /// Note that [`Input::span`] overrides this method and vice versa.
221 ///
222 /// # Panics
223 ///
224 /// This routine will panic if the given range could not be converted
225 /// to a valid [`Range`]. For example, this would panic when given
226 /// `0..=usize::MAX` since it cannot be represented using a half-open
227 /// interval in terms of `usize`.
228 ///
229 /// This also panics if the given range does not correspond to valid bounds
230 /// in the haystack or the termination of a search.
231 ///
232 /// # Example
233 ///
234 /// ```
235 /// use regex_automata::Input;
236 ///
237 /// let input = Input::new("foobar");
238 /// assert_eq!(0..6, input.get_range());
239 ///
240 /// let input = Input::new("foobar").range(2..=4);
241 /// assert_eq!(2..5, input.get_range());
242 /// ```
243 #[inline]
244 pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> {
245 self.set_range(range);
246 self
247 }
248
249 /// Sets the anchor mode of a search.
250 ///
251 /// When a search is anchored (so that's [`Anchored::Yes`] or
252 /// [`Anchored::Pattern`]), a match must begin at the start of a search.
253 /// When a search is not anchored (that's [`Anchored::No`]), regex engines
254 /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix
255 /// permits a match to appear anywhere.
256 ///
257 /// By default, the anchored mode is [`Anchored::No`].
258 ///
259 /// **WARNING:** this is subtly different than using a `^` at the start of
260 /// your regex. A `^` forces a regex to match exclusively at the start of
261 /// a haystack, regardless of where you begin your search. In contrast,
262 /// anchoring a search will allow your regex to match anywhere in your
263 /// haystack, but the match must start at the beginning of a search.
264 ///
265 /// For example, consider the haystack `aba` and the following searches:
266 ///
267 /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba`
268 /// starting at position `2`. Since `^` requires the match to start at
269 /// the beginning of the haystack and `2 > 0`, no match is found.
270 /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
271 /// starting at position `2`. This reports a match at `[2, 3]` since
272 /// the match starts where the search started. Since there is no `^`,
273 /// there is no requirement for the match to start at the beginning of
274 /// the haystack.
275 /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
276 /// starting at position `1`. Since `b` corresponds to position `1` and
277 /// since the search is anchored, it finds no match. While the regex
278 /// matches at other positions, configuring the search to be anchored
279 /// requires that it only report a match that begins at the same offset
280 /// as the beginning of the search.
281 /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba`
282 /// starting at position `1`. Since the search is not anchored and
283 /// the regex does not start with `^`, the search executes as if there
284 /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it
285 /// reports a match at `[2, 3]`.
286 ///
287 /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`,
288 /// except it only reports matches for a particular pattern.
289 ///
290 /// # Example
291 ///
292 /// This demonstrates the differences between an anchored search and
293 /// a pattern that begins with `^` (as described in the above warning
294 /// message).
295 ///
296 /// ```
297 /// use regex_automata::{
298 /// nfa::thompson::pikevm::PikeVM,
299 /// Anchored, Match, Input,
300 /// };
301 ///
302 /// let haystack = "aba";
303 ///
304 /// let re = PikeVM::new(r"^a")?;
305 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
306 /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No);
307 /// re.search(&mut cache, &input, &mut caps);
308 /// // No match is found because 2 is not the beginning of the haystack,
309 /// // which is what ^ requires.
310 /// assert_eq!(None, caps.get_match());
311 ///
312 /// let re = PikeVM::new(r"a")?;
313 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
314 /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes);
315 /// re.search(&mut cache, &input, &mut caps);
316 /// // An anchored search can still match anywhere in the haystack, it just
317 /// // must begin at the start of the search which is '2' in this case.
318 /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match());
319 ///
320 /// let re = PikeVM::new(r"a")?;
321 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
322 /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes);
323 /// re.search(&mut cache, &input, &mut caps);
324 /// // No match is found since we start searching at offset 1 which
325 /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
326 /// // is found.
327 /// assert_eq!(None, caps.get_match());
328 ///
329 /// let re = PikeVM::new(r"a")?;
330 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
331 /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No);
332 /// re.search(&mut cache, &input, &mut caps);
333 /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the
334 /// // pattern. Even though the search starts at 'b', the 'match anything'
335 /// // prefix allows the search to match 'a'.
336 /// let expected = Some(Match::must(0, 2..3));
337 /// assert_eq!(expected, caps.get_match());
338 ///
339 /// # Ok::<(), Box<dyn std::error::Error>>(())
340 /// ```
341 #[inline]
342 pub fn anchored(mut self, mode: Anchored) -> Input<'h> {
343 self.set_anchored(mode);
344 self
345 }
346
347 /// Whether to execute an "earliest" search or not.
348 ///
349 /// When running a non-overlapping search, an "earliest" search will return
350 /// the match location as early as possible. For example, given a pattern
351 /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search
352 /// will return `foo12345` as a match. But an "earliest" search for regex
353 /// engines that support "earliest" semantics will return `foo1` as a
354 /// match, since as soon as the first digit following `foo` is seen, it is
355 /// known to have found a match.
356 ///
357 /// Note that "earliest" semantics generally depend on the regex engine.
358 /// Different regex engines may determine there is a match at different
359 /// points. So there is no guarantee that "earliest" matches will always
360 /// return the same offsets for all regex engines. The "earliest" notion
361 /// is really about when the particular regex engine determines there is
362 /// a match rather than a consistent semantic unto itself. This is often
363 /// useful for implementing "did a match occur or not" predicates, but
364 /// sometimes the offset is useful as well.
365 ///
366 /// This is disabled by default.
367 ///
368 /// # Example
369 ///
370 /// This example shows the difference between "earliest" searching and
371 /// normal searching.
372 ///
373 /// ```
374 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
375 ///
376 /// let re = PikeVM::new(r"foo[0-9]+")?;
377 /// let mut cache = re.create_cache();
378 /// let mut caps = re.create_captures();
379 ///
380 /// // A normal search implements greediness like you expect.
381 /// let input = Input::new("foo12345");
382 /// re.search(&mut cache, &input, &mut caps);
383 /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
384 ///
385 /// // When 'earliest' is enabled and the regex engine supports
386 /// // it, the search will bail once it knows a match has been
387 /// // found.
388 /// let input = Input::new("foo12345").earliest(true);
389 /// re.search(&mut cache, &input, &mut caps);
390 /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match());
391 /// # Ok::<(), Box<dyn std::error::Error>>(())
392 /// ```
393 #[inline]
394 pub fn earliest(mut self, yes: bool) -> Input<'h> {
395 self.set_earliest(yes);
396 self
397 }
398
399 /// Set the span for this search configuration.
400 ///
401 /// This is like the [`Input::span`] method, except this mutates the
402 /// span in place.
403 ///
404 /// This routine is generic over how a span is provided. While
405 /// a [`Span`] may be given directly, one may also provide a
406 /// `std::ops::Range<usize>`.
407 ///
408 /// # Panics
409 ///
410 /// This panics if the given span does not correspond to valid bounds in
411 /// the haystack or the termination of a search.
412 ///
413 /// # Example
414 ///
415 /// ```
416 /// use regex_automata::Input;
417 ///
418 /// let mut input = Input::new("foobar");
419 /// assert_eq!(0..6, input.get_range());
420 /// input.set_span(2..4);
421 /// assert_eq!(2..4, input.get_range());
422 /// ```
423 #[inline]
424 pub fn set_span<S: Into<Span>>(&mut self, span: S) {
425 let span = span.into();
426 assert!(
427 span.end <= self.haystack.len()
428 && span.start <= span.end.wrapping_add(1),
429 "invalid span {:?} for haystack of length {}",
430 span,
431 self.haystack.len(),
432 );
433 self.span = span;
434 }
435
436 /// Set the span for this search configuration given any range.
437 ///
438 /// This is like the [`Input::range`] method, except this mutates the
439 /// span in place.
440 ///
441 /// This routine does not panic if the range given is not a valid range for
442 /// this search's haystack. If this search is run with an invalid range,
443 /// then the most likely outcome is that the actual search execution will
444 /// panic.
445 ///
446 /// # Panics
447 ///
448 /// This routine will panic if the given range could not be converted
449 /// to a valid [`Range`]. For example, this would panic when given
450 /// `0..=usize::MAX` since it cannot be represented using a half-open
451 /// interval in terms of `usize`.
452 ///
453 /// This also panics if the given span does not correspond to valid bounds
454 /// in the haystack or the termination of a search.
455 ///
456 /// # Example
457 ///
458 /// ```
459 /// use regex_automata::Input;
460 ///
461 /// let mut input = Input::new("foobar");
462 /// assert_eq!(0..6, input.get_range());
463 /// input.set_range(2..=4);
464 /// assert_eq!(2..5, input.get_range());
465 /// ```
466 #[inline]
467 pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
468 use core::ops::Bound;
469
470 // It's a little weird to convert ranges into spans, and then spans
471 // back into ranges when we actually slice the haystack. Because
472 // of that process, we always represent everything as a half-open
473 // internal. Therefore, handling things like m..=n is a little awkward.
474 let start = match range.start_bound() {
475 Bound::Included(&i) => i,
476 // Can this case ever happen? Range syntax doesn't support it...
477 Bound::Excluded(&i) => i.checked_add(1).unwrap(),
478 Bound::Unbounded => 0,
479 };
480 let end = match range.end_bound() {
481 Bound::Included(&i) => i.checked_add(1).unwrap(),
482 Bound::Excluded(&i) => i,
483 Bound::Unbounded => self.haystack().len(),
484 };
485 self.set_span(Span { start, end });
486 }
487
488 /// Set the starting offset for the span for this search configuration.
489 ///
490 /// This is a convenience routine for only mutating the start of a span
491 /// without having to set the entire span.
492 ///
493 /// # Panics
494 ///
495 /// This panics if the span resulting from the new start position does not
496 /// correspond to valid bounds in the haystack or the termination of a
497 /// search.
498 ///
499 /// # Example
500 ///
501 /// ```
502 /// use regex_automata::Input;
503 ///
504 /// let mut input = Input::new("foobar");
505 /// assert_eq!(0..6, input.get_range());
506 /// input.set_start(5);
507 /// assert_eq!(5..6, input.get_range());
508 /// ```
509 #[inline]
510 pub fn set_start(&mut self, start: usize) {
511 self.set_span(Span { start, ..self.get_span() });
512 }
513
514 /// Set the ending offset for the span for this search configuration.
515 ///
516 /// This is a convenience routine for only mutating the end of a span
517 /// without having to set the entire span.
518 ///
519 /// # Panics
520 ///
521 /// This panics if the span resulting from the new end position does not
522 /// correspond to valid bounds in the haystack or the termination of a
523 /// search.
524 ///
525 /// # Example
526 ///
527 /// ```
528 /// use regex_automata::Input;
529 ///
530 /// let mut input = Input::new("foobar");
531 /// assert_eq!(0..6, input.get_range());
532 /// input.set_end(5);
533 /// assert_eq!(0..5, input.get_range());
534 /// ```
535 #[inline]
536 pub fn set_end(&mut self, end: usize) {
537 self.set_span(Span { end, ..self.get_span() });
538 }
539
540 /// Set the anchor mode of a search.
541 ///
542 /// This is like [`Input::anchored`], except it mutates the search
543 /// configuration in place.
544 ///
545 /// # Example
546 ///
547 /// ```
548 /// use regex_automata::{Anchored, Input, PatternID};
549 ///
550 /// let mut input = Input::new("foobar");
551 /// assert_eq!(Anchored::No, input.get_anchored());
552 ///
553 /// let pid = PatternID::must(5);
554 /// input.set_anchored(Anchored::Pattern(pid));
555 /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
556 /// ```
557 #[inline]
558 pub fn set_anchored(&mut self, mode: Anchored) {
559 self.anchored = mode;
560 }
561
562 /// Set whether the search should execute in "earliest" mode or not.
563 ///
564 /// This is like [`Input::earliest`], except it mutates the search
565 /// configuration in place.
566 ///
567 /// # Example
568 ///
569 /// ```
570 /// use regex_automata::Input;
571 ///
572 /// let mut input = Input::new("foobar");
573 /// assert!(!input.get_earliest());
574 /// input.set_earliest(true);
575 /// assert!(input.get_earliest());
576 /// ```
577 #[inline]
578 pub fn set_earliest(&mut self, yes: bool) {
579 self.earliest = yes;
580 }
581
582 /// Return a borrow of the underlying haystack as a slice of bytes.
583 ///
584 /// # Example
585 ///
586 /// ```
587 /// use regex_automata::Input;
588 ///
589 /// let input = Input::new("foobar");
590 /// assert_eq!(b"foobar", input.haystack());
591 /// ```
592 #[inline]
593 pub fn haystack(&self) -> &[u8] {
594 self.haystack
595 }
596
597 /// Return the start position of this search.
598 ///
599 /// This is a convenience routine for `search.get_span().start()`.
600 ///
601 /// When [`Input::is_done`] is `false`, this is guaranteed to return
602 /// an offset that is less than or equal to [`Input::end`]. Otherwise,
603 /// the offset is one greater than [`Input::end`].
604 ///
605 /// # Example
606 ///
607 /// ```
608 /// use regex_automata::Input;
609 ///
610 /// let input = Input::new("foobar");
611 /// assert_eq!(0, input.start());
612 ///
613 /// let input = Input::new("foobar").span(2..4);
614 /// assert_eq!(2, input.start());
615 /// ```
616 #[inline]
617 pub fn start(&self) -> usize {
618 self.get_span().start
619 }
620
621 /// Return the end position of this search.
622 ///
623 /// This is a convenience routine for `search.get_span().end()`.
624 ///
625 /// This is guaranteed to return an offset that is a valid exclusive end
626 /// bound for this input's haystack.
627 ///
628 /// # Example
629 ///
630 /// ```
631 /// use regex_automata::Input;
632 ///
633 /// let input = Input::new("foobar");
634 /// assert_eq!(6, input.end());
635 ///
636 /// let input = Input::new("foobar").span(2..4);
637 /// assert_eq!(4, input.end());
638 /// ```
639 #[inline]
640 pub fn end(&self) -> usize {
641 self.get_span().end
642 }
643
644 /// Return the span for this search configuration.
645 ///
646 /// If one was not explicitly set, then the span corresponds to the entire
647 /// range of the haystack.
648 ///
649 /// When [`Input::is_done`] is `false`, the span returned is guaranteed
650 /// to correspond to valid bounds for this input's haystack.
651 ///
652 /// # Example
653 ///
654 /// ```
655 /// use regex_automata::{Input, Span};
656 ///
657 /// let input = Input::new("foobar");
658 /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
659 /// ```
660 #[inline]
661 pub fn get_span(&self) -> Span {
662 self.span
663 }
664
665 /// Return the span as a range for this search configuration.
666 ///
667 /// If one was not explicitly set, then the span corresponds to the entire
668 /// range of the haystack.
669 ///
670 /// When [`Input::is_done`] is `false`, the range returned is guaranteed
671 /// to correspond to valid bounds for this input's haystack.
672 ///
673 /// # Example
674 ///
675 /// ```
676 /// use regex_automata::Input;
677 ///
678 /// let input = Input::new("foobar");
679 /// assert_eq!(0..6, input.get_range());
680 /// ```
681 #[inline]
682 pub fn get_range(&self) -> Range<usize> {
683 self.get_span().range()
684 }
685
686 /// Return the anchored mode for this search configuration.
687 ///
688 /// If no anchored mode was set, then it defaults to [`Anchored::No`].
689 ///
690 /// # Example
691 ///
692 /// ```
693 /// use regex_automata::{Anchored, Input, PatternID};
694 ///
695 /// let mut input = Input::new("foobar");
696 /// assert_eq!(Anchored::No, input.get_anchored());
697 ///
698 /// let pid = PatternID::must(5);
699 /// input.set_anchored(Anchored::Pattern(pid));
700 /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
701 /// ```
702 #[inline]
703 pub fn get_anchored(&self) -> Anchored {
704 self.anchored
705 }
706
707 /// Return whether this search should execute in "earliest" mode.
708 ///
709 /// # Example
710 ///
711 /// ```
712 /// use regex_automata::Input;
713 ///
714 /// let input = Input::new("foobar");
715 /// assert!(!input.get_earliest());
716 /// ```
717 #[inline]
718 pub fn get_earliest(&self) -> bool {
719 self.earliest
720 }
721
722 /// Return true if and only if this search can never return any other
723 /// matches.
724 ///
725 /// This occurs when the start position of this search is greater than the
726 /// end position of the search.
727 ///
728 /// # Example
729 ///
730 /// ```
731 /// use regex_automata::Input;
732 ///
733 /// let mut input = Input::new("foobar");
734 /// assert!(!input.is_done());
735 /// input.set_start(6);
736 /// assert!(!input.is_done());
737 /// input.set_start(7);
738 /// assert!(input.is_done());
739 /// ```
740 #[inline]
741 pub fn is_done(&self) -> bool {
742 self.get_span().start > self.get_span().end
743 }
744
745 /// Returns true if and only if the given offset in this search's haystack
746 /// falls on a valid UTF-8 encoded codepoint boundary.
747 ///
748 /// If the haystack is not valid UTF-8, then the behavior of this routine
749 /// is unspecified.
750 ///
751 /// # Example
752 ///
753 /// This shows where codepoint boundaries do and don't exist in valid
754 /// UTF-8.
755 ///
756 /// ```
757 /// use regex_automata::Input;
758 ///
759 /// let input = Input::new("☃");
760 /// assert!(input.is_char_boundary(0));
761 /// assert!(!input.is_char_boundary(1));
762 /// assert!(!input.is_char_boundary(2));
763 /// assert!(input.is_char_boundary(3));
764 /// assert!(!input.is_char_boundary(4));
765 /// ```
766 #[inline]
767 pub fn is_char_boundary(&self, offset: usize) -> bool {
768 utf8::is_boundary(self.haystack(), offset)
769 }
770}
771
772impl<'h> core::fmt::Debug for Input<'h> {
773 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
774 use crate::util::escape::DebugHaystack;
775
776 f&mut DebugStruct<'_, '_>.debug_struct("Input")
777 .field("haystack", &DebugHaystack(self.haystack()))
778 .field("span", &self.span)
779 .field("anchored", &self.anchored)
780 .field(name:"earliest", &self.earliest)
781 .finish()
782 }
783}
784
785impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> {
786 fn from(haystack: &'h H) -> Input<'h> {
787 Input::new(haystack)
788 }
789}
790
791/// A representation of a span reported by a regex engine.
792///
793/// A span corresponds to the starting and ending _byte offsets_ of a
794/// contiguous region of bytes. The starting offset is inclusive while the
795/// ending offset is exclusive. That is, a span is a half-open interval.
796///
797/// A span is used to report the offsets of a match, but it is also used to
798/// convey which region of a haystack should be searched via routines like
799/// [`Input::span`].
800///
801/// This is basically equivalent to a `std::ops::Range<usize>`, except this
802/// type implements `Copy` which makes it more ergonomic to use in the context
803/// of this crate. Like a range, this implements `Index` for `[u8]` and `str`,
804/// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`,
805/// which means things like `Span::from(5..10)` work.
806#[derive(Clone, Copy, Eq, Hash, PartialEq)]
807pub struct Span {
808 /// The start offset of the span, inclusive.
809 pub start: usize,
810 /// The end offset of the span, exclusive.
811 pub end: usize,
812}
813
814impl Span {
815 /// Returns this span as a range.
816 #[inline]
817 pub fn range(&self) -> Range<usize> {
818 Range::from(*self)
819 }
820
821 /// Returns true when this span is empty. That is, when `start >= end`.
822 #[inline]
823 pub fn is_empty(&self) -> bool {
824 self.start >= self.end
825 }
826
827 /// Returns the length of this span.
828 ///
829 /// This returns `0` in precisely the cases that `is_empty` returns `true`.
830 #[inline]
831 pub fn len(&self) -> usize {
832 self.end.saturating_sub(self.start)
833 }
834
835 /// Returns true when the given offset is contained within this span.
836 ///
837 /// Note that an empty span contains no offsets and will always return
838 /// false.
839 #[inline]
840 pub fn contains(&self, offset: usize) -> bool {
841 !self.is_empty() && self.start <= offset && offset <= self.end
842 }
843
844 /// Returns a new span with `offset` added to this span's `start` and `end`
845 /// values.
846 #[inline]
847 pub fn offset(&self, offset: usize) -> Span {
848 Span { start: self.start + offset, end: self.end + offset }
849 }
850}
851
852impl core::fmt::Debug for Span {
853 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
854 write!(f, "{}..{}", self.start, self.end)
855 }
856}
857
858impl core::ops::Index<Span> for [u8] {
859 type Output = [u8];
860
861 #[inline]
862 fn index(&self, index: Span) -> &[u8] {
863 &self[index.range()]
864 }
865}
866
867impl core::ops::IndexMut<Span> for [u8] {
868 #[inline]
869 fn index_mut(&mut self, index: Span) -> &mut [u8] {
870 &mut self[index.range()]
871 }
872}
873
874impl core::ops::Index<Span> for str {
875 type Output = str;
876
877 #[inline]
878 fn index(&self, index: Span) -> &str {
879 &self[index.range()]
880 }
881}
882
883impl From<Range<usize>> for Span {
884 #[inline]
885 fn from(range: Range<usize>) -> Span {
886 Span { start: range.start, end: range.end }
887 }
888}
889
890impl From<Span> for Range<usize> {
891 #[inline]
892 fn from(span: Span) -> Range<usize> {
893 Range { start: span.start, end: span.end }
894 }
895}
896
897impl PartialEq<Range<usize>> for Span {
898 #[inline]
899 fn eq(&self, range: &Range<usize>) -> bool {
900 self.start == range.start && self.end == range.end
901 }
902}
903
904impl PartialEq<Span> for Range<usize> {
905 #[inline]
906 fn eq(&self, span: &Span) -> bool {
907 self.start == span.start && self.end == span.end
908 }
909}
910
911/// A representation of "half" of a match reported by a DFA.
912///
913/// This is called a "half" match because it only includes the end location (or
914/// start location for a reverse search) of a match. This corresponds to the
915/// information that a single DFA scan can report. Getting the other half of
916/// the match requires a second scan with a reversed DFA.
917///
918/// A half match also includes the pattern that matched. The pattern is
919/// identified by an ID, which corresponds to its position (starting from `0`)
920/// relative to other patterns used to construct the corresponding DFA. If only
921/// a single pattern is provided to the DFA, then all matches are guaranteed to
922/// have a pattern ID of `0`.
923#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
924pub struct HalfMatch {
925 /// The pattern ID.
926 pattern: PatternID,
927 /// The offset of the match.
928 ///
929 /// For forward searches, the offset is exclusive. For reverse searches,
930 /// the offset is inclusive.
931 offset: usize,
932}
933
934impl HalfMatch {
935 /// Create a new half match from a pattern ID and a byte offset.
936 #[inline]
937 pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
938 HalfMatch { pattern, offset }
939 }
940
941 /// Create a new half match from a pattern ID and a byte offset.
942 ///
943 /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
944 /// [`PatternID`]. This panics if the given `usize` is not representable
945 /// as a `PatternID`.
946 #[inline]
947 pub fn must(pattern: usize, offset: usize) -> HalfMatch {
948 HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
949 }
950
951 /// Returns the ID of the pattern that matched.
952 ///
953 /// The ID of a pattern is derived from the position in which it was
954 /// originally inserted into the corresponding DFA. The first pattern has
955 /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
956 #[inline]
957 pub fn pattern(&self) -> PatternID {
958 self.pattern
959 }
960
961 /// The position of the match.
962 ///
963 /// If this match was produced by a forward search, then the offset is
964 /// exclusive. If this match was produced by a reverse search, then the
965 /// offset is inclusive.
966 #[inline]
967 pub fn offset(&self) -> usize {
968 self.offset
969 }
970}
971
972/// A representation of a match reported by a regex engine.
973///
974/// A match has two essential pieces of information: the [`PatternID`] that
975/// matches, and the [`Span`] of the match in a haystack.
976///
977/// The pattern is identified by an ID, which corresponds to its position
978/// (starting from `0`) relative to other patterns used to construct the
979/// corresponding regex engine. If only a single pattern is provided, then all
980/// matches are guaranteed to have a pattern ID of `0`.
981///
982/// Every match reported by a regex engine guarantees that its span has its
983/// start offset as less than or equal to its end offset.
984#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
985pub struct Match {
986 /// The pattern ID.
987 pattern: PatternID,
988 /// The underlying match span.
989 span: Span,
990}
991
992impl Match {
993 /// Create a new match from a pattern ID and a span.
994 ///
995 /// This constructor is generic over how a span is provided. While
996 /// a [`Span`] may be given directly, one may also provide a
997 /// `std::ops::Range<usize>`.
998 ///
999 /// # Panics
1000 ///
1001 /// This panics if `end < start`.
1002 ///
1003 /// # Example
1004 ///
1005 /// This shows how to create a match for the first pattern in a regex
1006 /// object using convenient range syntax.
1007 ///
1008 /// ```
1009 /// use regex_automata::{Match, PatternID};
1010 ///
1011 /// let m = Match::new(PatternID::ZERO, 5..10);
1012 /// assert_eq!(0, m.pattern().as_usize());
1013 /// assert_eq!(5, m.start());
1014 /// assert_eq!(10, m.end());
1015 /// ```
1016 #[inline]
1017 pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match {
1018 let span: Span = span.into();
1019 assert!(span.start <= span.end, "invalid match span");
1020 Match { pattern, span }
1021 }
1022
1023 /// Create a new match from a pattern ID and a byte offset span.
1024 ///
1025 /// This constructor is generic over how a span is provided. While
1026 /// a [`Span`] may be given directly, one may also provide a
1027 /// `std::ops::Range<usize>`.
1028 ///
1029 /// This is like [`Match::new`], but accepts a `usize` instead of a
1030 /// [`PatternID`]. This panics if the given `usize` is not representable
1031 /// as a `PatternID`.
1032 ///
1033 /// # Panics
1034 ///
1035 /// This panics if `end < start` or if `pattern > PatternID::MAX`.
1036 ///
1037 /// # Example
1038 ///
1039 /// This shows how to create a match for the third pattern in a regex
1040 /// object using convenient range syntax.
1041 ///
1042 /// ```
1043 /// use regex_automata::Match;
1044 ///
1045 /// let m = Match::must(3, 5..10);
1046 /// assert_eq!(3, m.pattern().as_usize());
1047 /// assert_eq!(5, m.start());
1048 /// assert_eq!(10, m.end());
1049 /// ```
1050 #[inline]
1051 pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match {
1052 Match::new(PatternID::must(pattern), span)
1053 }
1054
1055 /// Returns the ID of the pattern that matched.
1056 ///
1057 /// The ID of a pattern is derived from the position in which it was
1058 /// originally inserted into the corresponding regex engine. The first
1059 /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
1060 /// so on.
1061 #[inline]
1062 pub fn pattern(&self) -> PatternID {
1063 self.pattern
1064 }
1065
1066 /// The starting position of the match.
1067 ///
1068 /// This is a convenience routine for `Match::span().start`.
1069 #[inline]
1070 pub fn start(&self) -> usize {
1071 self.span().start
1072 }
1073
1074 /// The ending position of the match.
1075 ///
1076 /// This is a convenience routine for `Match::span().end`.
1077 #[inline]
1078 pub fn end(&self) -> usize {
1079 self.span().end
1080 }
1081
1082 /// Returns the match span as a range.
1083 ///
1084 /// This is a convenience routine for `Match::span().range()`.
1085 #[inline]
1086 pub fn range(&self) -> core::ops::Range<usize> {
1087 self.span().range()
1088 }
1089
1090 /// Returns the span for this match.
1091 #[inline]
1092 pub fn span(&self) -> Span {
1093 self.span
1094 }
1095
1096 /// Returns true when the span in this match is empty.
1097 ///
1098 /// An empty match can only be returned when the regex itself can match
1099 /// the empty string.
1100 #[inline]
1101 pub fn is_empty(&self) -> bool {
1102 self.span().is_empty()
1103 }
1104
1105 /// Returns the length of this match.
1106 ///
1107 /// This returns `0` in precisely the cases that `is_empty` returns `true`.
1108 #[inline]
1109 pub fn len(&self) -> usize {
1110 self.span().len()
1111 }
1112}
1113
1114/// A set of `PatternID`s.
1115///
1116/// A set of pattern identifiers is useful for recording which patterns have
1117/// matched a particular haystack. A pattern set _only_ includes pattern
1118/// identifiers. It does not include offset information.
1119///
1120/// # Example
1121///
1122/// This shows basic usage of a set.
1123///
1124/// ```
1125/// use regex_automata::{PatternID, PatternSet};
1126///
1127/// let pid1 = PatternID::must(5);
1128/// let pid2 = PatternID::must(8);
1129/// // Create a new empty set.
1130/// let mut set = PatternSet::new(10);
1131/// // Insert pattern IDs.
1132/// set.insert(pid1);
1133/// set.insert(pid2);
1134/// // Test membership.
1135/// assert!(set.contains(pid1));
1136/// assert!(set.contains(pid2));
1137/// // Get all members.
1138/// assert_eq!(
1139/// vec![5, 8],
1140/// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(),
1141/// );
1142/// // Clear the set.
1143/// set.clear();
1144/// // Test that it is indeed empty.
1145/// assert!(set.is_empty());
1146/// ```
1147#[cfg(feature = "alloc")]
1148#[derive(Clone, Debug, Eq, PartialEq)]
1149pub struct PatternSet {
1150 /// The number of patterns set to 'true' in this set.
1151 len: usize,
1152 /// A map from PatternID to boolean of whether a pattern matches or not.
1153 ///
1154 /// This should probably be a bitset, but it's probably unlikely to matter
1155 /// much in practice.
1156 ///
1157 /// The main downside of this representation (and similarly for a bitset)
1158 /// is that iteration scales with the capacity of the set instead of
1159 /// the length of the set. This doesn't seem likely to be a problem in
1160 /// practice.
1161 ///
1162 /// Another alternative is to just use a 'SparseSet' for this. It does use
1163 /// more memory (quite a bit more), but that seems fine I think compared
1164 /// to the memory being used by the regex engine. The real hiccup with
1165 /// it is that it yields pattern IDs in the order they were inserted.
1166 /// Which is actually kind of nice, but at the time of writing, pattern
1167 /// IDs are yielded in ascending order in the regex crate RegexSet API.
1168 /// If we did change to 'SparseSet', we could provide an additional
1169 /// 'iter_match_order' iterator, but keep the ascending order one for
1170 /// compatibility.
1171 which: alloc::boxed::Box<[bool]>,
1172}
1173
1174#[cfg(feature = "alloc")]
1175impl PatternSet {
1176 /// Create a new set of pattern identifiers with the given capacity.
1177 ///
1178 /// The given capacity typically corresponds to (at least) the number of
1179 /// patterns in a compiled regex object.
1180 ///
1181 /// # Panics
1182 ///
1183 /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is
1184 /// impossible if you use the `pattern_len()` method as defined on any of
1185 /// the regex engines in this crate. Namely, a regex will fail to build by
1186 /// returning an error if the number of patterns given to it exceeds the
1187 /// limit. Therefore, the number of patterns in a valid regex is always
1188 /// a correct capacity to provide here.
1189 pub fn new(capacity: usize) -> PatternSet {
1190 assert!(
1191 capacity <= PatternID::LIMIT,
1192 "pattern set capacity exceeds limit of {}",
1193 PatternID::LIMIT,
1194 );
1195 PatternSet {
1196 len: 0,
1197 which: alloc::vec![false; capacity].into_boxed_slice(),
1198 }
1199 }
1200
1201 /// Clear this set such that it contains no pattern IDs.
1202 pub fn clear(&mut self) {
1203 self.len = 0;
1204 for matched in self.which.iter_mut() {
1205 *matched = false;
1206 }
1207 }
1208
1209 /// Return true if and only if the given pattern identifier is in this set.
1210 pub fn contains(&self, pid: PatternID) -> bool {
1211 pid.as_usize() < self.capacity() && self.which[pid]
1212 }
1213
1214 /// Insert the given pattern identifier into this set and return `true` if
1215 /// the given pattern ID was not previously in this set.
1216 ///
1217 /// If the pattern identifier is already in this set, then this is a no-op.
1218 ///
1219 /// Use [`PatternSet::try_insert`] for a fallible version of this routine.
1220 ///
1221 /// # Panics
1222 ///
1223 /// This panics if this pattern set has insufficient capacity to
1224 /// store the given pattern ID.
1225 pub fn insert(&mut self, pid: PatternID) -> bool {
1226 self.try_insert(pid)
1227 .expect("PatternSet should have sufficient capacity")
1228 }
1229
1230 /// Insert the given pattern identifier into this set and return `true` if
1231 /// the given pattern ID was not previously in this set.
1232 ///
1233 /// If the pattern identifier is already in this set, then this is a no-op.
1234 ///
1235 /// # Errors
1236 ///
1237 /// This returns an error if this pattern set has insufficient capacity to
1238 /// store the given pattern ID.
1239 pub fn try_insert(
1240 &mut self,
1241 pid: PatternID,
1242 ) -> Result<bool, PatternSetInsertError> {
1243 if pid.as_usize() >= self.capacity() {
1244 return Err(PatternSetInsertError {
1245 attempted: pid,
1246 capacity: self.capacity(),
1247 });
1248 }
1249 if self.which[pid] {
1250 return Ok(false);
1251 }
1252 self.len += 1;
1253 self.which[pid] = true;
1254 Ok(true)
1255 }
1256
1257 /*
1258 // This is currently commented out because it is unused and it is unclear
1259 // whether it's useful or not. What's the harm in having it? When, if
1260 // we ever wanted to change our representation to a 'SparseSet', then
1261 // supporting this method would be a bit tricky. So in order to keep some
1262 // API evolution flexibility, we leave it out for now.
1263
1264 /// Remove the given pattern identifier from this set.
1265 ///
1266 /// If the pattern identifier was not previously in this set, then this
1267 /// does not change the set and returns `false`.
1268 ///
1269 /// # Panics
1270 ///
1271 /// This panics if `pid` exceeds the capacity of this set.
1272 pub fn remove(&mut self, pid: PatternID) -> bool {
1273 if !self.which[pid] {
1274 return false;
1275 }
1276 self.len -= 1;
1277 self.which[pid] = false;
1278 true
1279 }
1280 */
1281
1282 /// Return true if and only if this set has no pattern identifiers in it.
1283 pub fn is_empty(&self) -> bool {
1284 self.len() == 0
1285 }
1286
1287 /// Return true if and only if this set has the maximum number of pattern
1288 /// identifiers in the set. This occurs precisely when `PatternSet::len()
1289 /// == PatternSet::capacity()`.
1290 ///
1291 /// This particular property is useful to test because it may allow one to
1292 /// stop a search earlier than you might otherwise. Namely, if a search is
1293 /// only reporting which patterns match a haystack and if you know all of
1294 /// the patterns match at a given point, then there's no new information
1295 /// that can be learned by continuing the search. (Because a pattern set
1296 /// does not keep track of offset information.)
1297 pub fn is_full(&self) -> bool {
1298 self.len() == self.capacity()
1299 }
1300
1301 /// Returns the total number of pattern identifiers in this set.
1302 pub fn len(&self) -> usize {
1303 self.len
1304 }
1305
1306 /// Returns the total number of pattern identifiers that may be stored
1307 /// in this set.
1308 ///
1309 /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`].
1310 ///
1311 /// Typically, the capacity of a pattern set matches the number of patterns
1312 /// in a regex object with which you are searching.
1313 pub fn capacity(&self) -> usize {
1314 self.which.len()
1315 }
1316
1317 /// Returns an iterator over all pattern identifiers in this set.
1318 ///
1319 /// The iterator yields pattern identifiers in ascending order, starting
1320 /// at zero.
1321 pub fn iter(&self) -> PatternSetIter<'_> {
1322 PatternSetIter { it: self.which.iter().enumerate() }
1323 }
1324}
1325
1326/// An error that occurs when a `PatternID` failed to insert into a
1327/// `PatternSet`.
1328///
1329/// An insert fails when the given `PatternID` exceeds the configured capacity
1330/// of the `PatternSet`.
1331///
1332/// This error is created by the [`PatternSet::try_insert`] routine.
1333#[cfg(feature = "alloc")]
1334#[derive(Clone, Debug)]
1335pub struct PatternSetInsertError {
1336 attempted: PatternID,
1337 capacity: usize,
1338}
1339
1340#[cfg(feature = "std")]
1341impl std::error::Error for PatternSetInsertError {}
1342
1343#[cfg(feature = "alloc")]
1344impl core::fmt::Display for PatternSetInsertError {
1345 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1346 write!(
1347 f,
1348 "failed to insert pattern ID {} into pattern set \
1349 with insufficiet capacity of {}",
1350 self.attempted.as_usize(),
1351 self.capacity,
1352 )
1353 }
1354}
1355
1356/// An iterator over all pattern identifiers in a [`PatternSet`].
1357///
1358/// The lifetime parameter `'a` refers to the lifetime of the pattern set being
1359/// iterated over.
1360///
1361/// This iterator is created by the [`PatternSet::iter`] method.
1362#[cfg(feature = "alloc")]
1363#[derive(Clone, Debug)]
1364pub struct PatternSetIter<'a> {
1365 it: core::iter::Enumerate<core::slice::Iter<'a, bool>>,
1366}
1367
1368#[cfg(feature = "alloc")]
1369impl<'a> Iterator for PatternSetIter<'a> {
1370 type Item = PatternID;
1371
1372 fn next(&mut self) -> Option<PatternID> {
1373 while let Some((index: usize, &yes: bool)) = self.it.next() {
1374 if yes {
1375 // Only valid 'PatternID' values can be inserted into the set
1376 // and construction of the set panics if the capacity would
1377 // permit storing invalid pattern IDs. Thus, 'yes' is only true
1378 // precisely when 'index' corresponds to a valid 'PatternID'.
1379 return Some(PatternID::new_unchecked(index));
1380 }
1381 }
1382 None
1383 }
1384
1385 fn size_hint(&self) -> (usize, Option<usize>) {
1386 self.it.size_hint()
1387 }
1388}
1389
1390#[cfg(feature = "alloc")]
1391impl<'a> DoubleEndedIterator for PatternSetIter<'a> {
1392 fn next_back(&mut self) -> Option<PatternID> {
1393 while let Some((index: usize, &yes: bool)) = self.it.next_back() {
1394 if yes {
1395 // Only valid 'PatternID' values can be inserted into the set
1396 // and construction of the set panics if the capacity would
1397 // permit storing invalid pattern IDs. Thus, 'yes' is only true
1398 // precisely when 'index' corresponds to a valid 'PatternID'.
1399 return Some(PatternID::new_unchecked(index));
1400 }
1401 }
1402 None
1403 }
1404}
1405
1406/// The type of anchored search to perform.
1407///
1408/// This is *almost* a boolean option. That is, you can either do an unanchored
1409/// search for any pattern in a regex, or you can do an anchored search for any
1410/// pattern in a regex.
1411///
1412/// A third option exists that, assuming the regex engine supports it, permits
1413/// you to do an anchored search for a specific pattern.
1414///
1415/// Note that there is no way to run an unanchored search for a specific
1416/// pattern. If you need that, you'll need to build separate regexes for each
1417/// pattern.
1418///
1419/// # Errors
1420///
1421/// If a regex engine does not support the anchored mode selected, then the
1422/// regex engine will return an error. While any non-trivial regex engine
1423/// should support at least one of the available anchored modes, there is no
1424/// singular mode that is guaranteed to be universally supported. Some regex
1425/// engines might only support unanchored searches (DFAs compiled without
1426/// anchored starting states) and some regex engines might only support
1427/// anchored searches (like the one-pass DFA).
1428///
1429/// The specific error returned is a [`MatchError`] with a
1430/// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the
1431/// `Anchored` value given that is unsupported.
1432///
1433/// Note that regex engines should report "no match" if, for example, an
1434/// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where
1435/// anchored searches for a specific pattern are supported. This is smooths out
1436/// behavior such that it's possible to guarantee that an error never occurs
1437/// based on how the regex engine is configured. All regex engines in this
1438/// crate report "no match" when searching for an invalid pattern ID, but where
1439/// searching for a valid pattern ID is otherwise supported.
1440///
1441/// # Example
1442///
1443/// This example shows how to use the various `Anchored` modes to run a
1444/// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)
1445/// because it supports all modes unconditionally. Some regex engines, like
1446/// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored
1447/// searches.
1448///
1449/// ```
1450/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1451/// use regex_automata::{
1452/// nfa::thompson::pikevm::PikeVM,
1453/// Anchored, Input, Match, PatternID,
1454/// };
1455///
1456/// let re = PikeVM::new_many(&[
1457/// r"Mrs. \w+",
1458/// r"Miss \w+",
1459/// r"Mr. \w+",
1460/// r"Ms. \w+",
1461/// ])?;
1462/// let mut cache = re.create_cache();
1463/// let hay = "Hello Mr. Springsteen!";
1464///
1465/// // The default is to do an unanchored search.
1466/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
1467/// // Explicitly ask for an unanchored search. Same as above.
1468/// let input = Input::new(hay).anchored(Anchored::No);
1469/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
1470///
1471/// // Now try an anchored search. Since the match doesn't start at the
1472/// // beginning of the haystack, no match is found!
1473/// let input = Input::new(hay).anchored(Anchored::Yes);
1474/// assert_eq!(None, re.find(&mut cache, input));
1475///
1476/// // We can try an anchored search again, but move the location of where
1477/// // we start the search. Note that the offsets reported are still in
1478/// // terms of the overall haystack and not relative to where we started
1479/// // the search.
1480/// let input = Input::new(hay).anchored(Anchored::Yes).range(6..);
1481/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
1482///
1483/// // Now try an anchored search for a specific pattern. We specifically
1484/// // choose a pattern that we know doesn't match to prove that the search
1485/// // only looks for the pattern we provide.
1486/// let input = Input::new(hay)
1487/// .anchored(Anchored::Pattern(PatternID::must(1)))
1488/// .range(6..);
1489/// assert_eq!(None, re.find(&mut cache, input));
1490///
1491/// // But if we switch it to the pattern that we know matches, then we find
1492/// // the match.
1493/// let input = Input::new(hay)
1494/// .anchored(Anchored::Pattern(PatternID::must(2)))
1495/// .range(6..);
1496/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
1497///
1498/// # Ok::<(), Box<dyn std::error::Error>>(())
1499/// ```
1500#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1501pub enum Anchored {
1502 /// Run an unanchored search. This means a match may occur anywhere at or
1503 /// after the start position of the search.
1504 ///
1505 /// This search can return a match for any pattern in the regex.
1506 No,
1507 /// Run an anchored search. This means that a match must begin at the
1508 /// start position of the search.
1509 ///
1510 /// This search can return a match for any pattern in the regex.
1511 Yes,
1512 /// Run an anchored search for a specific pattern. This means that a match
1513 /// must be for the given pattern and must begin at the start position of
1514 /// the search.
1515 Pattern(PatternID),
1516}
1517
1518impl Anchored {
1519 /// Returns true if and only if this anchor mode corresponds to any kind of
1520 /// anchored search.
1521 ///
1522 /// # Example
1523 ///
1524 /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern`
1525 /// are considered anchored searches.
1526 ///
1527 /// ```
1528 /// use regex_automata::{Anchored, PatternID};
1529 ///
1530 /// assert!(!Anchored::No.is_anchored());
1531 /// assert!(Anchored::Yes.is_anchored());
1532 /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored());
1533 /// ```
1534 #[inline]
1535 pub fn is_anchored(&self) -> bool {
1536 matches!(*self, Anchored::Yes | Anchored::Pattern(_))
1537 }
1538
1539 /// Returns the pattern ID associated with this configuration if it is an
1540 /// anchored search for a specific pattern. Otherwise `None` is returned.
1541 ///
1542 /// # Example
1543 ///
1544 /// ```
1545 /// use regex_automata::{Anchored, PatternID};
1546 ///
1547 /// assert_eq!(None, Anchored::No.pattern());
1548 /// assert_eq!(None, Anchored::Yes.pattern());
1549 ///
1550 /// let pid = PatternID::must(5);
1551 /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern());
1552 /// ```
1553 #[inline]
1554 pub fn pattern(&self) -> Option<PatternID> {
1555 match *self {
1556 Anchored::Pattern(pid) => Some(pid),
1557 _ => None,
1558 }
1559 }
1560}
1561
1562/// The kind of match semantics to use for a regex pattern.
1563///
1564/// The default match kind is `LeftmostFirst`, and this corresponds to the
1565/// match semantics used by most backtracking engines, such as Perl.
1566///
1567/// # Leftmost first or "preference order" match semantics
1568///
1569/// Leftmost-first semantics determine which match to report when there are
1570/// multiple paths through a regex that match at the same position. The tie is
1571/// essentially broken by how a backtracker would behave. For example, consider
1572/// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this
1573/// case, both the `foofoo` and `foo` branches match at position `0`. So should
1574/// the end of the match be `3` or `6`?
1575///
1576/// A backtracker will conceptually work by trying `foofoofoo` and failing.
1577/// Then it will try `foofoo`, find the match and stop there. Thus, the
1578/// leftmost-first match position is `6`. This is called "leftmost-first" or
1579/// "preference order" because the order of the branches as written in the
1580/// regex pattern is what determines how to break the tie.
1581///
1582/// (Note that leftmost-longest match semantics, which break ties by always
1583/// taking the longest matching string, are not currently supported by this
1584/// crate. These match semantics tend to be found in POSIX regex engines.)
1585///
1586/// This example shows how leftmost-first semantics work, and how it even
1587/// applies to multi-pattern regexes:
1588///
1589/// ```
1590/// use regex_automata::{
1591/// nfa::thompson::pikevm::PikeVM,
1592/// Match,
1593/// };
1594///
1595/// let re = PikeVM::new_many(&[
1596/// r"foofoofoo",
1597/// r"foofoo",
1598/// r"foo",
1599/// ])?;
1600/// let mut cache = re.create_cache();
1601/// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect();
1602/// let expected = vec![Match::must(1, 0..6)];
1603/// assert_eq!(expected, got);
1604///
1605/// # Ok::<(), Box<dyn std::error::Error>>(())
1606/// ```
1607///
1608/// # All matches
1609///
1610/// The `All` match semantics report any and all matches, and generally will
1611/// attempt to match as much as possible. It doesn't respect any sort of match
1612/// priority at all, so things like non-greedy matching don't work in this
1613/// mode.
1614///
1615/// The fact that non-greedy matching doesn't work generally makes most forms
1616/// of unanchored non-overlapping searches have unintuitive behavior. Namely,
1617/// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the
1618/// beginning of the pattern, which is specifically non-greedy. Since it will
1619/// be treated as greedy in `All` match semantics, this generally means that
1620/// it will first attempt to consume all of the haystack and is likely to wind
1621/// up skipping matches.
1622///
1623/// Generally speaking, `All` should only be used in two circumstances:
1624///
1625/// * When running an anchored search and there is a desire to match as much as
1626/// possible. For example, when building a reverse regex matcher to find the
1627/// start of a match after finding the end. In this case, the reverse search
1628/// is anchored to the end of the match found by the forward search.
1629/// * When running overlapping searches. Since `All` encodes all possible
1630/// matches, this is generally what you want for an overlapping search. If you
1631/// try to use leftmost-first in an overlapping search, it is likely to produce
1632/// counter-intuitive results since leftmost-first specifically excludes some
1633/// matches from its underlying finite state machine.
1634///
1635/// This example demonstrates the counter-intuitive behavior of `All` semantics
1636/// when using a standard leftmost unanchored search:
1637///
1638/// ```
1639/// use regex_automata::{
1640/// nfa::thompson::pikevm::PikeVM,
1641/// Match, MatchKind,
1642/// };
1643///
1644/// let re = PikeVM::builder()
1645/// .configure(PikeVM::config().match_kind(MatchKind::All))
1646/// .build("foo")?;
1647/// let hay = "first foo second foo wat";
1648/// let mut cache = re.create_cache();
1649/// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect();
1650/// // Notice that it completely skips the first 'foo'!
1651/// let expected = vec![Match::must(0, 17..20)];
1652/// assert_eq!(expected, got);
1653///
1654/// # Ok::<(), Box<dyn std::error::Error>>(())
1655/// ```
1656///
1657/// This second example shows how `All` semantics are useful for an overlapping
1658/// search. Note that we use lower level lazy DFA APIs here since the NFA
1659/// engines only currently support a very limited form of overlapping search.
1660///
1661/// ```
1662/// use regex_automata::{
1663/// hybrid::dfa::{DFA, OverlappingState},
1664/// HalfMatch, Input, MatchKind,
1665/// };
1666///
1667/// let re = DFA::builder()
1668/// // If we didn't set 'All' semantics here, then the regex would only
1669/// // match 'foo' at offset 3 and nothing else. Why? Because the state
1670/// // machine implements preference order and knows that the 'foofoo' and
1671/// // 'foofoofoo' branches can never match since 'foo' will always match
1672/// // when they match and take priority.
1673/// .configure(DFA::config().match_kind(MatchKind::All))
1674/// .build(r"foo|foofoo|foofoofoo")?;
1675/// let mut cache = re.create_cache();
1676/// let mut state = OverlappingState::start();
1677/// let input = Input::new("foofoofoo");
1678/// let mut got = vec![];
1679/// loop {
1680/// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?;
1681/// let m = match state.get_match() {
1682/// None => break,
1683/// Some(m) => m,
1684/// };
1685/// got.push(m);
1686/// }
1687/// let expected = vec![
1688/// HalfMatch::must(0, 3),
1689/// HalfMatch::must(0, 6),
1690/// HalfMatch::must(0, 9),
1691/// ];
1692/// assert_eq!(expected, got);
1693///
1694/// # Ok::<(), Box<dyn std::error::Error>>(())
1695/// ```
1696#[non_exhaustive]
1697#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1698pub enum MatchKind {
1699 /// Report all possible matches.
1700 All,
1701 /// Report only the leftmost matches. When multiple leftmost matches exist,
1702 /// report the match corresponding to the part of the regex that appears
1703 /// first in the syntax.
1704 LeftmostFirst,
1705 // There is prior art in RE2 that shows that we should be able to add
1706 // LeftmostLongest too. The tricky part of it is supporting ungreedy
1707 // repetitions. Instead of treating all NFA states as having equivalent
1708 // priority (as in 'All') or treating all NFA states as having distinct
1709 // priority based on order (as in 'LeftmostFirst'), we instead group NFA
1710 // states into sets, and treat members of each set as having equivalent
1711 // priority, but having greater priority than all following members
1712 // of different sets.
1713 //
1714 // However, it's not clear whether it's really worth adding this. After
1715 // all, leftmost-longest can be emulated when using literals by using
1716 // leftmost-first and sorting the literals by length in descending order.
1717 // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
1718 // always match `a` in `ab` when using leftmost-first, but leftmost-longest
1719 // would match `ab`.
1720}
1721
1722impl MatchKind {
1723 #[cfg(feature = "alloc")]
1724 pub(crate) fn continue_past_first_match(&self) -> bool {
1725 *self == MatchKind::All
1726 }
1727}
1728
1729impl Default for MatchKind {
1730 fn default() -> MatchKind {
1731 MatchKind::LeftmostFirst
1732 }
1733}
1734
1735/// An error indicating that a search stopped before reporting whether a
1736/// match exists or not.
1737///
1738/// To be very clear, this error type implies that one cannot assume that no
1739/// matches occur, since the search stopped before completing. That is, if
1740/// you're looking for information about where a search determined that no
1741/// match can occur, then this error type does *not* give you that. (Indeed, at
1742/// the time of writing, if you need such a thing, you have to write your own
1743/// search routine.)
1744///
1745/// Normally, when one searches for something, the response is either an
1746/// affirmative "it was found at this location" or a negative "not found at
1747/// all." However, in some cases, a regex engine can be configured to stop its
1748/// search before concluding whether a match exists or not. When this happens,
1749/// it may be important for the caller to know why the regex engine gave up and
1750/// where in the input it gave up at. This error type exposes the 'why' and the
1751/// 'where.'
1752///
1753/// For example, the DFAs provided by this library generally cannot correctly
1754/// implement Unicode word boundaries. Instead, they provide an option to
1755/// eagerly support them on ASCII text (since Unicode word boundaries are
1756/// equivalent to ASCII word boundaries when searching ASCII text), but will
1757/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
1758/// required to either report the failure to the caller (unergonomic) or
1759/// otherwise fall back to some other regex engine (ergonomic, but potentially
1760/// costly).
1761///
1762/// More generally, some regex engines offer the ability for callers to specify
1763/// certain bytes that will trigger the regex engine to automatically quit if
1764/// they are seen.
1765///
1766/// Still yet, there may be other reasons for a failed match. For example,
1767/// the hybrid DFA provided by this crate can be configured to give up if it
1768/// believes that it is not efficient. This in turn permits callers to choose a
1769/// different regex engine.
1770///
1771/// (Note that DFAs are configured by default to never quit or give up in this
1772/// fashion. For example, by default, a DFA will fail to build if the regex
1773/// pattern contains a Unicode word boundary. One needs to opt into the "quit"
1774/// behavior via options, like
1775/// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).)
1776///
1777/// There are a couple other ways a search
1778/// can fail. For example, when using the
1779/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
1780/// with a haystack that is too long, or trying to run an unanchored search
1781/// with a [one-pass DFA](crate::dfa::onepass).
1782#[derive(Clone, Debug, Eq, PartialEq)]
1783pub struct MatchError(
1784 #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>,
1785 #[cfg(not(feature = "alloc"))] MatchErrorKind,
1786);
1787
1788impl MatchError {
1789 /// Create a new error value with the given kind.
1790 ///
1791 /// This is a more verbose version of the kind-specific constructors,
1792 /// e.g., `MatchError::quit`.
1793 pub fn new(kind: MatchErrorKind) -> MatchError {
1794 #[cfg(feature = "alloc")]
1795 {
1796 MatchError(alloc::boxed::Box::new(kind))
1797 }
1798 #[cfg(not(feature = "alloc"))]
1799 {
1800 MatchError(kind)
1801 }
1802 }
1803
1804 /// Returns a reference to the underlying error kind.
1805 pub fn kind(&self) -> &MatchErrorKind {
1806 &self.0
1807 }
1808
1809 /// Create a new "quit" error. The given `byte` corresponds to the value
1810 /// that tripped a search's quit condition, and `offset` corresponds to the
1811 /// location in the haystack at which the search quit.
1812 ///
1813 /// This is the same as calling `MatchError::new` with a
1814 /// [`MatchErrorKind::Quit`] kind.
1815 pub fn quit(byte: u8, offset: usize) -> MatchError {
1816 MatchError::new(MatchErrorKind::Quit { byte, offset })
1817 }
1818
1819 /// Create a new "gave up" error. The given `offset` corresponds to the
1820 /// location in the haystack at which the search gave up.
1821 ///
1822 /// This is the same as calling `MatchError::new` with a
1823 /// [`MatchErrorKind::GaveUp`] kind.
1824 pub fn gave_up(offset: usize) -> MatchError {
1825 MatchError::new(MatchErrorKind::GaveUp { offset })
1826 }
1827
1828 /// Create a new "haystack too long" error. The given `len` corresponds to
1829 /// the length of the haystack that was problematic.
1830 ///
1831 /// This is the same as calling `MatchError::new` with a
1832 /// [`MatchErrorKind::HaystackTooLong`] kind.
1833 pub fn haystack_too_long(len: usize) -> MatchError {
1834 MatchError::new(MatchErrorKind::HaystackTooLong { len })
1835 }
1836
1837 /// Create a new "unsupported anchored" error. This occurs when the caller
1838 /// requests a search with an anchor mode that is not supported by the
1839 /// regex engine.
1840 ///
1841 /// This is the same as calling `MatchError::new` with a
1842 /// [`MatchErrorKind::UnsupportedAnchored`] kind.
1843 pub fn unsupported_anchored(mode: Anchored) -> MatchError {
1844 MatchError::new(MatchErrorKind::UnsupportedAnchored { mode })
1845 }
1846}
1847
1848/// The underlying kind of a [`MatchError`].
1849///
1850/// This is a **non-exhaustive** enum. That means new variants may be added in
1851/// a semver-compatible release.
1852#[non_exhaustive]
1853#[derive(Clone, Debug, Eq, PartialEq)]
1854pub enum MatchErrorKind {
1855 /// The search saw a "quit" byte at which it was instructed to stop
1856 /// searching.
1857 Quit {
1858 /// The "quit" byte that was observed that caused the search to stop.
1859 byte: u8,
1860 /// The offset at which the quit byte was observed.
1861 offset: usize,
1862 },
1863 /// The search, based on heuristics, determined that it would be better
1864 /// to stop, typically to provide the caller an opportunity to use an
1865 /// alternative regex engine.
1866 ///
1867 /// Currently, the only way for this to occur is via the lazy DFA and
1868 /// only when it is configured to do so (it will not return this error by
1869 /// default).
1870 GaveUp {
1871 /// The offset at which the search stopped. This corresponds to the
1872 /// position immediately following the last byte scanned.
1873 offset: usize,
1874 },
1875 /// This error occurs if the haystack given to the regex engine was too
1876 /// long to be searched. This occurs, for example, with regex engines
1877 /// like the bounded backtracker that have a configurable fixed amount of
1878 /// capacity that is tied to the length of the haystack. Anything beyond
1879 /// that configured limit will result in an error at search time.
1880 HaystackTooLong {
1881 /// The length of the haystack that exceeded the limit.
1882 len: usize,
1883 },
1884 /// An error indicating that a particular type of anchored search was
1885 /// requested, but that the regex engine does not support it.
1886 ///
1887 /// Note that this error should not be returned by a regex engine simply
1888 /// because the pattern ID is invalid (i.e., equal to or exceeds the number
1889 /// of patterns in the regex). In that case, the regex engine should report
1890 /// a non-match.
1891 UnsupportedAnchored {
1892 /// The anchored mode given that is unsupported.
1893 mode: Anchored,
1894 },
1895}
1896
1897#[cfg(feature = "std")]
1898impl std::error::Error for MatchError {}
1899
1900impl core::fmt::Display for MatchError {
1901 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1902 match *self.kind() {
1903 MatchErrorKind::Quit { byte, offset } => write!(
1904 f,
1905 "quit search after observing byte {:?} at offset {}",
1906 DebugByte(byte),
1907 offset,
1908 ),
1909 MatchErrorKind::GaveUp { offset } => {
1910 write!(f, "gave up searching at offset {}", offset)
1911 }
1912 MatchErrorKind::HaystackTooLong { len } => {
1913 write!(f, "haystack of length {} is too long", len)
1914 }
1915 MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => {
1916 write!(f, "anchored searches are not supported or enabled")
1917 }
1918 MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => {
1919 write!(f, "unanchored searches are not supported or enabled")
1920 }
1921 MatchErrorKind::UnsupportedAnchored {
1922 mode: Anchored::Pattern(pid),
1923 } => {
1924 write!(
1925 f,
1926 "anchored searches for a specific pattern ({}) are \
1927 not supported or enabled",
1928 pid.as_usize(),
1929 )
1930 }
1931 }
1932 }
1933}
1934
1935#[cfg(test)]
1936mod tests {
1937 use super::*;
1938
1939 // We test that our 'MatchError' type is the size we expect. This isn't an
1940 // API guarantee, but if the size increases, we really want to make sure we
1941 // decide to do that intentionally. So this should be a speed bump. And in
1942 // general, we should not increase the size without a very good reason.
1943 //
1944 // Why? Because low level search APIs return Result<.., MatchError>. When
1945 // MatchError gets bigger, so to does the Result type.
1946 //
1947 // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes
1948 // the importance of keeping a small error type. But without 'alloc', we
1949 // still want things to be small.
1950 #[test]
1951 fn match_error_size() {
1952 let expected_size = if cfg!(feature = "alloc") {
1953 core::mem::size_of::<usize>()
1954 } else {
1955 2 * core::mem::size_of::<usize>()
1956 };
1957 assert_eq!(expected_size, core::mem::size_of::<MatchError>());
1958 }
1959
1960 // Same as above, but for the underlying match error kind.
1961 #[cfg(target_pointer_width = "64")]
1962 #[test]
1963 fn match_error_kind_size() {
1964 let expected_size = 2 * core::mem::size_of::<usize>();
1965 assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
1966 }
1967
1968 #[cfg(target_pointer_width = "32")]
1969 #[test]
1970 fn match_error_kind_size() {
1971 let expected_size = 3 * core::mem::size_of::<usize>();
1972 assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
1973 }
1974
1975 #[test]
1976 fn incorrect_asref_guard() {
1977 struct Bad(std::cell::Cell<bool>);
1978
1979 impl AsRef<[u8]> for Bad {
1980 fn as_ref(&self) -> &[u8] {
1981 if self.0.replace(false) {
1982 &[]
1983 } else {
1984 &[0; 1000]
1985 }
1986 }
1987 }
1988
1989 let bad = Bad(std::cell::Cell::new(true));
1990 let input = Input::new(&bad);
1991 assert!(input.end() <= input.haystack().len());
1992 }
1993}
1994