1/*!
2Types and routines that support the search APIs of most regex engines.
3
4This sub-module isn't exposed directly, but rather, its contents are exported
5at the crate root due to the universality of most of the types and routines in
6this module.
7*/
8
9use core::ops::{Range, RangeBounds};
10
11use crate::util::{escape::DebugByte, primitives::PatternID, utf8};
12
13/// The parameters for a regex search including the haystack to search.
14///
15/// It turns out that regex searches have a few parameters, and in most cases,
16/// those parameters have defaults that work in the vast majority of cases.
17/// This `Input` type exists to make that common case seamnless while also
18/// providing an avenue for changing the parameters of a search. In particular,
19/// this type enables doing so without a combinatorial explosion of different
20/// methods and/or superfluous parameters in the common cases.
21///
22/// An `Input` permits configuring the following things:
23///
24/// * Search only a substring of a haystack, while taking the broader context
25/// into account for resolving look-around assertions.
26/// * Indicating whether to search for all patterns in a regex, or to
27/// only search for one pattern in particular.
28/// * Whether to perform an anchored on unanchored search.
29/// * Whether to report a match as early as possible.
30///
31/// All of these parameters, except for the haystack, have sensible default
32/// values. This means that the minimal search configuration is simply a call
33/// to [`Input::new`] with your haystack. Setting any other parameter is
34/// optional.
35///
36/// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a
37/// `From<H> for Input` implementation. This is useful because many of the
38/// search APIs in this crate accept an `Into<Input>`. This means you can
39/// provide string or byte strings to these routines directly, and they'll
40/// automatically get converted into an `Input` for you.
41///
42/// The lifetime parameter `'h` refers to the lifetime of the haystack.
43///
44/// # Organization
45///
46/// The API of `Input` is split into a few different parts:
47///
48/// * A builder-like API that transforms a `Input` by value. Examples:
49/// [`Input::span`] and [`Input::anchored`].
50/// * A setter API that permits mutating parameters in place. Examples:
51/// [`Input::set_span`] and [`Input::set_anchored`].
52/// * A getter API that permits retrieving any of the search parameters.
53/// Examples: [`Input::get_span`] and [`Input::get_anchored`].
54/// * A few convenience getter routines that don't conform to the above naming
55/// pattern due to how common they are. Examples: [`Input::haystack`],
56/// [`Input::start`] and [`Input::end`].
57/// * Miscellaneous predicates and other helper routines that are useful
58/// in some contexts. Examples: [`Input::is_char_boundary`].
59///
60/// A `Input` exposes so much because it is meant to be used by both callers of
61/// regex engines _and_ implementors of regex engines. A constraining factor is
62/// that regex engines should accept a `&Input` as its lowest level API, which
63/// means that implementors should only use the "getter" APIs of a `Input`.
64///
65/// # Valid bounds and search termination
66///
67/// An `Input` permits setting the bounds of a search via either
68/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or
69/// else a panic will occur. Bounds are valid if and only if:
70///
71/// * The bounds represent a valid range into the input's haystack.
72/// * **or** the end bound is a valid ending bound for the haystack *and*
73/// the start bound is exactly one greater than the start bound.
74///
75/// In the latter case, [`Input::is_done`] will return true and indicates any
76/// search receiving such an input should immediately return with no match.
77///
78/// Note that while `Input` is used for reverse searches in this crate, the
79/// `Input::is_done` predicate assumes a forward search. Because unsigned
80/// offsets are used internally, there is no way to tell from only the offsets
81/// whether a reverse search is done or not.
82///
83/// # Regex engine support
84///
85/// Any regex engine accepting an `Input` must support at least the following
86/// things:
87///
88/// * Searching a `&[u8]` for matches.
89/// * Searching a substring of `&[u8]` for a match, such that any match
90/// reported must appear entirely within that substring.
91/// * For a forwards search, a match should never be reported when
92/// [`Input::is_done`] returns true. (For reverse searches, termination should
93/// be handled outside of `Input`.)
94///
95/// Supporting other aspects of an `Input` are optional, but regex engines
96/// should handle aspects they don't support gracefully. How this is done is
97/// generally up to the regex engine. This crate generally treats unsupported
98/// anchored modes as an error to report for example, but for simplicity, in
99/// the meta regex engine, trying to search with an invalid pattern ID just
100/// results in no match being reported.
101#[derive(Clone)]
102pub struct Input<'h> {
103 haystack: &'h [u8],
104 span: Span,
105 anchored: Anchored,
106 earliest: bool,
107}
108
109impl<'h> Input<'h> {
110 /// Create a new search configuration for the given haystack.
111 #[inline]
112 pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> {
113 Input {
114 haystack: haystack.as_ref(),
115 span: Span { start: 0, end: haystack.as_ref().len() },
116 anchored: Anchored::No,
117 earliest: false,
118 }
119 }
120
121 /// Set the span for this search.
122 ///
123 /// This routine does not panic if the span given is not a valid range for
124 /// this search's haystack. If this search is run with an invalid range,
125 /// then the most likely outcome is that the actual search execution will
126 /// panic.
127 ///
128 /// This routine is generic over how a span is provided. While
129 /// a [`Span`] may be given directly, one may also provide a
130 /// `std::ops::Range<usize>`. To provide anything supported by range
131 /// syntax, use the [`Input::range`] method.
132 ///
133 /// The default span is the entire haystack.
134 ///
135 /// Note that [`Input::range`] overrides this method and vice versa.
136 ///
137 /// # Panics
138 ///
139 /// This panics if the given span does not correspond to valid bounds in
140 /// the haystack or the termination of a search.
141 ///
142 /// # Example
143 ///
144 /// This example shows how the span of the search can impact whether a
145 /// match is reported or not. This is particularly relevant for look-around
146 /// operators, which might take things outside of the span into account
147 /// when determining whether they match.
148 ///
149 /// ```
150 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
151 /// use regex_automata::{
152 /// nfa::thompson::pikevm::PikeVM,
153 /// Match, Input,
154 /// };
155 ///
156 /// // Look for 'at', but as a distinct word.
157 /// let re = PikeVM::new(r"\bat\b")?;
158 /// let mut cache = re.create_cache();
159 /// let mut caps = re.create_captures();
160 ///
161 /// // Our haystack contains 'at', but not as a distinct word.
162 /// let haystack = "batter";
163 ///
164 /// // A standard search finds nothing, as expected.
165 /// let input = Input::new(haystack);
166 /// re.search(&mut cache, &input, &mut caps);
167 /// assert_eq!(None, caps.get_match());
168 ///
169 /// // But if we wanted to search starting at position '1', we might
170 /// // slice the haystack. If we do this, it's impossible for the \b
171 /// // anchors to take the surrounding context into account! And thus,
172 /// // a match is produced.
173 /// let input = Input::new(&haystack[1..3]);
174 /// re.search(&mut cache, &input, &mut caps);
175 /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match());
176 ///
177 /// // But if we specify the span of the search instead of slicing the
178 /// // haystack, then the regex engine can "see" outside of the span
179 /// // and resolve the anchors correctly.
180 /// let input = Input::new(haystack).span(1..3);
181 /// re.search(&mut cache, &input, &mut caps);
182 /// assert_eq!(None, caps.get_match());
183 ///
184 /// # Ok::<(), Box<dyn std::error::Error>>(())
185 /// ```
186 ///
187 /// This may seem a little ham-fisted, but this scenario tends to come up
188 /// if some other regex engine found the match span and now you need to
189 /// re-process that span to look for capturing groups. (e.g., Run a faster
190 /// DFA first, find a match, then run the PikeVM on just the match span to
191 /// resolve capturing groups.) In order to implement that sort of logic
192 /// correctly, you need to set the span on the search instead of slicing
193 /// the haystack directly.
194 ///
195 /// The other advantage of using this routine to specify the bounds of the
196 /// search is that the match offsets are still reported in terms of the
197 /// original haystack. For example, the second search in the example above
198 /// reported a match at position `0`, even though `at` starts at offset
199 /// `1` because we sliced the haystack.
200 #[inline]
201 pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> {
202 self.set_span(span);
203 self
204 }
205
206 /// Like `Input::span`, but accepts any range instead.
207 ///
208 /// This routine does not panic if the range given is not a valid range for
209 /// this search's haystack. If this search is run with an invalid range,
210 /// then the most likely outcome is that the actual search execution will
211 /// panic.
212 ///
213 /// The default range is the entire haystack.
214 ///
215 /// Note that [`Input::span`] overrides this method and vice versa.
216 ///
217 /// # Panics
218 ///
219 /// This routine will panic if the given range could not be converted
220 /// to a valid [`Range`]. For example, this would panic when given
221 /// `0..=usize::MAX` since it cannot be represented using a half-open
222 /// interval in terms of `usize`.
223 ///
224 /// This also panics if the given range does not correspond to valid bounds
225 /// in the haystack or the termination of a search.
226 ///
227 /// # Example
228 ///
229 /// ```
230 /// use regex_automata::Input;
231 ///
232 /// let input = Input::new("foobar");
233 /// assert_eq!(0..6, input.get_range());
234 ///
235 /// let input = Input::new("foobar").range(2..=4);
236 /// assert_eq!(2..5, input.get_range());
237 /// ```
238 #[inline]
239 pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> {
240 self.set_range(range);
241 self
242 }
243
244 /// Sets the anchor mode of a search.
245 ///
246 /// When a search is anchored (so that's [`Anchored::Yes`] or
247 /// [`Anchored::Pattern`]), a match must begin at the start of a search.
248 /// When a search is not anchored (that's [`Anchored::No`]), regex engines
249 /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix
250 /// permits a match to appear anywhere.
251 ///
252 /// By default, the anchored mode is [`Anchored::No`].
253 ///
254 /// **WARNING:** this is subtly different than using a `^` at the start of
255 /// your regex. A `^` forces a regex to match exclusively at the start of
256 /// a haystack, regardless of where you begin your search. In contrast,
257 /// anchoring a search will allow your regex to match anywhere in your
258 /// haystack, but the match must start at the beginning of a search.
259 ///
260 /// For example, consider the haystack `aba` and the following searches:
261 ///
262 /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba`
263 /// starting at position `2`. Since `^` requires the match to start at
264 /// the beginning of the haystack and `2 > 0`, no match is found.
265 /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
266 /// starting at position `2`. This reports a match at `[2, 3]` since
267 /// the match starts where the search started. Since there is no `^`,
268 /// there is no requirement for the match to start at the beginning of
269 /// the haystack.
270 /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
271 /// starting at position `1`. Since `b` corresponds to position `1` and
272 /// since the search is anchored, it finds no match. While the regex
273 /// matches at other positions, configuring the search to be anchored
274 /// requires that it only report a match that begins at the same offset
275 /// as the beginning of the search.
276 /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba`
277 /// starting at position `1`. Since the search is not anchored and
278 /// the regex does not start with `^`, the search executes as if there
279 /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it
280 /// reports a match at `[2, 3]`.
281 ///
282 /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`,
283 /// except it only reports matches for a particular pattern.
284 ///
285 /// # Example
286 ///
287 /// This demonstrates the differences between an anchored search and
288 /// a pattern that begins with `^` (as described in the above warning
289 /// message).
290 ///
291 /// ```
292 /// use regex_automata::{
293 /// nfa::thompson::pikevm::PikeVM,
294 /// Anchored, Match, Input,
295 /// };
296 ///
297 /// let haystack = "aba";
298 ///
299 /// let re = PikeVM::new(r"^a")?;
300 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
301 /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No);
302 /// re.search(&mut cache, &input, &mut caps);
303 /// // No match is found because 2 is not the beginning of the haystack,
304 /// // which is what ^ requires.
305 /// assert_eq!(None, caps.get_match());
306 ///
307 /// let re = PikeVM::new(r"a")?;
308 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
309 /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes);
310 /// re.search(&mut cache, &input, &mut caps);
311 /// // An anchored search can still match anywhere in the haystack, it just
312 /// // must begin at the start of the search which is '2' in this case.
313 /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match());
314 ///
315 /// let re = PikeVM::new(r"a")?;
316 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
317 /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes);
318 /// re.search(&mut cache, &input, &mut caps);
319 /// // No match is found since we start searching at offset 1 which
320 /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
321 /// // is found.
322 /// assert_eq!(None, caps.get_match());
323 ///
324 /// let re = PikeVM::new(r"a")?;
325 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
326 /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No);
327 /// re.search(&mut cache, &input, &mut caps);
328 /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the
329 /// // pattern. Even though the search starts at 'b', the 'match anything'
330 /// // prefix allows the search to match 'a'.
331 /// let expected = Some(Match::must(0, 2..3));
332 /// assert_eq!(expected, caps.get_match());
333 ///
334 /// # Ok::<(), Box<dyn std::error::Error>>(())
335 /// ```
336 #[inline]
337 pub fn anchored(mut self, mode: Anchored) -> Input<'h> {
338 self.set_anchored(mode);
339 self
340 }
341
342 /// Whether to execute an "earliest" search or not.
343 ///
344 /// When running a non-overlapping search, an "earliest" search will return
345 /// the match location as early as possible. For example, given a pattern
346 /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search
347 /// will return `foo12345` as a match. But an "earliest" search for regex
348 /// engines that support "earliest" semantics will return `foo1` as a
349 /// match, since as soon as the first digit following `foo` is seen, it is
350 /// known to have found a match.
351 ///
352 /// Note that "earliest" semantics generally depend on the regex engine.
353 /// Different regex engines may determine there is a match at different
354 /// points. So there is no guarantee that "earliest" matches will always
355 /// return the same offsets for all regex engines. The "earliest" notion
356 /// is really about when the particular regex engine determines there is
357 /// a match rather than a consistent semantic unto itself. This is often
358 /// useful for implementing "did a match occur or not" predicates, but
359 /// sometimes the offset is useful as well.
360 ///
361 /// This is disabled by default.
362 ///
363 /// # Example
364 ///
365 /// This example shows the difference between "earliest" searching and
366 /// normal searching.
367 ///
368 /// ```
369 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
370 ///
371 /// let re = PikeVM::new(r"foo[0-9]+")?;
372 /// let mut cache = re.create_cache();
373 /// let mut caps = re.create_captures();
374 ///
375 /// // A normal search implements greediness like you expect.
376 /// let input = Input::new("foo12345");
377 /// re.search(&mut cache, &input, &mut caps);
378 /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
379 ///
380 /// // When 'earliest' is enabled and the regex engine supports
381 /// // it, the search will bail once it knows a match has been
382 /// // found.
383 /// let input = Input::new("foo12345").earliest(true);
384 /// re.search(&mut cache, &input, &mut caps);
385 /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match());
386 /// # Ok::<(), Box<dyn std::error::Error>>(())
387 /// ```
388 #[inline]
389 pub fn earliest(mut self, yes: bool) -> Input<'h> {
390 self.set_earliest(yes);
391 self
392 }
393
394 /// Set the span for this search configuration.
395 ///
396 /// This is like the [`Input::span`] method, except this mutates the
397 /// span in place.
398 ///
399 /// This routine is generic over how a span is provided. While
400 /// a [`Span`] may be given directly, one may also provide a
401 /// `std::ops::Range<usize>`.
402 ///
403 /// # Panics
404 ///
405 /// This panics if the given span does not correspond to valid bounds in
406 /// the haystack or the termination of a search.
407 ///
408 /// # Example
409 ///
410 /// ```
411 /// use regex_automata::Input;
412 ///
413 /// let mut input = Input::new("foobar");
414 /// assert_eq!(0..6, input.get_range());
415 /// input.set_span(2..4);
416 /// assert_eq!(2..4, input.get_range());
417 /// ```
418 #[inline]
419 pub fn set_span<S: Into<Span>>(&mut self, span: S) {
420 let span = span.into();
421 assert!(
422 span.end <= self.haystack.len()
423 && span.start <= span.end.wrapping_add(1),
424 "invalid span {:?} for haystack of length {}",
425 span,
426 self.haystack.len(),
427 );
428 self.span = span;
429 }
430
431 /// Set the span for this search configuration given any range.
432 ///
433 /// This is like the [`Input::range`] method, except this mutates the
434 /// span in place.
435 ///
436 /// This routine does not panic if the range given is not a valid range for
437 /// this search's haystack. If this search is run with an invalid range,
438 /// then the most likely outcome is that the actual search execution will
439 /// panic.
440 ///
441 /// # Panics
442 ///
443 /// This routine will panic if the given range could not be converted
444 /// to a valid [`Range`]. For example, this would panic when given
445 /// `0..=usize::MAX` since it cannot be represented using a half-open
446 /// interval in terms of `usize`.
447 ///
448 /// This also panics if the given span does not correspond to valid bounds
449 /// in the haystack or the termination of a search.
450 ///
451 /// # Example
452 ///
453 /// ```
454 /// use regex_automata::Input;
455 ///
456 /// let mut input = Input::new("foobar");
457 /// assert_eq!(0..6, input.get_range());
458 /// input.set_range(2..=4);
459 /// assert_eq!(2..5, input.get_range());
460 /// ```
461 #[inline]
462 pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
463 use core::ops::Bound;
464
465 // It's a little weird to convert ranges into spans, and then spans
466 // back into ranges when we actually slice the haystack. Because
467 // of that process, we always represent everything as a half-open
468 // internal. Therefore, handling things like m..=n is a little awkward.
469 let start = match range.start_bound() {
470 Bound::Included(&i) => i,
471 // Can this case ever happen? Range syntax doesn't support it...
472 Bound::Excluded(&i) => i.checked_add(1).unwrap(),
473 Bound::Unbounded => 0,
474 };
475 let end = match range.end_bound() {
476 Bound::Included(&i) => i.checked_add(1).unwrap(),
477 Bound::Excluded(&i) => i,
478 Bound::Unbounded => self.haystack().len(),
479 };
480 self.set_span(Span { start, end });
481 }
482
483 /// Set the starting offset for the span for this search configuration.
484 ///
485 /// This is a convenience routine for only mutating the start of a span
486 /// without having to set the entire span.
487 ///
488 /// # Panics
489 ///
490 /// This panics if the span resulting from the new start position does not
491 /// correspond to valid bounds in the haystack or the termination of a
492 /// search.
493 ///
494 /// # Example
495 ///
496 /// ```
497 /// use regex_automata::Input;
498 ///
499 /// let mut input = Input::new("foobar");
500 /// assert_eq!(0..6, input.get_range());
501 /// input.set_start(5);
502 /// assert_eq!(5..6, input.get_range());
503 /// ```
504 #[inline]
505 pub fn set_start(&mut self, start: usize) {
506 self.set_span(Span { start, ..self.get_span() });
507 }
508
509 /// Set the ending offset for the span for this search configuration.
510 ///
511 /// This is a convenience routine for only mutating the end of a span
512 /// without having to set the entire span.
513 ///
514 /// # Panics
515 ///
516 /// This panics if the span resulting from the new end position does not
517 /// correspond to valid bounds in the haystack or the termination of a
518 /// search.
519 ///
520 /// # Example
521 ///
522 /// ```
523 /// use regex_automata::Input;
524 ///
525 /// let mut input = Input::new("foobar");
526 /// assert_eq!(0..6, input.get_range());
527 /// input.set_end(5);
528 /// assert_eq!(0..5, input.get_range());
529 /// ```
530 #[inline]
531 pub fn set_end(&mut self, end: usize) {
532 self.set_span(Span { end, ..self.get_span() });
533 }
534
535 /// Set the anchor mode of a search.
536 ///
537 /// This is like [`Input::anchored`], except it mutates the search
538 /// configuration in place.
539 ///
540 /// # Example
541 ///
542 /// ```
543 /// use regex_automata::{Anchored, Input, PatternID};
544 ///
545 /// let mut input = Input::new("foobar");
546 /// assert_eq!(Anchored::No, input.get_anchored());
547 ///
548 /// let pid = PatternID::must(5);
549 /// input.set_anchored(Anchored::Pattern(pid));
550 /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
551 /// ```
552 #[inline]
553 pub fn set_anchored(&mut self, mode: Anchored) {
554 self.anchored = mode;
555 }
556
557 /// Set whether the search should execute in "earliest" mode or not.
558 ///
559 /// This is like [`Input::earliest`], except it mutates the search
560 /// configuration in place.
561 ///
562 /// # Example
563 ///
564 /// ```
565 /// use regex_automata::Input;
566 ///
567 /// let mut input = Input::new("foobar");
568 /// assert!(!input.get_earliest());
569 /// input.set_earliest(true);
570 /// assert!(input.get_earliest());
571 /// ```
572 #[inline]
573 pub fn set_earliest(&mut self, yes: bool) {
574 self.earliest = yes;
575 }
576
577 /// Return a borrow of the underlying haystack as a slice of bytes.
578 ///
579 /// # Example
580 ///
581 /// ```
582 /// use regex_automata::Input;
583 ///
584 /// let input = Input::new("foobar");
585 /// assert_eq!(b"foobar", input.haystack());
586 /// ```
587 #[inline]
588 pub fn haystack(&self) -> &[u8] {
589 self.haystack
590 }
591
592 /// Return the start position of this search.
593 ///
594 /// This is a convenience routine for `search.get_span().start()`.
595 ///
596 /// When [`Input::is_done`] is `false`, this is guaranteed to return
597 /// an offset that is less than or equal to [`Input::end`]. Otherwise,
598 /// the offset is one greater than [`Input::end`].
599 ///
600 /// # Example
601 ///
602 /// ```
603 /// use regex_automata::Input;
604 ///
605 /// let input = Input::new("foobar");
606 /// assert_eq!(0, input.start());
607 ///
608 /// let input = Input::new("foobar").span(2..4);
609 /// assert_eq!(2, input.start());
610 /// ```
611 #[inline]
612 pub fn start(&self) -> usize {
613 self.get_span().start
614 }
615
616 /// Return the end position of this search.
617 ///
618 /// This is a convenience routine for `search.get_span().end()`.
619 ///
620 /// This is guaranteed to return an offset that is a valid exclusive end
621 /// bound for this input's haystack.
622 ///
623 /// # Example
624 ///
625 /// ```
626 /// use regex_automata::Input;
627 ///
628 /// let input = Input::new("foobar");
629 /// assert_eq!(6, input.end());
630 ///
631 /// let input = Input::new("foobar").span(2..4);
632 /// assert_eq!(4, input.end());
633 /// ```
634 #[inline]
635 pub fn end(&self) -> usize {
636 self.get_span().end
637 }
638
639 /// Return the span for this search configuration.
640 ///
641 /// If one was not explicitly set, then the span corresponds to the entire
642 /// range of the haystack.
643 ///
644 /// When [`Input::is_done`] is `false`, the span returned is guaranteed
645 /// to correspond to valid bounds for this input's haystack.
646 ///
647 /// # Example
648 ///
649 /// ```
650 /// use regex_automata::{Input, Span};
651 ///
652 /// let input = Input::new("foobar");
653 /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
654 /// ```
655 #[inline]
656 pub fn get_span(&self) -> Span {
657 self.span
658 }
659
660 /// Return the span as a range for this search configuration.
661 ///
662 /// If one was not explicitly set, then the span corresponds to the entire
663 /// range of the haystack.
664 ///
665 /// When [`Input::is_done`] is `false`, the range returned is guaranteed
666 /// to correspond to valid bounds for this input's haystack.
667 ///
668 /// # Example
669 ///
670 /// ```
671 /// use regex_automata::Input;
672 ///
673 /// let input = Input::new("foobar");
674 /// assert_eq!(0..6, input.get_range());
675 /// ```
676 #[inline]
677 pub fn get_range(&self) -> Range<usize> {
678 self.get_span().range()
679 }
680
681 /// Return the anchored mode for this search configuration.
682 ///
683 /// If no anchored mode was set, then it defaults to [`Anchored::No`].
684 ///
685 /// # Example
686 ///
687 /// ```
688 /// use regex_automata::{Anchored, Input, PatternID};
689 ///
690 /// let mut input = Input::new("foobar");
691 /// assert_eq!(Anchored::No, input.get_anchored());
692 ///
693 /// let pid = PatternID::must(5);
694 /// input.set_anchored(Anchored::Pattern(pid));
695 /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
696 /// ```
697 #[inline]
698 pub fn get_anchored(&self) -> Anchored {
699 self.anchored
700 }
701
702 /// Return whether this search should execute in "earliest" mode.
703 ///
704 /// # Example
705 ///
706 /// ```
707 /// use regex_automata::Input;
708 ///
709 /// let input = Input::new("foobar");
710 /// assert!(!input.get_earliest());
711 /// ```
712 #[inline]
713 pub fn get_earliest(&self) -> bool {
714 self.earliest
715 }
716
717 /// Return true if and only if this search can never return any other
718 /// matches.
719 ///
720 /// This occurs when the start position of this search is greater than the
721 /// end position of the search.
722 ///
723 /// # Example
724 ///
725 /// ```
726 /// use regex_automata::Input;
727 ///
728 /// let mut input = Input::new("foobar");
729 /// assert!(!input.is_done());
730 /// input.set_start(6);
731 /// assert!(!input.is_done());
732 /// input.set_start(7);
733 /// assert!(input.is_done());
734 /// ```
735 #[inline]
736 pub fn is_done(&self) -> bool {
737 self.get_span().start > self.get_span().end
738 }
739
740 /// Returns true if and only if the given offset in this search's haystack
741 /// falls on a valid UTF-8 encoded codepoint boundary.
742 ///
743 /// If the haystack is not valid UTF-8, then the behavior of this routine
744 /// is unspecified.
745 ///
746 /// # Example
747 ///
748 /// This shows where codepoint boundaries do and don't exist in valid
749 /// UTF-8.
750 ///
751 /// ```
752 /// use regex_automata::Input;
753 ///
754 /// let input = Input::new("☃");
755 /// assert!(input.is_char_boundary(0));
756 /// assert!(!input.is_char_boundary(1));
757 /// assert!(!input.is_char_boundary(2));
758 /// assert!(input.is_char_boundary(3));
759 /// assert!(!input.is_char_boundary(4));
760 /// ```
761 #[inline]
762 pub fn is_char_boundary(&self, offset: usize) -> bool {
763 utf8::is_boundary(self.haystack(), offset)
764 }
765}
766
767impl<'h> core::fmt::Debug for Input<'h> {
768 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
769 use crate::util::escape::DebugHaystack;
770
771 f.debug_struct("Input")
772 .field("haystack", &DebugHaystack(self.haystack()))
773 .field("span", &self.span)
774 .field("anchored", &self.anchored)
775 .field("earliest", &self.earliest)
776 .finish()
777 }
778}
779
780impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> {
781 fn from(haystack: &'h H) -> Input<'h> {
782 Input::new(haystack)
783 }
784}
785
786/// A representation of a span reported by a regex engine.
787///
788/// A span corresponds to the starting and ending _byte offsets_ of a
789/// contiguous region of bytes. The starting offset is inclusive while the
790/// ending offset is exclusive. That is, a span is a half-open interval.
791///
792/// A span is used to report the offsets of a match, but it is also used to
793/// convey which region of a haystack should be searched via routines like
794/// [`Input::span`].
795///
796/// This is basically equivalent to a `std::ops::Range<usize>`, except this
797/// type implements `Copy` which makes it more ergonomic to use in the context
798/// of this crate. Like a range, this implements `Index` for `[u8]` and `str`,
799/// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`,
800/// which means things like `Span::from(5..10)` work.
801#[derive(Clone, Copy, Eq, Hash, PartialEq)]
802pub struct Span {
803 /// The start offset of the span, inclusive.
804 pub start: usize,
805 /// The end offset of the span, exclusive.
806 pub end: usize,
807}
808
809impl Span {
810 /// Returns this span as a range.
811 #[inline]
812 pub fn range(&self) -> Range<usize> {
813 Range::from(*self)
814 }
815
816 /// Returns true when this span is empty. That is, when `start >= end`.
817 #[inline]
818 pub fn is_empty(&self) -> bool {
819 self.start >= self.end
820 }
821
822 /// Returns the length of this span.
823 ///
824 /// This returns `0` in precisely the cases that `is_empty` returns `true`.
825 #[inline]
826 pub fn len(&self) -> usize {
827 self.end.saturating_sub(self.start)
828 }
829
830 /// Returns true when the given offset is contained within this span.
831 ///
832 /// Note that an empty span contains no offsets and will always return
833 /// false.
834 #[inline]
835 pub fn contains(&self, offset: usize) -> bool {
836 !self.is_empty() && self.start <= offset && offset <= self.end
837 }
838
839 /// Returns a new span with `offset` added to this span's `start` and `end`
840 /// values.
841 #[inline]
842 pub fn offset(&self, offset: usize) -> Span {
843 Span { start: self.start + offset, end: self.end + offset }
844 }
845}
846
847impl core::fmt::Debug for Span {
848 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
849 write!(f, "{}..{}", self.start, self.end)
850 }
851}
852
853impl core::ops::Index<Span> for [u8] {
854 type Output = [u8];
855
856 #[inline]
857 fn index(&self, index: Span) -> &[u8] {
858 &self[index.range()]
859 }
860}
861
862impl core::ops::IndexMut<Span> for [u8] {
863 #[inline]
864 fn index_mut(&mut self, index: Span) -> &mut [u8] {
865 &mut self[index.range()]
866 }
867}
868
869impl core::ops::Index<Span> for str {
870 type Output = str;
871
872 #[inline]
873 fn index(&self, index: Span) -> &str {
874 &self[index.range()]
875 }
876}
877
878impl From<Range<usize>> for Span {
879 #[inline]
880 fn from(range: Range<usize>) -> Span {
881 Span { start: range.start, end: range.end }
882 }
883}
884
885impl From<Span> for Range<usize> {
886 #[inline]
887 fn from(span: Span) -> Range<usize> {
888 Range { start: span.start, end: span.end }
889 }
890}
891
892impl PartialEq<Range<usize>> for Span {
893 #[inline]
894 fn eq(&self, range: &Range<usize>) -> bool {
895 self.start == range.start && self.end == range.end
896 }
897}
898
899impl PartialEq<Span> for Range<usize> {
900 #[inline]
901 fn eq(&self, span: &Span) -> bool {
902 self.start == span.start && self.end == span.end
903 }
904}
905
906/// A representation of "half" of a match reported by a DFA.
907///
908/// This is called a "half" match because it only includes the end location (or
909/// start location for a reverse search) of a match. This corresponds to the
910/// information that a single DFA scan can report. Getting the other half of
911/// the match requires a second scan with a reversed DFA.
912///
913/// A half match also includes the pattern that matched. The pattern is
914/// identified by an ID, which corresponds to its position (starting from `0`)
915/// relative to other patterns used to construct the corresponding DFA. If only
916/// a single pattern is provided to the DFA, then all matches are guaranteed to
917/// have a pattern ID of `0`.
918#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
919pub struct HalfMatch {
920 /// The pattern ID.
921 pattern: PatternID,
922 /// The offset of the match.
923 ///
924 /// For forward searches, the offset is exclusive. For reverse searches,
925 /// the offset is inclusive.
926 offset: usize,
927}
928
929impl HalfMatch {
930 /// Create a new half match from a pattern ID and a byte offset.
931 #[inline]
932 pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
933 HalfMatch { pattern, offset }
934 }
935
936 /// Create a new half match from a pattern ID and a byte offset.
937 ///
938 /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
939 /// [`PatternID`]. This panics if the given `usize` is not representable
940 /// as a `PatternID`.
941 #[inline]
942 pub fn must(pattern: usize, offset: usize) -> HalfMatch {
943 HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
944 }
945
946 /// Returns the ID of the pattern that matched.
947 ///
948 /// The ID of a pattern is derived from the position in which it was
949 /// originally inserted into the corresponding DFA. The first pattern has
950 /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
951 #[inline]
952 pub fn pattern(&self) -> PatternID {
953 self.pattern
954 }
955
956 /// The position of the match.
957 ///
958 /// If this match was produced by a forward search, then the offset is
959 /// exclusive. If this match was produced by a reverse search, then the
960 /// offset is inclusive.
961 #[inline]
962 pub fn offset(&self) -> usize {
963 self.offset
964 }
965}
966
967/// A representation of a match reported by a regex engine.
968///
969/// A match has two essential pieces of information: the [`PatternID`] that
970/// matches, and the [`Span`] of the match in a haystack.
971///
972/// The pattern is identified by an ID, which corresponds to its position
973/// (starting from `0`) relative to other patterns used to construct the
974/// corresponding regex engine. If only a single pattern is provided, then all
975/// matches are guaranteed to have a pattern ID of `0`.
976///
977/// Every match reported by a regex engine guarantees that its span has its
978/// start offset as less than or equal to its end offset.
979#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
980pub struct Match {
981 /// The pattern ID.
982 pattern: PatternID,
983 /// The underlying match span.
984 span: Span,
985}
986
987impl Match {
988 /// Create a new match from a pattern ID and a span.
989 ///
990 /// This constructor is generic over how a span is provided. While
991 /// a [`Span`] may be given directly, one may also provide a
992 /// `std::ops::Range<usize>`.
993 ///
994 /// # Panics
995 ///
996 /// This panics if `end < start`.
997 ///
998 /// # Example
999 ///
1000 /// This shows how to create a match for the first pattern in a regex
1001 /// object using convenient range syntax.
1002 ///
1003 /// ```
1004 /// use regex_automata::{Match, PatternID};
1005 ///
1006 /// let m = Match::new(PatternID::ZERO, 5..10);
1007 /// assert_eq!(0, m.pattern().as_usize());
1008 /// assert_eq!(5, m.start());
1009 /// assert_eq!(10, m.end());
1010 /// ```
1011 #[inline]
1012 pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match {
1013 let span: Span = span.into();
1014 assert!(span.start <= span.end, "invalid match span");
1015 Match { pattern, span }
1016 }
1017
1018 /// Create a new match from a pattern ID and a byte offset span.
1019 ///
1020 /// This constructor is generic over how a span is provided. While
1021 /// a [`Span`] may be given directly, one may also provide a
1022 /// `std::ops::Range<usize>`.
1023 ///
1024 /// This is like [`Match::new`], but accepts a `usize` instead of a
1025 /// [`PatternID`]. This panics if the given `usize` is not representable
1026 /// as a `PatternID`.
1027 ///
1028 /// # Panics
1029 ///
1030 /// This panics if `end < start` or if `pattern > PatternID::MAX`.
1031 ///
1032 /// # Example
1033 ///
1034 /// This shows how to create a match for the third pattern in a regex
1035 /// object using convenient range syntax.
1036 ///
1037 /// ```
1038 /// use regex_automata::Match;
1039 ///
1040 /// let m = Match::must(3, 5..10);
1041 /// assert_eq!(3, m.pattern().as_usize());
1042 /// assert_eq!(5, m.start());
1043 /// assert_eq!(10, m.end());
1044 /// ```
1045 #[inline]
1046 pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match {
1047 Match::new(PatternID::must(pattern), span)
1048 }
1049
1050 /// Returns the ID of the pattern that matched.
1051 ///
1052 /// The ID of a pattern is derived from the position in which it was
1053 /// originally inserted into the corresponding regex engine. The first
1054 /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
1055 /// so on.
1056 #[inline]
1057 pub fn pattern(&self) -> PatternID {
1058 self.pattern
1059 }
1060
1061 /// The starting position of the match.
1062 ///
1063 /// This is a convenience routine for `Match::span().start`.
1064 #[inline]
1065 pub fn start(&self) -> usize {
1066 self.span().start
1067 }
1068
1069 /// The ending position of the match.
1070 ///
1071 /// This is a convenience routine for `Match::span().end`.
1072 #[inline]
1073 pub fn end(&self) -> usize {
1074 self.span().end
1075 }
1076
1077 /// Returns the match span as a range.
1078 ///
1079 /// This is a convenience routine for `Match::span().range()`.
1080 #[inline]
1081 pub fn range(&self) -> core::ops::Range<usize> {
1082 self.span().range()
1083 }
1084
1085 /// Returns the span for this match.
1086 #[inline]
1087 pub fn span(&self) -> Span {
1088 self.span
1089 }
1090
1091 /// Returns true when the span in this match is empty.
1092 ///
1093 /// An empty match can only be returned when the regex itself can match
1094 /// the empty string.
1095 #[inline]
1096 pub fn is_empty(&self) -> bool {
1097 self.span().is_empty()
1098 }
1099
1100 /// Returns the length of this match.
1101 ///
1102 /// This returns `0` in precisely the cases that `is_empty` returns `true`.
1103 #[inline]
1104 pub fn len(&self) -> usize {
1105 self.span().len()
1106 }
1107}
1108
1109/// A set of `PatternID`s.
1110///
1111/// A set of pattern identifiers is useful for recording which patterns have
1112/// matched a particular haystack. A pattern set _only_ includes pattern
1113/// identifiers. It does not include offset information.
1114///
1115/// # Example
1116///
1117/// This shows basic usage of a set.
1118///
1119/// ```
1120/// use regex_automata::{PatternID, PatternSet};
1121///
1122/// let pid1 = PatternID::must(5);
1123/// let pid2 = PatternID::must(8);
1124/// // Create a new empty set.
1125/// let mut set = PatternSet::new(10);
1126/// // Insert pattern IDs.
1127/// set.insert(pid1);
1128/// set.insert(pid2);
1129/// // Test membership.
1130/// assert!(set.contains(pid1));
1131/// assert!(set.contains(pid2));
1132/// // Get all members.
1133/// assert_eq!(
1134/// vec![5, 8],
1135/// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(),
1136/// );
1137/// // Clear the set.
1138/// set.clear();
1139/// // Test that it is indeed empty.
1140/// assert!(set.is_empty());
1141/// ```
1142#[cfg(feature = "alloc")]
1143#[derive(Clone, Debug, Eq, PartialEq)]
1144pub struct PatternSet {
1145 /// The number of patterns set to 'true' in this set.
1146 len: usize,
1147 /// A map from PatternID to boolean of whether a pattern matches or not.
1148 ///
1149 /// This should probably be a bitset, but it's probably unlikely to matter
1150 /// much in practice.
1151 ///
1152 /// The main downside of this representation (and similarly for a bitset)
1153 /// is that iteration scales with the capacity of the set instead of
1154 /// the length of the set. This doesn't seem likely to be a problem in
1155 /// practice.
1156 ///
1157 /// Another alternative is to just use a 'SparseSet' for this. It does use
1158 /// more memory (quite a bit more), but that seems fine I think compared
1159 /// to the memory being used by the regex engine. The real hiccup with
1160 /// it is that it yields pattern IDs in the order they were inserted.
1161 /// Which is actually kind of nice, but at the time of writing, pattern
1162 /// IDs are yielded in ascending order in the regex crate RegexSet API.
1163 /// If we did change to 'SparseSet', we could provide an additional
1164 /// 'iter_match_order' iterator, but keep the ascending order one for
1165 /// compatibility.
1166 which: alloc::boxed::Box<[bool]>,
1167}
1168
1169#[cfg(feature = "alloc")]
1170impl PatternSet {
1171 /// Create a new set of pattern identifiers with the given capacity.
1172 ///
1173 /// The given capacity typically corresponds to (at least) the number of
1174 /// patterns in a compiled regex object.
1175 ///
1176 /// # Panics
1177 ///
1178 /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is
1179 /// impossible if you use the `pattern_len()` method as defined on any of
1180 /// the regex engines in this crate. Namely, a regex will fail to build by
1181 /// returning an error if the number of patterns given to it exceeds the
1182 /// limit. Therefore, the number of patterns in a valid regex is always
1183 /// a correct capacity to provide here.
1184 pub fn new(capacity: usize) -> PatternSet {
1185 assert!(
1186 capacity <= PatternID::LIMIT,
1187 "pattern set capacity exceeds limit of {}",
1188 PatternID::LIMIT,
1189 );
1190 PatternSet {
1191 len: 0,
1192 which: alloc::vec![false; capacity].into_boxed_slice(),
1193 }
1194 }
1195
1196 /// Clear this set such that it contains no pattern IDs.
1197 pub fn clear(&mut self) {
1198 self.len = 0;
1199 for matched in self.which.iter_mut() {
1200 *matched = false;
1201 }
1202 }
1203
1204 /// Return true if and only if the given pattern identifier is in this set.
1205 pub fn contains(&self, pid: PatternID) -> bool {
1206 pid.as_usize() < self.capacity() && self.which[pid]
1207 }
1208
1209 /// Insert the given pattern identifier into this set and return `true` if
1210 /// the given pattern ID was not previously in this set.
1211 ///
1212 /// If the pattern identifier is already in this set, then this is a no-op.
1213 ///
1214 /// Use [`PatternSet::try_insert`] for a fallible version of this routine.
1215 ///
1216 /// # Panics
1217 ///
1218 /// This panics if this pattern set has insufficient capacity to
1219 /// store the given pattern ID.
1220 pub fn insert(&mut self, pid: PatternID) -> bool {
1221 self.try_insert(pid)
1222 .expect("PatternSet should have sufficient capacity")
1223 }
1224
1225 /// Insert the given pattern identifier into this set and return `true` if
1226 /// the given pattern ID was not previously in this set.
1227 ///
1228 /// If the pattern identifier is already in this set, then this is a no-op.
1229 ///
1230 /// # Errors
1231 ///
1232 /// This returns an error if this pattern set has insufficient capacity to
1233 /// store the given pattern ID.
1234 pub fn try_insert(
1235 &mut self,
1236 pid: PatternID,
1237 ) -> Result<bool, PatternSetInsertError> {
1238 if pid.as_usize() >= self.capacity() {
1239 return Err(PatternSetInsertError {
1240 attempted: pid,
1241 capacity: self.capacity(),
1242 });
1243 }
1244 if self.which[pid] {
1245 return Ok(false);
1246 }
1247 self.len += 1;
1248 self.which[pid] = true;
1249 Ok(true)
1250 }
1251
1252 /*
1253 // This is currently commented out because it is unused and it is unclear
1254 // whether it's useful or not. What's the harm in having it? When, if
1255 // we ever wanted to change our representation to a 'SparseSet', then
1256 // supporting this method would be a bit tricky. So in order to keep some
1257 // API evolution flexibility, we leave it out for now.
1258
1259 /// Remove the given pattern identifier from this set.
1260 ///
1261 /// If the pattern identifier was not previously in this set, then this
1262 /// does not change the set and returns `false`.
1263 ///
1264 /// # Panics
1265 ///
1266 /// This panics if `pid` exceeds the capacity of this set.
1267 pub fn remove(&mut self, pid: PatternID) -> bool {
1268 if !self.which[pid] {
1269 return false;
1270 }
1271 self.len -= 1;
1272 self.which[pid] = false;
1273 true
1274 }
1275 */
1276
1277 /// Return true if and only if this set has no pattern identifiers in it.
1278 pub fn is_empty(&self) -> bool {
1279 self.len() == 0
1280 }
1281
1282 /// Return true if and only if this set has the maximum number of pattern
1283 /// identifiers in the set. This occurs precisely when `PatternSet::len()
1284 /// == PatternSet::capacity()`.
1285 ///
1286 /// This particular property is useful to test because it may allow one to
1287 /// stop a search earlier than you might otherwise. Namely, if a search is
1288 /// only reporting which patterns match a haystack and if you know all of
1289 /// the patterns match at a given point, then there's no new information
1290 /// that can be learned by continuing the search. (Because a pattern set
1291 /// does not keep track of offset information.)
1292 pub fn is_full(&self) -> bool {
1293 self.len() == self.capacity()
1294 }
1295
1296 /// Returns the total number of pattern identifiers in this set.
1297 pub fn len(&self) -> usize {
1298 self.len
1299 }
1300
1301 /// Returns the total number of pattern identifiers that may be stored
1302 /// in this set.
1303 ///
1304 /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`].
1305 ///
1306 /// Typically, the capacity of a pattern set matches the number of patterns
1307 /// in a regex object with which you are searching.
1308 pub fn capacity(&self) -> usize {
1309 self.which.len()
1310 }
1311
1312 /// Returns an iterator over all pattern identifiers in this set.
1313 ///
1314 /// The iterator yields pattern identifiers in ascending order, starting
1315 /// at zero.
1316 pub fn iter(&self) -> PatternSetIter<'_> {
1317 PatternSetIter { it: self.which.iter().enumerate() }
1318 }
1319}
1320
1321/// An error that occurs when a `PatternID` failed to insert into a
1322/// `PatternSet`.
1323///
1324/// An insert fails when the given `PatternID` exceeds the configured capacity
1325/// of the `PatternSet`.
1326///
1327/// This error is created by the [`PatternSet::try_insert`] routine.
1328#[cfg(feature = "alloc")]
1329#[derive(Clone, Debug)]
1330pub struct PatternSetInsertError {
1331 attempted: PatternID,
1332 capacity: usize,
1333}
1334
1335#[cfg(feature = "std")]
1336impl std::error::Error for PatternSetInsertError {}
1337
1338#[cfg(feature = "alloc")]
1339impl core::fmt::Display for PatternSetInsertError {
1340 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1341 write!(
1342 f,
1343 "failed to insert pattern ID {} into pattern set \
1344 with insufficiet capacity of {}",
1345 self.attempted.as_usize(),
1346 self.capacity,
1347 )
1348 }
1349}
1350
1351/// An iterator over all pattern identifiers in a [`PatternSet`].
1352///
1353/// The lifetime parameter `'a` refers to the lifetime of the pattern set being
1354/// iterated over.
1355///
1356/// This iterator is created by the [`PatternSet::iter`] method.
1357#[cfg(feature = "alloc")]
1358#[derive(Clone, Debug)]
1359pub struct PatternSetIter<'a> {
1360 it: core::iter::Enumerate<core::slice::Iter<'a, bool>>,
1361}
1362
1363#[cfg(feature = "alloc")]
1364impl<'a> Iterator for PatternSetIter<'a> {
1365 type Item = PatternID;
1366
1367 fn next(&mut self) -> Option<PatternID> {
1368 while let Some((index, &yes)) = self.it.next() {
1369 if yes {
1370 // Only valid 'PatternID' values can be inserted into the set
1371 // and construction of the set panics if the capacity would
1372 // permit storing invalid pattern IDs. Thus, 'yes' is only true
1373 // precisely when 'index' corresponds to a valid 'PatternID'.
1374 return Some(PatternID::new_unchecked(index));
1375 }
1376 }
1377 None
1378 }
1379
1380 fn size_hint(&self) -> (usize, Option<usize>) {
1381 self.it.size_hint()
1382 }
1383}
1384
1385#[cfg(feature = "alloc")]
1386impl<'a> DoubleEndedIterator for PatternSetIter<'a> {
1387 fn next_back(&mut self) -> Option<PatternID> {
1388 while let Some((index, &yes)) = self.it.next_back() {
1389 if yes {
1390 // Only valid 'PatternID' values can be inserted into the set
1391 // and construction of the set panics if the capacity would
1392 // permit storing invalid pattern IDs. Thus, 'yes' is only true
1393 // precisely when 'index' corresponds to a valid 'PatternID'.
1394 return Some(PatternID::new_unchecked(index));
1395 }
1396 }
1397 None
1398 }
1399}
1400
1401/// The type of anchored search to perform.
1402///
1403/// This is *almost* a boolean option. That is, you can either do an unanchored
1404/// search for any pattern in a regex, or you can do an anchored search for any
1405/// pattern in a regex.
1406///
1407/// A third option exists that, assuming the regex engine supports it, permits
1408/// you to do an anchored search for a specific pattern.
1409///
1410/// Note that there is no way to run an unanchored search for a specific
1411/// pattern. If you need that, you'll need to build separate regexes for each
1412/// pattern.
1413///
1414/// # Errors
1415///
1416/// If a regex engine does not support the anchored mode selected, then the
1417/// regex engine will return an error. While any non-trivial regex engine
1418/// should support at least one of the available anchored modes, there is no
1419/// singular mode that is guaranteed to be universally supported. Some regex
1420/// engines might only support unanchored searches (DFAs compiled without
1421/// anchored starting states) and some regex engines might only support
1422/// anchored searches (like the one-pass DFA).
1423///
1424/// The specific error returned is a [`MatchError`] with a
1425/// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the
1426/// `Anchored` value given that is unsupported.
1427///
1428/// Note that regex engines should report "no match" if, for example, an
1429/// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where
1430/// anchored searches for a specific pattern are supported. This is smooths out
1431/// behavior such that it's possible to guarantee that an error never occurs
1432/// based on how the regex engine is configured. All regex engines in this
1433/// crate report "no match" when searching for an invalid pattern ID, but where
1434/// searching for a valid pattern ID is otherwise supported.
1435///
1436/// # Example
1437///
1438/// This example shows how to use the various `Anchored` modes to run a
1439/// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)
1440/// because it supports all modes unconditionally. Some regex engines, like
1441/// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored
1442/// searches.
1443///
1444/// ```
1445/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1446/// use regex_automata::{
1447/// nfa::thompson::pikevm::PikeVM,
1448/// Anchored, Input, Match, PatternID,
1449/// };
1450///
1451/// let re = PikeVM::new_many(&[
1452/// r"Mrs. \w+",
1453/// r"Miss \w+",
1454/// r"Mr. \w+",
1455/// r"Ms. \w+",
1456/// ])?;
1457/// let mut cache = re.create_cache();
1458/// let hay = "Hello Mr. Springsteen!";
1459///
1460/// // The default is to do an unanchored search.
1461/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
1462/// // Explicitly ask for an unanchored search. Same as above.
1463/// let input = Input::new(hay).anchored(Anchored::No);
1464/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
1465///
1466/// // Now try an anchored search. Since the match doesn't start at the
1467/// // beginning of the haystack, no match is found!
1468/// let input = Input::new(hay).anchored(Anchored::Yes);
1469/// assert_eq!(None, re.find(&mut cache, input));
1470///
1471/// // We can try an anchored search again, but move the location of where
1472/// // we start the search. Note that the offsets reported are still in
1473/// // terms of the overall haystack and not relative to where we started
1474/// // the search.
1475/// let input = Input::new(hay).anchored(Anchored::Yes).range(6..);
1476/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
1477///
1478/// // Now try an anchored search for a specific pattern. We specifically
1479/// // choose a pattern that we know doesn't match to prove that the search
1480/// // only looks for the pattern we provide.
1481/// let input = Input::new(hay)
1482/// .anchored(Anchored::Pattern(PatternID::must(1)))
1483/// .range(6..);
1484/// assert_eq!(None, re.find(&mut cache, input));
1485///
1486/// // But if we switch it to the pattern that we know matches, then we find
1487/// // the match.
1488/// let input = Input::new(hay)
1489/// .anchored(Anchored::Pattern(PatternID::must(2)))
1490/// .range(6..);
1491/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
1492///
1493/// # Ok::<(), Box<dyn std::error::Error>>(())
1494/// ```
1495#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1496pub enum Anchored {
1497 /// Run an unanchored search. This means a match may occur anywhere at or
1498 /// after the start position of the search.
1499 ///
1500 /// This search can return a match for any pattern in the regex.
1501 No,
1502 /// Run an anchored search. This means that a match must begin at the
1503 /// start position of the search.
1504 ///
1505 /// This search can return a match for any pattern in the regex.
1506 Yes,
1507 /// Run an anchored search for a specific pattern. This means that a match
1508 /// must be for the given pattern and must begin at the start position of
1509 /// the search.
1510 Pattern(PatternID),
1511}
1512
1513impl Anchored {
1514 /// Returns true if and only if this anchor mode corresponds to any kind of
1515 /// anchored search.
1516 ///
1517 /// # Example
1518 ///
1519 /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern`
1520 /// are considered anchored searches.
1521 ///
1522 /// ```
1523 /// use regex_automata::{Anchored, PatternID};
1524 ///
1525 /// assert!(!Anchored::No.is_anchored());
1526 /// assert!(Anchored::Yes.is_anchored());
1527 /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored());
1528 /// ```
1529 #[inline]
1530 pub fn is_anchored(&self) -> bool {
1531 matches!(*self, Anchored::Yes | Anchored::Pattern(_))
1532 }
1533
1534 /// Returns the pattern ID associated with this configuration if it is an
1535 /// anchored search for a specific pattern. Otherwise `None` is returned.
1536 ///
1537 /// # Example
1538 ///
1539 /// ```
1540 /// use regex_automata::{Anchored, PatternID};
1541 ///
1542 /// assert_eq!(None, Anchored::No.pattern());
1543 /// assert_eq!(None, Anchored::Yes.pattern());
1544 ///
1545 /// let pid = PatternID::must(5);
1546 /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern());
1547 /// ```
1548 #[inline]
1549 pub fn pattern(&self) -> Option<PatternID> {
1550 match *self {
1551 Anchored::Pattern(pid) => Some(pid),
1552 _ => None,
1553 }
1554 }
1555}
1556
1557/// The kind of match semantics to use for a regex pattern.
1558///
1559/// The default match kind is `LeftmostFirst`, and this corresponds to the
1560/// match semantics used by most backtracking engines, such as Perl.
1561///
1562/// # Leftmost first or "preference order" match semantics
1563///
1564/// Leftmost-first semantics determine which match to report when there are
1565/// multiple paths through a regex that match at the same position. The tie is
1566/// essentially broken by how a backtracker would behave. For example, consider
1567/// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this
1568/// case, both the `foofoo` and `foo` branches match at position `0`. So should
1569/// the end of the match be `3` or `6`?
1570///
1571/// A backtracker will conceptually work by trying `foofoofoo` and failing.
1572/// Then it will try `foofoo`, find the match and stop there. Thus, the
1573/// leftmost-first match position is `6`. This is called "leftmost-first" or
1574/// "preference order" because the order of the branches as written in the
1575/// regex pattern is what determines how to break the tie.
1576///
1577/// (Note that leftmost-longest match semantics, which break ties by always
1578/// taking the longest matching string, are not currently supported by this
1579/// crate. These match semantics tend to be found in POSIX regex engines.)
1580///
1581/// This example shows how leftmost-first semantics work, and how it even
1582/// applies to multi-pattern regexes:
1583///
1584/// ```
1585/// use regex_automata::{
1586/// nfa::thompson::pikevm::PikeVM,
1587/// Match,
1588/// };
1589///
1590/// let re = PikeVM::new_many(&[
1591/// r"foofoofoo",
1592/// r"foofoo",
1593/// r"foo",
1594/// ])?;
1595/// let mut cache = re.create_cache();
1596/// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect();
1597/// let expected = vec![Match::must(1, 0..6)];
1598/// assert_eq!(expected, got);
1599///
1600/// # Ok::<(), Box<dyn std::error::Error>>(())
1601/// ```
1602///
1603/// # All matches
1604///
1605/// The `All` match semantics report any and all matches, and generally will
1606/// attempt to match as much as possible. It doesn't respect any sort of match
1607/// priority at all, so things like non-greedy matching don't work in this
1608/// mode.
1609///
1610/// The fact that non-greedy matching doesn't work generally makes most forms
1611/// of unanchored non-overlapping searches have unintuitive behavior. Namely,
1612/// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the
1613/// beginning of the pattern, which is specifically non-greedy. Since it will
1614/// be treated as greedy in `All` match semantics, this generally means that
1615/// it will first attempt to consume all of the haystack and is likely to wind
1616/// up skipping matches.
1617///
1618/// Generally speaking, `All` should only be used in two circumstances:
1619///
1620/// * When running an anchored search and there is a desire to match as much as
1621/// possible. For example, when building a reverse regex matcher to find the
1622/// start of a match after finding the end. In this case, the reverse search
1623/// is anchored to the end of the match found by the forward search.
1624/// * When running overlapping searches. Since `All` encodes all possible
1625/// matches, this is generally what you want for an overlapping search. If you
1626/// try to use leftmost-first in an overlapping search, it is likely to produce
1627/// counter-intuitive results since leftmost-first specifically excludes some
1628/// matches from its underlying finite state machine.
1629///
1630/// This example demonstrates the counter-intuitive behavior of `All` semantics
1631/// when using a standard leftmost unanchored search:
1632///
1633/// ```
1634/// use regex_automata::{
1635/// nfa::thompson::pikevm::PikeVM,
1636/// Match, MatchKind,
1637/// };
1638///
1639/// let re = PikeVM::builder()
1640/// .configure(PikeVM::config().match_kind(MatchKind::All))
1641/// .build("foo")?;
1642/// let hay = "first foo second foo wat";
1643/// let mut cache = re.create_cache();
1644/// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect();
1645/// // Notice that it completely skips the first 'foo'!
1646/// let expected = vec![Match::must(0, 17..20)];
1647/// assert_eq!(expected, got);
1648///
1649/// # Ok::<(), Box<dyn std::error::Error>>(())
1650/// ```
1651///
1652/// This second example shows how `All` semantics are useful for an overlapping
1653/// search. Note that we use lower level lazy DFA APIs here since the NFA
1654/// engines only currently support a very limited form of overlapping search.
1655///
1656/// ```
1657/// use regex_automata::{
1658/// hybrid::dfa::{DFA, OverlappingState},
1659/// HalfMatch, Input, MatchKind,
1660/// };
1661///
1662/// let re = DFA::builder()
1663/// // If we didn't set 'All' semantics here, then the regex would only
1664/// // match 'foo' at offset 3 and nothing else. Why? Because the state
1665/// // machine implements preference order and knows that the 'foofoo' and
1666/// // 'foofoofoo' branches can never match since 'foo' will always match
1667/// // when they match and take priority.
1668/// .configure(DFA::config().match_kind(MatchKind::All))
1669/// .build(r"foo|foofoo|foofoofoo")?;
1670/// let mut cache = re.create_cache();
1671/// let mut state = OverlappingState::start();
1672/// let input = Input::new("foofoofoo");
1673/// let mut got = vec![];
1674/// loop {
1675/// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?;
1676/// let m = match state.get_match() {
1677/// None => break,
1678/// Some(m) => m,
1679/// };
1680/// got.push(m);
1681/// }
1682/// let expected = vec![
1683/// HalfMatch::must(0, 3),
1684/// HalfMatch::must(0, 6),
1685/// HalfMatch::must(0, 9),
1686/// ];
1687/// assert_eq!(expected, got);
1688///
1689/// # Ok::<(), Box<dyn std::error::Error>>(())
1690/// ```
1691#[non_exhaustive]
1692#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1693pub enum MatchKind {
1694 /// Report all possible matches.
1695 All,
1696 /// Report only the leftmost matches. When multiple leftmost matches exist,
1697 /// report the match corresponding to the part of the regex that appears
1698 /// first in the syntax.
1699 LeftmostFirst,
1700 // There is prior art in RE2 that shows that we should be able to add
1701 // LeftmostLongest too. The tricky part of it is supporting ungreedy
1702 // repetitions. Instead of treating all NFA states as having equivalent
1703 // priority (as in 'All') or treating all NFA states as having distinct
1704 // priority based on order (as in 'LeftmostFirst'), we instead group NFA
1705 // states into sets, and treat members of each set as having equivalent
1706 // priority, but having greater priority than all following members
1707 // of different sets.
1708 //
1709 // However, it's not clear whether it's really worth adding this. After
1710 // all, leftmost-longest can be emulated when using literals by using
1711 // leftmost-first and sorting the literals by length in descending order.
1712 // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
1713 // always match `a` in `ab` when using leftmost-first, but leftmost-longest
1714 // would match `ab`.
1715}
1716
1717impl MatchKind {
1718 #[cfg(feature = "alloc")]
1719 pub(crate) fn continue_past_first_match(&self) -> bool {
1720 *self == MatchKind::All
1721 }
1722}
1723
1724impl Default for MatchKind {
1725 fn default() -> MatchKind {
1726 MatchKind::LeftmostFirst
1727 }
1728}
1729
1730/// An error indicating that a search stopped before reporting whether a
1731/// match exists or not.
1732///
1733/// To be very clear, this error type implies that one cannot assume that no
1734/// matches occur, since the search stopped before completing. That is, if
1735/// you're looking for information about where a search determined that no
1736/// match can occur, then this error type does *not* give you that. (Indeed, at
1737/// the time of writing, if you need such a thing, you have to write your own
1738/// search routine.)
1739///
1740/// Normally, when one searches for something, the response is either an
1741/// affirmative "it was found at this location" or a negative "not found at
1742/// all." However, in some cases, a regex engine can be configured to stop its
1743/// search before concluding whether a match exists or not. When this happens,
1744/// it may be important for the caller to know why the regex engine gave up and
1745/// where in the input it gave up at. This error type exposes the 'why' and the
1746/// 'where.'
1747///
1748/// For example, the DFAs provided by this library generally cannot correctly
1749/// implement Unicode word boundaries. Instead, they provide an option to
1750/// eagerly support them on ASCII text (since Unicode word boundaries are
1751/// equivalent to ASCII word boundaries when searching ASCII text), but will
1752/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
1753/// required to either report the failure to the caller (unergonomic) or
1754/// otherwise fall back to some other regex engine (ergonomic, but potentially
1755/// costly).
1756///
1757/// More generally, some regex engines offer the ability for callers to specify
1758/// certain bytes that will trigger the regex engine to automatically quit if
1759/// they are seen.
1760///
1761/// Still yet, there may be other reasons for a failed match. For example,
1762/// the hybrid DFA provided by this crate can be configured to give up if it
1763/// believes that it is not efficient. This in turn permits callers to choose a
1764/// different regex engine.
1765///
1766/// (Note that DFAs are configured by default to never quit or give up in this
1767/// fashion. For example, by default, a DFA will fail to build if the regex
1768/// pattern contains a Unicode word boundary. One needs to opt into the "quit"
1769/// behavior via options, like
1770/// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).)
1771///
1772/// There are a couple other ways a search
1773/// can fail. For example, when using the
1774/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
1775/// with a haystack that is too long, or trying to run an unanchored search
1776/// with a [one-pass DFA](crate::dfa::onepass).
1777#[derive(Clone, Debug, Eq, PartialEq)]
1778pub struct MatchError(
1779 #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>,
1780 #[cfg(not(feature = "alloc"))] MatchErrorKind,
1781);
1782
1783impl MatchError {
1784 /// Create a new error value with the given kind.
1785 ///
1786 /// This is a more verbose version of the kind-specific constructors,
1787 /// e.g., `MatchError::quit`.
1788 pub fn new(kind: MatchErrorKind) -> MatchError {
1789 #[cfg(feature = "alloc")]
1790 {
1791 MatchError(alloc::boxed::Box::new(kind))
1792 }
1793 #[cfg(not(feature = "alloc"))]
1794 {
1795 MatchError(kind)
1796 }
1797 }
1798
1799 /// Returns a reference to the underlying error kind.
1800 pub fn kind(&self) -> &MatchErrorKind {
1801 &self.0
1802 }
1803
1804 /// Create a new "quit" error. The given `byte` corresponds to the value
1805 /// that tripped a search's quit condition, and `offset` corresponds to the
1806 /// location in the haystack at which the search quit.
1807 ///
1808 /// This is the same as calling `MatchError::new` with a
1809 /// [`MatchErrorKind::Quit`] kind.
1810 pub fn quit(byte: u8, offset: usize) -> MatchError {
1811 MatchError::new(MatchErrorKind::Quit { byte, offset })
1812 }
1813
1814 /// Create a new "gave up" error. The given `offset` corresponds to the
1815 /// location in the haystack at which the search gave up.
1816 ///
1817 /// This is the same as calling `MatchError::new` with a
1818 /// [`MatchErrorKind::GaveUp`] kind.
1819 pub fn gave_up(offset: usize) -> MatchError {
1820 MatchError::new(MatchErrorKind::GaveUp { offset })
1821 }
1822
1823 /// Create a new "haystack too long" error. The given `len` corresponds to
1824 /// the length of the haystack that was problematic.
1825 ///
1826 /// This is the same as calling `MatchError::new` with a
1827 /// [`MatchErrorKind::HaystackTooLong`] kind.
1828 pub fn haystack_too_long(len: usize) -> MatchError {
1829 MatchError::new(MatchErrorKind::HaystackTooLong { len })
1830 }
1831
1832 /// Create a new "unsupported anchored" error. This occurs when the caller
1833 /// requests a search with an anchor mode that is not supported by the
1834 /// regex engine.
1835 ///
1836 /// This is the same as calling `MatchError::new` with a
1837 /// [`MatchErrorKind::UnsupportedAnchored`] kind.
1838 pub fn unsupported_anchored(mode: Anchored) -> MatchError {
1839 MatchError::new(MatchErrorKind::UnsupportedAnchored { mode })
1840 }
1841}
1842
1843/// The underlying kind of a [`MatchError`].
1844///
1845/// This is a **non-exhaustive** enum. That means new variants may be added in
1846/// a semver-compatible release.
1847#[non_exhaustive]
1848#[derive(Clone, Debug, Eq, PartialEq)]
1849pub enum MatchErrorKind {
1850 /// The search saw a "quit" byte at which it was instructed to stop
1851 /// searching.
1852 Quit {
1853 /// The "quit" byte that was observed that caused the search to stop.
1854 byte: u8,
1855 /// The offset at which the quit byte was observed.
1856 offset: usize,
1857 },
1858 /// The search, based on heuristics, determined that it would be better
1859 /// to stop, typically to provide the caller an opportunity to use an
1860 /// alternative regex engine.
1861 ///
1862 /// Currently, the only way for this to occur is via the lazy DFA and
1863 /// only when it is configured to do so (it will not return this error by
1864 /// default).
1865 GaveUp {
1866 /// The offset at which the search stopped. This corresponds to the
1867 /// position immediately following the last byte scanned.
1868 offset: usize,
1869 },
1870 /// This error occurs if the haystack given to the regex engine was too
1871 /// long to be searched. This occurs, for example, with regex engines
1872 /// like the bounded backtracker that have a configurable fixed amount of
1873 /// capacity that is tied to the length of the haystack. Anything beyond
1874 /// that configured limit will result in an error at search time.
1875 HaystackTooLong {
1876 /// The length of the haystack that exceeded the limit.
1877 len: usize,
1878 },
1879 /// An error indicating that a particular type of anchored search was
1880 /// requested, but that the regex engine does not support it.
1881 ///
1882 /// Note that this error should not be returned by a regex engine simply
1883 /// because the pattern ID is invalid (i.e., equal to or exceeds the number
1884 /// of patterns in the regex). In that case, the regex engine should report
1885 /// a non-match.
1886 UnsupportedAnchored {
1887 /// The anchored mode given that is unsupported.
1888 mode: Anchored,
1889 },
1890}
1891
1892#[cfg(feature = "std")]
1893impl std::error::Error for MatchError {}
1894
1895impl core::fmt::Display for MatchError {
1896 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1897 match *self.kind() {
1898 MatchErrorKind::Quit { byte, offset } => write!(
1899 f,
1900 "quit search after observing byte {:?} at offset {}",
1901 DebugByte(byte),
1902 offset,
1903 ),
1904 MatchErrorKind::GaveUp { offset } => {
1905 write!(f, "gave up searching at offset {}", offset)
1906 }
1907 MatchErrorKind::HaystackTooLong { len } => {
1908 write!(f, "haystack of length {} is too long", len)
1909 }
1910 MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => {
1911 write!(f, "anchored searches are not supported or enabled")
1912 }
1913 MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => {
1914 write!(f, "unanchored searches are not supported or enabled")
1915 }
1916 MatchErrorKind::UnsupportedAnchored {
1917 mode: Anchored::Pattern(pid),
1918 } => {
1919 write!(
1920 f,
1921 "anchored searches for a specific pattern ({}) are \
1922 not supported or enabled",
1923 pid.as_usize(),
1924 )
1925 }
1926 }
1927 }
1928}
1929
1930#[cfg(test)]
1931mod tests {
1932 use super::*;
1933
1934 // We test that our 'MatchError' type is the size we expect. This isn't an
1935 // API guarantee, but if the size increases, we really want to make sure we
1936 // decide to do that intentionally. So this should be a speed bump. And in
1937 // general, we should not increase the size without a very good reason.
1938 //
1939 // Why? Because low level search APIs return Result<.., MatchError>. When
1940 // MatchError gets bigger, so to does the Result type.
1941 //
1942 // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes
1943 // the importance of keeping a small error type. But without 'alloc', we
1944 // still want things to be small.
1945 #[test]
1946 fn match_error_size() {
1947 let expected_size = if cfg!(feature = "alloc") {
1948 core::mem::size_of::<usize>()
1949 } else {
1950 2 * core::mem::size_of::<usize>()
1951 };
1952 assert_eq!(expected_size, core::mem::size_of::<MatchError>());
1953 }
1954
1955 // Same as above, but for the underlying match error kind.
1956 #[cfg(target_pointer_width = "64")]
1957 #[test]
1958 fn match_error_kind_size() {
1959 let expected_size = 2 * core::mem::size_of::<usize>();
1960 assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
1961 }
1962
1963 #[cfg(target_pointer_width = "32")]
1964 #[test]
1965 fn match_error_kind_size() {
1966 let expected_size = 3 * core::mem::size_of::<usize>();
1967 assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
1968 }
1969}
1970