1use alloc::string::String;
2
3use regex_automata::{meta, Input, PatternID, PatternSet, PatternSetIter};
4
5use crate::{Error, RegexSetBuilder};
6
7/// Match multiple, possibly overlapping, regexes in a single search.
8///
9/// A regex set corresponds to the union of zero or more regular expressions.
10/// That is, a regex set will match a haystack when at least one of its
11/// constituent regexes matches. A regex set as its formulated here provides a
12/// touch more power: it will also report *which* regular expressions in the
13/// set match. Indeed, this is the key difference between regex sets and a
14/// single `Regex` with many alternates, since only one alternate can match at
15/// a time.
16///
17/// For example, consider regular expressions to match email addresses and
18/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
19/// regex set is constructed from those regexes, then searching the haystack
20/// `foo@example.com` will report both regexes as matching. Of course, one
21/// could accomplish this by compiling each regex on its own and doing two
22/// searches over the haystack. The key advantage of using a regex set is
23/// that it will report the matching regexes using a *single pass through the
24/// haystack*. If one has hundreds or thousands of regexes to match repeatedly
25/// (like a URL router for a complex web application or a user agent matcher),
26/// then a regex set *can* realize huge performance gains.
27///
28/// # Limitations
29///
30/// Regex sets are limited to answering the following two questions:
31///
32/// 1. Does any regex in the set match?
33/// 2. If so, which regexes in the set match?
34///
35/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
36/// instead of (2) since the matching engines can stop after the first match
37/// is found.
38///
39/// You cannot directly extract [`Match`][crate::Match] or
40/// [`Captures`][crate::Captures] objects from a regex set. If you need these
41/// operations, the recommended approach is to compile each pattern in the set
42/// independently and scan the exact same haystack a second time with those
43/// independently compiled patterns:
44///
45/// ```
46/// use regex::{Regex, RegexSet};
47///
48/// let patterns = ["foo", "bar"];
49/// // Both patterns will match different ranges of this string.
50/// let hay = "barfoo";
51///
52/// // Compile a set matching any of our patterns.
53/// let set = RegexSet::new(patterns).unwrap();
54/// // Compile each pattern independently.
55/// let regexes: Vec<_> = set
56/// .patterns()
57/// .iter()
58/// .map(|pat| Regex::new(pat).unwrap())
59/// .collect();
60///
61/// // Match against the whole set first and identify the individual
62/// // matching patterns.
63/// let matches: Vec<&str> = set
64/// .matches(hay)
65/// .into_iter()
66/// // Dereference the match index to get the corresponding
67/// // compiled pattern.
68/// .map(|index| &regexes[index])
69/// // To get match locations or any other info, we then have to search the
70/// // exact same haystack again, using our separately-compiled pattern.
71/// .map(|re| re.find(hay).unwrap().as_str())
72/// .collect();
73///
74/// // Matches arrive in the order the constituent patterns were declared,
75/// // not the order they appear in the haystack.
76/// assert_eq!(vec!["foo", "bar"], matches);
77/// ```
78///
79/// # Performance
80///
81/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
82/// search takes `O(m * n)` time, where `m` is proportional to the size of the
83/// regex set and `n` is proportional to the length of the haystack.
84///
85/// # Trait implementations
86///
87/// The `Default` trait is implemented for `RegexSet`. The default value
88/// is an empty set. An empty set can also be explicitly constructed via
89/// [`RegexSet::empty`].
90///
91/// # Example
92///
93/// This shows how the above two regexes (for matching email addresses and
94/// domains) might work:
95///
96/// ```
97/// use regex::RegexSet;
98///
99/// let set = RegexSet::new(&[
100/// r"[a-z]+@[a-z]+\.(com|org|net)",
101/// r"[a-z]+\.(com|org|net)",
102/// ]).unwrap();
103///
104/// // Ask whether any regexes in the set match.
105/// assert!(set.is_match("foo@example.com"));
106///
107/// // Identify which regexes in the set match.
108/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
109/// assert_eq!(vec![0, 1], matches);
110///
111/// // Try again, but with a haystack that only matches one of the regexes.
112/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
113/// assert_eq!(vec![1], matches);
114///
115/// // Try again, but with a haystack that doesn't match any regex in the set.
116/// let matches: Vec<_> = set.matches("example").into_iter().collect();
117/// assert!(matches.is_empty());
118/// ```
119///
120/// Note that it would be possible to adapt the above example to using `Regex`
121/// with an expression like:
122///
123/// ```text
124/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
125/// ```
126///
127/// After a match, one could then inspect the capture groups to figure out
128/// which alternates matched. The problem is that it is hard to make this
129/// approach scale when there are many regexes since the overlap between each
130/// alternate isn't always obvious to reason about.
131#[derive(Clone)]
132pub struct RegexSet {
133 pub(crate) meta: meta::Regex,
134 pub(crate) patterns: alloc::sync::Arc<[String]>,
135}
136
137impl RegexSet {
138 /// Create a new regex set with the given regular expressions.
139 ///
140 /// This takes an iterator of `S`, where `S` is something that can produce
141 /// a `&str`. If any of the strings in the iterator are not valid regular
142 /// expressions, then an error is returned.
143 ///
144 /// # Example
145 ///
146 /// Create a new regex set from an iterator of strings:
147 ///
148 /// ```
149 /// use regex::RegexSet;
150 ///
151 /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
152 /// assert!(set.is_match("foo"));
153 /// ```
154 pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
155 where
156 S: AsRef<str>,
157 I: IntoIterator<Item = S>,
158 {
159 RegexSetBuilder::new(exprs).build()
160 }
161
162 /// Create a new empty regex set.
163 ///
164 /// An empty regex never matches anything.
165 ///
166 /// This is a convenience function for `RegexSet::new([])`, but doesn't
167 /// require one to specify the type of the input.
168 ///
169 /// # Example
170 ///
171 /// ```
172 /// use regex::RegexSet;
173 ///
174 /// let set = RegexSet::empty();
175 /// assert!(set.is_empty());
176 /// // an empty set matches nothing
177 /// assert!(!set.is_match(""));
178 /// ```
179 pub fn empty() -> RegexSet {
180 let empty: [&str; 0] = [];
181 RegexSetBuilder::new(empty).build().unwrap()
182 }
183
184 /// Returns true if and only if one of the regexes in this set matches
185 /// the haystack given.
186 ///
187 /// This method should be preferred if you only need to test whether any
188 /// of the regexes in the set should match, but don't care about *which*
189 /// regexes matched. This is because the underlying matching engine will
190 /// quit immediately after seeing the first match instead of continuing to
191 /// find all matches.
192 ///
193 /// Note that as with searches using [`Regex`](crate::Regex), the
194 /// expression is unanchored by default. That is, if the regex does not
195 /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
196 /// to match anywhere in the haystack.
197 ///
198 /// # Example
199 ///
200 /// Tests whether a set matches somewhere in a haystack:
201 ///
202 /// ```
203 /// use regex::RegexSet;
204 ///
205 /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
206 /// assert!(set.is_match("foo"));
207 /// assert!(!set.is_match("☃"));
208 /// ```
209 #[inline]
210 pub fn is_match(&self, haystack: &str) -> bool {
211 self.is_match_at(haystack, 0)
212 }
213
214 /// Returns true if and only if one of the regexes in this set matches the
215 /// haystack given, with the search starting at the offset given.
216 ///
217 /// The significance of the starting point is that it takes the surrounding
218 /// context into consideration. For example, the `\A` anchor can only
219 /// match when `start == 0`.
220 ///
221 /// # Panics
222 ///
223 /// This panics when `start >= haystack.len() + 1`.
224 ///
225 /// # Example
226 ///
227 /// This example shows the significance of `start`. Namely, consider a
228 /// haystack `foobar` and a desire to execute a search starting at offset
229 /// `3`. You could search a substring explicitly, but then the look-around
230 /// assertions won't work correctly. Instead, you can use this method to
231 /// specify the start position of a search.
232 ///
233 /// ```
234 /// use regex::RegexSet;
235 ///
236 /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
237 /// let hay = "foobar";
238 /// // We get a match here, but it's probably not intended.
239 /// assert!(set.is_match(&hay[3..]));
240 /// // No match because the assertions take the context into account.
241 /// assert!(!set.is_match_at(hay, 3));
242 /// ```
243 #[inline]
244 pub fn is_match_at(&self, haystack: &str, start: usize) -> bool {
245 self.meta.is_match(Input::new(haystack).span(start..haystack.len()))
246 }
247
248 /// Returns the set of regexes that match in the given haystack.
249 ///
250 /// The set returned contains the index of each regex that matches in
251 /// the given haystack. The index is in correspondence with the order of
252 /// regular expressions given to `RegexSet`'s constructor.
253 ///
254 /// The set can also be used to iterate over the matched indices. The order
255 /// of iteration is always ascending with respect to the matching indices.
256 ///
257 /// Note that as with searches using [`Regex`](crate::Regex), the
258 /// expression is unanchored by default. That is, if the regex does not
259 /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
260 /// to match anywhere in the haystack.
261 ///
262 /// # Example
263 ///
264 /// Tests which regular expressions match the given haystack:
265 ///
266 /// ```
267 /// use regex::RegexSet;
268 ///
269 /// let set = RegexSet::new([
270 /// r"\w+",
271 /// r"\d+",
272 /// r"\pL+",
273 /// r"foo",
274 /// r"bar",
275 /// r"barfoo",
276 /// r"foobar",
277 /// ]).unwrap();
278 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
279 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
280 ///
281 /// // You can also test whether a particular regex matched:
282 /// let matches = set.matches("foobar");
283 /// assert!(!matches.matched(5));
284 /// assert!(matches.matched(6));
285 /// ```
286 #[inline]
287 pub fn matches(&self, haystack: &str) -> SetMatches {
288 self.matches_at(haystack, 0)
289 }
290
291 /// Returns the set of regexes that match in the given haystack.
292 ///
293 /// The set returned contains the index of each regex that matches in
294 /// the given haystack. The index is in correspondence with the order of
295 /// regular expressions given to `RegexSet`'s constructor.
296 ///
297 /// The set can also be used to iterate over the matched indices. The order
298 /// of iteration is always ascending with respect to the matching indices.
299 ///
300 /// The significance of the starting point is that it takes the surrounding
301 /// context into consideration. For example, the `\A` anchor can only
302 /// match when `start == 0`.
303 ///
304 /// # Panics
305 ///
306 /// This panics when `start >= haystack.len() + 1`.
307 ///
308 /// # Example
309 ///
310 /// Tests which regular expressions match the given haystack:
311 ///
312 /// ```
313 /// use regex::RegexSet;
314 ///
315 /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
316 /// let hay = "foobar";
317 /// // We get matches here, but it's probably not intended.
318 /// let matches: Vec<_> = set.matches(&hay[3..]).into_iter().collect();
319 /// assert_eq!(matches, vec![0, 1]);
320 /// // No matches because the assertions take the context into account.
321 /// let matches: Vec<_> = set.matches_at(hay, 3).into_iter().collect();
322 /// assert_eq!(matches, vec![]);
323 /// ```
324 #[inline]
325 pub fn matches_at(&self, haystack: &str, start: usize) -> SetMatches {
326 let input = Input::new(haystack).span(start..haystack.len());
327 let mut patset = PatternSet::new(self.meta.pattern_len());
328 self.meta.which_overlapping_matches(&input, &mut patset);
329 SetMatches(patset)
330 }
331
332 /// Returns the same as matches, but starts the search at the given
333 /// offset and stores the matches into the slice given.
334 ///
335 /// The significance of the starting point is that it takes the surrounding
336 /// context into consideration. For example, the `\A` anchor can only
337 /// match when `start == 0`.
338 ///
339 /// `matches` must have a length that is at least the number of regexes
340 /// in this set.
341 ///
342 /// This method returns true if and only if at least one member of
343 /// `matches` is true after executing the set against `haystack`.
344 #[doc(hidden)]
345 #[inline]
346 pub fn matches_read_at(
347 &self,
348 matches: &mut [bool],
349 haystack: &str,
350 start: usize,
351 ) -> bool {
352 // This is pretty dumb. We should try to fix this, but the
353 // regex-automata API doesn't provide a way to store matches in an
354 // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and
355 // thus not public... But regex-capi currently uses it. We should
356 // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet
357 // is in regex-automata, not regex. So maybe we should just accept a
358 // 'SetMatches', which is basically just a newtype around PatternSet.
359 let mut patset = PatternSet::new(self.meta.pattern_len());
360 let mut input = Input::new(haystack);
361 input.set_start(start);
362 self.meta.which_overlapping_matches(&input, &mut patset);
363 for pid in patset.iter() {
364 matches[pid] = true;
365 }
366 !patset.is_empty()
367 }
368
369 /// An alias for `matches_read_at` to preserve backward compatibility.
370 ///
371 /// The `regex-capi` crate used this method, so to avoid breaking that
372 /// crate, we continue to export it as an undocumented API.
373 #[doc(hidden)]
374 #[inline]
375 pub fn read_matches_at(
376 &self,
377 matches: &mut [bool],
378 haystack: &str,
379 start: usize,
380 ) -> bool {
381 self.matches_read_at(matches, haystack, start)
382 }
383
384 /// Returns the total number of regexes in this set.
385 ///
386 /// # Example
387 ///
388 /// ```
389 /// use regex::RegexSet;
390 ///
391 /// assert_eq!(0, RegexSet::empty().len());
392 /// assert_eq!(1, RegexSet::new([r"[0-9]"]).unwrap().len());
393 /// assert_eq!(2, RegexSet::new([r"[0-9]", r"[a-z]"]).unwrap().len());
394 /// ```
395 #[inline]
396 pub fn len(&self) -> usize {
397 self.meta.pattern_len()
398 }
399
400 /// Returns `true` if this set contains no regexes.
401 ///
402 /// # Example
403 ///
404 /// ```
405 /// use regex::RegexSet;
406 ///
407 /// assert!(RegexSet::empty().is_empty());
408 /// assert!(!RegexSet::new([r"[0-9]"]).unwrap().is_empty());
409 /// ```
410 #[inline]
411 pub fn is_empty(&self) -> bool {
412 self.meta.pattern_len() == 0
413 }
414
415 /// Returns the regex patterns that this regex set was constructed from.
416 ///
417 /// This function can be used to determine the pattern for a match. The
418 /// slice returned has exactly as many patterns givens to this regex set,
419 /// and the order of the slice is the same as the order of the patterns
420 /// provided to the set.
421 ///
422 /// # Example
423 ///
424 /// ```
425 /// use regex::RegexSet;
426 ///
427 /// let set = RegexSet::new(&[
428 /// r"\w+",
429 /// r"\d+",
430 /// r"\pL+",
431 /// r"foo",
432 /// r"bar",
433 /// r"barfoo",
434 /// r"foobar",
435 /// ]).unwrap();
436 /// let matches: Vec<_> = set
437 /// .matches("foobar")
438 /// .into_iter()
439 /// .map(|index| &set.patterns()[index])
440 /// .collect();
441 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
442 /// ```
443 #[inline]
444 pub fn patterns(&self) -> &[String] {
445 &self.patterns
446 }
447}
448
449impl Default for RegexSet {
450 fn default() -> Self {
451 RegexSet::empty()
452 }
453}
454
455/// A set of matches returned by a regex set.
456///
457/// Values of this type are constructed by [`RegexSet::matches`].
458#[derive(Clone, Debug)]
459pub struct SetMatches(PatternSet);
460
461impl SetMatches {
462 /// Whether this set contains any matches.
463 ///
464 /// # Example
465 ///
466 /// ```
467 /// use regex::RegexSet;
468 ///
469 /// let set = RegexSet::new(&[
470 /// r"[a-z]+@[a-z]+\.(com|org|net)",
471 /// r"[a-z]+\.(com|org|net)",
472 /// ]).unwrap();
473 /// let matches = set.matches("foo@example.com");
474 /// assert!(matches.matched_any());
475 /// ```
476 #[inline]
477 pub fn matched_any(&self) -> bool {
478 !self.0.is_empty()
479 }
480
481 /// Whether the regex at the given index matched.
482 ///
483 /// The index for a regex is determined by its insertion order upon the
484 /// initial construction of a `RegexSet`, starting at `0`.
485 ///
486 /// # Panics
487 ///
488 /// If `index` is greater than or equal to the number of regexes in the
489 /// original set that produced these matches. Equivalently, when `index`
490 /// is greater than or equal to [`SetMatches::len`].
491 ///
492 /// # Example
493 ///
494 /// ```
495 /// use regex::RegexSet;
496 ///
497 /// let set = RegexSet::new([
498 /// r"[a-z]+@[a-z]+\.(com|org|net)",
499 /// r"[a-z]+\.(com|org|net)",
500 /// ]).unwrap();
501 /// let matches = set.matches("example.com");
502 /// assert!(!matches.matched(0));
503 /// assert!(matches.matched(1));
504 /// ```
505 #[inline]
506 pub fn matched(&self, index: usize) -> bool {
507 self.0.contains(PatternID::new_unchecked(index))
508 }
509
510 /// The total number of regexes in the set that created these matches.
511 ///
512 /// **WARNING:** This always returns the same value as [`RegexSet::len`].
513 /// In particular, it does *not* return the number of elements yielded by
514 /// [`SetMatches::iter`]. The only way to determine the total number of
515 /// matched regexes is to iterate over them.
516 ///
517 /// # Example
518 ///
519 /// Notice that this method returns the total number of regexes in the
520 /// original set, and *not* the total number of regexes that matched.
521 ///
522 /// ```
523 /// use regex::RegexSet;
524 ///
525 /// let set = RegexSet::new([
526 /// r"[a-z]+@[a-z]+\.(com|org|net)",
527 /// r"[a-z]+\.(com|org|net)",
528 /// ]).unwrap();
529 /// let matches = set.matches("example.com");
530 /// // Total number of patterns that matched.
531 /// assert_eq!(1, matches.iter().count());
532 /// // Total number of patterns in the set.
533 /// assert_eq!(2, matches.len());
534 /// ```
535 #[inline]
536 pub fn len(&self) -> usize {
537 self.0.capacity()
538 }
539
540 /// Returns an iterator over the indices of the regexes that matched.
541 ///
542 /// This will always produces matches in ascending order, where the index
543 /// yielded corresponds to the index of the regex that matched with respect
544 /// to its position when initially building the set.
545 ///
546 /// # Example
547 ///
548 /// ```
549 /// use regex::RegexSet;
550 ///
551 /// let set = RegexSet::new([
552 /// r"[0-9]",
553 /// r"[a-z]",
554 /// r"[A-Z]",
555 /// r"\p{Greek}",
556 /// ]).unwrap();
557 /// let hay = "βa1";
558 /// let matches: Vec<_> = set.matches(hay).iter().collect();
559 /// assert_eq!(matches, vec![0, 1, 3]);
560 /// ```
561 ///
562 /// Note that `SetMatches` also implemnets the `IntoIterator` trait, so
563 /// this method is not always needed. For example:
564 ///
565 /// ```
566 /// use regex::RegexSet;
567 ///
568 /// let set = RegexSet::new([
569 /// r"[0-9]",
570 /// r"[a-z]",
571 /// r"[A-Z]",
572 /// r"\p{Greek}",
573 /// ]).unwrap();
574 /// let hay = "βa1";
575 /// let mut matches = vec![];
576 /// for index in set.matches(hay) {
577 /// matches.push(index);
578 /// }
579 /// assert_eq!(matches, vec![0, 1, 3]);
580 /// ```
581 #[inline]
582 pub fn iter(&self) -> SetMatchesIter<'_> {
583 SetMatchesIter(self.0.iter())
584 }
585}
586
587impl IntoIterator for SetMatches {
588 type IntoIter = SetMatchesIntoIter;
589 type Item = usize;
590
591 fn into_iter(self) -> Self::IntoIter {
592 let it = 0..self.0.capacity();
593 SetMatchesIntoIter { patset: self.0, it }
594 }
595}
596
597impl<'a> IntoIterator for &'a SetMatches {
598 type IntoIter = SetMatchesIter<'a>;
599 type Item = usize;
600
601 fn into_iter(self) -> Self::IntoIter {
602 self.iter()
603 }
604}
605
606/// An owned iterator over the set of matches from a regex set.
607///
608/// This will always produces matches in ascending order of index, where the
609/// index corresponds to the index of the regex that matched with respect to
610/// its position when initially building the set.
611///
612/// This iterator is created by calling `SetMatches::into_iter` via the
613/// `IntoIterator` trait. This is automatically done in `for` loops.
614///
615/// # Example
616///
617/// ```
618/// use regex::RegexSet;
619///
620/// let set = RegexSet::new([
621/// r"[0-9]",
622/// r"[a-z]",
623/// r"[A-Z]",
624/// r"\p{Greek}",
625/// ]).unwrap();
626/// let hay = "βa1";
627/// let mut matches = vec![];
628/// for index in set.matches(hay) {
629/// matches.push(index);
630/// }
631/// assert_eq!(matches, vec![0, 1, 3]);
632/// ```
633#[derive(Debug)]
634pub struct SetMatchesIntoIter {
635 patset: PatternSet,
636 it: core::ops::Range<usize>,
637}
638
639impl Iterator for SetMatchesIntoIter {
640 type Item = usize;
641
642 fn next(&mut self) -> Option<usize> {
643 loop {
644 let id = self.it.next()?;
645 if self.patset.contains(PatternID::new_unchecked(id)) {
646 return Some(id);
647 }
648 }
649 }
650
651 fn size_hint(&self) -> (usize, Option<usize>) {
652 self.it.size_hint()
653 }
654}
655
656impl DoubleEndedIterator for SetMatchesIntoIter {
657 fn next_back(&mut self) -> Option<usize> {
658 loop {
659 let id = self.it.next_back()?;
660 if self.patset.contains(PatternID::new_unchecked(id)) {
661 return Some(id);
662 }
663 }
664 }
665}
666
667impl core::iter::FusedIterator for SetMatchesIntoIter {}
668
669/// A borrowed iterator over the set of matches from a regex set.
670///
671/// The lifetime `'a` refers to the lifetime of the [`SetMatches`] value that
672/// created this iterator.
673///
674/// This will always produces matches in ascending order, where the index
675/// corresponds to the index of the regex that matched with respect to its
676/// position when initially building the set.
677///
678/// This iterator is created by the [`SetMatches::iter`] method.
679#[derive(Clone, Debug)]
680pub struct SetMatchesIter<'a>(PatternSetIter<'a>);
681
682impl<'a> Iterator for SetMatchesIter<'a> {
683 type Item = usize;
684
685 fn next(&mut self) -> Option<usize> {
686 self.0.next().map(|pid| pid.as_usize())
687 }
688
689 fn size_hint(&self) -> (usize, Option<usize>) {
690 self.0.size_hint()
691 }
692}
693
694impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
695 fn next_back(&mut self) -> Option<usize> {
696 self.0.next_back().map(|pid| pid.as_usize())
697 }
698}
699
700impl<'a> core::iter::FusedIterator for SetMatchesIter<'a> {}
701
702impl core::fmt::Debug for RegexSet {
703 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
704 write!(f, "RegexSet({:?})", self.patterns())
705 }
706}
707