1macro_rules! define_set {
2 ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
3 $(#[$doc_regexset_example:meta])* ) => {
4 pub mod $name {
5 use std::fmt;
6 use std::iter;
7 use std::slice;
8 use std::vec;
9
10 use crate::error::Error;
11 use crate::exec::Exec;
12 use crate::re_builder::$builder_mod::RegexSetBuilder;
13 use crate::re_trait::RegularExpression;
14
15/// Match multiple (possibly overlapping) regular expressions in a single scan.
16///
17/// A regex set corresponds to the union of two or more regular expressions.
18/// That is, a regex set will match text where at least one of its
19/// constituent regular expressions matches. A regex set as its formulated here
20/// provides a touch more power: it will also report *which* regular
21/// expressions in the set match. Indeed, this is the key difference between
22/// regex sets and a single `Regex` with many alternates, since only one
23/// alternate can match at a time.
24///
25/// For example, consider regular expressions to match email addresses and
26/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
27/// regex set is constructed from those regexes, then searching the text
28/// `foo@example.com` will report both regexes as matching. Of course, one
29/// could accomplish this by compiling each regex on its own and doing two
30/// searches over the text. The key advantage of using a regex set is that it
31/// will report the matching regexes using a *single pass through the text*.
32/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
33/// router for a complex web application or a user agent matcher), then a regex
34/// set can realize huge performance gains.
35///
36/// # Example
37///
38/// This shows how the above two regexes (for matching email addresses and
39/// domains) might work:
40///
41$(#[$doc_regexset_example])*
42///
43/// Note that it would be possible to adapt the above example to using `Regex`
44/// with an expression like:
45///
46/// ```text
47/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
48/// ```
49///
50/// After a match, one could then inspect the capture groups to figure out
51/// which alternates matched. The problem is that it is hard to make this
52/// approach scale when there are many regexes since the overlap between each
53/// alternate isn't always obvious to reason about.
54///
55/// # Limitations
56///
57/// Regex sets are limited to answering the following two questions:
58///
59/// 1. Does any regex in the set match?
60/// 2. If so, which regexes in the set match?
61///
62/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
63/// instead of (2) since the matching engines can stop after the first match
64/// is found.
65///
66/// You cannot directly extract [`Match`][crate::Match] or
67/// [`Captures`][crate::Captures] objects from a regex set. If you need these
68/// operations, the recommended approach is to compile each pattern in the set
69/// independently and scan the exact same input a second time with those
70/// independently compiled patterns:
71///
72/// ```rust
73/// use regex::{Regex, RegexSet};
74///
75/// let patterns = ["foo", "bar"];
76/// // Both patterns will match different ranges of this string.
77/// let text = "barfoo";
78///
79/// // Compile a set matching any of our patterns.
80/// let set = RegexSet::new(&patterns).unwrap();
81/// // Compile each pattern independently.
82/// let regexes: Vec<_> = set.patterns().iter()
83/// .map(|pat| Regex::new(pat).unwrap())
84/// .collect();
85///
86/// // Match against the whole set first and identify the individual
87/// // matching patterns.
88/// let matches: Vec<&str> = set.matches(text).into_iter()
89/// // Dereference the match index to get the corresponding
90/// // compiled pattern.
91/// .map(|match_idx| &regexes[match_idx])
92/// // To get match locations or any other info, we then have to search
93/// // the exact same text again, using our separately-compiled pattern.
94/// .map(|pat| pat.find(text).unwrap().as_str())
95/// .collect();
96///
97/// // Matches arrive in the order the constituent patterns were declared,
98/// // not the order they appear in the input.
99/// assert_eq!(vec!["foo", "bar"], matches);
100/// ```
101///
102/// # Performance
103///
104/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
105/// search takes `O(mn)` time, where `m` is proportional to the size of the
106/// regex set and `n` is proportional to the length of the search text.
107#[derive(Clone)]
108pub struct RegexSet(Exec);
109
110impl RegexSet {
111 /// Create a new regex set with the given regular expressions.
112 ///
113 /// This takes an iterator of `S`, where `S` is something that can produce
114 /// a `&str`. If any of the strings in the iterator are not valid regular
115 /// expressions, then an error is returned.
116 ///
117 /// # Example
118 ///
119 /// Create a new regex set from an iterator of strings:
120 ///
121 /// ```rust
122 /// # use regex::RegexSet;
123 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
124 /// assert!(set.is_match("foo"));
125 /// ```
126 pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
127 where S: AsRef<str>, I: IntoIterator<Item=S> {
128 RegexSetBuilder::new(exprs).build()
129 }
130
131 /// Create a new empty regex set.
132 ///
133 /// # Example
134 ///
135 /// ```rust
136 /// # use regex::RegexSet;
137 /// let set = RegexSet::empty();
138 /// assert!(set.is_empty());
139 /// ```
140 pub fn empty() -> RegexSet {
141 RegexSetBuilder::new(&[""; 0]).build().unwrap()
142 }
143
144 /// Returns true if and only if one of the regexes in this set matches
145 /// the text given.
146 ///
147 /// This method should be preferred if you only need to test whether any
148 /// of the regexes in the set should match, but don't care about *which*
149 /// regexes matched. This is because the underlying matching engine will
150 /// quit immediately after seeing the first match instead of continuing to
151 /// find all matches.
152 ///
153 /// Note that as with searches using `Regex`, the expression is unanchored
154 /// by default. That is, if the regex does not start with `^` or `\A`, or
155 /// end with `$` or `\z`, then it is permitted to match anywhere in the
156 /// text.
157 ///
158 /// # Example
159 ///
160 /// Tests whether a set matches some text:
161 ///
162 /// ```rust
163 /// # use regex::RegexSet;
164 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
165 /// assert!(set.is_match("foo"));
166 /// assert!(!set.is_match("☃"));
167 /// ```
168 pub fn is_match(&self, text: $text_ty) -> bool {
169 self.is_match_at(text, 0)
170 }
171
172 /// Returns the same as is_match, but starts the search at the given
173 /// offset.
174 ///
175 /// The significance of the starting point is that it takes the surrounding
176 /// context into consideration. For example, the `\A` anchor can only
177 /// match when `start == 0`.
178 #[doc(hidden)]
179 pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
180 self.0.searcher().is_match_at($as_bytes(text), start)
181 }
182
183 /// Returns the set of regular expressions that match in the given text.
184 ///
185 /// The set returned contains the index of each regular expression that
186 /// matches in the given text. The index is in correspondence with the
187 /// order of regular expressions given to `RegexSet`'s constructor.
188 ///
189 /// The set can also be used to iterate over the matched indices.
190 ///
191 /// Note that as with searches using `Regex`, the expression is unanchored
192 /// by default. That is, if the regex does not start with `^` or `\A`, or
193 /// end with `$` or `\z`, then it is permitted to match anywhere in the
194 /// text.
195 ///
196 /// # Example
197 ///
198 /// Tests which regular expressions match the given text:
199 ///
200 /// ```rust
201 /// # use regex::RegexSet;
202 /// let set = RegexSet::new(&[
203 /// r"\w+",
204 /// r"\d+",
205 /// r"\pL+",
206 /// r"foo",
207 /// r"bar",
208 /// r"barfoo",
209 /// r"foobar",
210 /// ]).unwrap();
211 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
212 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
213 ///
214 /// // You can also test whether a particular regex matched:
215 /// let matches = set.matches("foobar");
216 /// assert!(!matches.matched(5));
217 /// assert!(matches.matched(6));
218 /// ```
219 pub fn matches(&self, text: $text_ty) -> SetMatches {
220 let mut matches = vec![false; self.0.regex_strings().len()];
221 let any = self.read_matches_at(&mut matches, text, 0);
222 SetMatches {
223 matched_any: any,
224 matches: matches,
225 }
226 }
227
228 /// Returns the same as matches, but starts the search at the given
229 /// offset and stores the matches into the slice given.
230 ///
231 /// The significance of the starting point is that it takes the surrounding
232 /// context into consideration. For example, the `\A` anchor can only
233 /// match when `start == 0`.
234 ///
235 /// `matches` must have a length that is at least the number of regexes
236 /// in this set.
237 ///
238 /// This method returns true if and only if at least one member of
239 /// `matches` is true after executing the set against `text`.
240 #[doc(hidden)]
241 pub fn read_matches_at(
242 &self,
243 matches: &mut [bool],
244 text: $text_ty,
245 start: usize,
246 ) -> bool {
247 self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
248 }
249
250 /// Returns the total number of regular expressions in this set.
251 pub fn len(&self) -> usize {
252 self.0.regex_strings().len()
253 }
254
255 /// Returns `true` if this set contains no regular expressions.
256 pub fn is_empty(&self) -> bool {
257 self.0.regex_strings().is_empty()
258 }
259
260 /// Returns the patterns that this set will match on.
261 ///
262 /// This function can be used to determine the pattern for a match. The
263 /// slice returned has exactly as many patterns givens to this regex set,
264 /// and the order of the slice is the same as the order of the patterns
265 /// provided to the set.
266 ///
267 /// # Example
268 ///
269 /// ```rust
270 /// # use regex::RegexSet;
271 /// let set = RegexSet::new(&[
272 /// r"\w+",
273 /// r"\d+",
274 /// r"\pL+",
275 /// r"foo",
276 /// r"bar",
277 /// r"barfoo",
278 /// r"foobar",
279 /// ]).unwrap();
280 /// let matches: Vec<_> = set
281 /// .matches("foobar")
282 /// .into_iter()
283 /// .map(|match_idx| &set.patterns()[match_idx])
284 /// .collect();
285 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
286 /// ```
287 pub fn patterns(&self) -> &[String] {
288 self.0.regex_strings()
289 }
290}
291
292impl Default for RegexSet {
293 fn default() -> Self {
294 RegexSet::empty()
295 }
296}
297
298/// A set of matches returned by a regex set.
299#[derive(Clone, Debug)]
300pub struct SetMatches {
301 matched_any: bool,
302 matches: Vec<bool>,
303}
304
305impl SetMatches {
306 /// Whether this set contains any matches.
307 pub fn matched_any(&self) -> bool {
308 self.matched_any
309 }
310
311 /// Whether the regex at the given index matched.
312 ///
313 /// The index for a regex is determined by its insertion order upon the
314 /// initial construction of a `RegexSet`, starting at `0`.
315 ///
316 /// # Panics
317 ///
318 /// If `regex_index` is greater than or equal to `self.len()`.
319 pub fn matched(&self, regex_index: usize) -> bool {
320 self.matches[regex_index]
321 }
322
323 /// The total number of regexes in the set that created these matches.
324 ///
325 /// **WARNING:** This always returns the same value as [`RegexSet::len`].
326 /// In particular, it does *not* return the number of elements yielded by
327 /// [`SetMatches::iter`]. The only way to determine the total number of
328 /// matched regexes is to iterate over them.
329 pub fn len(&self) -> usize {
330 self.matches.len()
331 }
332
333 /// Returns an iterator over indexes in the regex that matched.
334 ///
335 /// This will always produces matches in ascending order of index, where
336 /// the index corresponds to the index of the regex that matched with
337 /// respect to its position when initially building the set.
338 pub fn iter(&self) -> SetMatchesIter<'_> {
339 SetMatchesIter((&*self.matches).into_iter().enumerate())
340 }
341}
342
343impl IntoIterator for SetMatches {
344 type IntoIter = SetMatchesIntoIter;
345 type Item = usize;
346
347 fn into_iter(self) -> Self::IntoIter {
348 SetMatchesIntoIter(self.matches.into_iter().enumerate())
349 }
350}
351
352impl<'a> IntoIterator for &'a SetMatches {
353 type IntoIter = SetMatchesIter<'a>;
354 type Item = usize;
355
356 fn into_iter(self) -> Self::IntoIter {
357 self.iter()
358 }
359}
360
361/// An owned iterator over the set of matches from a regex set.
362///
363/// This will always produces matches in ascending order of index, where the
364/// index corresponds to the index of the regex that matched with respect to
365/// its position when initially building the set.
366#[derive(Debug)]
367pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
368
369impl Iterator for SetMatchesIntoIter {
370 type Item = usize;
371
372 fn next(&mut self) -> Option<usize> {
373 loop {
374 match self.0.next() {
375 None => return None,
376 Some((_, false)) => {}
377 Some((i, true)) => return Some(i),
378 }
379 }
380 }
381
382 fn size_hint(&self) -> (usize, Option<usize>) {
383 self.0.size_hint()
384 }
385}
386
387impl DoubleEndedIterator for SetMatchesIntoIter {
388 fn next_back(&mut self) -> Option<usize> {
389 loop {
390 match self.0.next_back() {
391 None => return None,
392 Some((_, false)) => {}
393 Some((i, true)) => return Some(i),
394 }
395 }
396 }
397}
398
399impl iter::FusedIterator for SetMatchesIntoIter {}
400
401/// A borrowed iterator over the set of matches from a regex set.
402///
403/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
404///
405/// This will always produces matches in ascending order of index, where the
406/// index corresponds to the index of the regex that matched with respect to
407/// its position when initially building the set.
408#[derive(Clone, Debug)]
409pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
410
411impl<'a> Iterator for SetMatchesIter<'a> {
412 type Item = usize;
413
414 fn next(&mut self) -> Option<usize> {
415 loop {
416 match self.0.next() {
417 None => return None,
418 Some((_, &false)) => {}
419 Some((i, &true)) => return Some(i),
420 }
421 }
422 }
423
424 fn size_hint(&self) -> (usize, Option<usize>) {
425 self.0.size_hint()
426 }
427}
428
429impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
430 fn next_back(&mut self) -> Option<usize> {
431 loop {
432 match self.0.next_back() {
433 None => return None,
434 Some((_, &false)) => {}
435 Some((i, &true)) => return Some(i),
436 }
437 }
438 }
439}
440
441impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
442
443#[doc(hidden)]
444impl From<Exec> for RegexSet {
445 fn from(exec: Exec) -> Self {
446 RegexSet(exec)
447 }
448}
449
450impl fmt::Debug for RegexSet {
451 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
452 write!(f, "RegexSet({:?})", self.0.regex_strings())
453 }
454}
455
456#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
457#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
458 }
459 }
460}
461
462define_set! {
463 unicode,
464 set_unicode,
465 &str,
466 as_bytes_str,
467/// ```rust
468/// # use regex::RegexSet;
469/// let set = RegexSet::new(&[
470/// r"[a-z]+@[a-z]+\.(com|org|net)",
471/// r"[a-z]+\.(com|org|net)",
472/// ]).unwrap();
473///
474/// // Ask whether any regexes in the set match.
475/// assert!(set.is_match("foo@example.com"));
476///
477/// // Identify which regexes in the set match.
478/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
479/// assert_eq!(vec![0, 1], matches);
480///
481/// // Try again, but with text that only matches one of the regexes.
482/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
483/// assert_eq!(vec![1], matches);
484///
485/// // Try again, but with text that doesn't match any regex in the set.
486/// let matches: Vec<_> = set.matches("example").into_iter().collect();
487/// assert!(matches.is_empty());
488/// ```
489}
490
491define_set! {
492 bytes,
493 set_bytes,
494 &[u8],
495 as_bytes_bytes,
496/// ```rust
497/// # use regex::bytes::RegexSet;
498/// let set = RegexSet::new(&[
499/// r"[a-z]+@[a-z]+\.(com|org|net)",
500/// r"[a-z]+\.(com|org|net)",
501/// ]).unwrap();
502///
503/// // Ask whether any regexes in the set match.
504/// assert!(set.is_match(b"foo@example.com"));
505///
506/// // Identify which regexes in the set match.
507/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
508/// assert_eq!(vec![0, 1], matches);
509///
510/// // Try again, but with text that only matches one of the regexes.
511/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
512/// assert_eq!(vec![1], matches);
513///
514/// // Try again, but with text that doesn't match any regex in the set.
515/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
516/// assert!(matches.is_empty());
517/// ```
518}
519