re_set.rs source code [crates/regex-1.8.4/src/re_set.rs]

1	macro_rules! define_set {
2	($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
3	$(#[$doc_regexset_example:meta])* ) => {
4	pub mod $name {
5	use std::fmt;
6	use std::iter;
7	use std::slice;
8	use std::vec;
9
10	use crate::error::Error;
11	use crate::exec::Exec;
12	use crate::re_builder::$builder_mod::RegexSetBuilder;
13	use crate::re_trait::RegularExpression;
14
15	/// Match multiple (possibly overlapping) regular expressions in a single scan.
16	///
17	/// A regex set corresponds to the union of two or more regular expressions.
18	/// That is, a regex set will match text where at least one of its
19	/// constituent regular expressions matches. A regex set as its formulated here
20	/// provides a touch more power: it will also report which* regular*
21	/// expressions in the set match. Indeed, this is the key difference between
22	/// regex sets and a single `Regex` with many alternates, since only one
23	/// alternate can match at a time.
24	///
25	/// For example, consider regular expressions to match email addresses and
26	/// domains: `[a-z]+@[a-z]+\.(com\|org\|net)` and `[a-z]+\.(com\|org\|net)`. If a
27	/// regex set is constructed from those regexes, then searching the text
28	/// `foo@example.com` will report both regexes as matching. Of course, one
29	/// could accomplish this by compiling each regex on its own and doing two
30	/// searches over the text. The key advantage of using a regex set is that it
31	/// will report the matching regexes using a single pass through the text.
32	/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
33	/// router for a complex web application or a user agent matcher), then a regex
34	/// set can realize huge performance gains.
35	///
36	/// # Example
37	///
38	/// This shows how the above two regexes (for matching email addresses and
39	/// domains) might work:
40	///
41	$(#[$doc_regexset_example])*
42	///
43	/// Note that it would be possible to adapt the above example to using `Regex`
44	/// with an expression like:
45	///
46	/// ```text
47	/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com\|org\|net)))\|(?P<domain>[a-z]+[.](com\|org\|net))
48	/// ```
49	///
50	/// After a match, one could then inspect the capture groups to figure out
51	/// which alternates matched. The problem is that it is hard to make this
52	/// approach scale when there are many regexes since the overlap between each
53	/// alternate isn't always obvious to reason about.
54	///
55	/// # Limitations
56	///
57	/// Regex sets are limited to answering the following two questions:
58	///
59	/// 1. Does any regex in the set match?
60	/// 2. If so, which regexes in the set match?
61	///
62	/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
63	/// instead of (2) since the matching engines can stop after the first match
64	/// is found.
65	///
66	/// You cannot directly extract [`Match`][crate::Match] or
67	/// [`Captures`][crate::Captures] objects from a regex set. If you need these
68	/// operations, the recommended approach is to compile each pattern in the set
69	/// independently and scan the exact same input a second time with those
70	/// independently compiled patterns:
71	///
72	/// ```rust
73	/// use regex::{Regex, RegexSet};
74	///
75	/// let patterns = ["foo", "bar"];
76	/// // Both patterns will match different ranges of this string.
77	/// let text = "barfoo";
78	///
79	/// // Compile a set matching any of our patterns.
80	/// let set = RegexSet::new(&patterns).unwrap();
81	/// // Compile each pattern independently.
82	/// let regexes: Vec<_> = set.patterns().iter()
83	/// .map(\|pat\| Regex::new(pat).unwrap())
84	/// .collect();
85	///
86	/// // Match against the whole set first and identify the individual
87	/// // matching patterns.
88	/// let matches: Vec<&str> = set.matches(text).into_iter()
89	/// // Dereference the match index to get the corresponding
90	/// // compiled pattern.
91	/// .map(\|match_idx\| &regexes[match_idx])
92	/// // To get match locations or any other info, we then have to search
93	/// // the exact same text again, using our separately-compiled pattern.
94	/// .map(\|pat\| pat.find(text).unwrap().as_str())
95	/// .collect();
96	///
97	/// // Matches arrive in the order the constituent patterns were declared,
98	/// // not the order they appear in the input.
99	/// assert_eq!(vec!["foo", "bar"], matches);
100	/// ```
101	///
102	/// # Performance
103	///
104	/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
105	/// search takes `O(mn)` time, where `m` is proportional to the size of the
106	/// regex set and `n` is proportional to the length of the search text.
107	#[derive(Clone)]
108	pub struct RegexSet(Exec);
109
110	impl RegexSet {
111	/// Create a new regex set with the given regular expressions.
112	///
113	/// This takes an iterator of `S`, where `S` is something that can produce
114	/// a `&str`. If any of the strings in the iterator are not valid regular
115	/// expressions, then an error is returned.
116	///
117	/// # Example
118	///
119	/// Create a new regex set from an iterator of strings:
120	///
121	/// ```rust
122	/// # use regex::RegexSet;
123	/// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
124	/// assert!(set.is_match("foo"));
125	/// ```
126	pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
127	where S: AsRef<str>, I: IntoIterator<Item=S> {
128	RegexSetBuilder::new(exprs).build()
129	}
130
131	/// Create a new empty regex set.
132	///
133	/// # Example
134	///
135	/// ```rust
136	/// # use regex::RegexSet;
137	/// let set = RegexSet::empty();
138	/// assert!(set.is_empty());
139	/// ```
140	pub fn empty() -> RegexSet {
141	RegexSetBuilder::new(&[""; `0`]).build().unwrap()
142	}
143
144	/// Returns true if and only if one of the regexes in this set matches
145	/// the text given.
146	///
147	/// This method should be preferred if you only need to test whether any
148	/// of the regexes in the set should match, but don't care about which
149	/// regexes matched. This is because the underlying matching engine will
150	/// quit immediately after seeing the first match instead of continuing to
151	/// find all matches.
152	///
153	/// Note that as with searches using `Regex`, the expression is unanchored
154	/// by default. That is, if the regex does not start with `^` or `\A`, or
155	/// end with `$` or `\z`, then it is permitted to match anywhere in the
156	/// text.
157	///
158	/// # Example
159	///
160	/// Tests whether a set matches some text:
161	///
162	/// ```rust
163	/// # use regex::RegexSet;
164	/// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
165	/// assert!(set.is_match("foo"));
166	/// assert!(!set.is_match("☃"));
167	/// ```
168	pub fn is_match(&self, text: $text_ty) -> bool {
169	self.is_match_at(text, `0`)
170	}
171
172	/// Returns the same as is_match, but starts the search at the given
173	/// offset.
174	///
175	/// The significance of the starting point is that it takes the surrounding
176	/// context into consideration. For example, the `\A` anchor can only
177	/// match when `start == 0`.
178	#[doc(hidden)]
179	pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
180	self.`0`.searcher().is_match_at($as_bytes(text), start)
181	}
182
183	/// Returns the set of regular expressions that match in the given text.
184	///
185	/// The set returned contains the index of each regular expression that
186	/// matches in the given text. The index is in correspondence with the
187	/// order of regular expressions given to `RegexSet`'s constructor.
188	///
189	/// The set can also be used to iterate over the matched indices.
190	///
191	/// Note that as with searches using `Regex`, the expression is unanchored
192	/// by default. That is, if the regex does not start with `^` or `\A`, or
193	/// end with `$` or `\z`, then it is permitted to match anywhere in the
194	/// text.
195	///
196	/// # Example
197	///
198	/// Tests which regular expressions match the given text:
199	///
200	/// ```rust
201	/// # use regex::RegexSet;
202	/// let set = RegexSet::new(&[
203	/// r"\w+",
204	/// r"\d+",
205	/// r"\pL+",
206	/// r"foo",
207	/// r"bar",
208	/// r"barfoo",
209	/// r"foobar",
210	/// ]).unwrap();
211	/// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
212	/// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
213	///
214	/// // You can also test whether a particular regex matched:
215	/// let matches = set.matches("foobar");
216	/// assert!(!matches.matched(5));
217	/// assert!(matches.matched(6));
218	/// ```
219	pub fn matches(&self, text: $text_ty) -> SetMatches {
220	let mut matches = vec![`false`; self.`0`.regex_strings().len()];
221	let any = self.read_matches_at(&mut matches, text, `0`);
222	SetMatches {
223	matched_any: any,
224	matches: matches,
225	}
226	}
227
228	/// Returns the same as matches, but starts the search at the given
229	/// offset and stores the matches into the slice given.
230	///
231	/// The significance of the starting point is that it takes the surrounding
232	/// context into consideration. For example, the `\A` anchor can only
233	/// match when `start == 0`.
234	///
235	/// `matches` must have a length that is at least the number of regexes
236	/// in this set.
237	///
238	/// This method returns true if and only if at least one member of
239	/// `matches` is true after executing the set against `text`.
240	#[doc(hidden)]
241	pub fn read_matches_at(
242	&self,
243	matches: &mut [bool],
244	text: $text_ty,
245	start: usize,
246	) -> bool {
247	self.`0`.searcher().many_matches_at(matches, $as_bytes(text), start)
248	}
249
250	/// Returns the total number of regular expressions in this set.
251	pub fn len(&self) -> usize {
252	self.`0`.regex_strings().len()
253	}
254
255	/// Returns `true` if this set contains no regular expressions.
256	pub fn is_empty(&self) -> bool {
257	self.`0`.regex_strings().is_empty()
258	}
259
260	/// Returns the patterns that this set will match on.
261	///
262	/// This function can be used to determine the pattern for a match. The
263	/// slice returned has exactly as many patterns givens to this regex set,
264	/// and the order of the slice is the same as the order of the patterns
265	/// provided to the set.
266	///
267	/// # Example
268	///
269	/// ```rust
270	/// # use regex::RegexSet;
271	/// let set = RegexSet::new(&[
272	/// r"\w+",
273	/// r"\d+",
274	/// r"\pL+",
275	/// r"foo",
276	/// r"bar",
277	/// r"barfoo",
278	/// r"foobar",
279	/// ]).unwrap();
280	/// let matches: Vec<_> = set
281	/// .matches("foobar")
282	/// .into_iter()
283	/// .map(\|match_idx\| &set.patterns()[match_idx])
284	/// .collect();
285	/// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
286	/// ```
287	pub fn patterns(&self) -> &[String] {
288	self.`0`.regex_strings()
289	}
290	}
291
292	impl Default for RegexSet {
293	fn default() -> Self {
294	RegexSet::empty()
295	}
296	}
297
298	/// A set of matches returned by a regex set.
299	#[derive(Clone, Debug)]
300	pub struct SetMatches {
301	matched_any: bool,
302	matches: Vec<bool>,
303	}
304
305	impl SetMatches {
306	/// Whether this set contains any matches.
307	pub fn matched_any(&self) -> bool {
308	self.matched_any
309	}
310
311	/// Whether the regex at the given index matched.
312	///
313	/// The index for a regex is determined by its insertion order upon the
314	/// initial construction of a `RegexSet`, starting at `0`.
315	///
316	/// # Panics
317	///
318	/// If `regex_index` is greater than or equal to `self.len()`.
319	pub fn matched(&self, regex_index: usize) -> bool {
320	self.matches[regex_index]
321	}
322
323	/// The total number of regexes in the set that created these matches.
324	///
325	/// WARNING:* This always returns the same value as [`RegexSet::len`].*
326	/// In particular, it does not* return the number of elements yielded by*
327	/// [`SetMatches::iter`]. The only way to determine the total number of
328	/// matched regexes is to iterate over them.
329	pub fn len(&self) -> usize {
330	self.matches.len()
331	}
332
333	/// Returns an iterator over indexes in the regex that matched.
334	///
335	/// This will always produces matches in ascending order of index, where
336	/// the index corresponds to the index of the regex that matched with
337	/// respect to its position when initially building the set.
338	pub fn iter(&self) -> SetMatchesIter<'_> {
339	SetMatchesIter((&*self.matches).into_iter().enumerate())
340	}
341	}
342
343	impl IntoIterator for SetMatches {
344	type IntoIter = SetMatchesIntoIter;
345	type Item = usize;
346
347	fn into_iter(self) -> Self::IntoIter {
348	SetMatchesIntoIter(self.matches.into_iter().enumerate())
349	}
350	}
351
352	impl<'a> IntoIterator for &'a SetMatches {
353	type IntoIter = SetMatchesIter<'a>;
354	type Item = usize;
355
356	fn into_iter(self) -> Self::IntoIter {
357	self.iter()
358	}
359	}
360
361	/// An owned iterator over the set of matches from a regex set.
362	///
363	/// This will always produces matches in ascending order of index, where the
364	/// index corresponds to the index of the regex that matched with respect to
365	/// its position when initially building the set.
366	#[derive(Debug)]
367	pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
368
369	impl Iterator for SetMatchesIntoIter {
370	type Item = usize;
371
372	fn next(&mut self) -> Option<usize> {
373	loop {
374	match self.`0`.next() {
375	None => return None,
376	Some((_, `false`)) => {}
377	Some((i, `true`)) => return Some(i),
378	}
379	}
380	}
381
382	fn size_hint(&self) -> (usize, Option<usize>) {
383	self.`0`.size_hint()
384	}
385	}
386
387	impl DoubleEndedIterator for SetMatchesIntoIter {
388	fn next_back(&mut self) -> Option<usize> {
389	loop {
390	match self.`0`.next_back() {
391	None => return None,
392	Some((_, `false`)) => {}
393	Some((i, `true`)) => return Some(i),
394	}
395	}
396	}
397	}
398
399	impl iter::FusedIterator for SetMatchesIntoIter {}
400
401	/// A borrowed iterator over the set of matches from a regex set.
402	///
403	/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
404	///
405	/// This will always produces matches in ascending order of index, where the
406	/// index corresponds to the index of the regex that matched with respect to
407	/// its position when initially building the set.
408	#[derive(Clone, Debug)]
409	pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
410
411	impl<'a> Iterator for SetMatchesIter<'a> {
412	type Item = usize;
413
414	fn next(&mut self) -> Option<usize> {
415	loop {
416	match self.`0`.next() {
417	None => return None,
418	Some((_, &`false`)) => {}
419	Some((i, &`true`)) => return Some(i),
420	}
421	}
422	}
423
424	fn size_hint(&self) -> (usize, Option<usize>) {
425	self.`0`.size_hint()
426	}
427	}
428
429	impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
430	fn next_back(&mut self) -> Option<usize> {
431	loop {
432	match self.`0`.next_back() {
433	None => return None,
434	Some((_, &`false`)) => {}
435	Some((i, &`true`)) => return Some(i),
436	}
437	}
438	}
439	}
440
441	impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
442
443	#[doc(hidden)]
444	impl From<Exec> for RegexSet {
445	fn from(exec: Exec) -> Self {
446	RegexSet(exec)
447	}
448	}
449
450	impl fmt::Debug for RegexSet {
451	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
452	write!(f, "RegexSet({:?})", self.`0`.regex_strings())
453	}
454	}
455
456	#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
457	#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
458	}
459	}
460	}
461
462	define_set! {
463	unicode,
464	set_unicode,
465	&str,
466	as_bytes_str,
467	/// ```rust
468	/// # use regex::RegexSet;
469	/// let set = RegexSet::new(&[
470	/// r"[a-z]+@[a-z]+\.(com\|org\|net)",
471	/// r"[a-z]+\.(com\|org\|net)",
472	/// ]).unwrap();
473	///
474	/// // Ask whether any regexes in the set match.
475	/// assert!(set.is_match("foo@example.com"));
476	///
477	/// // Identify which regexes in the set match.
478	/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
479	/// assert_eq!(vec![0, 1], matches);
480	///
481	/// // Try again, but with text that only matches one of the regexes.
482	/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
483	/// assert_eq!(vec![1], matches);
484	///
485	/// // Try again, but with text that doesn't match any regex in the set.
486	/// let matches: Vec<_> = set.matches("example").into_iter().collect();
487	/// assert!(matches.is_empty());
488	/// ```
489	}
490
491	define_set! {
492	bytes,
493	set_bytes,
494	&[u8],
495	as_bytes_bytes,
496	/// ```rust
497	/// # use regex::bytes::RegexSet;
498	/// let set = RegexSet::new(&[
499	/// r"[a-z]+@[a-z]+\.(com\|org\|net)",
500	/// r"[a-z]+\.(com\|org\|net)",
501	/// ]).unwrap();
502	///
503	/// // Ask whether any regexes in the set match.
504	/// assert!(set.is_match(b"foo@example.com"));
505	///
506	/// // Identify which regexes in the set match.
507	/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
508	/// assert_eq!(vec![0, 1], matches);
509	///
510	/// // Try again, but with text that only matches one of the regexes.
511	/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
512	/// assert_eq!(vec![1], matches);
513	///
514	/// // Try again, but with text that doesn't match any regex in the set.
515	/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
516	/// assert!(matches.is_empty());
517	/// ```
518	}
519