regex.rs - Codebrowser

1	/!*
2	A DFA-backed `Regex`.
3
4	This module provides [`Regex`], which is defined generically over the
5	[`Automaton`] trait. A `Regex` implements convenience routines you might have
6	come to expect, such as finding the start/end of a match and iterating over
7	all non-overlapping matches. This `Regex` type is limited in its capabilities
8	to what a DFA can provide. Therefore, APIs involving capturing groups, for
9	example, are not provided.
10
11	Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
12	finds the end offset of a match, where as the other is a "reverse" DFA that
13	find the start offset of a match.
14
15	See the [parent module](crate::dfa) for examples.
16	*/
17
18	#[cfg(feature = "alloc")]
19	use alloc::vec::Vec;
20
21	#[cfg(feature = "dfa-build")]
22	use crate::dfa::dense::BuildError;
23	use crate::{
24	dfa::{automaton::Automaton, dense},
25	util::{iter, search::Input},
26	Anchored, Match, MatchError,
27	};
28	#[cfg(feature = "alloc")]
29	use crate::{
30	dfa::{sparse, StartKind},
31	util::search::MatchKind,
32	};
33
34	// When the alloc feature is enabled, the regex type sets its A type parameter
35	// to default to an owned dense DFA. But without alloc, we set no default. This
36	// makes things a lot more convenient in the common case, since writing out the
37	// DFA types is pretty annoying.
38	//
39	// Since we have two different definitions but only want to write one doc
40	// string, we use a macro to capture the doc and other attributes once and then
41	// repeat them for each definition.
42	macro_rules! define_regex_type {
43	($(#[$doc:meta])*) => {
44	#[cfg(feature = "alloc")]
45	$(#[$doc])*
46	pub struct Regex<A = dense::OwnedDFA> {
47	forward: A,
48	reverse: A,
49	}
50
51	#[cfg(not(feature = "alloc"))]
52	$(#[$doc])*
53	pub struct Regex<A> {
54	forward: A,
55	reverse: A,
56	}
57	};
58	}
59
60	define_regex_type!(
61	/// A regular expression that uses deterministic finite automata for fast
62	/// searching.
63	///
64	/// A regular expression is comprised of two DFAs, a "forward" DFA and a
65	/// "reverse" DFA. The forward DFA is responsible for detecting the end of
66	/// a match while the reverse DFA is responsible for detecting the start
67	/// of a match. Thus, in order to find the bounds of any given match, a
68	/// forward search must first be run followed by a reverse search. A match
69	/// found by the forward DFA guarantees that the reverse DFA will also find
70	/// a match.
71	///
72	/// The type of the DFA used by a `Regex` corresponds to the `A` type
73	/// parameter, which must satisfy the [`Automaton`] trait. Typically,
74	/// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
75	/// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
76	/// memory but search faster, while sparse DFAs use less memory but search
77	/// more slowly.
78	///
79	/// # Crate features
80	///
81	/// Note that despite what the documentation auto-generates, the _only_
82	/// crate feature needed to use this type is `dfa-search`. You do _not_
83	/// need to enable the `alloc` feature.
84	///
85	/// By default, a regex's automaton type parameter is set to
86	/// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
87	/// in-memory work loads, this is the most convenient type that gives the
88	/// best search performance. When the `alloc` feature is disabled, no
89	/// default type is used.
90	///
91	/// # When should I use this?
92	///
93	/// Generally speaking, if you can afford the overhead of building a full
94	/// DFA for your regex, and you don't need things like capturing groups,
95	/// then this is a good choice if you're looking to optimize for matching
96	/// speed. Note however that its speed may be worse than a general purpose
97	/// regex engine if you don't provide a [`dense::Config::prefilter`] to the
98	/// underlying DFA.
99	///
100	/// # Sparse DFAs
101	///
102	/// Since a `Regex` is generic over the [`Automaton`] trait, it can be
103	/// used with any kind of DFA. While this crate constructs dense DFAs by
104	/// default, it is easy enough to build corresponding sparse DFAs, and then
105	/// build a regex from them:
106	///
107	/// ```
108	/// use regex_automata::dfa::regex::Regex;
109	///
110	/// // First, build a regex that uses dense DFAs.
111	/// let dense_re = Regex::new("foo[0-9]+")?;
112	///
113	/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
114	/// let fwd = dense_re.forward().to_sparse()?;
115	/// let rev = dense_re.reverse().to_sparse()?;
116	///
117	/// // Third, build a new regex from the constituent sparse DFAs.
118	/// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
119	///
120	/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
121	/// assert_eq!(true, sparse_re.is_match(b"foo123"));
122	///
123	/// # Ok::<(), Box<dyn std::error::Error>>(())
124	/// ```
125	///
126	/// Alternatively, one can use a [`Builder`] to construct a sparse DFA
127	/// more succinctly. (Note though that dense DFAs are still constructed
128	/// first internally, and then converted to sparse DFAs, as in the example
129	/// above.)
130	///
131	/// ```
132	/// use regex_automata::dfa::regex::Regex;
133	///
134	/// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
135	/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
136	/// assert!(sparse_re.is_match(b"foo123"));
137	///
138	/// # Ok::<(), Box<dyn std::error::Error>>(())
139	/// ```
140	///
141	/// # Fallibility
142	///
143	/// Most of the search routines defined on this type will _panic_ when the
144	/// underlying search fails. This might be because the DFA gave up because
145	/// it saw a quit byte, whether configured explicitly or via heuristic
146	/// Unicode word boundary support, although neither are enabled by default.
147	/// Or it might fail because an invalid `Input` configuration is given,
148	/// for example, with an unsupported [`Anchored`] mode.
149	///
150	/// If you need to handle these error cases instead of allowing them to
151	/// trigger a panic, then the lower level [`Regex::try_search`] provides
152	/// a fallible API that never panics.
153	///
154	/// # Example
155	///
156	/// This example shows how to cause a search to terminate if it sees a
157	/// `\n` byte, and handle the error returned. This could be useful if, for
158	/// example, you wanted to prevent a user supplied pattern from matching
159	/// across a line boundary.
160	///
161	/// ```
162	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
163	/// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
164	///
165	/// let re = Regex::builder()
166	/// .dense(dfa::dense::Config::new().quit(b'\n', true))
167	/// .build(r"foo\p{any}+bar")?;
168	///
169	/// let input = Input::new("foo\nbar");
170	/// // Normally this would produce a match, since \p{any} contains '\n'.
171	/// // But since we instructed the automaton to enter a quit state if a
172	/// // '\n' is observed, this produces a match error instead.
173	/// let expected = MatchError::quit(b'\n', 3);
174	/// let got = re.try_search(&input).unwrap_err();
175	/// assert_eq!(expected, got);
176	///
177	/// # Ok::<(), Box<dyn std::error::Error>>(())
178	/// ```
179	#[derive(Clone, Debug)]
180	);
181
182	#[cfg(all(feature = "syntax", feature = "dfa-build"))]
183	impl Regex {
184	/// Parse the given regular expression using the default configuration and
185	/// return the corresponding regex.
186	///
187	/// If you want a non-default configuration, then use the [`Builder`] to
188	/// set your own configuration.
189	///
190	/// # Example
191	///
192	/// ```
193	/// use regex_automata::{Match, dfa::regex::Regex};
194	///
195	/// let re = Regex::new("foo[0-9]+bar")?;
196	/// assert_eq!(
197	/// Some(Match::must(`0`, `3`..`14`)),
198	/// re.find(b"zzzfoo12345barzzz"),
199	/// );
200	/// # Ok::<(), Box<dyn std::error::Error>>(())
201	/// ```
202	pub fn new(pattern: &str) -> Result<Regex, BuildError> {
203	Builder::new().build(pattern)
204	}
205
206	/// Like `new`, but parses multiple patterns into a single "regex set."
207	/// This similarly uses the default regex configuration.
208	///
209	/// # Example
210	///
211	/// ```
212	/// use regex_automata::{Match, dfa::regex::Regex};
213	///
214	/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
215	///
216	/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
217	/// assert_eq!(Some(Match::must(`0`, `0`..`3`)), it.next());
218	/// assert_eq!(Some(Match::must(`1`, `4`..`5`)), it.next());
219	/// assert_eq!(Some(Match::must(`0`, `6`..`9`)), it.next());
220	/// assert_eq!(Some(Match::must(`1`, `10`..`14`)), it.next());
221	/// assert_eq!(Some(Match::must(`1`, `15`..`16`)), it.next());
222	/// assert_eq!(Some(Match::must(`0`, `17`..`21`)), it.next());
223	/// assert_eq!(None, it.next());
224	/// # Ok::<(), Box<dyn std::error::Error>>(())
225	/// ```
226	pub fn new_many<P: AsRef<str>>(
227	patterns: &[P],
228	) -> Result<Regex, BuildError> {
229	Builder::new().build_many(patterns)
230	}
231	}
232
233	#[cfg(all(feature = "syntax", feature = "dfa-build"))]
234	impl Regex<sparse::DFA<Vec<u8>>> {
235	/// Parse the given regular expression using the default configuration,
236	/// except using sparse DFAs, and return the corresponding regex.
237	///
238	/// If you want a non-default configuration, then use the [`Builder`] to
239	/// set your own configuration.
240	///
241	/// # Example
242	///
243	/// ```
244	/// use regex_automata::{Match, dfa::regex::Regex};
245	///
246	/// let re = Regex::new_sparse("foo[0-9]+bar")?;
247	/// assert_eq!(
248	/// Some(Match::must(`0`, `3`..`14`)),
249	/// re.find(b"zzzfoo12345barzzz"),
250	/// );
251	/// # Ok::<(), Box<dyn std::error::Error>>(())
252	/// ```
253	pub fn new_sparse(
254	pattern: &str,
255	) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
256	Builder::new().build_sparse(pattern)
257	}
258
259	/// Like `new`, but parses multiple patterns into a single "regex set"
260	/// using sparse DFAs. This otherwise similarly uses the default regex
261	/// configuration.
262	///
263	/// # Example
264	///
265	/// ```
266	/// use regex_automata::{Match, dfa::regex::Regex};
267	///
268	/// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
269	///
270	/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
271	/// assert_eq!(Some(Match::must(`0`, `0`..`3`)), it.next());
272	/// assert_eq!(Some(Match::must(`1`, `4`..`5`)), it.next());
273	/// assert_eq!(Some(Match::must(`0`, `6`..`9`)), it.next());
274	/// assert_eq!(Some(Match::must(`1`, `10`..`14`)), it.next());
275	/// assert_eq!(Some(Match::must(`1`, `15`..`16`)), it.next());
276	/// assert_eq!(Some(Match::must(`0`, `17`..`21`)), it.next());
277	/// assert_eq!(None, it.next());
278	/// # Ok::<(), Box<dyn std::error::Error>>(())
279	/// ```
280	pub fn new_many_sparse<P: AsRef<str>>(
281	patterns: &[P],
282	) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
283	Builder::new().build_many_sparse(patterns)
284	}
285	}
286
287	/// Convenience routines for regex construction.
288	impl Regex<dense::DFA<&'static [u32]>> {
289	/// Return a builder for configuring the construction of a `Regex`.
290	///
291	/// This is a convenience routine to avoid needing to import the
292	/// [`Builder`] type in common cases.
293	///
294	/// # Example
295	///
296	/// This example shows how to use the builder to disable UTF-8 mode
297	/// everywhere.
298	///
299	/// ```
300	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
301	/// use regex_automata::{
302	/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
303	/// };
304	///
305	/// let re = Regex::builder()
306	/// .syntax(syntax::Config::new().utf8(`false`))
307	/// .thompson(thompson::Config::new().utf8(`false`))
308	/// .build(r"foo(?-u:[^b])ar.*")?;
309	/// let haystack = b"`\xFE`foo`\xFF`arzz`\xE2\x98\xFF\n`";
310	/// let expected = Some(Match::must(`0`, `1`..`9`));
311	/// let got = re.find(haystack);
312	/// assert_eq!(expected, got);
313	///
314	/// # Ok::<(), Box<dyn std::error::Error>>(())
315	/// ```
316	pub fn builder() -> Builder {
317	Builder::new()
318	}
319	}
320
321	/// Standard search routines for finding and iterating over matches.
322	impl<A: Automaton> Regex<A> {
323	/// Returns true if and only if this regex matches the given haystack.
324	///
325	/// This routine may short circuit if it knows that scanning future input
326	/// will never lead to a different result. In particular, if the underlying
327	/// DFA enters a match state or a dead state, then this routine will return
328	/// `true` or `false`, respectively, without inspecting any future input.
329	///
330	/// # Panics
331	///
332	/// This routine panics if the search could not complete. This can occur
333	/// in a number of circumstances:
334	///
335	/// The configuration of the DFA may permit it to "quit" the search.*
336	/// For example, setting quit bytes or enabling heuristic support for
337	/// Unicode word boundaries. The default configuration does not enable any
338	/// option that could result in the DFA quitting.
339	/// When the provided `Input` configuration is not supported. For*
340	/// example, by providing an unsupported anchor mode.
341	///
342	/// When a search panics, callers cannot know whether a match exists or
343	/// not.
344	///
345	/// Use [`Regex::try_search`] if you want to handle these error conditions.
346	///
347	/// # Example
348	///
349	/// ```
350	/// use regex_automata::dfa::regex::Regex;
351	///
352	/// let re = Regex::new("foo[0-9]+bar")?;
353	/// assert_eq!(`true`, re.is_match("foo12345bar"));
354	/// assert_eq!(`false`, re.is_match("foobar"));
355	/// # Ok::<(), Box<dyn std::error::Error>>(())
356	/// ```
357	#[inline]
358	pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
359	// Not only can we do an "earliest" search, but we can avoid doing a
360	// reverse scan too.
361	let input = input.into().earliest(`true`);
362	self.forward().try_search_fwd(&input).map(\|x\| x.is_some()).unwrap()
363	}
364
365	/// Returns the start and end offset of the leftmost match. If no match
366	/// exists, then `None` is returned.
367	///
368	/// # Panics
369	///
370	/// This routine panics if the search could not complete. This can occur
371	/// in a number of circumstances:
372	///
373	/// The configuration of the DFA may permit it to "quit" the search.*
374	/// For example, setting quit bytes or enabling heuristic support for
375	/// Unicode word boundaries. The default configuration does not enable any
376	/// option that could result in the DFA quitting.
377	/// When the provided `Input` configuration is not supported. For*
378	/// example, by providing an unsupported anchor mode.
379	///
380	/// When a search panics, callers cannot know whether a match exists or
381	/// not.
382	///
383	/// Use [`Regex::try_search`] if you want to handle these error conditions.
384	///
385	/// # Example
386	///
387	/// ```
388	/// use regex_automata::{Match, dfa::regex::Regex};
389	///
390	/// // Greediness is applied appropriately.
391	/// let re = Regex::new("foo[0-9]+")?;
392	/// assert_eq!(Some(Match::must(`0`, `3`..`11`)), re.find("zzzfoo12345zzz"));
393	///
394	/// // Even though a match is found after reading the first byte (`a`),
395	/// // the default leftmost-first match semantics demand that we find the
396	/// // earliest match that prefers earlier parts of the pattern over latter
397	/// // parts.
398	/// let re = Regex::new("abc\|a")?;
399	/// assert_eq!(Some(Match::must(`0`, `0`..`3`)), re.find("abc"));
400	/// # Ok::<(), Box<dyn std::error::Error>>(())
401	/// ```
402	#[inline]
403	pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
404	self.try_search(&input.into()).unwrap()
405	}
406
407	/// Returns an iterator over all non-overlapping leftmost matches in the
408	/// given bytes. If no match exists, then the iterator yields no elements.
409	///
410	/// This corresponds to the "standard" regex search iterator.
411	///
412	/// # Panics
413	///
414	/// If the search returns an error during iteration, then iteration
415	/// panics. See [`Regex::find`] for the panic conditions.
416	///
417	/// Use [`Regex::try_search`] with
418	/// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
419	/// handle these error conditions.
420	///
421	/// # Example
422	///
423	/// ```
424	/// use regex_automata::{Match, dfa::regex::Regex};
425	///
426	/// let re = Regex::new("foo[0-9]+")?;
427	/// let text = "foo1 foo12 foo123";
428	/// let matches: Vec<Match> = re.find_iter(text).collect();
429	/// assert_eq!(matches, vec![
430	/// Match::must(`0`, `0`..`4`),
431	/// Match::must(`0`, `5`..`10`),
432	/// Match::must(`0`, `11`..`17`),
433	/// ]);
434	/// # Ok::<(), Box<dyn std::error::Error>>(())
435	/// ```
436	#[inline]
437	pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
438	&'r self,
439	input: I,
440	) -> FindMatches<'r, 'h, A> {
441	let it = iter::Searcher::new(input.into());
442	FindMatches { re: self, it }
443	}
444	}
445
446	/// Lower level fallible search routines that permit controlling where the
447	/// search starts and ends in a particular sequence.
448	impl<A: Automaton> Regex<A> {
449	/// Returns the start and end offset of the leftmost match. If no match
450	/// exists, then `None` is returned.
451	///
452	/// This is like [`Regex::find`] but with two differences:
453	///
454	/// 1. It is not generic over `Into<Input>` and instead accepts a
455	/// `&Input`. This permits reusing the same `Input` for multiple searches
456	/// without needing to create a new one. This _may_ help with latency.
457	/// 2. It returns an error if the search could not complete where as
458	/// [`Regex::find`] will panic.
459	///
460	/// # Errors
461	///
462	/// This routine errors if the search could not complete. This can occur
463	/// in the following circumstances:
464	///
465	/// The configuration of the DFA may permit it to "quit" the search.*
466	/// For example, setting quit bytes or enabling heuristic support for
467	/// Unicode word boundaries. The default configuration does not enable any
468	/// option that could result in the DFA quitting.
469	/// When the provided `Input` configuration is not supported. For*
470	/// example, by providing an unsupported anchor mode.
471	///
472	/// When a search returns an error, callers cannot know whether a match
473	/// exists or not.
474	#[inline]
475	pub fn try_search(
476	&self,
477	input: &Input<'_>,
478	) -> Result<Option<Match>, MatchError> {
479	let (fwd, rev) = (self.forward(), self.reverse());
480	let end = match fwd.try_search_fwd(input)? {
481	None => return Ok(None),
482	Some(end) => end,
483	};
484	// This special cases an empty match at the beginning of the search. If
485	// our end matches our start, then since a reverse DFA can't match past
486	// the start, it must follow that our starting position is also our end
487	// position. So short circuit and skip the reverse search.
488	if input.start() == end.offset() {
489	return Ok(Some(Match::new(
490	end.pattern(),
491	end.offset()..end.offset(),
492	)));
493	}
494	// We can also skip the reverse search if we know our search was
495	// anchored. This occurs either when the input config is anchored or
496	// when we know the regex itself is anchored. In this case, we know the
497	// start of the match, if one is found, must be the start of the
498	// search.
499	if self.is_anchored(input) {
500	return Ok(Some(Match::new(
501	end.pattern(),
502	input.start()..end.offset(),
503	)));
504	}
505	// N.B. I have tentatively convinced myself that it isn't necessary
506	// to specify the specific pattern for the reverse search since the
507	// reverse search will always find the same pattern to match as the
508	// forward search. But I lack a rigorous proof. Why not just provide
509	// the pattern anyway? Well, if it is needed, then leaving it out
510	// gives us a chance to find a witness. (Also, if we don't need to
511	// specify the pattern, then we don't need to build the reverse DFA
512	// with 'starts_for_each_pattern' enabled.)
513	//
514	// We also need to be careful to disable 'earliest' for the reverse
515	// search, since it could be enabled for the forward search. In the
516	// reverse case, to satisfy "leftmost" criteria, we need to match
517	// as much as we can. We also need to be careful to make the search
518	// anchored. We don't want the reverse search to report any matches
519	// other than the one beginning at the end of our forward search.
520	let revsearch = input
521	.clone()
522	.span(input.start()..end.offset())
523	.anchored(Anchored::Yes)
524	.earliest(`false`);
525	let start = rev
526	.try_search_rev(&revsearch)?
527	.expect("reverse search must match if forward search does");
528	assert_eq!(
529	start.pattern(),
530	end.pattern(),
531	"forward and reverse search must match same pattern",
532	);
533	assert!(start.offset() <= end.offset());
534	Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
535	}
536
537	/// Returns true if either the given input specifies an anchored search
538	/// or if the underlying DFA is always anchored.
539	fn is_anchored(&self, input: &Input<'_>) -> bool {
540	match input.get_anchored() {
541	Anchored::No => self.forward().is_always_start_anchored(),
542	Anchored::Yes \| Anchored::Pattern(_) => `true`,
543	}
544	}
545	}
546
547	/// Non-search APIs for querying information about the regex and setting a
548	/// prefilter.
549	impl<A: Automaton> Regex<A> {
550	/// Return the underlying DFA responsible for forward matching.
551	///
552	/// This is useful for accessing the underlying DFA and converting it to
553	/// some other format or size. See the [`Builder::build_from_dfas`] docs
554	/// for an example of where this might be useful.
555	pub fn forward(&self) -> &A {
556	&self.forward
557	}
558
559	/// Return the underlying DFA responsible for reverse matching.
560	///
561	/// This is useful for accessing the underlying DFA and converting it to
562	/// some other format or size. See the [`Builder::build_from_dfas`] docs
563	/// for an example of where this might be useful.
564	pub fn reverse(&self) -> &A {
565	&self.reverse
566	}
567
568	/// Returns the total number of patterns matched by this regex.
569	///
570	/// # Example
571	///
572	/// ```
573	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
574	/// use regex_automata::dfa::regex::Regex;
575	///
576	/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
577	/// assert_eq!(`3`, re.pattern_len());
578	/// # Ok::<(), Box<dyn std::error::Error>>(())
579	/// ```
580	pub fn pattern_len(&self) -> usize {
581	assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
582	self.forward().pattern_len()
583	}
584	}
585
586	/// An iterator over all non-overlapping matches for an infallible search.
587	///
588	/// The iterator yields a [`Match`] value until no more matches could be found.
589	/// If the underlying regex engine returns an error, then a panic occurs.
590	///
591	/// The type parameters are as follows:
592	///
593	/// `A` represents the type of the underlying DFA that implements the*
594	/// [`Automaton`] trait.
595	///
596	/// The lifetime parameters are as follows:
597	///
598	/// `'h` represents the lifetime of the haystack being searched.*
599	/// `'r` represents the lifetime of the regex object itself.*
600	///
601	/// This iterator can be created with the [`Regex::find_iter`] method.
602	#[derive(Debug)]
603	pub struct FindMatches<'r, 'h, A> {
604	re: &'r Regex<A>,
605	it: iter::Searcher<'h>,
606	}
607
608	impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
609	type Item = Match;
610
611	#[inline]
612	fn next(&mut self) -> Option<Match> {
613	let FindMatches { re, ref mut it } = *self;
614	it.advance(\|input\| re.try_search(input))
615	}
616	}
617
618	/// A builder for a regex based on deterministic finite automatons.
619	///
620	/// This builder permits configuring options for the syntax of a pattern, the
621	/// NFA construction, the DFA construction and finally the regex searching
622	/// itself. This builder is different from a general purpose regex builder in
623	/// that it permits fine grain configuration of the construction process. The
624	/// trade off for this is complexity, and the possibility of setting a
625	/// configuration that might not make sense. For example, there are two
626	/// different UTF-8 modes:
627	///
628	/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
629	/// whether the pattern itself can contain sub-expressions that match invalid
630	/// UTF-8.
631	/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
632	/// how the regex iterators themselves advance the starting position of the
633	/// next search when a match with zero length is found.
634	///
635	/// Generally speaking, callers will want to either enable all of these or
636	/// disable all of these.
637	///
638	/// Internally, building a regex requires building two DFAs, where one is
639	/// responsible for finding the end of a match and the other is responsible
640	/// for finding the start of a match. If you only need to detect whether
641	/// something matched, or only the end of a match, then you should use a
642	/// [`dense::Builder`] to construct a single DFA, which is cheaper than
643	/// building two DFAs.
644	///
645	/// # Build methods
646	///
647	/// This builder has a few "build" methods. In general, it's the result of
648	/// combining the following parameters:
649	///
650	/// Building one or many regexes.*
651	/// Building a regex with dense or sparse DFAs.*
652	///
653	/// The simplest "build" method is [`Builder::build`]. It accepts a single
654	/// pattern and builds a dense DFA using `usize` for the state identifier
655	/// representation.
656	///
657	/// The most general "build" method is [`Builder::build_many`], which permits
658	/// building a regex that searches for multiple patterns simultaneously while
659	/// using a specific state identifier representation.
660	///
661	/// The most flexible "build" method, but hardest to use, is
662	/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
663	/// just a pair of DFAs, and this method allows you to specify those DFAs
664	/// exactly.
665	///
666	/// # Example
667	///
668	/// This example shows how to disable UTF-8 mode in the syntax and the regex
669	/// itself. This is generally what you want for matching on arbitrary bytes.
670	///
671	/// ```
672	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
673	/// use regex_automata::{
674	/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
675	/// };
676	///
677	/// let re = Regex::builder()
678	/// .syntax(syntax::Config::new().utf8(`false`))
679	/// .thompson(thompson::Config::new().utf8(`false`))
680	/// .build(r"foo(?-u:[^b])ar.*")?;
681	/// let haystack = b"`\xFE`foo`\xFF`arzz`\xE2\x98\xFF\n`";
682	/// let expected = Some(Match::must(`0`, `1`..`9`));
683	/// let got = re.find(haystack);
684	/// assert_eq!(expected, got);
685	/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
686	/// // but the subsequent `.` does not! Disabling UTF-8*
687	/// // on the syntax permits this.
688	/// assert_eq!(b"foo`\xFF`arzz", &haystack[got.unwrap().range()]);
689	///
690	/// # Ok::<(), Box<dyn std::error::Error>>(())
691	/// ```
692	#[derive(Clone, Debug)]
693	pub struct Builder {
694	#[cfg(feature = "dfa-build")]
695	dfa: dense::Builder,
696	}
697
698	impl Builder {
699	/// Create a new regex builder with the default configuration.
700	pub fn new() -> Builder {
701	Builder {
702	#[cfg(feature = "dfa-build")]
703	dfa: dense::Builder::new(),
704	}
705	}
706
707	/// Build a regex from the given pattern.
708	///
709	/// If there was a problem parsing or compiling the pattern, then an error
710	/// is returned.
711	#[cfg(all(feature = "syntax", feature = "dfa-build"))]
712	pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
713	self.build_many(&[pattern])
714	}
715
716	/// Build a regex from the given pattern using sparse DFAs.
717	///
718	/// If there was a problem parsing or compiling the pattern, then an error
719	/// is returned.
720	#[cfg(all(feature = "syntax", feature = "dfa-build"))]
721	pub fn build_sparse(
722	&self,
723	pattern: &str,
724	) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
725	self.build_many_sparse(&[pattern])
726	}
727
728	/// Build a regex from the given patterns.
729	#[cfg(all(feature = "syntax", feature = "dfa-build"))]
730	pub fn build_many<P: AsRef<str>>(
731	&self,
732	patterns: &[P],
733	) -> Result<Regex, BuildError> {
734	let forward = self.dfa.build_many(patterns)?;
735	let reverse = self
736	.dfa
737	.clone()
738	.configure(
739	dense::Config::new()
740	.prefilter(None)
741	.specialize_start_states(`false`)
742	.start_kind(StartKind::Anchored)
743	.match_kind(MatchKind::All),
744	)
745	.thompson(crate::nfa::thompson::Config::new().reverse(`true`))
746	.build_many(patterns)?;
747	Ok(self.build_from_dfas(forward, reverse))
748	}
749
750	/// Build a sparse regex from the given patterns.
751	#[cfg(all(feature = "syntax", feature = "dfa-build"))]
752	pub fn build_many_sparse<P: AsRef<str>>(
753	&self,
754	patterns: &[P],
755	) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
756	let re = self.build_many(patterns)?;
757	let forward = re.forward().to_sparse()?;
758	let reverse = re.reverse().to_sparse()?;
759	Ok(self.build_from_dfas(forward, reverse))
760	}
761
762	/// Build a regex from its component forward and reverse DFAs.
763	///
764	/// This is useful when deserializing a regex from some arbitrary
765	/// memory region. This is also useful for building regexes from other
766	/// types of DFAs.
767	///
768	/// If you're building the DFAs from scratch instead of building new DFAs
769	/// from other DFAs, then you'll need to make sure that the reverse DFA is
770	/// configured correctly to match the intended semantics. Namely:
771	///
772	/// It should be anchored.*
773	/// It should use* [`MatchKind::All`] semantics.
774	/// It should match in reverse.*
775	/// Otherwise, its configuration should match the forward DFA.*
776	///
777	/// If these conditions aren't satisfied, then the behavior of searches is
778	/// unspecified.
779	///
780	/// Note that when using this constructor, no configuration is applied.
781	/// Since this routine provides the DFAs to the builder, there is no
782	/// opportunity to apply other configuration options.
783	///
784	/// # Example
785	///
786	/// This example is a bit a contrived. The usual use of these methods
787	/// would involve serializing `initial_re` somewhere and then deserializing
788	/// it later to build a regex. But in this case, we do everything in
789	/// memory.
790	///
791	/// ```
792	/// use regex_automata::dfa::regex::Regex;
793	///
794	/// let initial_re = Regex::new("foo[0-9]+")?;
795	/// assert_eq!(`true`, initial_re.is_match(b"foo123"));
796	///
797	/// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
798	/// let re = Regex::builder().build_from_dfas(fwd, rev);
799	/// assert_eq!(`true`, re.is_match(b"foo123"));
800	/// # Ok::<(), Box<dyn std::error::Error>>(())
801	/// ```
802	///
803	/// This example shows how to build a `Regex` that uses sparse DFAs instead
804	/// of dense DFAs without using one of the convenience `build_sparse`
805	/// routines:
806	///
807	/// ```
808	/// use regex_automata::dfa::regex::Regex;
809	///
810	/// let initial_re = Regex::new("foo[0-9]+")?;
811	/// assert_eq!(`true`, initial_re.is_match(b"foo123"));
812	///
813	/// let fwd = initial_re.forward().to_sparse()?;
814	/// let rev = initial_re.reverse().to_sparse()?;
815	/// let re = Regex::builder().build_from_dfas(fwd, rev);
816	/// assert_eq!(`true`, re.is_match(b"foo123"));
817	/// # Ok::<(), Box<dyn std::error::Error>>(())
818	/// ```
819	pub fn build_from_dfas<A: Automaton>(
820	&self,
821	forward: A,
822	reverse: A,
823	) -> Regex<A> {
824	Regex { forward, reverse }
825	}
826
827	/// Set the syntax configuration for this builder using
828	/// [`syntax::Config`](crate::util::syntax::Config).
829	///
830	/// This permits setting things like case insensitivity, Unicode and multi
831	/// line mode.
832	#[cfg(all(feature = "syntax", feature = "dfa-build"))]
833	pub fn syntax(
834	&mut self,
835	config: crate::util::syntax::Config,
836	) -> &mut Builder {
837	self.dfa.syntax(config);
838	self
839	}
840
841	/// Set the Thompson NFA configuration for this builder using
842	/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
843	///
844	/// This permits setting things like whether additional time should be
845	/// spent shrinking the size of the NFA.
846	#[cfg(all(feature = "syntax", feature = "dfa-build"))]
847	pub fn thompson(
848	&mut self,
849	config: crate::nfa::thompson::Config,
850	) -> &mut Builder {
851	self.dfa.thompson(config);
852	self
853	}
854
855	/// Set the dense DFA compilation configuration for this builder using
856	/// [`dense::Config`].
857	///
858	/// This permits setting things like whether the underlying DFAs should
859	/// be minimized.
860	#[cfg(feature = "dfa-build")]
861	pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
862	self.dfa.configure(config);
863	self
864	}
865	}
866
867	impl Default for Builder {
868	fn default() -> Builder {
869	Builder::new()
870	}
871	}
872