regex.rs source code [crates/regex-automata-0.4.6/src/hybrid/regex.rs]

1	/!*
2	A lazy DFA backed `Regex`.
3
4	This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements
5	convenience routines you might have come to expect, such as finding a match
6	and iterating over all non-overlapping matches. This `Regex` type is limited
7	in its capabilities to what a lazy DFA can provide. Therefore, APIs involving
8	capturing groups, for example, are not provided.
9
10	Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
11	finds the end offset of a match, where as the other is a "reverse" DFA that
12	find the start offset of a match.
13
14	See the [parent module](crate::hybrid) for examples.
15	*/
16
17	use crate::{
18	hybrid::{
19	dfa::{self, DFA},
20	error::BuildError,
21	},
22	nfa::thompson,
23	util::{
24	iter,
25	search::{Anchored, Input, Match, MatchError, MatchKind},
26	},
27	};
28
29	/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs")
30	/// for searching.
31	///
32	/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a
33	/// "reverse" DFA. The forward DFA is responsible for detecting the end of
34	/// a match while the reverse DFA is responsible for detecting the start
35	/// of a match. Thus, in order to find the bounds of any given match, a
36	/// forward search must first be run followed by a reverse search. A match
37	/// found by the forward DFA guarantees that the reverse DFA will also find
38	/// a match.
39	///
40	/// # Fallibility
41	///
42	/// Most of the search routines defined on this type will _panic_ when the
43	/// underlying search fails. This might be because the DFA gave up because it
44	/// saw a quit byte, whether configured explicitly or via heuristic Unicode
45	/// word boundary support, although neither are enabled by default. It might
46	/// also fail if the underlying DFA determines it isn't making effective use of
47	/// the cache (which also never happens by default). Or it might fail because
48	/// an invalid `Input` configuration is given, for example, with an unsupported
49	/// [`Anchored`] mode.
50	///
51	/// If you need to handle these error cases instead of allowing them to trigger
52	/// a panic, then the lower level [`Regex::try_search`] provides a fallible API
53	/// that never panics.
54	///
55	/// # Example
56	///
57	/// This example shows how to cause a search to terminate if it sees a
58	/// `\n` byte, and handle the error returned. This could be useful if, for
59	/// example, you wanted to prevent a user supplied pattern from matching
60	/// across a line boundary.
61	///
62	/// ```
63	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
64	/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError};
65	///
66	/// let re = Regex::builder()
67	/// .dfa(dfa::Config::new().quit(b'`\n`', `true`))
68	/// .build(r"foo\p{any}+bar")?;
69	/// let mut cache = re.create_cache();
70	///
71	/// let input = Input::new("foo`\n`bar");
72	/// // Normally this would produce a match, since \p{any} contains '\n'.
73	/// // But since we instructed the automaton to enter a quit state if a
74	/// // '\n' is observed, this produces a match error instead.
75	/// let expected = MatchError::quit(b'`\n`', `3`);
76	/// let got = re.try_search(&mut cache, &input).unwrap_err();
77	/// assert_eq!(expected, got);
78	///
79	/// # Ok::<(), Box<dyn std::error::Error>>(())
80	/// ```
81	#[derive(Debug)]
82	pub struct Regex {
83	/// The forward lazy DFA. This can only find the end of a match.
84	forward: DFA,
85	/// The reverse lazy DFA. This can only find the start of a match.
86	///
87	/// This is built with 'all' match semantics (instead of leftmost-first)
88	/// so that it always finds the longest possible match (which corresponds
89	/// to the leftmost starting position). It is also compiled as an anchored
90	/// matcher and has 'starts_for_each_pattern' enabled. Including starting
91	/// states for each pattern is necessary to ensure that we only look for
92	/// matches of a pattern that matched in the forward direction. Otherwise,
93	/// we might wind up finding the "leftmost" starting position of a totally
94	/// different pattern!
95	reverse: DFA,
96	}
97
98	/// Convenience routines for regex and cache construction.
99	impl Regex {
100	/// Parse the given regular expression using the default configuration and
101	/// return the corresponding regex.
102	///
103	/// If you want a non-default configuration, then use the [`Builder`] to
104	/// set your own configuration.
105	///
106	/// # Example
107	///
108	/// ```
109	/// use regex_automata::{hybrid::regex::Regex, Match};
110	///
111	/// let re = Regex::new("foo[0-9]+bar")?;
112	/// let mut cache = re.create_cache();
113	/// assert_eq!(
114	/// Some(Match::must(`0`, `3`..`14`)),
115	/// re.find(&mut cache, "zzzfoo12345barzzz"),
116	/// );
117	/// # Ok::<(), Box<dyn std::error::Error>>(())
118	/// ```
119	#[cfg(feature = "syntax")]
120	pub fn new(pattern: &str) -> Result<Regex, BuildError> {
121	Regex::builder().build(pattern)
122	}
123
124	/// Like `new`, but parses multiple patterns into a single "multi regex."
125	/// This similarly uses the default regex configuration.
126	///
127	/// # Example
128	///
129	/// ```
130	/// use regex_automata::{hybrid::regex::Regex, Match};
131	///
132	/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
133	/// let mut cache = re.create_cache();
134	///
135	/// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
136	/// assert_eq!(Some(Match::must(`0`, `0`..`3`)), it.next());
137	/// assert_eq!(Some(Match::must(`1`, `4`..`5`)), it.next());
138	/// assert_eq!(Some(Match::must(`0`, `6`..`9`)), it.next());
139	/// assert_eq!(Some(Match::must(`1`, `10`..`14`)), it.next());
140	/// assert_eq!(Some(Match::must(`1`, `15`..`16`)), it.next());
141	/// assert_eq!(Some(Match::must(`0`, `17`..`21`)), it.next());
142	/// assert_eq!(None, it.next());
143	/// # Ok::<(), Box<dyn std::error::Error>>(())
144	/// ```
145	#[cfg(feature = "syntax")]
146	pub fn new_many<P: AsRef<str>>(
147	patterns: &[P],
148	) -> Result<Regex, BuildError> {
149	Regex::builder().build_many(patterns)
150	}
151
152	/// Return a builder for configuring the construction of a `Regex`.
153	///
154	/// This is a convenience routine to avoid needing to import the
155	/// [`Builder`] type in common cases.
156	///
157	/// # Example
158	///
159	/// This example shows how to use the builder to disable UTF-8 mode
160	/// everywhere.
161	///
162	/// ```
163	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
164	/// use regex_automata::{
165	/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
166	/// };
167	///
168	/// let re = Regex::builder()
169	/// .syntax(syntax::Config::new().utf8(`false`))
170	/// .thompson(thompson::Config::new().utf8(`false`))
171	/// .build(r"foo(?-u:[^b])ar.*")?;
172	/// let mut cache = re.create_cache();
173	///
174	/// let haystack = b"`\xFE`foo`\xFF`arzz`\xE2\x98\xFF\n`";
175	/// let expected = Some(Match::must(`0`, `1`..`9`));
176	/// let got = re.find(&mut cache, haystack);
177	/// assert_eq!(expected, got);
178	///
179	/// # Ok::<(), Box<dyn std::error::Error>>(())
180	/// ```
181	pub fn builder() -> Builder {
182	Builder::new()
183	}
184
185	/// Create a new cache for this `Regex`.
186	///
187	/// The cache returned should only be used for searches for this
188	/// `Regex`. If you want to reuse the cache for another `Regex`, then
189	/// you must call [`Cache::reset`] with that `Regex` (or, equivalently,
190	/// [`Regex::reset_cache`]).
191	pub fn create_cache(&self) -> Cache {
192	Cache::new(self)
193	}
194
195	/// Reset the given cache such that it can be used for searching with the
196	/// this `Regex` (and only this `Regex`).
197	///
198	/// A cache reset permits reusing memory already allocated in this cache
199	/// with a different `Regex`.
200	///
201	/// Resetting a cache sets its "clear count" to 0. This is relevant if the
202	/// `Regex` has been configured to "give up" after it has cleared the cache
203	/// a certain number of times.
204	///
205	/// # Example
206	///
207	/// This shows how to re-purpose a cache for use with a different `Regex`.
208	///
209	/// ```
210	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
211	/// use regex_automata::{hybrid::regex::Regex, Match};
212	///
213	/// let re1 = Regex::new(r"\w")?;
214	/// let re2 = Regex::new(r"\W")?;
215	///
216	/// let mut cache = re1.create_cache();
217	/// assert_eq!(
218	/// Some(Match::must(`0`, `0`..`2`)),
219	/// re1.find(&mut cache, "Δ"),
220	/// );
221	///
222	/// // Using 'cache' with re2 is not allowed. It may result in panics or
223	/// // incorrect results. In order to re-purpose the cache, we must reset
224	/// // it with the Regex we'd like to use it with.
225	/// //
226	/// // Similarly, after this reset, using the cache with 're1' is also not
227	/// // allowed.
228	/// re2.reset_cache(&mut cache);
229	/// assert_eq!(
230	/// Some(Match::must(`0`, `0`..`3`)),
231	/// re2.find(&mut cache, "☃"),
232	/// );
233	///
234	/// # Ok::<(), Box<dyn std::error::Error>>(())
235	/// ```
236	pub fn reset_cache(&self, cache: &mut Cache) {
237	self.forward().reset_cache(&mut cache.forward);
238	self.reverse().reset_cache(&mut cache.reverse);
239	}
240	}
241
242	/// Standard infallible search routines for finding and iterating over matches.
243	impl Regex {
244	/// Returns true if and only if this regex matches the given haystack.
245	///
246	/// This routine may short circuit if it knows that scanning future input
247	/// will never lead to a different result. In particular, if the underlying
248	/// DFA enters a match state or a dead state, then this routine will return
249	/// `true` or `false`, respectively, without inspecting any future input.
250	///
251	/// # Panics
252	///
253	/// This routine panics if the search could not complete. This can occur
254	/// in a number of circumstances:
255	///
256	/// The configuration of the lazy DFA may permit it to "quit" the search.*
257	/// For example, setting quit bytes or enabling heuristic support for
258	/// Unicode word boundaries. The default configuration does not enable any
259	/// option that could result in the lazy DFA quitting.
260	/// The configuration of the lazy DFA may also permit it to "give up"*
261	/// on a search if it makes ineffective use of its transition table
262	/// cache. The default configuration does not enable this by default,
263	/// although it is typically a good idea to.
264	/// When the provided `Input` configuration is not supported. For*
265	/// example, by providing an unsupported anchor mode.
266	///
267	/// When a search panics, callers cannot know whether a match exists or
268	/// not.
269	///
270	/// Use [`Regex::try_search`] if you want to handle these error conditions.
271	///
272	/// # Example
273	///
274	/// ```
275	/// use regex_automata::hybrid::regex::Regex;
276	///
277	/// let re = Regex::new("foo[0-9]+bar")?;
278	/// let mut cache = re.create_cache();
279	///
280	/// assert!(re.is_match(&mut cache, "foo12345bar"));
281	/// assert!(!re.is_match(&mut cache, "foobar"));
282	/// # Ok::<(), Box<dyn std::error::Error>>(())
283	/// ```
284	#[inline]
285	pub fn is_match<'h, I: Into<Input<'h>>>(
286	&self,
287	cache: &mut Cache,
288	input: I,
289	) -> bool {
290	// Not only can we do an "earliest" search, but we can avoid doing a
291	// reverse scan too.
292	self.forward()
293	.try_search_fwd(&mut cache.forward, &input.into().earliest(`true`))
294	.unwrap()
295	.is_some()
296	}
297
298	/// Returns the start and end offset of the leftmost match. If no match
299	/// exists, then `None` is returned.
300	///
301	/// # Panics
302	///
303	/// This routine panics if the search could not complete. This can occur
304	/// in a number of circumstances:
305	///
306	/// The configuration of the lazy DFA may permit it to "quit" the search.*
307	/// For example, setting quit bytes or enabling heuristic support for
308	/// Unicode word boundaries. The default configuration does not enable any
309	/// option that could result in the lazy DFA quitting.
310	/// The configuration of the lazy DFA may also permit it to "give up"*
311	/// on a search if it makes ineffective use of its transition table
312	/// cache. The default configuration does not enable this by default,
313	/// although it is typically a good idea to.
314	/// When the provided `Input` configuration is not supported. For*
315	/// example, by providing an unsupported anchor mode.
316	///
317	/// When a search panics, callers cannot know whether a match exists or
318	/// not.
319	///
320	/// Use [`Regex::try_search`] if you want to handle these error conditions.
321	///
322	/// # Example
323	///
324	/// ```
325	/// use regex_automata::{Match, hybrid::regex::Regex};
326	///
327	/// let re = Regex::new("foo[0-9]+")?;
328	/// let mut cache = re.create_cache();
329	/// assert_eq!(
330	/// Some(Match::must(`0`, `3`..`11`)),
331	/// re.find(&mut cache, "zzzfoo12345zzz"),
332	/// );
333	///
334	/// // Even though a match is found after reading the first byte (`a`),
335	/// // the default leftmost-first match semantics demand that we find the
336	/// // earliest match that prefers earlier parts of the pattern over latter
337	/// // parts.
338	/// let re = Regex::new("abc\|a")?;
339	/// let mut cache = re.create_cache();
340	/// assert_eq!(Some(Match::must(`0`, `0`..`3`)), re.find(&mut cache, "abc"));
341	/// # Ok::<(), Box<dyn std::error::Error>>(())
342	/// ```
343	#[inline]
344	pub fn find<'h, I: Into<Input<'h>>>(
345	&self,
346	cache: &mut Cache,
347	input: I,
348	) -> Option<Match> {
349	self.try_search(cache, &input.into()).unwrap()
350	}
351
352	/// Returns an iterator over all non-overlapping leftmost matches in the
353	/// given bytes. If no match exists, then the iterator yields no elements.
354	///
355	/// # Panics
356	///
357	/// This routine panics if the search could not complete. This can occur
358	/// in a number of circumstances:
359	///
360	/// The configuration of the lazy DFA may permit it to "quit" the search.*
361	/// For example, setting quit bytes or enabling heuristic support for
362	/// Unicode word boundaries. The default configuration does not enable any
363	/// option that could result in the lazy DFA quitting.
364	/// The configuration of the lazy DFA may also permit it to "give up"*
365	/// on a search if it makes ineffective use of its transition table
366	/// cache. The default configuration does not enable this by default,
367	/// although it is typically a good idea to.
368	/// When the provided `Input` configuration is not supported. For*
369	/// example, by providing an unsupported anchor mode.
370	///
371	/// When a search panics, callers cannot know whether a match exists or
372	/// not.
373	///
374	/// The above conditions also apply to the iterator returned as well. For
375	/// example, if the lazy DFA gives up or quits during a search using this
376	/// method, then a panic will occur during iteration.
377	///
378	/// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
379	/// if you want to handle these error conditions.
380	///
381	/// # Example
382	///
383	/// ```
384	/// use regex_automata::{hybrid::regex::Regex, Match};
385	///
386	/// let re = Regex::new("foo[0-9]+")?;
387	/// let mut cache = re.create_cache();
388	///
389	/// let text = "foo1 foo12 foo123";
390	/// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
391	/// assert_eq!(matches, vec![
392	/// Match::must(`0`, `0`..`4`),
393	/// Match::must(`0`, `5`..`10`),
394	/// Match::must(`0`, `11`..`17`),
395	/// ]);
396	/// # Ok::<(), Box<dyn std::error::Error>>(())
397	/// ```
398	#[inline]
399	pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
400	&'r self,
401	cache: &'c mut Cache,
402	input: I,
403	) -> FindMatches<'r, 'c, 'h> {
404	let it = iter::Searcher::new(input.into());
405	FindMatches { re: self, cache, it }
406	}
407	}
408
409	/// Lower level "search" primitives that accept a `&Input` for cheap reuse
410	/// and return an error if one occurs instead of panicking.
411	impl Regex {
412	/// Returns the start and end offset of the leftmost match. If no match
413	/// exists, then `None` is returned.
414	///
415	/// This is like [`Regex::find`] but with two differences:
416	///
417	/// 1. It is not generic over `Into<Input>` and instead accepts a
418	/// `&Input`. This permits reusing the same `Input` for multiple searches
419	/// without needing to create a new one. This _may_ help with latency.
420	/// 2. It returns an error if the search could not complete where as
421	/// [`Regex::find`] will panic.
422	///
423	/// # Errors
424	///
425	/// This routine errors if the search could not complete. This can occur
426	/// in a number of circumstances:
427	///
428	/// The configuration of the lazy DFA may permit it to "quit" the search.*
429	/// For example, setting quit bytes or enabling heuristic support for
430	/// Unicode word boundaries. The default configuration does not enable any
431	/// option that could result in the lazy DFA quitting.
432	/// The configuration of the lazy DFA may also permit it to "give up"*
433	/// on a search if it makes ineffective use of its transition table
434	/// cache. The default configuration does not enable this by default,
435	/// although it is typically a good idea to.
436	/// When the provided `Input` configuration is not supported. For*
437	/// example, by providing an unsupported anchor mode.
438	///
439	/// When a search returns an error, callers cannot know whether a match
440	/// exists or not.
441	#[inline]
442	pub fn try_search(
443	&self,
444	cache: &mut Cache,
445	input: &Input<'_>,
446	) -> Result<Option<Match>, MatchError> {
447	let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
448	let end = match self.forward().try_search_fwd(fcache, input)? {
449	None => return Ok(None),
450	Some(end) => end,
451	};
452	// This special cases an empty match at the beginning of the search. If
453	// our end matches our start, then since a reverse DFA can't match past
454	// the start, it must follow that our starting position is also our end
455	// position. So short circuit and skip the reverse search.
456	if input.start() == end.offset() {
457	return Ok(Some(Match::new(
458	end.pattern(),
459	end.offset()..end.offset(),
460	)));
461	}
462	// We can also skip the reverse search if we know our search was
463	// anchored. This occurs either when the input config is anchored or
464	// when we know the regex itself is anchored. In this case, we know the
465	// start of the match, if one is found, must be the start of the
466	// search.
467	if self.is_anchored(input) {
468	return Ok(Some(Match::new(
469	end.pattern(),
470	input.start()..end.offset(),
471	)));
472	}
473	// N.B. I have tentatively convinced myself that it isn't necessary
474	// to specify the specific pattern for the reverse search since the
475	// reverse search will always find the same pattern to match as the
476	// forward search. But I lack a rigorous proof. Why not just provide
477	// the pattern anyway? Well, if it is needed, then leaving it out
478	// gives us a chance to find a witness. (Also, if we don't need to
479	// specify the pattern, then we don't need to build the reverse DFA
480	// with 'starts_for_each_pattern' enabled. It doesn't matter too much
481	// for the lazy DFA, but does make the overall DFA bigger.)
482	//
483	// We also need to be careful to disable 'earliest' for the reverse
484	// search, since it could be enabled for the forward search. In the
485	// reverse case, to satisfy "leftmost" criteria, we need to match as
486	// much as we can. We also need to be careful to make the search
487	// anchored. We don't want the reverse search to report any matches
488	// other than the one beginning at the end of our forward search.
489	let revsearch = input
490	.clone()
491	.span(input.start()..end.offset())
492	.anchored(Anchored::Yes)
493	.earliest(`false`);
494	let start = self
495	.reverse()
496	.try_search_rev(rcache, &revsearch)?
497	.expect("reverse search must match if forward search does");
498	debug_assert_eq!(
499	start.pattern(),
500	end.pattern(),
501	"forward and reverse search must match same pattern",
502	);
503	debug_assert!(start.offset() <= end.offset());
504	Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
505	}
506
507	/// Returns true if either the given input specifies an anchored search
508	/// or if the underlying NFA is always anchored.
509	fn is_anchored(&self, input: &Input<'_>) -> bool {
510	match input.get_anchored() {
511	Anchored::No => {
512	self.forward().get_nfa().is_always_start_anchored()
513	}
514	Anchored::Yes \| Anchored::Pattern(_) => `true`,
515	}
516	}
517	}
518
519	/// Non-search APIs for querying information about the regex and setting a
520	/// prefilter.
521	impl Regex {
522	/// Return the underlying lazy DFA responsible for forward matching.
523	///
524	/// This is useful for accessing the underlying lazy DFA and using it
525	/// directly if the situation calls for it.
526	pub fn forward(&self) -> &DFA {
527	&self.forward
528	}
529
530	/// Return the underlying lazy DFA responsible for reverse matching.
531	///
532	/// This is useful for accessing the underlying lazy DFA and using it
533	/// directly if the situation calls for it.
534	pub fn reverse(&self) -> &DFA {
535	&self.reverse
536	}
537
538	/// Returns the total number of patterns matched by this regex.
539	///
540	/// # Example
541	///
542	/// ```
543	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
544	/// use regex_automata::hybrid::regex::Regex;
545	///
546	/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
547	/// assert_eq!(`3`, re.pattern_len());
548	/// # Ok::<(), Box<dyn std::error::Error>>(())
549	/// ```
550	pub fn pattern_len(&self) -> usize {
551	assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
552	self.forward().pattern_len()
553	}
554	}
555
556	/// An iterator over all non-overlapping matches for an infallible search.
557	///
558	/// The iterator yields a [`Match`] value until no more matches could be found.
559	/// If the underlying regex engine returns an error, then a panic occurs.
560	///
561	/// The lifetime parameters are as follows:
562	///
563	/// `'r` represents the lifetime of the regex object.*
564	/// `'h` represents the lifetime of the haystack being searched.*
565	/// `'c` represents the lifetime of the regex cache.*
566	///
567	/// This iterator can be created with the [`Regex::find_iter`] method.
568	#[derive(Debug)]
569	pub struct FindMatches<'r, 'c, 'h> {
570	re: &'r Regex,
571	cache: &'c mut Cache,
572	it: iter::Searcher<'h>,
573	}
574
575	impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
576	type Item = Match;
577
578	#[inline]
579	fn next(&mut self) -> Option<Match> {
580	let FindMatches { re: &Regex, ref mut cache: &mut &mut Cache, ref mut it: &mut Searcher<'_> } = *self;
581	it.advance(\|input: &Input<'_>\| re.try_search(cache, input))
582	}
583	}
584
585	/// A cache represents a partially computed forward and reverse DFA.
586	///
587	/// A cache is the key component that differentiates a classical DFA and a
588	/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
589	/// complete transition table that can handle all possible inputs, a hybrid
590	/// NFA/DFA starts with an empty transition table and builds only the parts
591	/// required during search. The parts that are built are stored in a cache. For
592	/// this reason, a cache is a required parameter for nearly every operation on
593	/// a [`Regex`].
594	///
595	/// Caches can be created from their corresponding `Regex` via
596	/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`
597	/// that created it, or the `Regex` that was most recently used to reset it
598	/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in
599	/// panics or incorrect results.
600	#[derive(Debug, Clone)]
601	pub struct Cache {
602	forward: dfa::Cache,
603	reverse: dfa::Cache,
604	}
605
606	impl Cache {
607	/// Create a new cache for the given `Regex`.
608	///
609	/// The cache returned should only be used for searches for the given
610	/// `Regex`. If you want to reuse the cache for another `Regex`, then you
611	/// must call [`Cache::reset`] with that `Regex`.
612	pub fn new(re: &Regex) -> Cache {
613	let forward = dfa::Cache::new(re.forward());
614	let reverse = dfa::Cache::new(re.reverse());
615	Cache { forward, reverse }
616	}
617
618	/// Reset this cache such that it can be used for searching with the given
619	/// `Regex` (and only that `Regex`).
620	///
621	/// A cache reset permits reusing memory already allocated in this cache
622	/// with a different `Regex`.
623	///
624	/// Resetting a cache sets its "clear count" to 0. This is relevant if the
625	/// `Regex` has been configured to "give up" after it has cleared the cache
626	/// a certain number of times.
627	///
628	/// # Example
629	///
630	/// This shows how to re-purpose a cache for use with a different `Regex`.
631	///
632	/// ```
633	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
634	/// use regex_automata::{hybrid::regex::Regex, Match};
635	///
636	/// let re1 = Regex::new(r"\w")?;
637	/// let re2 = Regex::new(r"\W")?;
638	///
639	/// let mut cache = re1.create_cache();
640	/// assert_eq!(
641	/// Some(Match::must(`0`, `0`..`2`)),
642	/// re1.find(&mut cache, "Δ"),
643	/// );
644	///
645	/// // Using 'cache' with re2 is not allowed. It may result in panics or
646	/// // incorrect results. In order to re-purpose the cache, we must reset
647	/// // it with the Regex we'd like to use it with.
648	/// //
649	/// // Similarly, after this reset, using the cache with 're1' is also not
650	/// // allowed.
651	/// cache.reset(&re2);
652	/// assert_eq!(
653	/// Some(Match::must(`0`, `0`..`3`)),
654	/// re2.find(&mut cache, "☃"),
655	/// );
656	///
657	/// # Ok::<(), Box<dyn std::error::Error>>(())
658	/// ```
659	pub fn reset(&mut self, re: &Regex) {
660	self.forward.reset(re.forward());
661	self.reverse.reset(re.reverse());
662	}
663
664	/// Return a reference to the forward cache.
665	pub fn forward(&mut self) -> &dfa::Cache {
666	&self.forward
667	}
668
669	/// Return a reference to the reverse cache.
670	pub fn reverse(&mut self) -> &dfa::Cache {
671	&self.reverse
672	}
673
674	/// Return a mutable reference to the forward cache.
675	///
676	/// If you need mutable references to both the forward and reverse caches,
677	/// then use [`Cache::as_parts_mut`].
678	pub fn forward_mut(&mut self) -> &mut dfa::Cache {
679	&mut self.forward
680	}
681
682	/// Return a mutable reference to the reverse cache.
683	///
684	/// If you need mutable references to both the forward and reverse caches,
685	/// then use [`Cache::as_parts_mut`].
686	pub fn reverse_mut(&mut self) -> &mut dfa::Cache {
687	&mut self.reverse
688	}
689
690	/// Return references to the forward and reverse caches, respectively.
691	pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) {
692	(&self.forward, &self.reverse)
693	}
694
695	/// Return mutable references to the forward and reverse caches,
696	/// respectively.
697	pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
698	(&mut self.forward, &mut self.reverse)
699	}
700
701	/// Returns the heap memory usage, in bytes, as a sum of the forward and
702	/// reverse lazy DFA caches.
703	///
704	/// This does not* include the stack size used up by this cache. To*
705	/// compute that, use `std::mem::size_of::<Cache>()`.
706	pub fn memory_usage(&self) -> usize {
707	self.forward.memory_usage() + self.reverse.memory_usage()
708	}
709	}
710
711	/// A builder for a regex based on a hybrid NFA/DFA.
712	///
713	/// This builder permits configuring options for the syntax of a pattern, the
714	/// NFA construction, the lazy DFA construction and finally the regex searching
715	/// itself. This builder is different from a general purpose regex builder
716	/// in that it permits fine grain configuration of the construction process.
717	/// The trade off for this is complexity, and the possibility of setting a
718	/// configuration that might not make sense. For example, there are two
719	/// different UTF-8 modes:
720	///
721	/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
722	/// whether the pattern itself can contain sub-expressions that match invalid
723	/// UTF-8.
724	/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
725	/// advance the starting position of the next search when a match with zero
726	/// length is found.
727	///
728	/// Generally speaking, callers will want to either enable all of these or
729	/// disable all of these.
730	///
731	/// Internally, building a regex requires building two hybrid NFA/DFAs,
732	/// where one is responsible for finding the end of a match and the other is
733	/// responsible for finding the start of a match. If you only need to detect
734	/// whether something matched, or only the end of a match, then you should use
735	/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper
736	/// than building two of them.
737	///
738	/// # Example
739	///
740	/// This example shows how to disable UTF-8 mode in the syntax and the regex
741	/// itself. This is generally what you want for matching on arbitrary bytes.
742	///
743	/// ```
744	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
745	/// use regex_automata::{
746	/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
747	/// };
748	///
749	/// let re = Regex::builder()
750	/// .syntax(syntax::Config::new().utf8(`false`))
751	/// .thompson(thompson::Config::new().utf8(`false`))
752	/// .build(r"foo(?-u:[^b])ar.*")?;
753	/// let mut cache = re.create_cache();
754	///
755	/// let haystack = b"`\xFE`foo`\xFF`arzz`\xE2\x98\xFF\n`";
756	/// let expected = Some(Match::must(`0`, `1`..`9`));
757	/// let got = re.find(&mut cache, haystack);
758	/// assert_eq!(expected, got);
759	/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
760	/// // but the subsequent `.` does not! Disabling UTF-8*
761	/// // on the syntax permits this.
762	/// assert_eq!(b"foo`\xFF`arzz", &haystack[got.unwrap().range()]);
763	///
764	/// # Ok::<(), Box<dyn std::error::Error>>(())
765	/// ```
766	#[derive(Clone, Debug)]
767	pub struct Builder {
768	dfa: dfa::Builder,
769	}
770
771	impl Builder {
772	/// Create a new regex builder with the default configuration.
773	pub fn new() -> Builder {
774	Builder { dfa: DFA::builder() }
775	}
776
777	/// Build a regex from the given pattern.
778	///
779	/// If there was a problem parsing or compiling the pattern, then an error
780	/// is returned.
781	#[cfg(feature = "syntax")]
782	pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
783	self.build_many(&[pattern])
784	}
785
786	/// Build a regex from the given patterns.
787	#[cfg(feature = "syntax")]
788	pub fn build_many<P: AsRef<str>>(
789	&self,
790	patterns: &[P],
791	) -> Result<Regex, BuildError> {
792	let forward = self.dfa.build_many(patterns)?;
793	let reverse = self
794	.dfa
795	.clone()
796	.configure(
797	DFA::config()
798	.prefilter(None)
799	.specialize_start_states(`false`)
800	.match_kind(MatchKind::All),
801	)
802	.thompson(thompson::Config::new().reverse(`true`))
803	.build_many(patterns)?;
804	Ok(self.build_from_dfas(forward, reverse))
805	}
806
807	/// Build a regex from its component forward and reverse hybrid NFA/DFAs.
808	///
809	/// This is useful when you've built a forward and reverse lazy DFA
810	/// separately, and want to combine them into a single regex. Once build,
811	/// the individual DFAs given can still be accessed via [`Regex::forward`]
812	/// and [`Regex::reverse`].
813	///
814	/// It is important that the reverse lazy DFA be compiled under the
815	/// following conditions:
816	///
817	/// It should use* [`MatchKind::All`] semantics.
818	/// It should match in reverse.*
819	/// Otherwise, its configuration should match the forward DFA.*
820	///
821	/// If these conditions aren't satisfied, then the behavior of searches is
822	/// unspecified.
823	///
824	/// Note that when using this constructor, no configuration is applied.
825	/// Since this routine provides the DFAs to the builder, there is no
826	/// opportunity to apply other configuration options.
827	///
828	/// # Example
829	///
830	/// This shows how to build individual lazy forward and reverse DFAs, and
831	/// then combine them into a single `Regex`.
832	///
833	/// ```
834	/// use regex_automata::{
835	/// hybrid::{dfa::DFA, regex::Regex},
836	/// nfa::thompson,
837	/// MatchKind,
838	/// };
839	///
840	/// let fwd = DFA::new(r"foo[0-9]+")?;
841	/// let rev = DFA::builder()
842	/// .configure(DFA::config().match_kind(MatchKind::All))
843	/// .thompson(thompson::Config::new().reverse(`true`))
844	/// .build(r"foo[0-9]+")?;
845	///
846	/// let re = Regex::builder().build_from_dfas(fwd, rev);
847	/// let mut cache = re.create_cache();
848	/// assert_eq!(`true`, re.is_match(&mut cache, "foo123"));
849	/// # Ok::<(), Box<dyn std::error::Error>>(())
850	/// ```
851	pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
852	Regex { forward, reverse }
853	}
854
855	/// Set the syntax configuration for this builder using
856	/// [`syntax::Config`](crate::util::syntax::Config).
857	///
858	/// This permits setting things like case insensitivity, Unicode and multi
859	/// line mode.
860	#[cfg(feature = "syntax")]
861	pub fn syntax(
862	&mut self,
863	config: crate::util::syntax::Config,
864	) -> &mut Builder {
865	self.dfa.syntax(config);
866	self
867	}
868
869	/// Set the Thompson NFA configuration for this builder using
870	/// [`nfa::thompson::Config`](thompson::Config).
871	///
872	/// This permits setting things like whether additional time should be
873	/// spent shrinking the size of the NFA.
874	#[cfg(feature = "syntax")]
875	pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
876	self.dfa.thompson(config);
877	self
878	}
879
880	/// Set the lazy DFA compilation configuration for this builder using
881	/// [`dfa::Config`].
882	///
883	/// This permits setting things like whether Unicode word boundaries should
884	/// be heuristically supported or settings how the behavior of the cache.
885	pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder {
886	self.dfa.configure(config);
887	self
888	}
889	}
890
891	impl Default for Builder {
892	fn default() -> Builder {
893	Builder::new()
894	}
895	}
896

Provided by KDAB

Definitions