dfa.rs source code [crates/regex-automata-0.4.6/src/hybrid/dfa.rs]

1	/!*
2	Types and routines specific to lazy DFAs.
3
4	This module is the home of [`hybrid::dfa::DFA`](DFA).
5
6	This module also contains a [`hybrid::dfa::Builder`](Builder) and a
7	[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
8	*/
9
10	use core::{iter, mem::size_of};
11
12	use alloc::vec::Vec;
13
14	use crate::{
15	hybrid::{
16	error::{BuildError, CacheError, StartError},
17	id::{LazyStateID, LazyStateIDError},
18	search,
19	},
20	nfa::thompson,
21	util::{
22	alphabet::{self, ByteClasses, ByteSet},
23	determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
24	empty,
25	prefilter::Prefilter,
26	primitives::{PatternID, StateID as NFAStateID},
27	search::{
28	Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
29	},
30	sparse_set::SparseSets,
31	start::{self, Start, StartByteMap},
32	},
33	};
34
35	/// The minimum number of states that a lazy DFA's cache size must support.
36	///
37	/// This is checked at time of construction to ensure that at least some small
38	/// number of states can fit in the given capacity allotment. If we can't fit
39	/// at least this number of states, then the thinking is that it's pretty
40	/// senseless to use the lazy DFA. More to the point, parts of the code do
41	/// assume that the cache can fit at least some small number of states.
42	const MIN_STATES: usize = SENTINEL_STATES + `2`;
43
44	/// The number of "sentinel" states that get added to every lazy DFA.
45	///
46	/// These are special states indicating status conditions of a search: unknown,
47	/// dead and quit. These states in particular also use zero NFA states, so
48	/// their memory usage is quite small. This is relevant for computing the
49	/// minimum memory needed for a lazy DFA cache.
50	const SENTINEL_STATES: usize = `3`;
51
52	/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
53	///
54	/// A lazy DFA is a DFA that builds itself at search time. It otherwise has
55	/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA).
56	/// Indeed, both support precisely the same regex features with precisely the
57	/// same semantics.
58	///
59	/// Where as a `dense::DFA` must be completely built to handle any input before
60	/// it may be used for search, a lazy DFA starts off effectively empty. During
61	/// a search, a lazy DFA will build itself depending on whether it has already
62	/// computed the next transition or not. If it has, then it looks a lot like
63	/// a `dense::DFA` internally: it does a very fast table based access to find
64	/// the next transition. Otherwise, if the state hasn't been computed, then it
65	/// does determinization _for that specific transition_ to compute the next DFA
66	/// state.
67	///
68	/// The main selling point of a lazy DFA is that, in practice, it has
69	/// the performance profile of a `dense::DFA` without the weakness of it
70	/// taking worst case exponential time to build. Indeed, for each byte of
71	/// input, the lazy DFA will construct as most one new DFA state. Thus, a
72	/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~
73	/// pattern.len()` and `n ~ haystack.len()`).
74	///
75	/// The main downsides of a lazy DFA are:
76	///
77	/// 1. It requires mutable "cache" space during search. This is where the
78	/// transition table, among other things, is stored.
79	/// 2. In pathological cases (e.g., if the cache is too small), it will run
80	/// out of room and either require a bigger cache capacity or will repeatedly
81	/// clear the cache and thus repeatedly regenerate DFA states. Overall, this
82	/// will tend to be slower than a typical NFA simulation.
83	///
84	/// # Capabilities
85	///
86	/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following
87	/// operations:
88	///
89	/// 1. Detection of a match.
90	/// 2. Location of the end of a match.
91	/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched
92	/// is reported as well.
93	///
94	/// A notable absence from the above list of capabilities is the location of
95	/// the start* of a match. In order to provide both the start and end of*
96	/// a match, two* lazy DFAs are required. This functionality is provided by a*
97	/// [`Regex`](crate::hybrid::regex::Regex).
98	///
99	/// # Example
100	///
101	/// This shows how to build a lazy DFA with the default configuration and
102	/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create
103	/// a cache and pass it to our search routine.
104	///
105	/// ```
106	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
107	///
108	/// let dfa = DFA::new("foo[0-9]+")?;
109	/// let mut cache = dfa.create_cache();
110	///
111	/// let expected = Some(HalfMatch::must(`0`, `8`));
112	/// assert_eq!(expected, dfa.try_search_fwd(
113	/// &mut cache, &Input::new("foo12345"))?,
114	/// );
115	/// # Ok::<(), Box<dyn std::error::Error>>(())
116	/// ```
117	#[derive(Clone, Debug)]
118	pub struct DFA {
119	config: Config,
120	nfa: thompson::NFA,
121	stride2: usize,
122	start_map: StartByteMap,
123	classes: ByteClasses,
124	quitset: ByteSet,
125	cache_capacity: usize,
126	}
127
128	impl DFA {
129	/// Parse the given regular expression using a default configuration and
130	/// return the corresponding lazy DFA.
131	///
132	/// If you want a non-default configuration, then use the [`Builder`] to
133	/// set your own configuration.
134	///
135	/// # Example
136	///
137	/// ```
138	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
139	///
140	/// let dfa = DFA::new("foo[0-9]+bar")?;
141	/// let mut cache = dfa.create_cache();
142	///
143	/// let expected = HalfMatch::must(`0`, `11`);
144	/// assert_eq!(
145	/// Some(expected),
146	/// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
147	/// );
148	/// # Ok::<(), Box<dyn std::error::Error>>(())
149	/// ```
150	#[cfg(feature = "syntax")]
151	pub fn new(pattern: &str) -> Result<DFA, BuildError> {
152	DFA::builder().build(pattern)
153	}
154
155	/// Parse the given regular expressions using a default configuration and
156	/// return the corresponding lazy multi-DFA.
157	///
158	/// If you want a non-default configuration, then use the [`Builder`] to
159	/// set your own configuration.
160	///
161	/// # Example
162	///
163	/// ```
164	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
165	///
166	/// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
167	/// let mut cache = dfa.create_cache();
168	///
169	/// let expected = HalfMatch::must(`1`, `3`);
170	/// assert_eq!(
171	/// Some(expected),
172	/// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
173	/// );
174	/// # Ok::<(), Box<dyn std::error::Error>>(())
175	/// ```
176	#[cfg(feature = "syntax")]
177	pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
178	DFA::builder().build_many(patterns)
179	}
180
181	/// Create a new lazy DFA that matches every input.
182	///
183	/// # Example
184	///
185	/// ```
186	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
187	///
188	/// let dfa = DFA::always_match()?;
189	/// let mut cache = dfa.create_cache();
190	///
191	/// let expected = HalfMatch::must(`0`, `0`);
192	/// assert_eq!(Some(expected), dfa.try_search_fwd(
193	/// &mut cache, &Input::new(""))?,
194	/// );
195	/// assert_eq!(Some(expected), dfa.try_search_fwd(
196	/// &mut cache, &Input::new("foo"))?,
197	/// );
198	/// # Ok::<(), Box<dyn std::error::Error>>(())
199	/// ```
200	pub fn always_match() -> Result<DFA, BuildError> {
201	let nfa = thompson::NFA::always_match();
202	Builder::new().build_from_nfa(nfa)
203	}
204
205	/// Create a new lazy DFA that never matches any input.
206	///
207	/// # Example
208	///
209	/// ```
210	/// use regex_automata::{hybrid::dfa::DFA, Input};
211	///
212	/// let dfa = DFA::never_match()?;
213	/// let mut cache = dfa.create_cache();
214	///
215	/// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?);
216	/// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?);
217	/// # Ok::<(), Box<dyn std::error::Error>>(())
218	/// ```
219	pub fn never_match() -> Result<DFA, BuildError> {
220	let nfa = thompson::NFA::never_match();
221	Builder::new().build_from_nfa(nfa)
222	}
223
224	/// Return a default configuration for a `DFA`.
225	///
226	/// This is a convenience routine to avoid needing to import the [`Config`]
227	/// type when customizing the construction of a lazy DFA.
228	///
229	/// # Example
230	///
231	/// This example shows how to build a lazy DFA that heuristically supports
232	/// Unicode word boundaries.
233	///
234	/// ```
235	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
236	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input};
237	///
238	/// let re = DFA::builder()
239	/// .configure(DFA::config().unicode_word_boundary(`true`))
240	/// .build(r"\b\w+\b")?;
241	/// let mut cache = re.create_cache();
242	///
243	/// // Since our haystack is all ASCII, the DFA search sees then and knows
244	/// // it is legal to interpret Unicode word boundaries as ASCII word
245	/// // boundaries.
246	/// let input = Input::new("!!foo!!");
247	/// let expected = HalfMatch::must(`0`, `5`);
248	/// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?);
249	///
250	/// // But if our haystack contains non-ASCII, then the search will fail
251	/// // with an error.
252	/// let input = Input::new("!!βββ!!");
253	/// let expected = MatchError::quit(b'`\xCE`', `2`);
254	/// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input));
255	///
256	/// # Ok::<(), Box<dyn std::error::Error>>(())
257	/// ```
258	pub fn config() -> Config {
259	Config::new()
260	}
261
262	/// Return a builder for configuring the construction of a `Regex`.
263	///
264	/// This is a convenience routine to avoid needing to import the
265	/// [`Builder`] type in common cases.
266	///
267	/// # Example
268	///
269	/// This example shows how to use the builder to disable UTF-8 mode
270	/// everywhere for lazy DFAs.
271	///
272	/// ```
273	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
274	/// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input};
275	///
276	/// let re = DFA::builder()
277	/// .syntax(syntax::Config::new().utf8(`false`))
278	/// .build(r"foo(?-u:[^b])ar.*")?;
279	/// let mut cache = re.create_cache();
280	///
281	/// let input = Input::new(b"`\xFE`foo`\xFF`arzz`\xE2\x98\xFF\n`");
282	/// let expected = Some(HalfMatch::must(`0`, `9`));
283	/// let got = re.try_search_fwd(&mut cache, &input)?;
284	/// assert_eq!(expected, got);
285	///
286	/// # Ok::<(), Box<dyn std::error::Error>>(())
287	/// ```
288	pub fn builder() -> Builder {
289	Builder::new()
290	}
291
292	/// Create a new cache for this lazy DFA.
293	///
294	/// The cache returned should only be used for searches for this
295	/// lazy DFA. If you want to reuse the cache for another DFA, then
296	/// you must call [`Cache::reset`] with that DFA (or, equivalently,
297	/// [`DFA::reset_cache`]).
298	pub fn create_cache(&self) -> Cache {
299	Cache::new(self)
300	}
301
302	/// Reset the given cache such that it can be used for searching with the
303	/// this lazy DFA (and only this DFA).
304	///
305	/// A cache reset permits reusing memory already allocated in this cache
306	/// with a different lazy DFA.
307	///
308	/// Resetting a cache sets its "clear count" to 0. This is relevant if the
309	/// lazy DFA has been configured to "give up" after it has cleared the
310	/// cache a certain number of times.
311	///
312	/// Any lazy state ID generated by the cache prior to resetting it is
313	/// invalid after the reset.
314	///
315	/// # Example
316	///
317	/// This shows how to re-purpose a cache for use with a different DFA.
318	///
319	/// ```
320	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
321	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
322	///
323	/// let dfa1 = DFA::new(r"\w")?;
324	/// let dfa2 = DFA::new(r"\W")?;
325	///
326	/// let mut cache = dfa1.create_cache();
327	/// assert_eq!(
328	/// Some(HalfMatch::must(`0`, `2`)),
329	/// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
330	/// );
331	///
332	/// // Using 'cache' with dfa2 is not allowed. It may result in panics or
333	/// // incorrect results. In order to re-purpose the cache, we must reset
334	/// // it with the DFA we'd like to use it with.
335	/// //
336	/// // Similarly, after this reset, using the cache with 'dfa1' is also not
337	/// // allowed.
338	/// dfa2.reset_cache(&mut cache);
339	/// assert_eq!(
340	/// Some(HalfMatch::must(`0`, `3`)),
341	/// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
342	/// );
343	///
344	/// # Ok::<(), Box<dyn std::error::Error>>(())
345	/// ```
346	pub fn reset_cache(&self, cache: &mut Cache) {
347	Lazy::new(self, cache).reset_cache()
348	}
349
350	/// Returns the total number of patterns compiled into this lazy DFA.
351	///
352	/// In the case of a DFA that contains no patterns, this returns `0`.
353	///
354	/// # Example
355	///
356	/// This example shows the pattern length for a DFA that never matches:
357	///
358	/// ```
359	/// use regex_automata::hybrid::dfa::DFA;
360	///
361	/// let dfa = DFA::never_match()?;
362	/// assert_eq!(dfa.pattern_len(), `0`);
363	/// # Ok::<(), Box<dyn std::error::Error>>(())
364	/// ```
365	///
366	/// And another example for a DFA that matches at every position:
367	///
368	/// ```
369	/// use regex_automata::hybrid::dfa::DFA;
370	///
371	/// let dfa = DFA::always_match()?;
372	/// assert_eq!(dfa.pattern_len(), `1`);
373	/// # Ok::<(), Box<dyn std::error::Error>>(())
374	/// ```
375	///
376	/// And finally, a DFA that was constructed from multiple patterns:
377	///
378	/// ```
379	/// use regex_automata::hybrid::dfa::DFA;
380	///
381	/// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
382	/// assert_eq!(dfa.pattern_len(), `3`);
383	/// # Ok::<(), Box<dyn std::error::Error>>(())
384	/// ```
385	pub fn pattern_len(&self) -> usize {
386	self.nfa.pattern_len()
387	}
388
389	/// Returns the equivalence classes that make up the alphabet for this DFA.
390	///
391	/// Unless [`Config::byte_classes`] was disabled, it is possible that
392	/// multiple distinct bytes are grouped into the same equivalence class
393	/// if it is impossible for them to discriminate between a match and a
394	/// non-match. This has the effect of reducing the overall alphabet size
395	/// and in turn potentially substantially reducing the size of the DFA's
396	/// transition table.
397	///
398	/// The downside of using equivalence classes like this is that every state
399	/// transition will automatically use this map to convert an arbitrary
400	/// byte to its corresponding equivalence class. In practice this has a
401	/// negligible impact on performance.
402	pub fn byte_classes(&self) -> &ByteClasses {
403	&self.classes
404	}
405
406	/// Returns this lazy DFA's configuration.
407	pub fn get_config(&self) -> &Config {
408	&self.config
409	}
410
411	/// Returns a reference to the underlying NFA.
412	pub fn get_nfa(&self) -> &thompson::NFA {
413	&self.nfa
414	}
415
416	/// Returns the stride, as a base-2 exponent, required for these
417	/// equivalence classes.
418	///
419	/// The stride is always the smallest power of 2 that is greater than or
420	/// equal to the alphabet length. This is done so that converting between
421	/// state IDs and indices can be done with shifts alone, which is much
422	/// faster than integer division.
423	fn stride2(&self) -> usize {
424	self.stride2
425	}
426
427	/// Returns the total stride for every state in this lazy DFA. This
428	/// corresponds to the total number of transitions used by each state in
429	/// this DFA's transition table.
430	fn stride(&self) -> usize {
431	`1` << self.stride2()
432	}
433
434	/// Returns the memory usage, in bytes, of this lazy DFA.
435	///
436	/// This does not* include the stack size used up by this lazy DFA. To*
437	/// compute that, use `std::mem::size_of::<DFA>()`. This also does not
438	/// include the size of the `Cache` used.
439	///
440	/// This also does not include any heap memory used by the NFA inside of
441	/// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and
442	/// thus not owned by this hybrid NFA/DFA. More practically, several regex
443	/// engines in this crate embed an NFA, and reporting the NFA's memory
444	/// usage in all of them would likely result in reporting higher heap
445	/// memory than is actually used.
446	pub fn memory_usage(&self) -> usize {
447	// The only thing that uses heap memory in a DFA is the NFA. But the
448	// NFA has shared ownership, so reporting its memory as part of the
449	// hybrid DFA is likely to lead to double-counting the NFA memory
450	// somehow. In particular, this DFA does not really own an NFA, so
451	// including it in the DFA's memory usage doesn't seem semantically
452	// correct.
453	`0`
454	}
455	}
456
457	impl DFA {
458	/// Executes a forward search and returns the end position of the leftmost
459	/// match that is found. If no match exists, then `None` is returned.
460	///
461	/// In particular, this method continues searching even after it enters
462	/// a match state. The search only terminates once it has reached the
463	/// end of the input or when it has entered a dead or quit state. Upon
464	/// termination, the position of the last byte seen while still in a match
465	/// state is returned.
466	///
467	/// # Errors
468	///
469	/// This routine errors if the search could not complete. This can occur
470	/// in a number of circumstances:
471	///
472	/// The configuration of the lazy DFA may permit it to "quit" the search.*
473	/// For example, setting quit bytes or enabling heuristic support for
474	/// Unicode word boundaries. The default configuration does not enable any
475	/// option that could result in the lazy DFA quitting.
476	/// The configuration of the lazy DFA may also permit it to "give up"*
477	/// on a search if it makes ineffective use of its transition table
478	/// cache. The default configuration does not enable this by default,
479	/// although it is typically a good idea to.
480	/// When the provided `Input` configuration is not supported. For*
481	/// example, by providing an unsupported anchor mode.
482	///
483	/// When a search returns an error, callers cannot know whether a match
484	/// exists or not.
485	///
486	/// # Example
487	///
488	/// This example shows how to run a basic search.
489	///
490	/// ```
491	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
492	///
493	/// let dfa = DFA::new("foo[0-9]+")?;
494	/// let mut cache = dfa.create_cache();
495	/// let expected = HalfMatch::must(`0`, `8`);
496	/// assert_eq!(Some(expected), dfa.try_search_fwd(
497	/// &mut cache, &Input::new("foo12345"))?,
498	/// );
499	///
500	/// // Even though a match is found after reading the first byte (`a`),
501	/// // the leftmost first match semantics demand that we find the earliest
502	/// // match that prefers earlier parts of the pattern over later parts.
503	/// let dfa = DFA::new("abc\|a")?;
504	/// let mut cache = dfa.create_cache();
505	/// let expected = HalfMatch::must(`0`, `3`);
506	/// assert_eq!(Some(expected), dfa.try_search_fwd(
507	/// &mut cache, &Input::new("abc"))?,
508	/// );
509	///
510	/// # Ok::<(), Box<dyn std::error::Error>>(())
511	/// ```
512	///
513	/// # Example: specific pattern search
514	///
515	/// This example shows how to build a lazy multi-DFA that permits searching
516	/// for specific patterns.
517	///
518	/// ```
519	/// use regex_automata::{
520	/// hybrid::dfa::DFA,
521	/// Anchored, HalfMatch, PatternID, Input,
522	/// };
523	///
524	/// let dfa = DFA::builder()
525	/// .configure(DFA::config().starts_for_each_pattern(`true`))
526	/// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
527	/// let mut cache = dfa.create_cache();
528	/// let haystack = "foo123";
529	///
530	/// // Since we are using the default leftmost-first match and both
531	/// // patterns match at the same starting position, only the first pattern
532	/// // will be returned in this case when doing a search for any of the
533	/// // patterns.
534	/// let expected = Some(HalfMatch::must(`0`, `6`));
535	/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
536	/// assert_eq!(expected, got);
537	///
538	/// // But if we want to check whether some other pattern matches, then we
539	/// // can provide its pattern ID.
540	/// let expected = Some(HalfMatch::must(`1`, `6`));
541	/// let input = Input::new(haystack)
542	/// .anchored(Anchored::Pattern(PatternID::must(`1`)));
543	/// let got = dfa.try_search_fwd(&mut cache, &input)?;
544	/// assert_eq!(expected, got);
545	///
546	/// # Ok::<(), Box<dyn std::error::Error>>(())
547	/// ```
548	///
549	/// # Example: specifying the bounds of a search
550	///
551	/// This example shows how providing the bounds of a search can produce
552	/// different results than simply sub-slicing the haystack.
553	///
554	/// ```
555	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
556	///
557	/// // N.B. We disable Unicode here so that we use a simple ASCII word
558	/// // boundary. Alternatively, we could enable heuristic support for
559	/// // Unicode word boundaries since our haystack is pure ASCII.
560	/// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
561	/// let mut cache = dfa.create_cache();
562	/// let haystack = "foo123bar";
563	///
564	/// // Since we sub-slice the haystack, the search doesn't know about the
565	/// // larger context and assumes that `123` is surrounded by word
566	/// // boundaries. And of course, the match position is reported relative
567	/// // to the sub-slice as well, which means we get `3` instead of `6`.
568	/// let expected = Some(HalfMatch::must(`0`, `3`));
569	/// let got = dfa.try_search_fwd(
570	/// &mut cache,
571	/// &Input::new(&haystack[`3`..`6`]),
572	/// )?;
573	/// assert_eq!(expected, got);
574	///
575	/// // But if we provide the bounds of the search within the context of the
576	/// // entire haystack, then the search can take the surrounding context
577	/// // into account. (And if we did find a match, it would be reported
578	/// // as a valid offset into `haystack` instead of its sub-slice.)
579	/// let expected = None;
580	/// let got = dfa.try_search_fwd(
581	/// &mut cache,
582	/// &Input::new(haystack).range(`3`..`6`),
583	/// )?;
584	/// assert_eq!(expected, got);
585	///
586	/// # Ok::<(), Box<dyn std::error::Error>>(())
587	/// ```
588	#[inline]
589	pub fn try_search_fwd(
590	&self,
591	cache: &mut Cache,
592	input: &Input<'_>,
593	) -> Result<Option<HalfMatch>, MatchError> {
594	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
595	let hm = match search::find_fwd(self, cache, input)? {
596	None => return Ok(None),
597	Some(hm) if !utf8empty => return Ok(Some(hm)),
598	Some(hm) => hm,
599	};
600	// We get to this point when we know our DFA can match the empty string
601	// AND when UTF-8 mode is enabled. In this case, we skip any matches
602	// whose offset splits a codepoint. Such a match is necessarily a
603	// zero-width match, because UTF-8 mode requires the underlying NFA
604	// to be built such that all non-empty matches span valid UTF-8.
605	// Therefore, any match that ends in the middle of a codepoint cannot
606	// be part of a span of valid UTF-8 and thus must be an empty match.
607	// In such cases, we skip it, so as not to report matches that split a
608	// codepoint.
609	//
610	// Note that this is not a checked assumption. Callers can* provide an*
611	// NFA with UTF-8 mode enabled but produces non-empty matches that span
612	// invalid UTF-8. But doing so is documented to result in unspecified
613	// behavior.
614	empty::skip_splits_fwd(input, hm, hm.offset(), \|input\| {
615	let got = search::find_fwd(self, cache, input)?;
616	Ok(got.map(\|hm\| (hm, hm.offset())))
617	})
618	}
619
620	/// Executes a reverse search and returns the start of the position of the
621	/// leftmost match that is found. If no match exists, then `None` is
622	/// returned.
623	///
624	/// # Errors
625	///
626	/// This routine errors if the search could not complete. This can occur
627	/// in a number of circumstances:
628	///
629	/// The configuration of the lazy DFA may permit it to "quit" the search.*
630	/// For example, setting quit bytes or enabling heuristic support for
631	/// Unicode word boundaries. The default configuration does not enable any
632	/// option that could result in the lazy DFA quitting.
633	/// The configuration of the lazy DFA may also permit it to "give up"*
634	/// on a search if it makes ineffective use of its transition table
635	/// cache. The default configuration does not enable this by default,
636	/// although it is typically a good idea to.
637	/// When the provided `Input` configuration is not supported. For*
638	/// example, by providing an unsupported anchor mode.
639	///
640	/// When a search returns an error, callers cannot know whether a match
641	/// exists or not.
642	///
643	/// # Example
644	///
645	/// This routine is principally useful when used in
646	/// conjunction with the
647	/// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
648	/// configuration. In general, it's unlikely to be correct to use both
649	/// `try_search_fwd` and `try_search_rev` with the same DFA since any
650	/// particular DFA will only support searching in one direction with
651	/// respect to the pattern.
652	///
653	/// ```
654	/// use regex_automata::{
655	/// nfa::thompson,
656	/// hybrid::dfa::DFA,
657	/// HalfMatch, Input,
658	/// };
659	///
660	/// let dfa = DFA::builder()
661	/// .thompson(thompson::Config::new().reverse(`true`))
662	/// .build("foo[0-9]+")?;
663	/// let mut cache = dfa.create_cache();
664	/// let expected = HalfMatch::must(`0`, `0`);
665	/// assert_eq!(
666	/// Some(expected),
667	/// dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?,
668	/// );
669	///
670	/// // Even though a match is found after reading the last byte (`c`),
671	/// // the leftmost first match semantics demand that we find the earliest
672	/// // match that prefers earlier parts of the pattern over latter parts.
673	/// let dfa = DFA::builder()
674	/// .thompson(thompson::Config::new().reverse(`true`))
675	/// .build("abc\|c")?;
676	/// let mut cache = dfa.create_cache();
677	/// let expected = HalfMatch::must(`0`, `0`);
678	/// assert_eq!(Some(expected), dfa.try_search_rev(
679	/// &mut cache, &Input::new("abc"))?,
680	/// );
681	///
682	/// # Ok::<(), Box<dyn std::error::Error>>(())
683	/// ```
684	///
685	/// # Example: UTF-8 mode
686	///
687	/// This examples demonstrates that UTF-8 mode applies to reverse
688	/// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
689	/// matches reported must correspond to valid UTF-8 spans. This includes
690	/// prohibiting zero-width matches that split a codepoint.
691	///
692	/// UTF-8 mode is enabled by default. Notice below how the only zero-width
693	/// matches reported are those at UTF-8 boundaries:
694	///
695	/// ```
696	/// use regex_automata::{
697	/// hybrid::dfa::DFA,
698	/// nfa::thompson,
699	/// HalfMatch, Input, MatchKind,
700	/// };
701	///
702	/// let dfa = DFA::builder()
703	/// .thompson(thompson::Config::new().reverse(`true`))
704	/// .build(r"")?;
705	/// let mut cache = dfa.create_cache();
706	///
707	/// // Run the reverse DFA to collect all matches.
708	/// let mut input = Input::new("☃");
709	/// let mut matches = vec![];
710	/// loop {
711	/// match dfa.try_search_rev(&mut cache, &input)? {
712	/// None => break,
713	/// Some(hm) => {
714	/// matches.push(hm);
715	/// if hm.offset() == `0` \|\| input.end() == `0` {
716	/// break;
717	/// } else if hm.offset() < input.end() {
718	/// input.set_end(hm.offset());
719	/// } else {
720	/// // This is only necessary to handle zero-width
721	/// // matches, which of course occur in this example.
722	/// // Without this, the search would never advance
723	/// // backwards beyond the initial match.
724	/// input.set_end(input.end() - `1`);
725	/// }
726	/// }
727	/// }
728	/// }
729	///
730	/// // No matches split a codepoint.
731	/// let expected = vec![
732	/// HalfMatch::must(`0`, `3`),
733	/// HalfMatch::must(`0`, `0`),
734	/// ];
735	/// assert_eq!(expected, matches);
736	///
737	/// # Ok::<(), Box<dyn std::error::Error>>(())
738	/// ```
739	///
740	/// Now let's look at the same example, but with UTF-8 mode on the
741	/// underlying NFA disabled:
742	///
743	/// ```
744	/// use regex_automata::{
745	/// hybrid::dfa::DFA,
746	/// nfa::thompson,
747	/// HalfMatch, Input, MatchKind,
748	/// };
749	///
750	/// let dfa = DFA::builder()
751	/// .thompson(thompson::Config::new().reverse(`true`).utf8(`false`))
752	/// .build(r"")?;
753	/// let mut cache = dfa.create_cache();
754	///
755	/// // Run the reverse DFA to collect all matches.
756	/// let mut input = Input::new("☃");
757	/// let mut matches = vec![];
758	/// loop {
759	/// match dfa.try_search_rev(&mut cache, &input)? {
760	/// None => break,
761	/// Some(hm) => {
762	/// matches.push(hm);
763	/// if hm.offset() == `0` \|\| input.end() == `0` {
764	/// break;
765	/// } else if hm.offset() < input.end() {
766	/// input.set_end(hm.offset());
767	/// } else {
768	/// // This is only necessary to handle zero-width
769	/// // matches, which of course occur in this example.
770	/// // Without this, the search would never advance
771	/// // backwards beyond the initial match.
772	/// input.set_end(input.end() - `1`);
773	/// }
774	/// }
775	/// }
776	/// }
777	///
778	/// // No matches split a codepoint.
779	/// let expected = vec![
780	/// HalfMatch::must(`0`, `3`),
781	/// HalfMatch::must(`0`, `2`),
782	/// HalfMatch::must(`0`, `1`),
783	/// HalfMatch::must(`0`, `0`),
784	/// ];
785	/// assert_eq!(expected, matches);
786	///
787	/// # Ok::<(), Box<dyn std::error::Error>>(())
788	/// ```
789	#[inline]
790	pub fn try_search_rev(
791	&self,
792	cache: &mut Cache,
793	input: &Input<'_>,
794	) -> Result<Option<HalfMatch>, MatchError> {
795	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
796	let hm = match search::find_rev(self, cache, input)? {
797	None => return Ok(None),
798	Some(hm) if !utf8empty => return Ok(Some(hm)),
799	Some(hm) => hm,
800	};
801	empty::skip_splits_rev(input, hm, hm.offset(), \|input\| {
802	let got = search::find_rev(self, cache, input)?;
803	Ok(got.map(\|hm\| (hm, hm.offset())))
804	})
805	}
806
807	/// Executes an overlapping forward search and returns the end position of
808	/// matches as they are found. If no match exists, then `None` is returned.
809	///
810	/// This routine is principally only useful when searching for multiple
811	/// patterns on inputs where multiple patterns may match the same regions
812	/// of text. In particular, callers must preserve the automaton's search
813	/// state from prior calls so that the implementation knows where the last
814	/// match occurred.
815	///
816	/// When using this routine to implement an iterator of overlapping
817	/// matches, the `start` of the search should remain invariant throughout
818	/// iteration. The `OverlappingState` given to the search will keep track
819	/// of the current position of the search. (This is because multiple
820	/// matches may be reported at the same position, so only the search
821	/// implementation itself knows when to advance the position.)
822	///
823	/// If for some reason you want the search to forget about its previous
824	/// state and restart the search at a particular position, then setting the
825	/// state to [`OverlappingState::start`] will accomplish that.
826	///
827	/// # Errors
828	///
829	/// This routine errors if the search could not complete. This can occur
830	/// in a number of circumstances:
831	///
832	/// The configuration of the lazy DFA may permit it to "quit" the search.*
833	/// For example, setting quit bytes or enabling heuristic support for
834	/// Unicode word boundaries. The default configuration does not enable any
835	/// option that could result in the lazy DFA quitting.
836	/// The configuration of the lazy DFA may also permit it to "give up"*
837	/// on a search if it makes ineffective use of its transition table
838	/// cache. The default configuration does not enable this by default,
839	/// although it is typically a good idea to.
840	/// When the provided `Input` configuration is not supported. For*
841	/// example, by providing an unsupported anchor mode.
842	///
843	/// When a search returns an error, callers cannot know whether a match
844	/// exists or not.
845	///
846	/// # Example
847	///
848	/// This example shows how to run a basic overlapping search. Notice
849	/// that we build the automaton with a `MatchKind::All` configuration.
850	/// Overlapping searches are unlikely to work as one would expect when
851	/// using the default `MatchKind::LeftmostFirst` match semantics, since
852	/// leftmost-first matching is fundamentally incompatible with overlapping
853	/// searches. Namely, overlapping searches need to report matches as they
854	/// are seen, where as leftmost-first searches will continue searching even
855	/// after a match has been observed in order to find the conventional end
856	/// position of the match. More concretely, leftmost-first searches use
857	/// dead states to terminate a search after a specific match can no longer
858	/// be extended. Overlapping searches instead do the opposite by continuing
859	/// the search to find totally new matches (potentially of other patterns).
860	///
861	/// ```
862	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
863	/// use regex_automata::{
864	/// hybrid::dfa::{DFA, OverlappingState},
865	/// HalfMatch, Input, MatchKind,
866	/// };
867	///
868	/// let dfa = DFA::builder()
869	/// .configure(DFA::config().match_kind(MatchKind::All))
870	/// .build_many(&[r"\w+$", r"\S+$"])?;
871	/// let mut cache = dfa.create_cache();
872	///
873	/// let haystack = "@foo";
874	/// let mut state = OverlappingState::start();
875	///
876	/// let expected = Some(HalfMatch::must(`1`, `4`));
877	/// dfa.try_search_overlapping_fwd(
878	/// &mut cache, &Input::new(haystack), &mut state,
879	/// )?;
880	/// assert_eq!(expected, state.get_match());
881	///
882	/// // The first pattern also matches at the same position, so re-running
883	/// // the search will yield another match. Notice also that the first
884	/// // pattern is returned after the second. This is because the second
885	/// // pattern begins its match before the first, is therefore an earlier
886	/// // match and is thus reported first.
887	/// let expected = Some(HalfMatch::must(`0`, `4`));
888	/// dfa.try_search_overlapping_fwd(
889	/// &mut cache, &Input::new(haystack), &mut state,
890	/// )?;
891	/// assert_eq!(expected, state.get_match());
892	///
893	/// # Ok::<(), Box<dyn std::error::Error>>(())
894	/// ```
895	#[inline]
896	pub fn try_search_overlapping_fwd(
897	&self,
898	cache: &mut Cache,
899	input: &Input<'_>,
900	state: &mut OverlappingState,
901	) -> Result<(), MatchError> {
902	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
903	search::find_overlapping_fwd(self, cache, input, state)?;
904	match state.get_match() {
905	None => Ok(()),
906	Some(_) if !utf8empty => Ok(()),
907	Some(_) => skip_empty_utf8_splits_overlapping(
908	input,
909	state,
910	\|input, state\| {
911	search::find_overlapping_fwd(self, cache, input, state)
912	},
913	),
914	}
915	}
916
917	/// Executes a reverse overlapping search and returns the start of the
918	/// position of the leftmost match that is found. If no match exists, then
919	/// `None` is returned.
920	///
921	/// When using this routine to implement an iterator of overlapping
922	/// matches, the `start` of the search should remain invariant throughout
923	/// iteration. The `OverlappingState` given to the search will keep track
924	/// of the current position of the search. (This is because multiple
925	/// matches may be reported at the same position, so only the search
926	/// implementation itself knows when to advance the position.)
927	///
928	/// If for some reason you want the search to forget about its previous
929	/// state and restart the search at a particular position, then setting the
930	/// state to [`OverlappingState::start`] will accomplish that.
931	///
932	/// # Errors
933	///
934	/// This routine errors if the search could not complete. This can occur
935	/// in a number of circumstances:
936	///
937	/// The configuration of the lazy DFA may permit it to "quit" the search.*
938	/// For example, setting quit bytes or enabling heuristic support for
939	/// Unicode word boundaries. The default configuration does not enable any
940	/// option that could result in the lazy DFA quitting.
941	/// The configuration of the lazy DFA may also permit it to "give up"*
942	/// on a search if it makes ineffective use of its transition table
943	/// cache. The default configuration does not enable this by default,
944	/// although it is typically a good idea to.
945	/// When the provided `Input` configuration is not supported. For*
946	/// example, by providing an unsupported anchor mode.
947	///
948	/// When a search returns an error, callers cannot know whether a match
949	/// exists or not.
950	///
951	/// # Example: UTF-8 mode
952	///
953	/// This examples demonstrates that UTF-8 mode applies to reverse
954	/// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
955	/// matches reported must correspond to valid UTF-8 spans. This includes
956	/// prohibiting zero-width matches that split a codepoint.
957	///
958	/// UTF-8 mode is enabled by default. Notice below how the only zero-width
959	/// matches reported are those at UTF-8 boundaries:
960	///
961	/// ```
962	/// use regex_automata::{
963	/// hybrid::dfa::{DFA, OverlappingState},
964	/// nfa::thompson,
965	/// HalfMatch, Input, MatchKind,
966	/// };
967	///
968	/// let dfa = DFA::builder()
969	/// .configure(DFA::config().match_kind(MatchKind::All))
970	/// .thompson(thompson::Config::new().reverse(`true`))
971	/// .build_many(&[r"", r"☃"])?;
972	/// let mut cache = dfa.create_cache();
973	///
974	/// // Run the reverse DFA to collect all matches.
975	/// let input = Input::new("☃");
976	/// let mut state = OverlappingState::start();
977	/// let mut matches = vec![];
978	/// loop {
979	/// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
980	/// match state.get_match() {
981	/// None => break,
982	/// Some(hm) => matches.push(hm),
983	/// }
984	/// }
985	///
986	/// // No matches split a codepoint.
987	/// let expected = vec![
988	/// HalfMatch::must(`0`, `3`),
989	/// HalfMatch::must(`1`, `0`),
990	/// HalfMatch::must(`0`, `0`),
991	/// ];
992	/// assert_eq!(expected, matches);
993	///
994	/// # Ok::<(), Box<dyn std::error::Error>>(())
995	/// ```
996	///
997	/// Now let's look at the same example, but with UTF-8 mode on the
998	/// underlying NFA disabled:
999	///
1000	/// ```
1001	/// use regex_automata::{
1002	/// hybrid::dfa::{DFA, OverlappingState},
1003	/// nfa::thompson,
1004	/// HalfMatch, Input, MatchKind,
1005	/// };
1006	///
1007	/// let dfa = DFA::builder()
1008	/// .configure(DFA::config().match_kind(MatchKind::All))
1009	/// .thompson(thompson::Config::new().reverse(`true`).utf8(`false`))
1010	/// .build_many(&[r"", r"☃"])?;
1011	/// let mut cache = dfa.create_cache();
1012	///
1013	/// // Run the reverse DFA to collect all matches.
1014	/// let input = Input::new("☃");
1015	/// let mut state = OverlappingState::start();
1016	/// let mut matches = vec![];
1017	/// loop {
1018	/// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
1019	/// match state.get_match() {
1020	/// None => break,
1021	/// Some(hm) => matches.push(hm),
1022	/// }
1023	/// }
1024	///
1025	/// // Now all* positions match, even within a codepoint,*
1026	/// // because we lifted the requirement that matches
1027	/// // correspond to valid UTF-8 spans.
1028	/// let expected = vec![
1029	/// HalfMatch::must(`0`, `3`),
1030	/// HalfMatch::must(`0`, `2`),
1031	/// HalfMatch::must(`0`, `1`),
1032	/// HalfMatch::must(`1`, `0`),
1033	/// HalfMatch::must(`0`, `0`),
1034	/// ];
1035	/// assert_eq!(expected, matches);
1036	///
1037	/// # Ok::<(), Box<dyn std::error::Error>>(())
1038	/// ```
1039	#[inline]
1040	pub fn try_search_overlapping_rev(
1041	&self,
1042	cache: &mut Cache,
1043	input: &Input<'_>,
1044	state: &mut OverlappingState,
1045	) -> Result<(), MatchError> {
1046	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1047	search::find_overlapping_rev(self, cache, input, state)?;
1048	match state.get_match() {
1049	None => Ok(()),
1050	Some(_) if !utf8empty => Ok(()),
1051	Some(_) => skip_empty_utf8_splits_overlapping(
1052	input,
1053	state,
1054	\|input, state\| {
1055	search::find_overlapping_rev(self, cache, input, state)
1056	},
1057	),
1058	}
1059	}
1060
1061	/// Writes the set of patterns that match anywhere in the given search
1062	/// configuration to `patset`. If multiple patterns match at the same
1063	/// position and the underlying DFA supports overlapping matches, then all
1064	/// matching patterns are written to the given set.
1065	///
1066	/// Unless all of the patterns in this DFA are anchored, then generally
1067	/// speaking, this will visit every byte in the haystack.
1068	///
1069	/// This search routine does not* clear the pattern set. This gives some*
1070	/// flexibility to the caller (e.g., running multiple searches with the
1071	/// same pattern set), but does make the API bug-prone if you're reusing
1072	/// the same pattern set for multiple searches but intended them to be
1073	/// independent.
1074	///
1075	/// If a pattern ID matched but the given `PatternSet` does not have
1076	/// sufficient capacity to store it, then it is not inserted and silently
1077	/// dropped.
1078	///
1079	/// # Errors
1080	///
1081	/// This routine errors if the search could not complete. This can occur
1082	/// in a number of circumstances:
1083	///
1084	/// The configuration of the lazy DFA may permit it to "quit" the search.*
1085	/// For example, setting quit bytes or enabling heuristic support for
1086	/// Unicode word boundaries. The default configuration does not enable any
1087	/// option that could result in the lazy DFA quitting.
1088	/// The configuration of the lazy DFA may also permit it to "give up"*
1089	/// on a search if it makes ineffective use of its transition table
1090	/// cache. The default configuration does not enable this by default,
1091	/// although it is typically a good idea to.
1092	/// When the provided `Input` configuration is not supported. For*
1093	/// example, by providing an unsupported anchor mode.
1094	///
1095	/// When a search returns an error, callers cannot know whether a match
1096	/// exists or not.
1097	///
1098	/// # Example
1099	///
1100	/// This example shows how to find all matching patterns in a haystack,
1101	/// even when some patterns match at the same position as other patterns.
1102	///
1103	/// ```
1104	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1105	/// use regex_automata::{
1106	/// hybrid::dfa::DFA,
1107	/// Input, MatchKind, PatternSet,
1108	/// };
1109	///
1110	/// let patterns = &[
1111	/// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
1112	/// ];
1113	/// let dfa = DFA::builder()
1114	/// .configure(DFA::config().match_kind(MatchKind::All))
1115	/// .build_many(patterns)?;
1116	/// let mut cache = dfa.create_cache();
1117	///
1118	/// let input = Input::new("foobar");
1119	/// let mut patset = PatternSet::new(dfa.pattern_len());
1120	/// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?;
1121	/// let expected = vec![`0`, `2`, `3`, `4`, `6`];
1122	/// let got: Vec<usize> = patset.iter().map(\|p\| p.as_usize()).collect();
1123	/// assert_eq!(expected, got);
1124	///
1125	/// # Ok::<(), Box<dyn std::error::Error>>(())
1126	/// ```
1127	#[inline]
1128	pub fn try_which_overlapping_matches(
1129	&self,
1130	cache: &mut Cache,
1131	input: &Input<'_>,
1132	patset: &mut PatternSet,
1133	) -> Result<(), MatchError> {
1134	let mut state = OverlappingState::start();
1135	while let Some(m) = {
1136	self.try_search_overlapping_fwd(cache, input, &mut state)?;
1137	state.get_match()
1138	} {
1139	let _ = patset.try_insert(m.pattern());
1140	// There's nothing left to find, so we can stop. Or the caller
1141	// asked us to.
1142	if patset.is_full() \|\| input.get_earliest() {
1143	break;
1144	}
1145	}
1146	Ok(())
1147	}
1148	}
1149
1150	impl DFA {
1151	/// Transitions from the current state to the next state, given the next
1152	/// byte of input.
1153	///
1154	/// The given cache is used to either reuse pre-computed state
1155	/// transitions, or to store this newly computed transition for future
1156	/// reuse. Thus, this routine guarantees that it will never return a state
1157	/// ID that has an "unknown" tag.
1158	///
1159	/// # State identifier validity
1160	///
1161	/// The only valid value for `current` is the lazy state ID returned
1162	/// by the most recent call to `next_state`, `next_state_untagged`,
1163	/// `next_state_untagged_unchecked`, `start_state_forward` or
1164	/// `state_state_reverse` for the given `cache`. Any state ID returned from
1165	/// prior calls to these routines (with the same `cache`) is considered
1166	/// invalid (even if it gives an appearance of working). State IDs returned
1167	/// from _any_ prior call for different `cache` values are also always
1168	/// invalid.
1169	///
1170	/// The returned ID is always a valid ID when `current` refers to a valid
1171	/// ID. Moreover, this routine is defined for all possible values of
1172	/// `input`.
1173	///
1174	/// These validity rules are not checked, even in debug mode. Callers are
1175	/// required to uphold these rules themselves.
1176	///
1177	/// Violating these state ID validity rules will not sacrifice memory
1178	/// safety, but _may_ produce an incorrect result or a panic.
1179	///
1180	/// # Panics
1181	///
1182	/// If the given ID does not refer to a valid state, then this routine
1183	/// may panic but it also may not panic and instead return an invalid or
1184	/// incorrect ID.
1185	///
1186	/// # Example
1187	///
1188	/// This shows a simplistic example for walking a lazy DFA for a given
1189	/// haystack by using the `next_state` method.
1190	///
1191	/// ```
1192	/// use regex_automata::{hybrid::dfa::DFA, Input};
1193	///
1194	/// let dfa = DFA::new(r"[a-z]+r")?;
1195	/// let mut cache = dfa.create_cache();
1196	/// let haystack = "bar".as_bytes();
1197	///
1198	/// // The start state is determined by inspecting the position and the
1199	/// // initial bytes of the haystack.
1200	/// let mut sid = dfa.start_state_forward(
1201	/// &mut cache, &Input::new(haystack),
1202	/// )?;
1203	/// // Walk all the bytes in the haystack.
1204	/// for &b in haystack {
1205	/// sid = dfa.next_state(&mut cache, sid, b)?;
1206	/// }
1207	/// // Matches are always delayed by 1 byte, so we must explicitly walk the
1208	/// // special "EOI" transition at the end of the search.
1209	/// sid = dfa.next_eoi_state(&mut cache, sid)?;
1210	/// assert!(sid.is_match());
1211	///
1212	/// # Ok::<(), Box<dyn std::error::Error>>(())
1213	/// ```
1214	#[inline]
1215	pub fn next_state(
1216	&self,
1217	cache: &mut Cache,
1218	current: LazyStateID,
1219	input: u8,
1220	) -> Result<LazyStateID, CacheError> {
1221	let class = usize::from(self.classes.get(input));
1222	let offset = current.as_usize_untagged() + class;
1223	let sid = cache.trans[offset];
1224	if !sid.is_unknown() {
1225	return Ok(sid);
1226	}
1227	let unit = alphabet::Unit::u8(input);
1228	Lazy::new(self, cache).cache_next_state(current, unit)
1229	}
1230
1231	/// Transitions from the current state to the next state, given the next
1232	/// byte of input and a state ID that is not tagged.
1233	///
1234	/// The only reason to use this routine is performance. In particular, the
1235	/// `next_state` method needs to do some additional checks, among them is
1236	/// to account for identifiers to states that are not yet computed. In
1237	/// such a case, the transition is computed on the fly. However, if it is
1238	/// known that the `current` state ID is untagged, then these checks can be
1239	/// omitted.
1240	///
1241	/// Since this routine does not compute states on the fly, it does not
1242	/// modify the cache and thus cannot return an error. Consequently, `cache`
1243	/// does not need to be mutable and it is possible for this routine to
1244	/// return a state ID corresponding to the special "unknown" state. In
1245	/// this case, it is the caller's responsibility to use the prior state
1246	/// ID and `input` with `next_state` in order to force the computation of
1247	/// the unknown transition. Otherwise, trying to use the "unknown" state
1248	/// ID will just result in transitioning back to itself, and thus never
1249	/// terminating. (This is technically a special exemption to the state ID
1250	/// validity rules, but is permissible since this routine is guarateed to
1251	/// never mutate the given `cache`, and thus the identifier is guaranteed
1252	/// to remain valid.)
1253	///
1254	/// See [`LazyStateID`] for more details on what it means for a state ID
1255	/// to be tagged. Also, see
1256	/// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked)
1257	/// for this same idea, but with bounds checks forcefully elided.
1258	///
1259	/// # State identifier validity
1260	///
1261	/// The only valid value for `current` is an untagged* lazy*
1262	/// state ID returned by the most recent call to `next_state`,
1263	/// `next_state_untagged`, `next_state_untagged_unchecked`,
1264	/// `start_state_forward` or `state_state_reverse` for the given `cache`.
1265	/// Any state ID returned from prior calls to these routines (with the
1266	/// same `cache`) is considered invalid (even if it gives an appearance
1267	/// of working). State IDs returned from _any_ prior call for different
1268	/// `cache` values are also always invalid.
1269	///
1270	/// The returned ID is always a valid ID when `current` refers to a valid
1271	/// ID, although it may be tagged. Moreover, this routine is defined for
1272	/// all possible values of `input`.
1273	///
1274	/// Not all validity rules are checked, even in debug mode. Callers are
1275	/// required to uphold these rules themselves.
1276	///
1277	/// Violating these state ID validity rules will not sacrifice memory
1278	/// safety, but _may_ produce an incorrect result or a panic.
1279	///
1280	/// # Panics
1281	///
1282	/// If the given ID does not refer to a valid state, then this routine
1283	/// may panic but it also may not panic and instead return an invalid or
1284	/// incorrect ID.
1285	///
1286	/// # Example
1287	///
1288	/// This shows a simplistic example for walking a lazy DFA for a given
1289	/// haystack by using the `next_state_untagged` method where possible.
1290	///
1291	/// ```
1292	/// use regex_automata::{hybrid::dfa::DFA, Input};
1293	///
1294	/// let dfa = DFA::new(r"[a-z]+r")?;
1295	/// let mut cache = dfa.create_cache();
1296	/// let haystack = "bar".as_bytes();
1297	///
1298	/// // The start state is determined by inspecting the position and the
1299	/// // initial bytes of the haystack.
1300	/// let mut sid = dfa.start_state_forward(
1301	/// &mut cache, &Input::new(haystack),
1302	/// )?;
1303	/// // Walk all the bytes in the haystack.
1304	/// let mut at = `0`;
1305	/// while at < haystack.len() {
1306	/// if sid.is_tagged() {
1307	/// sid = dfa.next_state(&mut cache, sid, haystack[at])?;
1308	/// } else {
1309	/// let mut prev_sid = sid;
1310	/// // We attempt to chew through as much as we can while moving
1311	/// // through untagged state IDs. Thus, the transition function
1312	/// // does less work on average per byte. (Unrolling this loop
1313	/// // may help even more.)
1314	/// while at < haystack.len() {
1315	/// prev_sid = sid;
1316	/// sid = dfa.next_state_untagged(
1317	/// &mut cache, sid, haystack[at],
1318	/// );
1319	/// at += `1`;
1320	/// if sid.is_tagged() {
1321	/// break;
1322	/// }
1323	/// }
1324	/// // We must ensure that we never proceed to the next iteration
1325	/// // with an unknown state ID. If we don't account for this
1326	/// // case, then search isn't guaranteed to terminate since all
1327	/// // transitions on unknown states loop back to itself.
1328	/// if sid.is_unknown() {
1329	/// sid = dfa.next_state(
1330	/// &mut cache, prev_sid, haystack[at - `1`],
1331	/// )?;
1332	/// }
1333	/// }
1334	/// }
1335	/// // Matches are always delayed by 1 byte, so we must explicitly walk the
1336	/// // special "EOI" transition at the end of the search.
1337	/// sid = dfa.next_eoi_state(&mut cache, sid)?;
1338	/// assert!(sid.is_match());
1339	///
1340	/// # Ok::<(), Box<dyn std::error::Error>>(())
1341	/// ```
1342	#[inline]
1343	pub fn next_state_untagged(
1344	&self,
1345	cache: &Cache,
1346	current: LazyStateID,
1347	input: u8,
1348	) -> LazyStateID {
1349	debug_assert!(!current.is_tagged());
1350	let class = usize::from(self.classes.get(input));
1351	let offset = current.as_usize_unchecked() + class;
1352	cache.trans[offset]
1353	}
1354
1355	/// Transitions from the current state to the next state, eliding bounds
1356	/// checks, given the next byte of input and a state ID that is not tagged.
1357	///
1358	/// The only reason to use this routine is performance. In particular, the
1359	/// `next_state` method needs to do some additional checks, among them is
1360	/// to account for identifiers to states that are not yet computed. In
1361	/// such a case, the transition is computed on the fly. However, if it is
1362	/// known that the `current` state ID is untagged, then these checks can be
1363	/// omitted.
1364	///
1365	/// Since this routine does not compute states on the fly, it does not
1366	/// modify the cache and thus cannot return an error. Consequently, `cache`
1367	/// does not need to be mutable and it is possible for this routine to
1368	/// return a state ID corresponding to the special "unknown" state. In
1369	/// this case, it is the caller's responsibility to use the prior state
1370	/// ID and `input` with `next_state` in order to force the computation of
1371	/// the unknown transition. Otherwise, trying to use the "unknown" state
1372	/// ID will just result in transitioning back to itself, and thus never
1373	/// terminating. (This is technically a special exemption to the state ID
1374	/// validity rules, but is permissible since this routine is guarateed to
1375	/// never mutate the given `cache`, and thus the identifier is guaranteed
1376	/// to remain valid.)
1377	///
1378	/// See [`LazyStateID`] for more details on what it means for a state ID
1379	/// to be tagged. Also, see
1380	/// [`next_state_untagged`](DFA::next_state_untagged)
1381	/// for this same idea, but with memory safety guaranteed by retaining
1382	/// bounds checks.
1383	///
1384	/// # State identifier validity
1385	///
1386	/// The only valid value for `current` is an untagged* lazy*
1387	/// state ID returned by the most recent call to `next_state`,
1388	/// `next_state_untagged`, `next_state_untagged_unchecked`,
1389	/// `start_state_forward` or `state_state_reverse` for the given `cache`.
1390	/// Any state ID returned from prior calls to these routines (with the
1391	/// same `cache`) is considered invalid (even if it gives an appearance
1392	/// of working). State IDs returned from _any_ prior call for different
1393	/// `cache` values are also always invalid.
1394	///
1395	/// The returned ID is always a valid ID when `current` refers to a valid
1396	/// ID, although it may be tagged. Moreover, this routine is defined for
1397	/// all possible values of `input`.
1398	///
1399	/// Not all validity rules are checked, even in debug mode. Callers are
1400	/// required to uphold these rules themselves.
1401	///
1402	/// Violating these state ID validity rules will not sacrifice memory
1403	/// safety, but _may_ produce an incorrect result or a panic.
1404	///
1405	/// # Safety
1406	///
1407	/// Callers of this method must guarantee that `current` refers to a valid
1408	/// state ID according to the rules described above. If `current` is not a
1409	/// valid state ID for this automaton, then calling this routine may result
1410	/// in undefined behavior.
1411	///
1412	/// If `current` is valid, then the ID returned is valid for all possible
1413	/// values of `input`.
1414	#[inline]
1415	pub unsafe fn next_state_untagged_unchecked(
1416	&self,
1417	cache: &Cache,
1418	current: LazyStateID,
1419	input: u8,
1420	) -> LazyStateID {
1421	debug_assert!(!current.is_tagged());
1422	let class = usize::from(self.classes.get(input));
1423	let offset = current.as_usize_unchecked() + class;
1424	*cache.trans.get_unchecked(offset)
1425	}
1426
1427	/// Transitions from the current state to the next state for the special
1428	/// EOI symbol.
1429	///
1430	/// The given cache is used to either reuse pre-computed state
1431	/// transitions, or to store this newly computed transition for future
1432	/// reuse. Thus, this routine guarantees that it will never return a state
1433	/// ID that has an "unknown" tag.
1434	///
1435	/// This routine must be called at the end of every search in a correct
1436	/// implementation of search. Namely, lazy DFAs in this crate delay matches
1437	/// by one byte in order to support look-around operators. Thus, after
1438	/// reaching the end of a haystack, a search implementation must follow one
1439	/// last EOI transition.
1440	///
1441	/// It is best to think of EOI as an additional symbol in the alphabet of a
1442	/// DFA that is distinct from every other symbol. That is, the alphabet of
1443	/// lazy DFAs in this crate has a logical size of 257 instead of 256, where
1444	/// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
1445	/// physical alphabet size may be smaller because of alphabet compression
1446	/// via equivalence classes, but EOI is always represented somehow in the
1447	/// alphabet.)
1448	///
1449	/// # State identifier validity
1450	///
1451	/// The only valid value for `current` is the lazy state ID returned
1452	/// by the most recent call to `next_state`, `next_state_untagged`,
1453	/// `next_state_untagged_unchecked`, `start_state_forward` or
1454	/// `state_state_reverse` for the given `cache`. Any state ID returned from
1455	/// prior calls to these routines (with the same `cache`) is considered
1456	/// invalid (even if it gives an appearance of working). State IDs returned
1457	/// from _any_ prior call for different `cache` values are also always
1458	/// invalid.
1459	///
1460	/// The returned ID is always a valid ID when `current` refers to a valid
1461	/// ID.
1462	///
1463	/// These validity rules are not checked, even in debug mode. Callers are
1464	/// required to uphold these rules themselves.
1465	///
1466	/// Violating these state ID validity rules will not sacrifice memory
1467	/// safety, but _may_ produce an incorrect result or a panic.
1468	///
1469	/// # Panics
1470	///
1471	/// If the given ID does not refer to a valid state, then this routine
1472	/// may panic but it also may not panic and instead return an invalid or
1473	/// incorrect ID.
1474	///
1475	/// # Example
1476	///
1477	/// This shows a simplistic example for walking a DFA for a given haystack,
1478	/// and then finishing the search with the final EOI transition.
1479	///
1480	/// ```
1481	/// use regex_automata::{hybrid::dfa::DFA, Input};
1482	///
1483	/// let dfa = DFA::new(r"[a-z]+r")?;
1484	/// let mut cache = dfa.create_cache();
1485	/// let haystack = "bar".as_bytes();
1486	///
1487	/// // The start state is determined by inspecting the position and the
1488	/// // initial bytes of the haystack.
1489	/// let mut sid = dfa.start_state_forward(
1490	/// &mut cache, &Input::new(haystack),
1491	/// )?;
1492	/// // Walk all the bytes in the haystack.
1493	/// for &b in haystack {
1494	/// sid = dfa.next_state(&mut cache, sid, b)?;
1495	/// }
1496	/// // Matches are always delayed by 1 byte, so we must explicitly walk
1497	/// // the special "EOI" transition at the end of the search. Without this
1498	/// // final transition, the assert below will fail since the DFA will not
1499	/// // have entered a match state yet!
1500	/// sid = dfa.next_eoi_state(&mut cache, sid)?;
1501	/// assert!(sid.is_match());
1502	///
1503	/// # Ok::<(), Box<dyn std::error::Error>>(())
1504	/// ```
1505	#[inline]
1506	pub fn next_eoi_state(
1507	&self,
1508	cache: &mut Cache,
1509	current: LazyStateID,
1510	) -> Result<LazyStateID, CacheError> {
1511	let eoi = self.classes.eoi().as_usize();
1512	let offset = current.as_usize_untagged() + eoi;
1513	let sid = cache.trans[offset];
1514	if !sid.is_unknown() {
1515	return Ok(sid);
1516	}
1517	let unit = self.classes.eoi();
1518	Lazy::new(self, cache).cache_next_state(current, unit)
1519	}
1520
1521	/// Return the ID of the start state for this lazy DFA for the given
1522	/// starting configuration.
1523	///
1524	/// Unlike typical DFA implementations, the start state for DFAs in this
1525	/// crate is dependent on a few different factors:
1526	///
1527	/// The* [`Anchored`] mode of the search. Unanchored, anchored and
1528	/// anchored searches for a specific [`PatternID`] all use different start
1529	/// states.
1530	/// Whether a "look-behind" byte exists. For example, the `^` anchor*
1531	/// matches if and only if there is no look-behind byte.
1532	/// The specific value of that look-behind byte. For example, a `(?m:^)`*
1533	/// assertion only matches when there is either no look-behind byte, or
1534	/// when the look-behind byte is a line terminator.
1535	///
1536	/// The [starting configuration](start::Config) provides the above
1537	/// information.
1538	///
1539	/// This routine can be used for either forward or reverse searches.
1540	/// Although, as a convenience, if you have an [`Input`], then it
1541	/// may be more succinct to use [`DFA::start_state_forward`] or
1542	/// [`DFA::start_state_reverse`]. Note, for example, that the convenience
1543	/// routines return a [`MatchError`] on failure where as this routine
1544	/// returns a [`StartError`].
1545	///
1546	/// # Errors
1547	///
1548	/// This may return a [`StartError`] if the search needs to give up when
1549	/// determining the start state (for example, if it sees a "quit" byte
1550	/// or if the cache has become inefficient). This can also return an
1551	/// error if the given configuration contains an unsupported [`Anchored`]
1552	/// configuration.
1553	#[cfg_attr(feature = "perf-inline", inline(always))]
1554	pub fn start_state(
1555	&self,
1556	cache: &mut Cache,
1557	config: &start::Config,
1558	) -> Result<LazyStateID, StartError> {
1559	let lazy = LazyRef::new(self, cache);
1560	let anchored = config.get_anchored();
1561	let start = match config.get_look_behind() {
1562	None => Start::Text,
1563	Some(byte) => {
1564	if !self.quitset.is_empty() && self.quitset.contains(byte) {
1565	return Err(StartError::quit(byte));
1566	}
1567	self.start_map.get(byte)
1568	}
1569	};
1570	let start_id = lazy.get_cached_start_id(anchored, start)?;
1571	if !start_id.is_unknown() {
1572	return Ok(start_id);
1573	}
1574	Lazy::new(self, cache).cache_start_group(anchored, start)
1575	}
1576
1577	/// Return the ID of the start state for this lazy DFA when executing a
1578	/// forward search.
1579	///
1580	/// This is a convenience routine for calling [`DFA::start_state`] that
1581	/// converts the given [`Input`] to a [start configuration](start::Config).
1582	/// Additionally, if an error occurs, it is converted from a [`StartError`]
1583	/// to a [`MatchError`] using the offset information in the given
1584	/// [`Input`].
1585	///
1586	/// # Errors
1587	///
1588	/// This may return a [`MatchError`] if the search needs to give up when
1589	/// determining the start state (for example, if it sees a "quit" byte or
1590	/// if the cache has become inefficient). This can also return an error if
1591	/// the given `Input` contains an unsupported [`Anchored`] configuration.
1592	#[cfg_attr(feature = "perf-inline", inline(always))]
1593	pub fn start_state_forward(
1594	&self,
1595	cache: &mut Cache,
1596	input: &Input<'_>,
1597	) -> Result<LazyStateID, MatchError> {
1598	let config = start::Config::from_input_forward(input);
1599	self.start_state(cache, &config).map_err(\|err\| match err {
1600	StartError::Cache { .. } => MatchError::gave_up(input.start()),
1601	StartError::Quit { byte } => {
1602	let offset = input
1603	.start()
1604	.checked_sub(`1`)
1605	.expect("no quit in start without look-behind");
1606	MatchError::quit(byte, offset)
1607	}
1608	StartError::UnsupportedAnchored { mode } => {
1609	MatchError::unsupported_anchored(mode)
1610	}
1611	})
1612	}
1613
1614	/// Return the ID of the start state for this lazy DFA when executing a
1615	/// reverse search.
1616	///
1617	/// This is a convenience routine for calling [`DFA::start_state`] that
1618	/// converts the given [`Input`] to a [start configuration](start::Config).
1619	/// Additionally, if an error occurs, it is converted from a [`StartError`]
1620	/// to a [`MatchError`] using the offset information in the given
1621	/// [`Input`].
1622	///
1623	/// # Errors
1624	///
1625	/// This may return a [`MatchError`] if the search needs to give up when
1626	/// determining the start state (for example, if it sees a "quit" byte or
1627	/// if the cache has become inefficient). This can also return an error if
1628	/// the given `Input` contains an unsupported [`Anchored`] configuration.
1629	#[cfg_attr(feature = "perf-inline", inline(always))]
1630	pub fn start_state_reverse(
1631	&self,
1632	cache: &mut Cache,
1633	input: &Input<'_>,
1634	) -> Result<LazyStateID, MatchError> {
1635	let config = start::Config::from_input_reverse(input);
1636	self.start_state(cache, &config).map_err(\|err\| match err {
1637	StartError::Cache { .. } => MatchError::gave_up(input.end()),
1638	StartError::Quit { byte } => {
1639	let offset = input.end();
1640	MatchError::quit(byte, offset)
1641	}
1642	StartError::UnsupportedAnchored { mode } => {
1643	MatchError::unsupported_anchored(mode)
1644	}
1645	})
1646	}
1647
1648	/// Returns the total number of patterns that match in this state.
1649	///
1650	/// If the lazy DFA was compiled with one pattern, then this must
1651	/// necessarily always return `1` for all match states.
1652	///
1653	/// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with
1654	/// indices up to (but not including) the length returned by this routine
1655	/// without panicking.
1656	///
1657	/// # Panics
1658	///
1659	/// If the given state is not a match state, then this may either panic
1660	/// or return an incorrect result.
1661	///
1662	/// # Example
1663	///
1664	/// This example shows a simple instance of implementing overlapping
1665	/// matches. In particular, it shows not only how to determine how many
1666	/// patterns have matched in a particular state, but also how to access
1667	/// which specific patterns have matched.
1668	///
1669	/// Notice that we must use [`MatchKind::All`] when building the DFA. If we
1670	/// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be
1671	/// constructed in a way that supports overlapping matches. (It would only
1672	/// report a single pattern that matches at any particular point in time.)
1673	///
1674	/// Another thing to take note of is the patterns used and the order in
1675	/// which the pattern IDs are reported. In the example below, pattern `3`
1676	/// is yielded first. Why? Because it corresponds to the match that
1677	/// appears first. Namely, the `@` symbol is part of `\S+` but not part
1678	/// of any of the other patterns. Since the `\S+` pattern has a match that
1679	/// starts to the left of any other pattern, its ID is returned before any
1680	/// other.
1681	///
1682	/// ```
1683	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1684	/// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind};
1685	///
1686	/// let dfa = DFA::builder()
1687	/// .configure(DFA::config().match_kind(MatchKind::All))
1688	/// .build_many(&[
1689	/// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
1690	/// ])?;
1691	/// let mut cache = dfa.create_cache();
1692	/// let haystack = "@bar".as_bytes();
1693	///
1694	/// // The start state is determined by inspecting the position and the
1695	/// // initial bytes of the haystack.
1696	/// let mut sid = dfa.start_state_forward(
1697	/// &mut cache, &Input::new(haystack),
1698	/// )?;
1699	/// // Walk all the bytes in the haystack.
1700	/// for &b in haystack {
1701	/// sid = dfa.next_state(&mut cache, sid, b)?;
1702	/// }
1703	/// sid = dfa.next_eoi_state(&mut cache, sid)?;
1704	///
1705	/// assert!(sid.is_match());
1706	/// assert_eq!(dfa.match_len(&mut cache, sid), `3`);
1707	/// // The following calls are guaranteed to not panic since `match_len`
1708	/// // returned `3` above.
1709	/// assert_eq!(dfa.match_pattern(&mut cache, sid, `0`).as_usize(), `3`);
1710	/// assert_eq!(dfa.match_pattern(&mut cache, sid, `1`).as_usize(), `0`);
1711	/// assert_eq!(dfa.match_pattern(&mut cache, sid, `2`).as_usize(), `1`);
1712	///
1713	/// # Ok::<(), Box<dyn std::error::Error>>(())
1714	/// ```
1715	#[inline]
1716	pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize {
1717	assert!(id.is_match());
1718	LazyRef::new(self, cache).get_cached_state(id).match_len()
1719	}
1720
1721	/// Returns the pattern ID corresponding to the given match index in the
1722	/// given state.
1723	///
1724	/// See [`DFA::match_len`] for an example of how to use this method
1725	/// correctly. Note that if you know your lazy DFA is configured with a
1726	/// single pattern, then this routine is never necessary since it will
1727	/// always return a pattern ID of `0` for an index of `0` when `id`
1728	/// corresponds to a match state.
1729	///
1730	/// Typically, this routine is used when implementing an overlapping
1731	/// search, as the example for `DFA::match_len` does.
1732	///
1733	/// # Panics
1734	///
1735	/// If the state ID is not a match state or if the match index is out
1736	/// of bounds for the given state, then this routine may either panic
1737	/// or produce an incorrect result. If the state ID is correct and the
1738	/// match index is correct, then this routine always produces a valid
1739	/// `PatternID`.
1740	#[inline]
1741	pub fn match_pattern(
1742	&self,
1743	cache: &Cache,
1744	id: LazyStateID,
1745	match_index: usize,
1746	) -> PatternID {
1747	// This is an optimization for the very common case of a DFA with a
1748	// single pattern. This conditional avoids a somewhat more costly path
1749	// that finds the pattern ID from the corresponding `State`, which
1750	// requires a bit of slicing/pointer-chasing. This optimization tends
1751	// to only matter when matches are frequent.
1752	if self.pattern_len() == `1` {
1753	return PatternID::ZERO;
1754	}
1755	LazyRef::new(self, cache)
1756	.get_cached_state(id)
1757	.match_pattern(match_index)
1758	}
1759	}
1760
1761	/// A cache represents a partially computed DFA.
1762	///
1763	/// A cache is the key component that differentiates a classical DFA and a
1764	/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
1765	/// complete transition table that can handle all possible inputs, a hybrid
1766	/// NFA/DFA starts with an empty transition table and builds only the parts
1767	/// required during search. The parts that are built are stored in a cache. For
1768	/// this reason, a cache is a required parameter for nearly every operation on
1769	/// a [`DFA`].
1770	///
1771	/// Caches can be created from their corresponding DFA via
1772	/// [`DFA::create_cache`]. A cache can only be used with either the DFA that
1773	/// created it, or the DFA that was most recently used to reset it with
1774	/// [`Cache::reset`]. Using a cache with any other DFA may result in panics
1775	/// or incorrect results.
1776	#[derive(Clone, Debug)]
1777	pub struct Cache {
1778	// N.B. If you're looking to understand how determinization works, it
1779	// is probably simpler to first grok src/dfa/determinize.rs, since that
1780	// doesn't have the "laziness" component.
1781	/// The transition table.
1782	///
1783	/// Given a `current` LazyStateID and an `input` byte, the next state can
1784	/// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice
1785	/// that no multiplication is used. That's because state identifiers are
1786	/// "premultiplied."
1787	///
1788	/// Note that the next state may be the "unknown" state. In this case, the
1789	/// next state is not known and determinization for `current` on `input`
1790	/// must be performed.
1791	trans: Vec<LazyStateID>,
1792	/// The starting states for this DFA.
1793	///
1794	/// These are computed lazily. Initially, these are all set to "unknown"
1795	/// lazy state IDs.
1796	///
1797	/// When 'starts_for_each_pattern' is disabled (the default), then the size
1798	/// of this is constrained to the possible starting configurations based
1799	/// on the search parameters. (At time of writing, that's 4.) However,
1800	/// when starting states for each pattern is enabled, then there are N
1801	/// additional groups of starting states, where each group reflects the
1802	/// different possible configurations and N is the number of patterns.
1803	starts: Vec<LazyStateID>,
1804	/// A sequence of NFA/DFA powerset states that have been computed for this
1805	/// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every
1806	/// tagged LazyStateID can be used to index this sequence by converting it
1807	/// to its untagged form.)
1808	states: Vec<State>,
1809	/// A map from states to their corresponding IDs. This map may be accessed
1810	/// via the raw byte representation of a state, which means that a `State`
1811	/// does not need to be allocated to determine whether it already exists
1812	/// in this map. Indeed, the existence of such a state is what determines
1813	/// whether we allocate a new `State` or not.
1814	///
1815	/// The higher level idea here is that we do just enough determinization
1816	/// for a state to check whether we've already computed it. If we have,
1817	/// then we can save a little (albeit not much) work. The real savings is
1818	/// in memory usage. If we never checked for trivially duplicate states,
1819	/// then our memory usage would explode to unreasonable levels.
1820	states_to_id: StateMap,
1821	/// Sparse sets used to track which NFA states have been visited during
1822	/// various traversals.
1823	sparses: SparseSets,
1824	/// Scratch space for traversing the NFA graph. (We use space on the heap
1825	/// instead of the call stack.)
1826	stack: Vec<NFAStateID>,
1827	/// Scratch space for building a NFA/DFA powerset state. This is used to
1828	/// help amortize allocation since not every powerset state generated is
1829	/// added to the cache. In particular, if it already exists in the cache,
1830	/// then there is no need to allocate a new `State` for it.
1831	scratch_state_builder: StateBuilderEmpty,
1832	/// A simple abstraction for handling the saving of at most a single state
1833	/// across a cache clearing. This is required for correctness. Namely, if
1834	/// adding a new state after clearing the cache fails, then the caller
1835	/// must retain the ability to continue using the state ID given. The
1836	/// state corresponding to the state ID is what we preserve across cache
1837	/// clearings.
1838	state_saver: StateSaver,
1839	/// The memory usage, in bytes, used by 'states' and 'states_to_id'. We
1840	/// track this as new states are added since states use a variable amount
1841	/// of heap. Tracking this as we add states makes it possible to compute
1842	/// the total amount of memory used by the determinizer in constant time.
1843	memory_usage_state: usize,
1844	/// The number of times the cache has been cleared. When a minimum cache
1845	/// clear count is set, then the cache will return an error instead of
1846	/// clearing the cache if the count has been exceeded.
1847	clear_count: usize,
1848	/// The total number of bytes searched since the last time this cache was
1849	/// cleared, not including the current search.
1850	///
1851	/// This can be added to the length of the current search to get the true
1852	/// total number of bytes searched.
1853	///
1854	/// This is generally only non-zero when the
1855	/// `Cache::search_{start,update,finish}` APIs are used to track search
1856	/// progress.
1857	bytes_searched: usize,
1858	/// The progress of the current search.
1859	///
1860	/// This is only non-`None` when callers utlize the `Cache::search_start`,
1861	/// `Cache::search_update` and `Cache::search_finish` APIs.
1862	///
1863	/// The purpose of recording search progress is to be able to make a
1864	/// determination about the efficiency of the cache. Namely, by keeping
1865	/// track of the
1866	progress: Option<SearchProgress>,
1867	}
1868
1869	impl Cache {
1870	/// Create a new cache for the given lazy DFA.
1871	///
1872	/// The cache returned should only be used for searches for the given DFA.
1873	/// If you want to reuse the cache for another DFA, then you must call
1874	/// [`Cache::reset`] with that DFA.
1875	pub fn new(dfa: &DFA) -> Cache {
1876	let mut cache = Cache {
1877	trans: alloc::vec![],
1878	starts: alloc::vec![],
1879	states: alloc::vec![],
1880	states_to_id: StateMap::new(),
1881	sparses: SparseSets::new(dfa.get_nfa().states().len()),
1882	stack: alloc::vec![],
1883	scratch_state_builder: StateBuilderEmpty::new(),
1884	state_saver: StateSaver::none(),
1885	memory_usage_state: `0`,
1886	clear_count: `0`,
1887	bytes_searched: `0`,
1888	progress: None,
1889	};
1890	debug!("pre-init lazy DFA cache size: {}", cache.memory_usage());
1891	Lazy { dfa, cache: &mut cache }.init_cache();
1892	debug!("post-init lazy DFA cache size: {}", cache.memory_usage());
1893	cache
1894	}
1895
1896	/// Reset this cache such that it can be used for searching with the given
1897	/// lazy DFA (and only that DFA).
1898	///
1899	/// A cache reset permits reusing memory already allocated in this cache
1900	/// with a different lazy DFA.
1901	///
1902	/// Resetting a cache sets its "clear count" to 0. This is relevant if the
1903	/// lazy DFA has been configured to "give up" after it has cleared the
1904	/// cache a certain number of times.
1905	///
1906	/// Any lazy state ID generated by the cache prior to resetting it is
1907	/// invalid after the reset.
1908	///
1909	/// # Example
1910	///
1911	/// This shows how to re-purpose a cache for use with a different DFA.
1912	///
1913	/// ```
1914	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1915	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
1916	///
1917	/// let dfa1 = DFA::new(r"\w")?;
1918	/// let dfa2 = DFA::new(r"\W")?;
1919	///
1920	/// let mut cache = dfa1.create_cache();
1921	/// assert_eq!(
1922	/// Some(HalfMatch::must(`0`, `2`)),
1923	/// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
1924	/// );
1925	///
1926	/// // Using 'cache' with dfa2 is not allowed. It may result in panics or
1927	/// // incorrect results. In order to re-purpose the cache, we must reset
1928	/// // it with the DFA we'd like to use it with.
1929	/// //
1930	/// // Similarly, after this reset, using the cache with 'dfa1' is also not
1931	/// // allowed.
1932	/// cache.reset(&dfa2);
1933	/// assert_eq!(
1934	/// Some(HalfMatch::must(`0`, `3`)),
1935	/// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
1936	/// );
1937	///
1938	/// # Ok::<(), Box<dyn std::error::Error>>(())
1939	/// ```
1940	pub fn reset(&mut self, dfa: &DFA) {
1941	Lazy::new(dfa, self).reset_cache()
1942	}
1943
1944	/// Initializes a new search starting at the given position.
1945	///
1946	/// If a previous search was unfinished, then it is finished automatically
1947	/// and a new search is begun.
1948	///
1949	/// Note that keeping track of search progress is _not necessary_
1950	/// for correct implementations of search using a lazy DFA. Keeping
1951	/// track of search progress is only necessary if you want the
1952	/// [`Config::minimum_bytes_per_state`] configuration knob to work.
1953	#[inline]
1954	pub fn search_start(&mut self, at: usize) {
1955	// If a previous search wasn't marked as finished, then finish it
1956	// now automatically.
1957	if let Some(p) = self.progress.take() {
1958	self.bytes_searched += p.len();
1959	}
1960	self.progress = Some(SearchProgress { start: at, at });
1961	}
1962
1963	/// Updates the current search to indicate that it has search to the
1964	/// current position.
1965	///
1966	/// No special care needs to be taken for reverse searches. Namely, the
1967	/// position given may be _less than_ the starting position of the search.
1968	///
1969	/// # Panics
1970	///
1971	/// This panics if no search has been started by [`Cache::search_start`].
1972	#[inline]
1973	pub fn search_update(&mut self, at: usize) {
1974	let p =
1975	self.progress.as_mut().expect("no in-progress search to update");
1976	p.at = at;
1977	}
1978
1979	/// Indicates that a search has finished at the given position.
1980	///
1981	/// # Panics
1982	///
1983	/// This panics if no search has been started by [`Cache::search_start`].
1984	#[inline]
1985	pub fn search_finish(&mut self, at: usize) {
1986	let mut p =
1987	self.progress.take().expect("no in-progress search to finish");
1988	p.at = at;
1989	self.bytes_searched += p.len();
1990	}
1991
1992	/// Returns the total number of bytes that have been searched since this
1993	/// cache was last cleared.
1994	///
1995	/// This is useful for determining the efficiency of the cache. For
1996	/// example, the lazy DFA uses this value in conjunction with the
1997	/// [`Config::minimum_bytes_per_state`] knob to help determine whether it
1998	/// should quit searching.
1999	///
2000	/// This always returns `0` if search progress isn't being tracked. Note
2001	/// that the lazy DFA search routines in this crate always track search
2002	/// progress.
2003	pub fn search_total_len(&self) -> usize {
2004	self.bytes_searched + self.progress.as_ref().map_or(`0`, \|p\| p.len())
2005	}
2006
2007	/// Returns the total number of times this cache has been cleared since it
2008	/// was either created or last reset.
2009	///
2010	/// This is useful for informational purposes or if you want to change
2011	/// search strategies based on the number of times the cache has been
2012	/// cleared.
2013	pub fn clear_count(&self) -> usize {
2014	self.clear_count
2015	}
2016
2017	/// Returns the heap memory usage, in bytes, of this cache.
2018	///
2019	/// This does not* include the stack size used up by this cache. To*
2020	/// compute that, use `std::mem::size_of::<Cache>()`.
2021	pub fn memory_usage(&self) -> usize {
2022	const ID_SIZE: usize = size_of::<LazyStateID>();
2023	const STATE_SIZE: usize = size_of::<State>();
2024
2025	// NOTE: If you make changes to the below, then
2026	// 'minimum_cache_capacity' should be updated correspondingly.
2027
2028	self.trans.len() * ID_SIZE
2029	+ self.starts.len() * ID_SIZE
2030	+ self.states.len() * STATE_SIZE
2031	// Maps likely use more memory than this, but it's probably close.
2032	+ self.states_to_id.len() * (STATE_SIZE + ID_SIZE)
2033	+ self.sparses.memory_usage()
2034	+ self.stack.capacity() * ID_SIZE
2035	+ self.scratch_state_builder.capacity()
2036	// Heap memory used by 'State' in both 'states' and 'states_to_id'.
2037	+ self.memory_usage_state
2038	}
2039	}
2040
2041	/// Keeps track of the progress of the current search.
2042	///
2043	/// This is updated via the `Cache::search_{start,update,finish}` APIs to
2044	/// record how many bytes have been searched. This permits computing a
2045	/// heuristic that represents the efficiency of a cache, and thus helps inform
2046	/// whether the lazy DFA should give up or not.
2047	#[derive(Clone, Debug)]
2048	struct SearchProgress {
2049	start: usize,
2050	at: usize,
2051	}
2052
2053	impl SearchProgress {
2054	/// Returns the length, in bytes, of this search so far.
2055	///
2056	/// This automatically handles the case of a reverse search, where `at`
2057	/// is likely to be less than `start`.
2058	fn len(&self) -> usize {
2059	if self.start <= self.at {
2060	self.at - self.start
2061	} else {
2062	self.start - self.at
2063	}
2064	}
2065	}
2066
2067	/// A map from states to state identifiers. When using std, we use a standard
2068	/// hashmap, since it's a bit faster for this use case. (Other maps, like
2069	/// one's based on FNV, have not yet been benchmarked.)
2070	///
2071	/// The main purpose of this map is to reuse states where possible. This won't
2072	/// fully minimize the DFA, but it works well in a lot of cases.
2073	#[cfg(feature = "std")]
2074	type StateMap = std::collections::HashMap<State, LazyStateID>;
2075	#[cfg(not(feature = "std"))]
2076	type StateMap = alloc::collections::BTreeMap<State, LazyStateID>;
2077
2078	/// A type that groups methods that require the base NFA/DFA and writable
2079	/// access to the cache.
2080	#[derive(Debug)]
2081	struct Lazy<'i, 'c> {
2082	dfa: &'i DFA,
2083	cache: &'c mut Cache,
2084	}
2085
2086	impl<'i, 'c> Lazy<'i, 'c> {
2087	/// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
2088	fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> {
2089	Lazy { dfa, cache }
2090	}
2091
2092	/// Return an immutable view by downgrading a writable cache to a read-only
2093	/// cache.
2094	fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> {
2095	LazyRef::new(self.dfa, self.cache)
2096	}
2097
2098	/// This is marked as 'inline(never)' to avoid bloating methods on 'DFA'
2099	/// like 'next_state' and 'next_eoi_state' that are called in critical
2100	/// areas. The idea is to let the optimizer focus on the other areas of
2101	/// those methods as the hot path.
2102	///
2103	/// Here's an example that justifies 'inline(never)'
2104	///
2105	/// ```ignore
2106	/// regex-cli find match hybrid \
2107	/// --cache-capacity `100000000` \
2108	/// -p '\pL{100}'
2109	/// all-codepoints-utf8-`100x`
2110	/// ```
2111	///
2112	/// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
2113	/// codepoint, in sequence, repeated 100 times.
2114	///
2115	/// With 'inline(never)' hyperfine reports 1.1s per run. With
2116	/// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement.
2117	#[cold]
2118	#[inline(never)]
2119	fn cache_next_state(
2120	&mut self,
2121	mut current: LazyStateID,
2122	unit: alphabet::Unit,
2123	) -> Result<LazyStateID, CacheError> {
2124	let stride2 = self.dfa.stride2();
2125	let empty_builder = self.get_state_builder();
2126	let builder = determinize::next(
2127	self.dfa.get_nfa(),
2128	self.dfa.get_config().get_match_kind(),
2129	&mut self.cache.sparses,
2130	&mut self.cache.stack,
2131	&self.cache.states[current.as_usize_untagged() >> stride2],
2132	unit,
2133	empty_builder,
2134	);
2135	let save_state = !self.as_ref().state_builder_fits_in_cache(&builder);
2136	if save_state {
2137	self.save_state(current);
2138	}
2139	let next = self.add_builder_state(builder, \|sid\| sid)?;
2140	if save_state {
2141	current = self.saved_state_id();
2142	}
2143	// This is the payoff. The next time 'next_state' is called with this
2144	// state and alphabet unit, it will find this transition and avoid
2145	// having to re-determinize this transition.
2146	self.set_transition(current, unit, next);
2147	Ok(next)
2148	}
2149
2150	/// Compute and cache the starting state for the given pattern ID (if
2151	/// present) and the starting configuration.
2152	///
2153	/// This panics if a pattern ID is given and the DFA isn't configured to
2154	/// build anchored start states for each pattern.
2155	///
2156	/// This will never return an unknown lazy state ID.
2157	///
2158	/// If caching this state would otherwise result in a cache that has been
2159	/// cleared too many times, then an error is returned.
2160	#[cold]
2161	#[inline(never)]
2162	fn cache_start_group(
2163	&mut self,
2164	anchored: Anchored,
2165	start: Start,
2166	) -> Result<LazyStateID, StartError> {
2167	let nfa_start_id = match anchored {
2168	Anchored::No => self.dfa.get_nfa().start_unanchored(),
2169	Anchored::Yes => self.dfa.get_nfa().start_anchored(),
2170	Anchored::Pattern(pid) => {
2171	if !self.dfa.get_config().get_starts_for_each_pattern() {
2172	return Err(StartError::unsupported_anchored(anchored));
2173	}
2174	match self.dfa.get_nfa().start_pattern(pid) {
2175	None => return Ok(self.as_ref().dead_id()),
2176	Some(sid) => sid,
2177	}
2178	}
2179	};
2180
2181	let id = self
2182	.cache_start_one(nfa_start_id, start)
2183	.map_err(StartError::cache)?;
2184	self.set_start_state(anchored, start, id);
2185	Ok(id)
2186	}
2187
2188	/// Compute and cache the starting state for the given NFA state ID and the
2189	/// starting configuration. The NFA state ID might be one of the following:
2190	///
2191	/// 1) An unanchored start state to match any pattern.
2192	/// 2) An anchored start state to match any pattern.
2193	/// 3) An anchored start state for a particular pattern.
2194	///
2195	/// This will never return an unknown lazy state ID.
2196	///
2197	/// If caching this state would otherwise result in a cache that has been
2198	/// cleared too many times, then an error is returned.
2199	fn cache_start_one(
2200	&mut self,
2201	nfa_start_id: NFAStateID,
2202	start: Start,
2203	) -> Result<LazyStateID, CacheError> {
2204	let mut builder_matches = self.get_state_builder().into_matches();
2205	determinize::set_lookbehind_from_start(
2206	self.dfa.get_nfa(),
2207	&start,
2208	&mut builder_matches,
2209	);
2210	self.cache.sparses.set1.clear();
2211	determinize::epsilon_closure(
2212	self.dfa.get_nfa(),
2213	nfa_start_id,
2214	builder_matches.look_have(),
2215	&mut self.cache.stack,
2216	&mut self.cache.sparses.set1,
2217	);
2218	let mut builder = builder_matches.into_nfa();
2219	determinize::add_nfa_states(
2220	&self.dfa.get_nfa(),
2221	&self.cache.sparses.set1,
2222	&mut builder,
2223	);
2224	let tag_starts = self.dfa.get_config().get_specialize_start_states();
2225	self.add_builder_state(builder, \|id\| {
2226	if tag_starts {
2227	id.to_start()
2228	} else {
2229	id
2230	}
2231	})
2232	}
2233
2234	/// Either add the given builder state to this cache, or return an ID to an
2235	/// equivalent state already in this cache.
2236	///
2237	/// In the case where no equivalent state exists, the idmap function given
2238	/// may be used to transform the identifier allocated. This is useful if
2239	/// the caller needs to tag the ID with additional information.
2240	///
2241	/// This will never return an unknown lazy state ID.
2242	///
2243	/// If caching this state would otherwise result in a cache that has been
2244	/// cleared too many times, then an error is returned.
2245	fn add_builder_state(
2246	&mut self,
2247	builder: StateBuilderNFA,
2248	idmap: impl Fn(LazyStateID) -> LazyStateID,
2249	) -> Result<LazyStateID, CacheError> {
2250	if let Some(&cached_id) =
2251	self.cache.states_to_id.get(builder.as_bytes())
2252	{
2253	// Since we have a cached state, put the constructed state's
2254	// memory back into our scratch space, so that it can be reused.
2255	self.put_state_builder(builder);
2256	return Ok(cached_id);
2257	}
2258	let result = self.add_state(builder.to_state(), idmap);
2259	self.put_state_builder(builder);
2260	result
2261	}
2262
2263	/// Allocate a new state ID and add the given state to this cache.
2264	///
2265	/// The idmap function given may be used to transform the identifier
2266	/// allocated. This is useful if the caller needs to tag the ID with
2267	/// additional information.
2268	///
2269	/// This will never return an unknown lazy state ID.
2270	///
2271	/// If caching this state would otherwise result in a cache that has been
2272	/// cleared too many times, then an error is returned.
2273	fn add_state(
2274	&mut self,
2275	state: State,
2276	idmap: impl Fn(LazyStateID) -> LazyStateID,
2277	) -> Result<LazyStateID, CacheError> {
2278	if !self.as_ref().state_fits_in_cache(&state) {
2279	self.try_clear_cache()?;
2280	}
2281	// It's important for this to come second, since the above may clear
2282	// the cache. If we clear the cache after ID generation, then the ID
2283	// is likely bunk since it would have been generated based on a larger
2284	// transition table.
2285	let mut id = idmap(self.next_state_id()?);
2286	if state.is_match() {
2287	id = id.to_match();
2288	}
2289	// Add room in the transition table. Since this is a fresh state, all
2290	// of its transitions are unknown.
2291	self.cache.trans.extend(
2292	iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()),
2293	);
2294	// When we add a sentinel state, we never want to set any quit
2295	// transitions. Technically, this is harmless, since sentinel states
2296	// have all of their transitions set to loop back to themselves. But
2297	// when creating sentinel states before the quit sentinel state,
2298	// this will try to call 'set_transition' on a state ID that doesn't
2299	// actually exist yet, which isn't allowed. So we just skip doing so
2300	// entirely.
2301	if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) {
2302	let quit_id = self.as_ref().quit_id();
2303	for b in self.dfa.quitset.iter() {
2304	self.set_transition(id, alphabet::Unit::u8(b), quit_id);
2305	}
2306	}
2307	self.cache.memory_usage_state += state.memory_usage();
2308	self.cache.states.push(state.clone());
2309	self.cache.states_to_id.insert(state, id);
2310	Ok(id)
2311	}
2312
2313	/// Allocate a new state ID.
2314	///
2315	/// This will never return an unknown lazy state ID.
2316	///
2317	/// If caching this state would otherwise result in a cache that has been
2318	/// cleared too many times, then an error is returned.
2319	fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> {
2320	let sid = match LazyStateID::new(self.cache.trans.len()) {
2321	Ok(sid) => sid,
2322	Err(_) => {
2323	self.try_clear_cache()?;
2324	// This has to pass since we check that ID capacity at
2325	// construction time can fit at least MIN_STATES states.
2326	LazyStateID::new(self.cache.trans.len()).unwrap()
2327	}
2328	};
2329	Ok(sid)
2330	}
2331
2332	/// Attempt to clear the cache used by this lazy DFA.
2333	///
2334	/// If clearing the cache exceeds the minimum number of required cache
2335	/// clearings, then this will return a cache error. In this case,
2336	/// callers should bubble this up as the cache can't be used until it is
2337	/// reset. Implementations of search should convert this error into a
2338	/// [`MatchError::gave_up`].
2339	///
2340	/// If 'self.state_saver' is set to save a state, then this state is
2341	/// persisted through cache clearing. Otherwise, the cache is returned to
2342	/// its state after initialization with two exceptions: its clear count
2343	/// is incremented and some of its memory likely has additional capacity.
2344	/// That is, clearing a cache does _not_ release memory.
2345	///
2346	/// Otherwise, any lazy state ID generated by the cache prior to resetting
2347	/// it is invalid after the reset.
2348	fn try_clear_cache(&mut self) -> Result<(), CacheError> {
2349	let c = self.dfa.get_config();
2350	if let Some(min_count) = c.get_minimum_cache_clear_count() {
2351	if self.cache.clear_count >= min_count {
2352	if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() {
2353	let len = self.cache.search_total_len();
2354	let min_bytes =
2355	min_bytes_per.saturating_mul(self.cache.states.len());
2356	// If we've searched 0 bytes then probably something has
2357	// gone wrong and the lazy DFA search implementation isn't
2358	// correctly updating the search progress state.
2359	if len == `0` {
2360	trace!(
2361	"number of bytes searched is 0, but \
2362	a minimum bytes per state searched ({}) is \
2363	enabled, maybe Cache::search_update \
2364	is not being used?",
2365	min_bytes_per,
2366	);
2367	}
2368	if len < min_bytes {
2369	trace!(
2370	"lazy DFA cache has been cleared {} times, \
2371	which exceeds the limit of {}, \
2372	AND its bytes searched per state is less \
2373	than the configured minimum of {}, \
2374	therefore lazy DFA is giving up \
2375	(bytes searched since cache clear = {}, \
2376	number of states = {})",
2377	self.cache.clear_count,
2378	min_count,
2379	min_bytes_per,
2380	len,
2381	self.cache.states.len(),
2382	);
2383	return Err(CacheError::bad_efficiency());
2384	} else {
2385	trace!(
2386	"lazy DFA cache has been cleared {} times, \
2387	which exceeds the limit of {}, \
2388	AND its bytes searched per state is greater \
2389	than the configured minimum of {}, \
2390	therefore lazy DFA is continuing! \
2391	(bytes searched since cache clear = {}, \
2392	number of states = {})",
2393	self.cache.clear_count,
2394	min_count,
2395	min_bytes_per,
2396	len,
2397	self.cache.states.len(),
2398	);
2399	}
2400	} else {
2401	trace!(
2402	"lazy DFA cache has been cleared {} times, \
2403	which exceeds the limit of {}, \
2404	since there is no configured bytes per state \
2405	minimum, lazy DFA is giving up",
2406	self.cache.clear_count,
2407	min_count,
2408	);
2409	return Err(CacheError::too_many_cache_clears());
2410	}
2411	}
2412	}
2413	self.clear_cache();
2414	Ok(())
2415	}
2416
2417	/// Clears _and_ resets the cache. Resetting the cache means that no
2418	/// states are persisted and the clear count is reset to 0. No heap memory
2419	/// is released.
2420	///
2421	/// Note that the caller may reset a cache with a different DFA than what
2422	/// it was created from. In which case, the cache can now be used with the
2423	/// new DFA (and not the old DFA).
2424	fn reset_cache(&mut self) {
2425	self.cache.state_saver = StateSaver::none();
2426	self.clear_cache();
2427	// If a new DFA is used, it might have a different number of NFA
2428	// states, so we need to make sure our sparse sets have the appropriate
2429	// size.
2430	self.cache.sparses.resize(self.dfa.get_nfa().states().len());
2431	self.cache.clear_count = `0`;
2432	self.cache.progress = None;
2433	}
2434
2435	/// Clear the cache used by this lazy DFA.
2436	///
2437	/// If 'self.state_saver' is set to save a state, then this state is
2438	/// persisted through cache clearing. Otherwise, the cache is returned to
2439	/// its state after initialization with two exceptions: its clear count
2440	/// is incremented and some of its memory likely has additional capacity.
2441	/// That is, clearing a cache does _not_ release memory.
2442	///
2443	/// Otherwise, any lazy state ID generated by the cache prior to resetting
2444	/// it is invalid after the reset.
2445	fn clear_cache(&mut self) {
2446	self.cache.trans.clear();
2447	self.cache.starts.clear();
2448	self.cache.states.clear();
2449	self.cache.states_to_id.clear();
2450	self.cache.memory_usage_state = `0`;
2451	self.cache.clear_count += `1`;
2452	self.cache.bytes_searched = `0`;
2453	if let Some(ref mut progress) = self.cache.progress {
2454	progress.start = progress.at;
2455	}
2456	trace!(
2457	"lazy DFA cache has been cleared (count: {})",
2458	self.cache.clear_count
2459	);
2460	self.init_cache();
2461	// If the state we want to save is one of the sentinel
2462	// (unknown/dead/quit) states, then 'init_cache' adds those back, and
2463	// their identifier values remains invariant. So there's no need to add
2464	// it again. (And indeed, doing so would be incorrect!)
2465	if let Some((old_id, state)) = self.cache.state_saver.take_to_save() {
2466	// If the state is one of the special sentinel states, then it is
2467	// automatically added by cache initialization and its ID always
2468	// remains the same. With that said, this should never occur since
2469	// the sentinel states are all loop states back to themselves. So
2470	// we should never be in a position where we're attempting to save
2471	// a sentinel state since we never compute transitions out of a
2472	// sentinel state.
2473	assert!(
2474	!self.as_ref().is_sentinel(old_id),
2475	"cannot save sentinel state"
2476	);
2477	let new_id = self
2478	.add_state(state, \|id\| {
2479	if old_id.is_start() {
2480	// We don't need to consult the
2481	// 'specialize_start_states' config knob here, because
2482	// if it's disabled, old_id.is_start() will never
2483	// return true.
2484	id.to_start()
2485	} else {
2486	id
2487	}
2488	})
2489	// The unwrap here is OK because lazy DFA creation ensures that
2490	// we have room in the cache to add MIN_STATES states. Since
2491	// 'init_cache' above adds 3, this adds a 4th.
2492	.expect("adding one state after cache clear must work");
2493	self.cache.state_saver = StateSaver::Saved(new_id);
2494	}
2495	}
2496
2497	/// Initialize this cache from emptiness to a place where it can be used
2498	/// for search.
2499	///
2500	/// This is called both at cache creation time and after the cache has been
2501	/// cleared.
2502	///
2503	/// Primarily, this adds the three sentinel states and allocates some
2504	/// initial memory.
2505	fn init_cache(&mut self) {
2506	// Why multiply by 2 here? Because we make room for both the unanchored
2507	// and anchored start states. Unanchored is first and then anchored.
2508	let mut starts_len = Start::len().checked_mul(`2`).unwrap();
2509	// ... but if we also want start states for every pattern, we make room
2510	// for that too.
2511	if self.dfa.get_config().get_starts_for_each_pattern() {
2512	starts_len += Start::len() * self.dfa.pattern_len();
2513	}
2514	self.cache
2515	.starts
2516	.extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len));
2517	// This is the set of NFA states that corresponds to each of our three
2518	// sentinel states: the empty set.
2519	let dead = State::dead();
2520	// This sets up some states that we use as sentinels that are present
2521	// in every DFA. While it would be technically possible to implement
2522	// this DFA without explicitly putting these states in the transition
2523	// table, this is convenient to do to make `next_state` correct for all
2524	// valid state IDs without needing explicit conditionals to special
2525	// case these sentinel states.
2526	//
2527	// All three of these states are "dead" states. That is, all of
2528	// them transition only to themselves. So once you enter one of
2529	// these states, it's impossible to leave them. Thus, any correct
2530	// search routine must explicitly check for these state types. (Sans
2531	// `unknown`, since that is only used internally to represent missing
2532	// states.)
2533	let unk_id =
2534	self.add_state(dead.clone(), \|id\| id.to_unknown()).unwrap();
2535	let dead_id = self.add_state(dead.clone(), \|id\| id.to_dead()).unwrap();
2536	let quit_id = self.add_state(dead.clone(), \|id\| id.to_quit()).unwrap();
2537	assert_eq!(unk_id, self.as_ref().unknown_id());
2538	assert_eq!(dead_id, self.as_ref().dead_id());
2539	assert_eq!(quit_id, self.as_ref().quit_id());
2540	// The idea here is that if you start in an unknown/dead/quit state and
2541	// try to transition on them, then you should end up where you started.
2542	self.set_all_transitions(unk_id, unk_id);
2543	self.set_all_transitions(dead_id, dead_id);
2544	self.set_all_transitions(quit_id, quit_id);
2545	// All of these states are technically equivalent from the FSM
2546	// perspective, so putting all three of them in the cache isn't
2547	// possible. (They are distinct merely because we use their
2548	// identifiers as sentinels to mean something, as indicated by the
2549	// names.) Moreover, we wouldn't want to do that. Unknown and quit
2550	// states are special in that they are artificial constructions
2551	// this implementation. But dead states are a natural part of
2552	// determinization. When you reach a point in the NFA where you cannot
2553	// go anywhere else, a dead state will naturally arise and we MUST
2554	// reuse the canonical dead state that we've created here. Why? Because
2555	// it is the state ID that tells the search routine whether a state is
2556	// dead or not, and thus, whether to stop the search. Having a bunch of
2557	// distinct dead states would be quite wasteful!
2558	self.cache.states_to_id.insert(dead, dead_id);
2559	}
2560
2561	/// Save the state corresponding to the ID given such that the state
2562	/// persists through a cache clearing.
2563	///
2564	/// While the state may persist, the ID may not. In order to discover the
2565	/// new state ID, one must call 'saved_state_id' after a cache clearing.
2566	fn save_state(&mut self, id: LazyStateID) {
2567	let state = self.as_ref().get_cached_state(id).clone();
2568	self.cache.state_saver = StateSaver::ToSave { id, state };
2569	}
2570
2571	/// Returns the updated lazy state ID for a state that was persisted
2572	/// through a cache clearing.
2573	///
2574	/// It is only correct to call this routine when both a state has been
2575	/// saved and the cache has just been cleared. Otherwise, this panics.
2576	fn saved_state_id(&mut self) -> LazyStateID {
2577	self.cache
2578	.state_saver
2579	.take_saved()
2580	.expect("state saver does not have saved state ID")
2581	}
2582
2583	/// Set all transitions on the state 'from' to 'to'.
2584	fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) {
2585	for unit in self.dfa.classes.representatives(..) {
2586	self.set_transition(from, unit, to);
2587	}
2588	}
2589
2590	/// Set the transition on 'from' for 'unit' to 'to'.
2591	///
2592	/// This panics if either 'from' or 'to' is invalid.
2593	///
2594	/// All unit values are OK.
2595	fn set_transition(
2596	&mut self,
2597	from: LazyStateID,
2598	unit: alphabet::Unit,
2599	to: LazyStateID,
2600	) {
2601	assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from);
2602	assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to);
2603	let offset =
2604	from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit);
2605	self.cache.trans[offset] = to;
2606	}
2607
2608	/// Set the start ID for the given pattern ID (if given) and starting
2609	/// configuration to the ID given.
2610	///
2611	/// This panics if 'id' is not valid or if a pattern ID is given and
2612	/// 'starts_for_each_pattern' is not enabled.
2613	fn set_start_state(
2614	&mut self,
2615	anchored: Anchored,
2616	start: Start,
2617	id: LazyStateID,
2618	) {
2619	assert!(self.as_ref().is_valid(id));
2620	let start_index = start.as_usize();
2621	let index = match anchored {
2622	Anchored::No => start_index,
2623	Anchored::Yes => Start::len() + start_index,
2624	Anchored::Pattern(pid) => {
2625	assert!(
2626	self.dfa.get_config().get_starts_for_each_pattern(),
2627	"attempted to search for a specific pattern \
2628	without enabling starts_for_each_pattern",
2629	);
2630	let pid = pid.as_usize();
2631	(`2` * Start::len()) + (Start::len() * pid) + start_index
2632	}
2633	};
2634	self.cache.starts[index] = id;
2635	}
2636
2637	/// Returns a state builder from this DFA that might have existing
2638	/// capacity. This helps avoid allocs in cases where a state is built that
2639	/// turns out to already be cached.
2640	///
2641	/// Callers must put the state builder back with 'put_state_builder',
2642	/// otherwise the allocation reuse won't work.
2643	fn get_state_builder(&mut self) -> StateBuilderEmpty {
2644	core::mem::replace(
2645	&mut self.cache.scratch_state_builder,
2646	StateBuilderEmpty::new(),
2647	)
2648	}
2649
2650	/// Puts the given state builder back into this DFA for reuse.
2651	///
2652	/// Note that building a 'State' from a builder always creates a new alloc,
2653	/// so callers should always put the builder back.
2654	fn put_state_builder(&mut self, builder: StateBuilderNFA) {
2655	let _ = core::mem::replace(
2656	&mut self.cache.scratch_state_builder,
2657	builder.clear(),
2658	);
2659	}
2660	}
2661
2662	/// A type that groups methods that require the base NFA/DFA and read-only
2663	/// access to the cache.
2664	#[derive(Debug)]
2665	struct LazyRef<'i, 'c> {
2666	dfa: &'i DFA,
2667	cache: &'c Cache,
2668	}
2669
2670	impl<'i, 'c> LazyRef<'i, 'c> {
2671	/// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
2672	fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> {
2673	LazyRef { dfa, cache }
2674	}
2675
2676	/// Return the ID of the start state for the given configuration.
2677	///
2678	/// If the start state has not yet been computed, then this returns an
2679	/// unknown lazy state ID.
2680	#[cfg_attr(feature = "perf-inline", inline(always))]
2681	fn get_cached_start_id(
2682	&self,
2683	anchored: Anchored,
2684	start: Start,
2685	) -> Result<LazyStateID, StartError> {
2686	let start_index = start.as_usize();
2687	let index = match anchored {
2688	Anchored::No => start_index,
2689	Anchored::Yes => Start::len() + start_index,
2690	Anchored::Pattern(pid) => {
2691	if !self.dfa.get_config().get_starts_for_each_pattern() {
2692	return Err(StartError::unsupported_anchored(anchored));
2693	}
2694	if pid.as_usize() >= self.dfa.pattern_len() {
2695	return Ok(self.dead_id());
2696	}
2697	(`2` * Start::len())
2698	+ (Start::len() * pid.as_usize())
2699	+ start_index
2700	}
2701	};
2702	Ok(self.cache.starts[index])
2703	}
2704
2705	/// Return the cached NFA/DFA powerset state for the given ID.
2706	///
2707	/// This panics if the given ID does not address a valid state.
2708	fn get_cached_state(&self, sid: LazyStateID) -> &State {
2709	let index = sid.as_usize_untagged() >> self.dfa.stride2();
2710	&self.cache.states[index]
2711	}
2712
2713	/// Returns true if and only if the given ID corresponds to a "sentinel"
2714	/// state.
2715	///
2716	/// A sentinel state is a state that signifies a special condition of
2717	/// search, and where every transition maps back to itself. See LazyStateID
2718	/// for more details. Note that start and match states are _not_ sentinels
2719	/// since they may otherwise be real states with non-trivial transitions.
2720	/// The purposes of sentinel states is purely to indicate something. Their
2721	/// transitions are not meant to be followed.
2722	fn is_sentinel(&self, id: LazyStateID) -> bool {
2723	id == self.unknown_id() \|\| id == self.dead_id() \|\| id == self.quit_id()
2724	}
2725
2726	/// Returns the ID of the unknown state for this lazy DFA.
2727	fn unknown_id(&self) -> LazyStateID {
2728	// This unwrap is OK since 0 is always a valid state ID.
2729	LazyStateID::new(`0`).unwrap().to_unknown()
2730	}
2731
2732	/// Returns the ID of the dead state for this lazy DFA.
2733	fn dead_id(&self) -> LazyStateID {
2734	// This unwrap is OK since the maximum value here is 1 512 = 512,*
2735	// which is <= 2047 (the maximum state ID on 16-bit systems). Where
2736	// 512 is the worst case for our equivalence classes (every byte is a
2737	// distinct class).
2738	LazyStateID::new(`1` << self.dfa.stride2()).unwrap().to_dead()
2739	}
2740
2741	/// Returns the ID of the quit state for this lazy DFA.
2742	fn quit_id(&self) -> LazyStateID {
2743	// This unwrap is OK since the maximum value here is 2 512 = 1024,*
2744	// which is <= 2047 (the maximum state ID on 16-bit systems). Where
2745	// 512 is the worst case for our equivalence classes (every byte is a
2746	// distinct class).
2747	LazyStateID::new(`2` << self.dfa.stride2()).unwrap().to_quit()
2748	}
2749
2750	/// Returns true if and only if the given ID is valid.
2751	///
2752	/// An ID is valid if it is both a valid index into the transition table
2753	/// and is a multiple of the DFA's stride.
2754	fn is_valid(&self, id: LazyStateID) -> bool {
2755	let id = id.as_usize_untagged();
2756	id < self.cache.trans.len() && id % self.dfa.stride() == `0`
2757	}
2758
2759	/// Returns true if adding the state given would fit in this cache.
2760	fn state_fits_in_cache(&self, state: &State) -> bool {
2761	let needed = self.cache.memory_usage()
2762	+ self.memory_usage_for_one_more_state(state.memory_usage());
2763	trace!(
2764	"lazy DFA cache capacity check: {:?} ?<=? {:?}",
2765	needed,
2766	self.dfa.cache_capacity
2767	);
2768	needed <= self.dfa.cache_capacity
2769	}
2770
2771	/// Returns true if adding the state to be built by the given builder would
2772	/// fit in this cache.
2773	fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool {
2774	let needed = self.cache.memory_usage()
2775	+ self.memory_usage_for_one_more_state(state.as_bytes().len());
2776	needed <= self.dfa.cache_capacity
2777	}
2778
2779	/// Returns the additional memory usage, in bytes, required to add one more
2780	/// state to this cache. The given size should be the heap size, in bytes,
2781	/// that would be used by the new state being added.
2782	fn memory_usage_for_one_more_state(
2783	&self,
2784	state_heap_size: usize,
2785	) -> usize {
2786	const ID_SIZE: usize = size_of::<LazyStateID>();
2787	const STATE_SIZE: usize = size_of::<State>();
2788
2789	self.dfa.stride() * ID_SIZE // additional space needed in trans table
2790	+ STATE_SIZE // space in cache.states
2791	+ (STATE_SIZE + ID_SIZE) // space in cache.states_to_id
2792	+ state_heap_size // heap memory used by state itself
2793	}
2794	}
2795
2796	/// A simple type that encapsulates the saving of a state ID through a cache
2797	/// clearing.
2798	///
2799	/// A state ID can be marked for saving with ToSave, while a state ID can be
2800	/// saved itself with Saved.
2801	#[derive(Clone, Debug)]
2802	enum StateSaver {
2803	/// An empty state saver. In this case, no states (other than the special
2804	/// sentinel states) are preserved after clearing the cache.
2805	None,
2806	/// An ID of a state (and the state itself) that should be preserved after
2807	/// the lazy DFA's cache has been cleared. After clearing, the updated ID
2808	/// is stored in 'Saved' since it may have changed.
2809	ToSave { id: LazyStateID, state: State },
2810	/// An ID that of a state that has been persisted through a lazy DFA
2811	/// cache clearing. The ID recorded here corresponds to an ID that was
2812	/// once marked as ToSave. The IDs are likely not equivalent even though
2813	/// the states they point to are.
2814	Saved(LazyStateID),
2815	}
2816
2817	impl StateSaver {
2818	/// Create an empty state saver.
2819	fn none() -> StateSaver {
2820	StateSaver::None
2821	}
2822
2823	/// Replace this state saver with an empty saver, and if this saver is a
2824	/// request to save a state, return that request.
2825	fn take_to_save(&mut self) -> Option<(LazyStateID, State)> {
2826	match core::mem::replace(self, StateSaver::None) {
2827	StateSaver::None \| StateSaver::Saved(_) => None,
2828	StateSaver::ToSave { id, state } => Some((id, state)),
2829	}
2830	}
2831
2832	/// Replace this state saver with an empty saver, and if this saver is a
2833	/// saved state (or a request to save a state), return that state's ID.
2834	///
2835	/// The idea here is that a request to save a state isn't necessarily
2836	/// honored because it might not be needed. e.g., Some higher level code
2837	/// might request a state to be saved on the off chance that the cache gets
2838	/// cleared when a new state is added at a lower level. But if that new
2839	/// state is never added, then the cache is never cleared and the state and
2840	/// its ID remain unchanged.
2841	fn take_saved(&mut self) -> Option<LazyStateID> {
2842	match core::mem::replace(self, StateSaver::None) {
2843	StateSaver::None => None,
2844	StateSaver::Saved(id) \| StateSaver::ToSave { id, .. } => Some(id),
2845	}
2846	}
2847	}
2848
2849	/// The configuration used for building a lazy DFA.
2850	///
2851	/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
2852	/// advantage of the former is that it often lets you avoid importing the
2853	/// `Config` type directly.
2854	///
2855	/// A lazy DFA configuration is a simple data object that is typically used
2856	/// with [`Builder::configure`].
2857	///
2858	/// The default configuration guarantees that a search will never return a
2859	/// "gave up" or "quit" error, although it is possible for a search to fail
2860	/// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
2861	/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
2862	#[derive(Clone, Debug, Default)]
2863	pub struct Config {
2864	// As with other configuration types in this crate, we put all our knobs
2865	// in options so that we can distinguish between "default" and "not set."
2866	// This makes it possible to easily combine multiple configurations
2867	// without default values overwriting explicitly specified values. See the
2868	// 'overwrite' method.
2869	//
2870	// For docs on the fields below, see the corresponding method setters.
2871	match_kind: Option<MatchKind>,
2872	pre: Option<Option<Prefilter>>,
2873	starts_for_each_pattern: Option<bool>,
2874	byte_classes: Option<bool>,
2875	unicode_word_boundary: Option<bool>,
2876	quitset: Option<ByteSet>,
2877	specialize_start_states: Option<bool>,
2878	cache_capacity: Option<usize>,
2879	skip_cache_capacity_check: Option<bool>,
2880	minimum_cache_clear_count: Option<Option<usize>>,
2881	minimum_bytes_per_state: Option<Option<usize>>,
2882	}
2883
2884	impl Config {
2885	/// Return a new default lazy DFA builder configuration.
2886	pub fn new() -> Config {
2887	Config::default()
2888	}
2889
2890	/// Set the desired match semantics.
2891	///
2892	/// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
2893	/// match semantics of Perl-like regex engines. That is, when multiple
2894	/// patterns would match at the same leftmost position, the pattern that
2895	/// appears first in the concrete syntax is chosen.
2896	///
2897	/// Currently, the only other kind of match semantics supported is
2898	/// [`MatchKind::All`]. This corresponds to classical DFA construction
2899	/// where all possible matches are added to the lazy DFA.
2900	///
2901	/// Typically, `All` is used when one wants to execute an overlapping
2902	/// search and `LeftmostFirst` otherwise. In particular, it rarely makes
2903	/// sense to use `All` with the various "leftmost" find routines, since the
2904	/// leftmost routines depend on the `LeftmostFirst` automata construction
2905	/// strategy. Specifically, `LeftmostFirst` adds dead states to the
2906	/// lazy DFA as a way to terminate the search and report a match.
2907	/// `LeftmostFirst` also supports non-greedy matches using this strategy
2908	/// where as `All` does not.
2909	///
2910	/// # Example: overlapping search
2911	///
2912	/// This example shows the typical use of `MatchKind::All`, which is to
2913	/// report overlapping matches.
2914	///
2915	/// ```
2916	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
2917	/// use regex_automata::{
2918	/// hybrid::dfa::{DFA, OverlappingState},
2919	/// HalfMatch, Input, MatchKind,
2920	/// };
2921	///
2922	/// let dfa = DFA::builder()
2923	/// .configure(DFA::config().match_kind(MatchKind::All))
2924	/// .build_many(&[r"\w+$", r"\S+$"])?;
2925	/// let mut cache = dfa.create_cache();
2926	/// let haystack = "@foo";
2927	/// let mut state = OverlappingState::start();
2928	///
2929	/// let expected = Some(HalfMatch::must(`1`, `4`));
2930	/// dfa.try_search_overlapping_fwd(
2931	/// &mut cache, &Input::new(haystack), &mut state,
2932	/// )?;
2933	/// assert_eq!(expected, state.get_match());
2934	///
2935	/// // The first pattern also matches at the same position, so re-running
2936	/// // the search will yield another match. Notice also that the first
2937	/// // pattern is returned after the second. This is because the second
2938	/// // pattern begins its match before the first, is therefore an earlier
2939	/// // match and is thus reported first.
2940	/// let expected = Some(HalfMatch::must(`0`, `4`));
2941	/// dfa.try_search_overlapping_fwd(
2942	/// &mut cache, &Input::new(haystack), &mut state,
2943	/// )?;
2944	/// assert_eq!(expected, state.get_match());
2945	///
2946	/// # Ok::<(), Box<dyn std::error::Error>>(())
2947	/// ```
2948	///
2949	/// # Example: reverse automaton to find start of match
2950	///
2951	/// Another example for using `MatchKind::All` is for constructing a
2952	/// reverse automaton to find the start of a match. `All` semantics are
2953	/// used for this in order to find the longest possible match, which
2954	/// corresponds to the leftmost starting position.
2955	///
2956	/// Note that if you need the starting position then
2957	/// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this
2958	/// for you, so it's usually not necessary to do this yourself.
2959	///
2960	/// ```
2961	/// use regex_automata::{
2962	/// hybrid::dfa::DFA,
2963	/// nfa::thompson::NFA,
2964	/// Anchored, HalfMatch, Input, MatchKind,
2965	/// };
2966	///
2967	/// let input = Input::new("123foobar456");
2968	/// let pattern = r"[a-z]+r";
2969	///
2970	/// let dfa_fwd = DFA::new(pattern)?;
2971	/// let dfa_rev = DFA::builder()
2972	/// .thompson(NFA::config().reverse(`true`))
2973	/// .configure(DFA::config().match_kind(MatchKind::All))
2974	/// .build(pattern)?;
2975	/// let mut cache_fwd = dfa_fwd.create_cache();
2976	/// let mut cache_rev = dfa_rev.create_cache();
2977	///
2978	/// let expected_fwd = HalfMatch::must(`0`, `9`);
2979	/// let expected_rev = HalfMatch::must(`0`, `3`);
2980	/// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap();
2981	/// // Here we don't specify the pattern to search for since there's only
2982	/// // one pattern and we're doing a leftmost search. But if this were an
2983	/// // overlapping search, you'd need to specify the pattern that matched
2984	/// // in the forward direction. (Otherwise, you might wind up finding the
2985	/// // starting position of a match of some other pattern.) That in turn
2986	/// // requires building the reverse automaton with starts_for_each_pattern
2987	/// // enabled.
2988	/// let input = input
2989	/// .clone()
2990	/// .range(..got_fwd.offset())
2991	/// .anchored(Anchored::Yes);
2992	/// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap();
2993	/// assert_eq!(expected_fwd, got_fwd);
2994	/// assert_eq!(expected_rev, got_rev);
2995	///
2996	/// # Ok::<(), Box<dyn std::error::Error>>(())
2997	/// ```
2998	pub fn match_kind(mut self, kind: MatchKind) -> Config {
2999	self.match_kind = Some(kind);
3000	self
3001	}
3002
3003	/// Set a prefilter to be used whenever a start state is entered.
3004	///
3005	/// A [`Prefilter`] in this context is meant to accelerate searches by
3006	/// looking for literal prefixes that every match for the corresponding
3007	/// pattern (or patterns) must start with. Once a prefilter produces a
3008	/// match, the underlying search routine continues on to try and confirm
3009	/// the match.
3010	///
3011	/// Be warned that setting a prefilter does not guarantee that the search
3012	/// will be faster. While it's usually a good bet, if the prefilter
3013	/// produces a lot of false positive candidates (i.e., positions matched
3014	/// by the prefilter but not by the regex), then the overall result can
3015	/// be slower than if you had just executed the regex engine without any
3016	/// prefilters.
3017	///
3018	/// Note that unless [`Config::specialize_start_states`] has been
3019	/// explicitly set, then setting this will also enable (when `pre` is
3020	/// `Some`) or disable (when `pre` is `None`) start state specialization.
3021	/// This occurs because without start state specialization, a prefilter
3022	/// is likely to be less effective. And without a prefilter, start state
3023	/// specialization is usually pointless.
3024	///
3025	/// By default no prefilter is set.
3026	///
3027	/// # Example
3028	///
3029	/// ```
3030	/// use regex_automata::{
3031	/// hybrid::dfa::DFA,
3032	/// util::prefilter::Prefilter,
3033	/// Input, HalfMatch, MatchKind,
3034	/// };
3035	///
3036	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
3037	/// let re = DFA::builder()
3038	/// .configure(DFA::config().prefilter(pre))
3039	/// .build(r"(foo\|bar)[a-z]+")?;
3040	/// let mut cache = re.create_cache();
3041	/// let input = Input::new("foo1 barfox bar");
3042	/// assert_eq!(
3043	/// Some(HalfMatch::must(`0`, `11`)),
3044	/// re.try_search_fwd(&mut cache, &input)?,
3045	/// );
3046	///
3047	/// # Ok::<(), Box<dyn std::error::Error>>(())
3048	/// ```
3049	///
3050	/// Be warned though that an incorrect prefilter can lead to incorrect
3051	/// results!
3052	///
3053	/// ```
3054	/// use regex_automata::{
3055	/// hybrid::dfa::DFA,
3056	/// util::prefilter::Prefilter,
3057	/// Input, HalfMatch, MatchKind,
3058	/// };
3059	///
3060	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
3061	/// let re = DFA::builder()
3062	/// .configure(DFA::config().prefilter(pre))
3063	/// .build(r"(foo\|bar)[a-z]+")?;
3064	/// let mut cache = re.create_cache();
3065	/// let input = Input::new("foo1 barfox bar");
3066	/// assert_eq!(
3067	/// // No match reported even though there clearly is one!
3068	/// None,
3069	/// re.try_search_fwd(&mut cache, &input)?,
3070	/// );
3071	///
3072	/// # Ok::<(), Box<dyn std::error::Error>>(())
3073	/// ```
3074	pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
3075	self.pre = Some(pre);
3076	if self.specialize_start_states.is_none() {
3077	self.specialize_start_states =
3078	Some(self.get_prefilter().is_some());
3079	}
3080	self
3081	}
3082
3083	/// Whether to compile a separate start state for each pattern in the
3084	/// lazy DFA.
3085	///
3086	/// When enabled, a separate anchored* start state is added for each*
3087	/// pattern in the lazy DFA. When this start state is used, then the DFA
3088	/// will only search for matches for the pattern specified, even if there
3089	/// are other patterns in the DFA.
3090	///
3091	/// The main downside of this option is that it can potentially increase
3092	/// the size of the DFA and/or increase the time it takes to build the
3093	/// DFA at search time. However, since this is configuration for a lazy
3094	/// DFA, these states aren't actually built unless they're used. Enabling
3095	/// this isn't necessarily free, however, as it may result in higher cache
3096	/// usage.
3097	///
3098	/// There are a few reasons one might want to enable this (it's disabled
3099	/// by default):
3100	///
3101	/// 1. When looking for the start of an overlapping match (using a reverse
3102	/// DFA), doing it correctly requires starting the reverse search using the
3103	/// starting state of the pattern that matched in the forward direction.
3104	/// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it
3105	/// will automatically enable this option when building the reverse DFA
3106	/// internally.
3107	/// 2. When you want to use a DFA with multiple patterns to both search
3108	/// for matches of any pattern or to search for anchored matches of one
3109	/// particular pattern while using the same DFA. (Otherwise, you would need
3110	/// to compile a new DFA for each pattern.)
3111	///
3112	/// By default this is disabled.
3113	///
3114	/// # Example
3115	///
3116	/// This example shows how to use this option to permit the same lazy DFA
3117	/// to run both general searches for any pattern and anchored searches for
3118	/// a specific pattern.
3119	///
3120	/// ```
3121	/// use regex_automata::{
3122	/// hybrid::dfa::DFA,
3123	/// Anchored, HalfMatch, Input, PatternID,
3124	/// };
3125	///
3126	/// let dfa = DFA::builder()
3127	/// .configure(DFA::config().starts_for_each_pattern(`true`))
3128	/// .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?;
3129	/// let mut cache = dfa.create_cache();
3130	/// let haystack = "bar foo123";
3131	///
3132	/// // Here's a normal unanchored search that looks for any pattern.
3133	/// let expected = HalfMatch::must(`0`, `10`);
3134	/// let input = Input::new(haystack);
3135	/// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3136	/// // We can also do a normal anchored search for any pattern. Since it's
3137	/// // an anchored search, we position the start of the search where we
3138	/// // know the match will begin.
3139	/// let expected = HalfMatch::must(`0`, `10`);
3140	/// let input = Input::new(haystack).range(`4`..);
3141	/// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3142	/// // Since we compiled anchored start states for each pattern, we can
3143	/// // also look for matches of other patterns explicitly, even if a
3144	/// // different pattern would have normally matched.
3145	/// let expected = HalfMatch::must(`1`, `10`);
3146	/// let input = Input::new(haystack)
3147	/// .range(`4`..)
3148	/// .anchored(Anchored::Pattern(PatternID::must(`1`)));
3149	/// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3150	///
3151	/// # Ok::<(), Box<dyn std::error::Error>>(())
3152	/// ```
3153	pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
3154	self.starts_for_each_pattern = Some(yes);
3155	self
3156	}
3157
3158	/// Whether to attempt to shrink the size of the lazy DFA's alphabet or
3159	/// not.
3160	///
3161	/// This option is enabled by default and should never be disabled unless
3162	/// one is debugging the lazy DFA.
3163	///
3164	/// When enabled, the lazy DFA will use a map from all possible bytes
3165	/// to their corresponding equivalence class. Each equivalence class
3166	/// represents a set of bytes that does not discriminate between a match
3167	/// and a non-match in the DFA. For example, the pattern `[ab]+` has at
3168	/// least two equivalence classes: a set containing `a` and `b` and a set
3169	/// containing every byte except for `a` and `b`. `a` and `b` are in the
3170	/// same equivalence classes because they never discriminate between a
3171	/// match and a non-match.
3172	///
3173	/// The advantage of this map is that the size of the transition table
3174	/// can be reduced drastically from `#states 256 * sizeof(LazyStateID)`*
3175	/// to `#states k * sizeof(LazyStateID)` where `k` is the number of*
3176	/// equivalence classes (rounded up to the nearest power of 2). As a
3177	/// result, total space usage can decrease substantially. Moreover, since a
3178	/// smaller alphabet is used, DFA compilation during search becomes faster
3179	/// as well since it will potentially be able to reuse a single transition
3180	/// for multiple bytes.
3181	///
3182	/// WARNING:* This is only useful for debugging lazy DFAs. Disabling*
3183	/// this does not yield any speed advantages. Namely, even when this is
3184	/// disabled, a byte class map is still used while searching. The only
3185	/// difference is that every byte will be forced into its own distinct
3186	/// equivalence class. This is useful for debugging the actual generated
3187	/// transitions because it lets one see the transitions defined on actual
3188	/// bytes instead of the equivalence classes.
3189	pub fn byte_classes(mut self, yes: bool) -> Config {
3190	self.byte_classes = Some(yes);
3191	self
3192	}
3193
3194	/// Heuristically enable Unicode word boundaries.
3195	///
3196	/// When set, this will attempt to implement Unicode word boundaries as if
3197	/// they were ASCII word boundaries. This only works when the search input
3198	/// is ASCII only. If a non-ASCII byte is observed while searching, then a
3199	/// [`MatchError::quit`] error is returned.
3200	///
3201	/// A possible alternative to enabling this option is to simply use an
3202	/// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
3203	/// option is if you absolutely need Unicode support. This option lets one
3204	/// use a fast search implementation (a DFA) for some potentially very
3205	/// common cases, while providing the option to fall back to some other
3206	/// regex engine to handle the general case when an error is returned.
3207	///
3208	/// If the pattern provided has no Unicode word boundary in it, then this
3209	/// option has no effect. (That is, quitting on a non-ASCII byte only
3210	/// occurs when this option is enabled _and_ a Unicode word boundary is
3211	/// present in the pattern.)
3212	///
3213	/// This is almost equivalent to setting all non-ASCII bytes to be quit
3214	/// bytes. The only difference is that this will cause non-ASCII bytes to
3215	/// be quit bytes _only_ when a Unicode word boundary is present in the
3216	/// pattern.
3217	///
3218	/// When enabling this option, callers _must_ be prepared to
3219	/// handle a [`MatchError`] error during search. When using a
3220	/// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3221	/// `try_` suite of methods. Alternatively, if callers can guarantee that
3222	/// their input is ASCII only, then a [`MatchError::quit`] error will never
3223	/// be returned while searching.
3224	///
3225	/// This is disabled by default.
3226	///
3227	/// # Example
3228	///
3229	/// This example shows how to heuristically enable Unicode word boundaries
3230	/// in a pattern. It also shows what happens when a search comes across a
3231	/// non-ASCII byte.
3232	///
3233	/// ```
3234	/// use regex_automata::{
3235	/// hybrid::dfa::DFA,
3236	/// HalfMatch, Input, MatchError,
3237	/// };
3238	///
3239	/// let dfa = DFA::builder()
3240	/// .configure(DFA::config().unicode_word_boundary(`true`))
3241	/// .build(r"\b[0-9]+\b")?;
3242	/// let mut cache = dfa.create_cache();
3243	///
3244	/// // The match occurs before the search ever observes the snowman
3245	/// // character, so no error occurs.
3246	/// let haystack = "foo 123 ☃";
3247	/// let expected = Some(HalfMatch::must(`0`, `7`));
3248	/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3249	/// assert_eq!(expected, got);
3250	///
3251	/// // Notice that this search fails, even though the snowman character
3252	/// // occurs after the ending match offset. This is because search
3253	/// // routines read one byte past the end of the search to account for
3254	/// // look-around, and indeed, this is required here to determine whether
3255	/// // the trailing \b matches.
3256	/// let haystack = "foo 123 ☃";
3257	/// let expected = MatchError::quit(`0xE2`, `8`);
3258	/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack));
3259	/// assert_eq!(Err(expected), got);
3260	///
3261	/// // Another example is executing a search where the span of the haystack
3262	/// // we specify is all ASCII, but there is non-ASCII just before it. This
3263	/// // correctly also reports an error.
3264	/// let input = Input::new("β123").range(`2`..);
3265	/// let expected = MatchError::quit(`0xB2`, `1`);
3266	/// let got = dfa.try_search_fwd(&mut cache, &input);
3267	/// assert_eq!(Err(expected), got);
3268	///
3269	/// // And similarly for the trailing word boundary.
3270	/// let input = Input::new("123β").range(..`3`);
3271	/// let expected = MatchError::quit(`0xCE`, `3`);
3272	/// let got = dfa.try_search_fwd(&mut cache, &input);
3273	/// assert_eq!(Err(expected), got);
3274	///
3275	/// # Ok::<(), Box<dyn std::error::Error>>(())
3276	/// ```
3277	pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
3278	// We have a separate option for this instead of just setting the
3279	// appropriate quit bytes here because we don't want to set quit bytes
3280	// for every regex. We only want to set them when the regex contains a
3281	// Unicode word boundary.
3282	self.unicode_word_boundary = Some(yes);
3283	self
3284	}
3285
3286	/// Add a "quit" byte to the lazy DFA.
3287	///
3288	/// When a quit byte is seen during search time, then search will return a
3289	/// [`MatchError::quit`] error indicating the offset at which the search
3290	/// stopped.
3291	///
3292	/// A quit byte will always overrule any other aspects of a regex. For
3293	/// example, if the `x` byte is added as a quit byte and the regex `\w` is
3294	/// used, then observing `x` will cause the search to quit immediately
3295	/// despite the fact that `x` is in the `\w` class.
3296	///
3297	/// This mechanism is primarily useful for heuristically enabling certain
3298	/// features like Unicode word boundaries in a DFA. Namely, if the input
3299	/// to search is ASCII, then a Unicode word boundary can be implemented
3300	/// via an ASCII word boundary with no change in semantics. Thus, a DFA
3301	/// can attempt to match a Unicode word boundary but give up as soon as it
3302	/// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
3303	/// to be quit bytes, then Unicode word boundaries will be permitted when
3304	/// building lazy DFAs. Of course, callers should enable
3305	/// [`Config::unicode_word_boundary`] if they want this behavior instead.
3306	/// (The advantage being that non-ASCII quit bytes will only be added if a
3307	/// Unicode word boundary is in the pattern.)
3308	///
3309	/// When enabling this option, callers _must_ be prepared to
3310	/// handle a [`MatchError`] error during search. When using a
3311	/// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3312	/// `try_` suite of methods.
3313	///
3314	/// By default, there are no quit bytes set.
3315	///
3316	/// # Panics
3317	///
3318	/// This panics if heuristic Unicode word boundaries are enabled and any
3319	/// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
3320	/// Unicode word boundaries requires setting every non-ASCII byte to a quit
3321	/// byte. So if the caller attempts to undo any of that, then this will
3322	/// panic.
3323	///
3324	/// # Example
3325	///
3326	/// This example shows how to cause a search to terminate if it sees a
3327	/// `\n` byte. This could be useful if, for example, you wanted to prevent
3328	/// a user supplied pattern from matching across a line boundary.
3329	///
3330	/// ```
3331	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
3332	/// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3333	///
3334	/// let dfa = DFA::builder()
3335	/// .configure(DFA::config().quit(b'`\n`', `true`))
3336	/// .build(r"foo\p{any}+bar")?;
3337	/// let mut cache = dfa.create_cache();
3338	///
3339	/// let haystack = "foo`\n`bar";
3340	/// // Normally this would produce a match, since \p{any} contains '\n'.
3341	/// // But since we instructed the automaton to enter a quit state if a
3342	/// // '\n' is observed, this produces a match error instead.
3343	/// let expected = MatchError::quit(b'`\n`', `3`);
3344	/// let got = dfa.try_search_fwd(
3345	/// &mut cache,
3346	/// &Input::new(haystack),
3347	/// ).unwrap_err();
3348	/// assert_eq!(expected, got);
3349	///
3350	/// # Ok::<(), Box<dyn std::error::Error>>(())
3351	/// ```
3352	pub fn quit(mut self, byte: u8, yes: bool) -> Config {
3353	if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
3354	panic!(
3355	"cannot set non-ASCII byte to be non-quit when \
3356	Unicode word boundaries are enabled"
3357	);
3358	}
3359	if self.quitset.is_none() {
3360	self.quitset = Some(ByteSet::empty());
3361	}
3362	if yes {
3363	self.quitset.as_mut().unwrap().add(byte);
3364	} else {
3365	self.quitset.as_mut().unwrap().remove(byte);
3366	}
3367	self
3368	}
3369
3370	/// Enable specializing start states in the lazy DFA.
3371	///
3372	/// When start states are specialized, an implementor of a search routine
3373	/// using a lazy DFA can tell when the search has entered a starting state.
3374	/// When start states aren't specialized, then it is impossible to know
3375	/// whether the search has entered a start state.
3376	///
3377	/// Ideally, this option wouldn't need to exist and we could always
3378	/// specialize start states. The problem is that start states can be quite
3379	/// active. This in turn means that an efficient search routine is likely
3380	/// to ping-pong between a heavily optimized hot loop that handles most
3381	/// states and to a less optimized specialized handling of start states.
3382	/// This causes branches to get heavily mispredicted and overall can
3383	/// materially decrease throughput. Therefore, specializing start states
3384	/// should only be enabled when it is needed.
3385	///
3386	/// Knowing whether a search is in a start state is typically useful when a
3387	/// prefilter is active for the search. A prefilter is typically only run
3388	/// when in a start state and a prefilter can greatly accelerate a search.
3389	/// Therefore, the possible cost of specializing start states is worth it
3390	/// in this case. Otherwise, if you have no prefilter, there is likely no
3391	/// reason to specialize start states.
3392	///
3393	/// This is disabled by default, but note that it is automatically
3394	/// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
3395	/// `specialize_start_states` has already been set, [`Config::prefilter`]
3396	/// will automatically enable or disable it based on whether a prefilter
3397	/// is present or not, respectively. This is done because a prefilter's
3398	/// effectiveness is rooted in being executed whenever the DFA is in a
3399	/// start state, and that's only possible to do when they are specialized.
3400	///
3401	/// Note that it is plausibly reasonable to _disable_ this option
3402	/// explicitly while _enabling_ a prefilter. In that case, a prefilter
3403	/// will still be run at the beginning of a search, but never again. This
3404	/// in theory could strike a good balance if you're in a situation where a
3405	/// prefilter is likely to produce many false positive candidates.
3406	///
3407	/// # Example
3408	///
3409	/// This example shows how to enable start state specialization and then
3410	/// shows how to check whether a state is a start state or not.
3411	///
3412	/// ```
3413	/// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3414	///
3415	/// let dfa = DFA::builder()
3416	/// .configure(DFA::config().specialize_start_states(`true`))
3417	/// .build(r"[a-z]+")?;
3418	/// let mut cache = dfa.create_cache();
3419	///
3420	/// let haystack = "123 foobar 4567".as_bytes();
3421	/// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3422	/// // The ID returned by 'start_state_forward' will always be tagged as
3423	/// // a start state when start state specialization is enabled.
3424	/// assert!(sid.is_tagged());
3425	/// assert!(sid.is_start());
3426	///
3427	/// # Ok::<(), Box<dyn std::error::Error>>(())
3428	/// ```
3429	///
3430	/// Compare the above with the default lazy DFA configuration where
3431	/// start states are _not_ specialized. In this case, the start state
3432	/// is not tagged and `sid.is_start()` returns false.
3433	///
3434	/// ```
3435	/// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3436	///
3437	/// let dfa = DFA::new(r"[a-z]+")?;
3438	/// let mut cache = dfa.create_cache();
3439	///
3440	/// let haystack = "123 foobar 4567".as_bytes();
3441	/// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3442	/// // Start states are not tagged in the default configuration!
3443	/// assert!(!sid.is_tagged());
3444	/// assert!(!sid.is_start());
3445	///
3446	/// # Ok::<(), Box<dyn std::error::Error>>(())
3447	/// ```
3448	pub fn specialize_start_states(mut self, yes: bool) -> Config {
3449	self.specialize_start_states = Some(yes);
3450	self
3451	}
3452
3453	/// Sets the maximum amount of heap memory, in bytes, to allocate to the
3454	/// cache for use during a lazy DFA search. If the lazy DFA would otherwise
3455	/// use more heap memory, then, depending on other configuration knobs,
3456	/// either stop the search and return an error or clear the cache and
3457	/// continue the search.
3458	///
3459	/// The default cache capacity is some "reasonable" number that will
3460	/// accommodate most regular expressions. You may find that if you need
3461	/// to build a large DFA then it may be necessary to increase the cache
3462	/// capacity.
3463	///
3464	/// Note that while building a lazy DFA will do a "minimum" check to ensure
3465	/// the capacity is big enough, this is more or less about correctness.
3466	/// If the cache is bigger than the minimum but still "too small," then the
3467	/// lazy DFA could wind up spending a lot of time clearing the cache and
3468	/// recomputing transitions, thus negating the performance benefits of a
3469	/// lazy DFA. Thus, setting the cache capacity is mostly an experimental
3470	/// endeavor. For most common patterns, however, the default should be
3471	/// sufficient.
3472	///
3473	/// For more details on how the lazy DFA's cache is used, see the
3474	/// documentation for [`Cache`].
3475	///
3476	/// # Example
3477	///
3478	/// This example shows what happens if the configured cache capacity is
3479	/// too small. In such cases, one can override the cache capacity to make
3480	/// it bigger. Alternatively, one might want to use less memory by setting
3481	/// a smaller cache capacity.
3482	///
3483	/// ```
3484	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
3485	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3486	///
3487	/// let pattern = r"\p{L}{1000}";
3488	///
3489	/// // The default cache capacity is likely too small to deal with regexes
3490	/// // that are very large. Large repetitions of large Unicode character
3491	/// // classes are a common way to make very large regexes.
3492	/// let _ = DFA::new(pattern).unwrap_err();
3493	/// // Bump up the capacity to something bigger.
3494	/// let dfa = DFA::builder()
3495	/// .configure(DFA::config().cache_capacity(`100` * (`1`<<`20`))) // 100 MB
3496	/// .build(pattern)?;
3497	/// let mut cache = dfa.create_cache();
3498	///
3499	/// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(`50`);
3500	/// let expected = Some(HalfMatch::must(`0`, `2000`));
3501	/// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3502	/// assert_eq!(expected, got);
3503	///
3504	/// # Ok::<(), Box<dyn std::error::Error>>(())
3505	/// ```
3506	pub fn cache_capacity(mut self, bytes: usize) -> Config {
3507	self.cache_capacity = Some(bytes);
3508	self
3509	}
3510
3511	/// Configures construction of a lazy DFA to use the minimum cache capacity
3512	/// if the configured capacity is otherwise too small for the provided NFA.
3513	///
3514	/// This is useful if you never want lazy DFA construction to fail because
3515	/// of a capacity that is too small.
3516	///
3517	/// In general, this option is typically not a good idea. In particular,
3518	/// while a minimum cache capacity does permit the lazy DFA to function
3519	/// where it otherwise couldn't, it's plausible that it may not function
3520	/// well if it's constantly running out of room. In that case, the speed
3521	/// advantages of the lazy DFA may be negated. On the other hand, the
3522	/// "minimum" cache capacity computed may not be completely accurate and
3523	/// could actually be bigger than what is really necessary. Therefore, it
3524	/// is plausible that using the minimum cache capacity could still result
3525	/// in very good performance.
3526	///
3527	/// This is disabled by default.
3528	///
3529	/// # Example
3530	///
3531	/// This example shows what happens if the configured cache capacity is
3532	/// too small. In such cases, one could override the capacity explicitly.
3533	/// An alternative, demonstrated here, let's us force construction to use
3534	/// the minimum cache capacity if the configured capacity is otherwise
3535	/// too small.
3536	///
3537	/// ```
3538	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
3539	/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3540	///
3541	/// let pattern = r"\p{L}{1000}";
3542	///
3543	/// // The default cache capacity is likely too small to deal with regexes
3544	/// // that are very large. Large repetitions of large Unicode character
3545	/// // classes are a common way to make very large regexes.
3546	/// let _ = DFA::new(pattern).unwrap_err();
3547	/// // Configure construction such it automatically selects the minimum
3548	/// // cache capacity if it would otherwise be too small.
3549	/// let dfa = DFA::builder()
3550	/// .configure(DFA::config().skip_cache_capacity_check(`true`))
3551	/// .build(pattern)?;
3552	/// let mut cache = dfa.create_cache();
3553	///
3554	/// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(`50`);
3555	/// let expected = Some(HalfMatch::must(`0`, `2000`));
3556	/// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3557	/// assert_eq!(expected, got);
3558	///
3559	/// # Ok::<(), Box<dyn std::error::Error>>(())
3560	/// ```
3561	pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config {
3562	self.skip_cache_capacity_check = Some(yes);
3563	self
3564	}
3565
3566	/// Configure a lazy DFA search to quit after a certain number of cache
3567	/// clearings.
3568	///
3569	/// When a minimum is set, then a lazy DFA search will possibly* "give*
3570	/// up" after the minimum number of cache clearings has occurred. This is
3571	/// typically useful in scenarios where callers want to detect whether the
3572	/// lazy DFA search is "efficient" or not. If the cache is cleared too many
3573	/// times, this is a good indicator that it is not efficient, and thus, the
3574	/// caller may wish to use some other regex engine.
3575	///
3576	/// Note that the number of times a cache is cleared is a property of
3577	/// the cache itself. Thus, if a cache is used in a subsequent search
3578	/// with a similarly configured lazy DFA, then it could cause the
3579	/// search to "give up" if the cache needed to be cleared, depending
3580	/// on its internal count and configured minimum. The cache clear
3581	/// count can only be reset to `0` via [`DFA::reset_cache`] (or
3582	/// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if
3583	/// you're using the `Regex` API).
3584	///
3585	/// By default, no minimum is configured. Thus, a lazy DFA search will
3586	/// never give up due to cache clearings. If you do set this option, you
3587	/// might consider also setting [`Config::minimum_bytes_per_state`] in
3588	/// order for the lazy DFA to take efficiency into account before giving
3589	/// up.
3590	///
3591	/// # Example
3592	///
3593	/// This example uses a somewhat pathological configuration to demonstrate
3594	/// the _possible_ behavior of cache clearing and how it might result
3595	/// in a search that returns an error.
3596	///
3597	/// It is important to note that the precise mechanics of how and when
3598	/// a cache gets cleared is an implementation detail.
3599	///
3600	/// ```
3601	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
3602	/// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind};
3603	///
3604	/// // This is a carefully chosen regex. The idea is to pick one
3605	/// // that requires some decent number of states (hence the bounded
3606	/// // repetition). But we specifically choose to create a class with an
3607	/// // ASCII letter and a non-ASCII letter so that we can check that no new
3608	/// // states are created once the cache is full. Namely, if we fill up the
3609	/// // cache on a haystack of 'a's, then in order to match one 'β', a new
3610	/// // state will need to be created since a 'β' is encoded with multiple
3611	/// // bytes. Since there's no room for this state, the search should quit
3612	/// // at the very first position.
3613	/// let pattern = r"[aβ]{100}";
3614	/// let dfa = DFA::builder()
3615	/// .configure(
3616	/// // Configure it so that we have the minimum cache capacity
3617	/// // possible. And that if any clearings occur, the search quits.
3618	/// DFA::config()
3619	/// .skip_cache_capacity_check(`true`)
3620	/// .cache_capacity(`0`)
3621	/// .minimum_cache_clear_count(Some(`0`)),
3622	/// )
3623	/// .build(pattern)?;
3624	/// let mut cache = dfa.create_cache();
3625	///
3626	/// // Our search will give up before reaching the end!
3627	/// let haystack = "a".repeat(`101`).into_bytes();
3628	/// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3629	/// assert!(matches!(
3630	/// *result.unwrap_err().kind(),
3631	/// MatchErrorKind::GaveUp { .. },
3632	/// ));
3633	///
3634	/// // Now that we know the cache is full, if we search a haystack that we
3635	/// // know will require creating at least one new state, it should not
3636	/// // be able to make much progress.
3637	/// let haystack = "β".repeat(`101`).into_bytes();
3638	/// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3639	/// assert!(matches!(
3640	/// *result.unwrap_err().kind(),
3641	/// MatchErrorKind::GaveUp { .. },
3642	/// ));
3643	///
3644	/// // If we reset the cache, then we should be able to create more states
3645	/// // and make more progress with searching for betas.
3646	/// cache.reset(&dfa);
3647	/// let haystack = "β".repeat(`101`).into_bytes();
3648	/// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3649	/// assert!(matches!(
3650	/// *result.unwrap_err().kind(),
3651	/// MatchErrorKind::GaveUp { .. },
3652	/// ));
3653	///
3654	/// // ... switching back to ASCII still makes progress since it just needs
3655	/// // to set transitions on existing states!
3656	/// let haystack = "a".repeat(`101`).into_bytes();
3657	/// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3658	/// assert!(matches!(
3659	/// *result.unwrap_err().kind(),
3660	/// MatchErrorKind::GaveUp { .. },
3661	/// ));
3662	///
3663	/// # Ok::<(), Box<dyn std::error::Error>>(())
3664	/// ```
3665	pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config {
3666	self.minimum_cache_clear_count = Some(min);
3667	self
3668	}
3669
3670	/// Configure a lazy DFA search to quit only when its efficiency drops
3671	/// below the given minimum.
3672	///
3673	/// The efficiency of the cache is determined by the number of DFA states
3674	/// compiled per byte of haystack searched. For example, if the efficiency
3675	/// is 2, then it means the lazy DFA is creating a new DFA state after
3676	/// searching approximately 2 bytes in a haystack. Generally speaking, 2
3677	/// is quite bad and it's likely that even a slower regex engine like the
3678	/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster.
3679	///
3680	/// This has no effect if [`Config::minimum_cache_clear_count`] is not set.
3681	/// Namely, this option only kicks in when the cache has been cleared more
3682	/// than the minimum number. If no minimum is set, then the cache is simply
3683	/// cleared whenever it fills up and it is impossible for the lazy DFA to
3684	/// quit due to ineffective use of the cache.
3685	///
3686	/// In general, if one is setting [`Config::minimum_cache_clear_count`],
3687	/// then one should probably also set this knob as well. The reason is
3688	/// that the absolute number of times the cache is cleared is generally
3689	/// not a great predictor of efficiency. For example, if a new DFA state
3690	/// is created for every 1,000 bytes searched, then it wouldn't be hard
3691	/// for the cache to get cleared more than `N` times and then cause the
3692	/// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite
3693	/// good from a performance perspective, and it's likely that the lazy
3694	/// DFA should continue searching, even if it requires clearing the cache
3695	/// occasionally.
3696	///
3697	/// Finally, note that if you're implementing your own lazy DFA search
3698	/// routine and also want this efficiency check to work correctly, then
3699	/// you'll need to use the following routines to record search progress:
3700	///
3701	/// Call* [`Cache::search_start`] at the beginning of every search.
3702	/// Call* [`Cache::search_update`] whenever [`DFA::next_state`] is
3703	/// called.
3704	/// Call* [`Cache::search_finish`] before completing a search. (It is
3705	/// not strictly necessary to call this when an error is returned, as
3706	/// `Cache::search_start` will automatically finish the previous search
3707	/// for you. But calling it where possible before returning helps improve
3708	/// the accuracy of how many bytes have actually been searched.)
3709	pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config {
3710	self.minimum_bytes_per_state = Some(min);
3711	self
3712	}
3713
3714	/// Returns the match semantics set in this configuration.
3715	pub fn get_match_kind(&self) -> MatchKind {
3716	self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
3717	}
3718
3719	/// Returns the prefilter set in this configuration, if one at all.
3720	pub fn get_prefilter(&self) -> Option<&Prefilter> {
3721	self.pre.as_ref().unwrap_or(&None).as_ref()
3722	}
3723
3724	/// Returns whether this configuration has enabled anchored starting states
3725	/// for every pattern in the DFA.
3726	pub fn get_starts_for_each_pattern(&self) -> bool {
3727	self.starts_for_each_pattern.unwrap_or(`false`)
3728	}
3729
3730	/// Returns whether this configuration has enabled byte classes or not.
3731	/// This is typically a debugging oriented option, as disabling it confers
3732	/// no speed benefit.
3733	pub fn get_byte_classes(&self) -> bool {
3734	self.byte_classes.unwrap_or(`true`)
3735	}
3736
3737	/// Returns whether this configuration has enabled heuristic Unicode word
3738	/// boundary support. When enabled, it is possible for a search to return
3739	/// an error.
3740	pub fn get_unicode_word_boundary(&self) -> bool {
3741	self.unicode_word_boundary.unwrap_or(`false`)
3742	}
3743
3744	/// Returns whether this configuration will instruct the lazy DFA to enter
3745	/// a quit state whenever the given byte is seen during a search. When at
3746	/// least one byte has this enabled, it is possible for a search to return
3747	/// an error.
3748	pub fn get_quit(&self, byte: u8) -> bool {
3749	self.quitset.map_or(`false`, \|q\| q.contains(byte))
3750	}
3751
3752	/// Returns whether this configuration will instruct the lazy DFA to
3753	/// "specialize" start states. When enabled, the lazy DFA will tag start
3754	/// states so that search routines using the lazy DFA can detect when
3755	/// it's in a start state and do some kind of optimization (like run a
3756	/// prefilter).
3757	pub fn get_specialize_start_states(&self) -> bool {
3758	self.specialize_start_states.unwrap_or(`false`)
3759	}
3760
3761	/// Returns the cache capacity set on this configuration.
3762	pub fn get_cache_capacity(&self) -> usize {
3763	self.cache_capacity.unwrap_or(`2` * (`1` << `20`))
3764	}
3765
3766	/// Returns whether the cache capacity check should be skipped.
3767	pub fn get_skip_cache_capacity_check(&self) -> bool {
3768	self.skip_cache_capacity_check.unwrap_or(`false`)
3769	}
3770
3771	/// Returns, if set, the minimum number of times the cache must be cleared
3772	/// before a lazy DFA search can give up. When no minimum is set, then a
3773	/// search will never quit and will always clear the cache whenever it
3774	/// fills up.
3775	pub fn get_minimum_cache_clear_count(&self) -> Option<usize> {
3776	self.minimum_cache_clear_count.unwrap_or(None)
3777	}
3778
3779	/// Returns, if set, the minimum number of bytes per state that need to be
3780	/// processed in order for the lazy DFA to keep going. If the minimum falls
3781	/// below this number (and the cache has been cleared a minimum number of
3782	/// times), then the lazy DFA will return a "gave up" error.
3783	pub fn get_minimum_bytes_per_state(&self) -> Option<usize> {
3784	self.minimum_bytes_per_state.unwrap_or(None)
3785	}
3786
3787	/// Returns the minimum lazy DFA cache capacity required for the given NFA.
3788	///
3789	/// The cache capacity required for a particular NFA may change without
3790	/// notice. Callers should not rely on it being stable.
3791	///
3792	/// This is useful for informational purposes, but can also be useful for
3793	/// other reasons. For example, if one wants to check the minimum cache
3794	/// capacity themselves or if one wants to set the capacity based on the
3795	/// minimum.
3796	///
3797	/// This may return an error if this configuration does not support all of
3798	/// the instructions used in the given NFA. For example, if the NFA has a
3799	/// Unicode word boundary but this configuration does not enable heuristic
3800	/// support for Unicode word boundaries.
3801	pub fn get_minimum_cache_capacity(
3802	&self,
3803	nfa: &thompson::NFA,
3804	) -> Result<usize, BuildError> {
3805	let quitset = self.quit_set_from_nfa(nfa)?;
3806	let classes = self.byte_classes_from_nfa(nfa, &quitset);
3807	let starts = self.get_starts_for_each_pattern();
3808	Ok(minimum_cache_capacity(nfa, &classes, starts))
3809	}
3810
3811	/// Returns the byte class map used during search from the given NFA.
3812	///
3813	/// If byte classes are disabled on this configuration, then a map is
3814	/// returned that puts each byte in its own equivalent class.
3815	fn byte_classes_from_nfa(
3816	&self,
3817	nfa: &thompson::NFA,
3818	quit: &ByteSet,
3819	) -> ByteClasses {
3820	if !self.get_byte_classes() {
3821	// The lazy DFA will always use the equivalence class map, but
3822	// enabling this option is useful for debugging. Namely, this will
3823	// cause all transitions to be defined over their actual bytes
3824	// instead of an opaque equivalence class identifier. The former is
3825	// much easier to grok as a human.
3826	ByteClasses::singletons()
3827	} else {
3828	let mut set = nfa.byte_class_set().clone();
3829	// It is important to distinguish any "quit" bytes from all other
3830	// bytes. Otherwise, a non-quit byte may end up in the same class
3831	// as a quit byte, and thus cause the DFA stop when it shouldn't.
3832	//
3833	// Test case:
3834	//
3835	// regex-cli find match hybrid --unicode-word-boundary \
3836	// -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
3837	if !quit.is_empty() {
3838	set.add_set(&quit);
3839	}
3840	set.byte_classes()
3841	}
3842	}
3843
3844	/// Return the quit set for this configuration and the given NFA.
3845	///
3846	/// This may return an error if the NFA is incompatible with this
3847	/// configuration's quit set. For example, if the NFA has a Unicode word
3848	/// boundary and the quit set doesn't include non-ASCII bytes.
3849	fn quit_set_from_nfa(
3850	&self,
3851	nfa: &thompson::NFA,
3852	) -> Result<ByteSet, BuildError> {
3853	let mut quit = self.quitset.unwrap_or(ByteSet::empty());
3854	if nfa.look_set_any().contains_word_unicode() {
3855	if self.get_unicode_word_boundary() {
3856	for b in `0x80`..=`0xFF` {
3857	quit.add(b);
3858	}
3859	} else {
3860	// If heuristic support for Unicode word boundaries wasn't
3861	// enabled, then we can still check if our quit set is correct.
3862	// If the caller set their quit bytes in a way that causes the
3863	// DFA to quit on at least all non-ASCII bytes, then that's all
3864	// we need for heuristic support to work.
3865	if !quit.contains_range(`0x80`, `0xFF`) {
3866	return Err(
3867	BuildError::unsupported_dfa_word_boundary_unicode(),
3868	);
3869	}
3870	}
3871	}
3872	Ok(quit)
3873	}
3874
3875	/// Overwrite the default configuration such that the options in `o` are
3876	/// always used. If an option in `o` is not set, then the corresponding
3877	/// option in `self` is used. If it's not set in `self` either, then it
3878	/// remains not set.
3879	fn overwrite(&self, o: Config) -> Config {
3880	Config {
3881	match_kind: o.match_kind.or(self.match_kind),
3882	pre: o.pre.or_else(\|\| self.pre.clone()),
3883	starts_for_each_pattern: o
3884	.starts_for_each_pattern
3885	.or(self.starts_for_each_pattern),
3886	byte_classes: o.byte_classes.or(self.byte_classes),
3887	unicode_word_boundary: o
3888	.unicode_word_boundary
3889	.or(self.unicode_word_boundary),
3890	quitset: o.quitset.or(self.quitset),
3891	specialize_start_states: o
3892	.specialize_start_states
3893	.or(self.specialize_start_states),
3894	cache_capacity: o.cache_capacity.or(self.cache_capacity),
3895	skip_cache_capacity_check: o
3896	.skip_cache_capacity_check
3897	.or(self.skip_cache_capacity_check),
3898	minimum_cache_clear_count: o
3899	.minimum_cache_clear_count
3900	.or(self.minimum_cache_clear_count),
3901	minimum_bytes_per_state: o
3902	.minimum_bytes_per_state
3903	.or(self.minimum_bytes_per_state),
3904	}
3905	}
3906	}
3907
3908	/// A builder for constructing a lazy deterministic finite automaton from
3909	/// regular expressions.
3910	///
3911	/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The
3912	/// advantage of the former is that it often lets you avoid importing the
3913	/// `Builder` type directly.
3914	///
3915	/// This builder provides two main things:
3916	///
3917	/// 1. It provides a few different `build` routines for actually constructing
3918	/// a DFA from different kinds of inputs. The most convenient is
3919	/// [`Builder::build`], which builds a DFA directly from a pattern string. The
3920	/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
3921	/// from an NFA.
3922	/// 2. The builder permits configuring a number of things.
3923	/// [`Builder::configure`] is used with [`Config`] to configure aspects of
3924	/// the DFA and the construction process itself. [`Builder::syntax`] and
3925	/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
3926	/// construction, respectively. The syntax and thompson configurations only
3927	/// apply when building from a pattern string.
3928	///
3929	/// This builder always constructs a single* lazy DFA. As such, this builder*
3930	/// can only be used to construct regexes that either detect the presence
3931	/// of a match or find the end location of a match. A single DFA cannot
3932	/// produce both the start and end of a match. For that information, use a
3933	/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured
3934	/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason
3935	/// to use a DFA directly is if the end location of a match is enough for your
3936	/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one,
3937	/// since a second reverse DFA is needed to find the start of a match.
3938	///
3939	/// # Example
3940	///
3941	/// This example shows how to build a lazy DFA that uses a tiny cache capacity
3942	/// and completely disables Unicode. That is:
3943	///
3944	/// Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`*
3945	/// and `\b` are ASCII-only while `.` matches any byte except for `\n`
3946	/// (instead of any UTF-8 encoding of a Unicode scalar value except for
3947	/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
3948	/// The pattern itself is permitted to match invalid UTF-8. For example,*
3949	/// things like `[^a]` that match any byte except for `a` are permitted.
3950	///
3951	/// ```
3952	/// use regex_automata::{
3953	/// hybrid::dfa::DFA,
3954	/// nfa::thompson,
3955	/// util::syntax,
3956	/// HalfMatch, Input,
3957	/// };
3958	///
3959	/// let dfa = DFA::builder()
3960	/// .configure(DFA::config().cache_capacity(`5_000`))
3961	/// .thompson(thompson::Config::new().utf8(`false`))
3962	/// .syntax(syntax::Config::new().unicode(`false`).utf8(`false`))
3963	/// .build(r"foo[^b]ar.*")?;
3964	/// let mut cache = dfa.create_cache();
3965	///
3966	/// let haystack = b"`\xFE`foo`\xFF`ar`\xE2\x98\xFF\n`";
3967	/// let expected = Some(HalfMatch::must(`0`, `10`));
3968	/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3969	/// assert_eq!(expected, got);
3970	///
3971	/// # Ok::<(), Box<dyn std::error::Error>>(())
3972	/// ```
3973	#[derive(Clone, Debug)]
3974	pub struct Builder {
3975	config: Config,
3976	#[cfg(feature = "syntax")]
3977	thompson: thompson::Compiler,
3978	}
3979
3980	impl Builder {
3981	/// Create a new lazy DFA builder with the default configuration.
3982	pub fn new() -> Builder {
3983	Builder {
3984	config: Config::default(),
3985	#[cfg(feature = "syntax")]
3986	thompson: thompson::Compiler::new(),
3987	}
3988	}
3989
3990	/// Build a lazy DFA from the given pattern.
3991	///
3992	/// If there was a problem parsing or compiling the pattern, then an error
3993	/// is returned.
3994	#[cfg(feature = "syntax")]
3995	pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
3996	self.build_many(&[pattern])
3997	}
3998
3999	/// Build a lazy DFA from the given patterns.
4000	///
4001	/// When matches are returned, the pattern ID corresponds to the index of
4002	/// the pattern in the slice given.
4003	#[cfg(feature = "syntax")]
4004	pub fn build_many<P: AsRef<str>>(
4005	&self,
4006	patterns: &[P],
4007	) -> Result<DFA, BuildError> {
4008	let nfa = self
4009	.thompson
4010	.clone()
4011	// We can always forcefully disable captures because DFAs do not
4012	// support them.
4013	.configure(
4014	thompson::Config::new()
4015	.which_captures(thompson::WhichCaptures::None),
4016	)
4017	.build_many(patterns)
4018	.map_err(BuildError::nfa)?;
4019	self.build_from_nfa(nfa)
4020	}
4021
4022	/// Build a DFA from the given NFA.
4023	///
4024	/// Note that this requires owning a `thompson::NFA`. While this may force
4025	/// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs
4026	/// are defined internally to support shared ownership such that cloning is
4027	/// very cheap.
4028	///
4029	/// # Example
4030	///
4031	/// This example shows how to build a lazy DFA if you already have an NFA
4032	/// in hand.
4033	///
4034	/// ```
4035	/// use regex_automata::{
4036	/// hybrid::dfa::DFA,
4037	/// nfa::thompson,
4038	/// HalfMatch, Input,
4039	/// };
4040	///
4041	/// let haystack = "foo123bar";
4042	///
4043	/// // This shows how to set non-default options for building an NFA.
4044	/// let nfa = thompson::Compiler::new()
4045	/// .configure(thompson::Config::new().shrink(`true`))
4046	/// .build(r"[0-9]+")?;
4047	/// let dfa = DFA::builder().build_from_nfa(nfa)?;
4048	/// let mut cache = dfa.create_cache();
4049	/// let expected = Some(HalfMatch::must(`0`, `6`));
4050	/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
4051	/// assert_eq!(expected, got);
4052	///
4053	/// # Ok::<(), Box<dyn std::error::Error>>(())
4054	/// ```
4055	pub fn build_from_nfa(
4056	&self,
4057	nfa: thompson::NFA,
4058	) -> Result<DFA, BuildError> {
4059	let quitset = self.config.quit_set_from_nfa(&nfa)?;
4060	let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
4061	// Check that we can fit at least a few states into our cache,
4062	// otherwise it's pretty senseless to use the lazy DFA. This does have
4063	// a possible failure mode though. This assumes the maximum size of a
4064	// state in powerset space (so, the total number of NFA states), which
4065	// may never actually materialize, and could be quite a bit larger
4066	// than the actual biggest state. If this turns out to be a problem,
4067	// we could expose a knob that disables this check. But if so, we have
4068	// to be careful not to panic in other areas of the code (the cache
4069	// clearing and init code) that tend to assume some minimum useful
4070	// cache capacity.
4071	let min_cache = minimum_cache_capacity(
4072	&nfa,
4073	&classes,
4074	self.config.get_starts_for_each_pattern(),
4075	);
4076	let mut cache_capacity = self.config.get_cache_capacity();
4077	if cache_capacity < min_cache {
4078	// When the caller has asked us to skip the cache capacity check,
4079	// then we simply force the cache capacity to its minimum amount
4080	// and mush on.
4081	if self.config.get_skip_cache_capacity_check() {
4082	debug!(
4083	"given capacity ({}) is too small, \
4084	since skip_cache_capacity_check is enabled, \
4085	setting cache capacity to minimum ({})",
4086	cache_capacity, min_cache,
4087	);
4088	cache_capacity = min_cache;
4089	} else {
4090	return Err(BuildError::insufficient_cache_capacity(
4091	min_cache,
4092	cache_capacity,
4093	));
4094	}
4095	}
4096	// We also need to check that we can fit at least some small number
4097	// of states in our state ID space. This is unlikely to trigger in
4098	// >=32-bit systems, but 16-bit systems have a pretty small state ID
4099	// space since a number of bits are used up as sentinels.
4100	if let Err(err) = minimum_lazy_state_id(&classes) {
4101	return Err(BuildError::insufficient_state_id_capacity(err));
4102	}
4103	let stride2 = classes.stride2();
4104	let start_map = StartByteMap::new(nfa.look_matcher());
4105	Ok(DFA {
4106	config: self.config.clone(),
4107	nfa,
4108	stride2,
4109	start_map,
4110	classes,
4111	quitset,
4112	cache_capacity,
4113	})
4114	}
4115
4116	/// Apply the given lazy DFA configuration options to this builder.
4117	pub fn configure(&mut self, config: Config) -> &mut Builder {
4118	self.config = self.config.overwrite(config);
4119	self
4120	}
4121
4122	/// Set the syntax configuration for this builder using
4123	/// [`syntax::Config`](crate::util::syntax::Config).
4124	///
4125	/// This permits setting things like case insensitivity, Unicode and multi
4126	/// line mode.
4127	///
4128	/// These settings only apply when constructing a lazy DFA directly from a
4129	/// pattern.
4130	#[cfg(feature = "syntax")]
4131	pub fn syntax(
4132	&mut self,
4133	config: crate::util::syntax::Config,
4134	) -> &mut Builder {
4135	self.thompson.syntax(config);
4136	self
4137	}
4138
4139	/// Set the Thompson NFA configuration for this builder using
4140	/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
4141	///
4142	/// This permits setting things like whether the DFA should match the regex
4143	/// in reverse or if additional time should be spent shrinking the size of
4144	/// the NFA.
4145	///
4146	/// These settings only apply when constructing a DFA directly from a
4147	/// pattern.
4148	#[cfg(feature = "syntax")]
4149	pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
4150	self.thompson.configure(config);
4151	self
4152	}
4153	}
4154
4155	/// Represents the current state of an overlapping search.
4156	///
4157	/// This is used for overlapping searches since they need to know something
4158	/// about the previous search. For example, when multiple patterns match at the
4159	/// same position, this state tracks the last reported pattern so that the next
4160	/// search knows whether to report another matching pattern or continue with
4161	/// the search at the next position. Additionally, it also tracks which state
4162	/// the last search call terminated in.
4163	///
4164	/// This type provides little introspection capabilities. The only thing a
4165	/// caller can do is construct it and pass it around to permit search routines
4166	/// to use it to track state, and also ask whether a match has been found.
4167	///
4168	/// Callers should always provide a fresh state constructed via
4169	/// [`OverlappingState::start`] when starting a new search. Reusing state from
4170	/// a previous search may result in incorrect results.
4171	#[derive(Clone, Debug, Eq, PartialEq)]
4172	pub struct OverlappingState {
4173	/// The match reported by the most recent overlapping search to use this
4174	/// state.
4175	///
4176	/// If a search does not find any matches, then it is expected to clear
4177	/// this value.
4178	pub(crate) mat: Option<HalfMatch>,
4179	/// The state ID of the state at which the search was in when the call
4180	/// terminated. When this is a match state, `last_match` must be set to a
4181	/// non-None value.
4182	///
4183	/// A `None` value indicates the start state of the corresponding
4184	/// automaton. We cannot use the actual ID, since any one automaton may
4185	/// have many start states, and which one is in use depends on several
4186	/// search-time factors.
4187	pub(crate) id: Option<LazyStateID>,
4188	/// The position of the search.
4189	///
4190	/// When `id` is None (i.e., we are starting a search), this is set to
4191	/// the beginning of the search as given by the caller regardless of its
4192	/// current value. Subsequent calls to an overlapping search pick up at
4193	/// this offset.
4194	pub(crate) at: usize,
4195	/// The index into the matching patterns of the next match to report if the
4196	/// current state is a match state. Note that this may be 1 greater than
4197	/// the total number of matches to report for the current match state. (In
4198	/// which case, no more matches should be reported at the current position
4199	/// and the search should advance to the next position.)
4200	pub(crate) next_match_index: Option<usize>,
4201	/// This is set to true when a reverse overlapping search has entered its
4202	/// EOI transitions.
4203	///
4204	/// This isn't used in a forward search because it knows to stop once the
4205	/// position exceeds the end of the search range. In a reverse search,
4206	/// since we use unsigned offsets, we don't "know" once we've gone past
4207	/// `0`. So the only way to detect it is with this extra flag. The reverse
4208	/// overlapping search knows to terminate specifically after it has
4209	/// reported all matches after following the EOI transition.
4210	pub(crate) rev_eoi: bool,
4211	}
4212
4213	impl OverlappingState {
4214	/// Create a new overlapping state that begins at the start state of any
4215	/// automaton.
4216	pub fn start() -> OverlappingState {
4217	OverlappingState {
4218	mat: None,
4219	id: None,
4220	at: `0`,
4221	next_match_index: None,
4222	rev_eoi: `false`,
4223	}
4224	}
4225
4226	/// Return the match result of the most recent search to execute with this
4227	/// state.
4228	///
4229	/// A searches will clear this result automatically, such that if no
4230	/// match is found, this will correctly report `None`.
4231	pub fn get_match(&self) -> Option<HalfMatch> {
4232	self.mat
4233	}
4234	}
4235
4236	/// Runs the given overlapping `search` function (forwards or backwards) until
4237	/// a match is found whose offset does not split a codepoint.
4238	///
4239	/// This is not* always correct to call. It should only be called when the*
4240	/// underlying NFA has UTF-8 mode enabled and* it can produce zero-width*
4241	/// matches. Calling this when both of those things aren't true might result
4242	/// in legitimate matches getting skipped.
4243	#[cold]
4244	#[inline(never)]
4245	fn skip_empty_utf8_splits_overlapping<F>(
4246	input: &Input<'_>,
4247	state: &mut OverlappingState,
4248	mut search: F,
4249	) -> Result<(), MatchError>
4250	where
4251	F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,
4252	{
4253	// Note that this routine works for forwards and reverse searches
4254	// even though there's no code here to handle those cases. That's
4255	// because overlapping searches drive themselves to completion via
4256	// `OverlappingState`. So all we have to do is push it until no matches are
4257	// found.
4258
4259	let mut hm = match state.get_match() {
4260	None => return Ok(()),
4261	Some(hm) => hm,
4262	};
4263	if input.get_anchored().is_anchored() {
4264	if !input.is_char_boundary(hm.offset()) {
4265	state.mat = None;
4266	}
4267	return Ok(());
4268	}
4269	while !input.is_char_boundary(hm.offset()) {
4270	search(input, state)?;
4271	hm = match state.get_match() {
4272	None => return Ok(()),
4273	Some(hm) => hm,
4274	};
4275	}
4276	Ok(())
4277	}
4278
4279	/// Based on the minimum number of states required for a useful lazy DFA cache,
4280	/// this returns the minimum lazy state ID that must be representable.
4281	///
4282	/// It's not likely for this to have any impact 32-bit systems (or higher), but
4283	/// on 16-bit systems, the lazy state ID space is quite constrained and thus
4284	/// may be insufficient if our MIN_STATES value is (for some reason) too high.
4285	fn minimum_lazy_state_id(
4286	classes: &ByteClasses,
4287	) -> Result<LazyStateID, LazyStateIDError> {
4288	let stride: usize = `1` << classes.stride2();
4289	let min_state_index: usize = MIN_STATES.checked_sub(`1`).unwrap();
4290	LazyStateID::new(id:min_state_index * stride)
4291	}
4292
4293	/// Based on the minimum number of states required for a useful lazy DFA cache,
4294	/// this returns a heuristic minimum number of bytes of heap space required.
4295	///
4296	/// This is a "heuristic" because the minimum it returns is likely bigger than
4297	/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses
4298	/// the maximum number of NFA states (all of them). This is likely bigger
4299	/// than what is required in practice. Computing the true minimum effectively
4300	/// requires determinization, which is probably too much work to do for a
4301	/// simple check like this.
4302	///
4303	/// One of the issues with this approach IMO is that it requires that this
4304	/// be in sync with the calculation above for computing how much heap memory
4305	/// the DFA cache uses. If we get it wrong, it's possible for example for the
4306	/// minimum to be smaller than the computed heap memory, and thus, it may be
4307	/// the case that we can't add the required minimum number of states. That in
4308	/// turn will make lazy DFA panic because we assume that we can add at least a
4309	/// minimum number of states.
4310	///
4311	/// Another approach would be to always allow the minimum number of states to
4312	/// be added to the lazy DFA cache, even if it exceeds the configured cache
4313	/// limit. This does mean that the limit isn't really a limit in all cases,
4314	/// which is unfortunate. But it does at least guarantee that the lazy DFA can
4315	/// always make progress, even if it is slow. (This approach is very similar to
4316	/// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't
4317	/// rely on cache size calculation. Instead, it would just always permit a
4318	/// minimum number of states to be added.)
4319	fn minimum_cache_capacity(
4320	nfa: &thompson::NFA,
4321	classes: &ByteClasses,
4322	starts_for_each_pattern: bool,
4323	) -> usize {
4324	const ID_SIZE: usize = size_of::<LazyStateID>();
4325	const STATE_SIZE: usize = size_of::<State>();
4326
4327	let stride = `1` << classes.stride2();
4328	let states_len = nfa.states().len();
4329	let sparses = `2` * states_len * NFAStateID::SIZE;
4330	let trans = MIN_STATES * stride * ID_SIZE;
4331
4332	let mut starts = Start::len() * ID_SIZE;
4333	if starts_for_each_pattern {
4334	starts += (Start::len() * nfa.pattern_len()) * ID_SIZE;
4335	}
4336
4337	// The min number of states HAS to be at least 4: we have 3 sentinel states
4338	// and then we need space for one more when we save a state after clearing
4339	// the cache. We also need space for one more, otherwise we get stuck in a
4340	// loop where we try to add a 5th state, which gets rejected, which clears
4341	// the cache, which adds back a saved state (4th total state) which then
4342	// tries to add the 5th state again.
4343	assert!(MIN_STATES >= `5`, "minimum number of states has to be at least 5");
4344	// The minimum number of non-sentinel states. We consider this separately
4345	// because sentinel states are much smaller in that they contain no NFA
4346	// states. Given our aggressive calculation here, it's worth being more
4347	// precise with the number of states we need.
4348	let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap();
4349
4350	// Every `State` has 5 bytes for flags, 4 bytes (max) for the number of
4351	// patterns, followed by 32-bit encodings of patterns and then delta
4352	// varint encodings of NFA state IDs. We use the worst case (which isn't
4353	// technically possible) of 5 bytes for each NFA state ID.
4354	//
4355	// HOWEVER, three of the states needed by a lazy DFA are just the sentinel
4356	// unknown, dead and quit states. Those states have a known size and it is
4357	// small.
4358	let dead_state_size = State::dead().memory_usage();
4359	let max_state_size = `5` + `4` + (nfa.pattern_len() * `4`) + (states_len * `5`);
4360	let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size))
4361	+ (non_sentinel * (STATE_SIZE + max_state_size));
4362	// NOTE: We don't double count heap memory used by State for this map since
4363	// we use reference counting to avoid doubling memory usage. (This tends to
4364	// be where most memory is allocated in the cache.)
4365	let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE);
4366	let stack = states_len * NFAStateID::SIZE;
4367	let scratch_state_builder = max_state_size;
4368
4369	trans
4370	+ starts
4371	+ states
4372	+ states_to_sid
4373	+ sparses
4374	+ stack
4375	+ scratch_state_builder
4376	}
4377
4378	#[cfg(all(test, feature = "syntax"))]
4379	mod tests {
4380	use super::*;
4381
4382	// Tests that we handle heuristic Unicode word boundary support in reverse
4383	// DFAs in the specific case of contextual searches.
4384	//
4385	// I wrote this test when I discovered a bug in how heuristic word
4386	// boundaries were handled. Namely, that the starting state selection
4387	// didn't consider the DFA's quit byte set when looking at the byte
4388	// immediately before the start of the search (or immediately after the
4389	// end of the search in the case of a reverse search). As a result, it was
4390	// possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte
4391	// in the 'β' codepoint would be treated as a non-word character. But of
4392	// course, this search should trigger the DFA to quit, since there is a
4393	// non-ASCII byte in consideration.
4394	//
4395	// Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set
4396	// if it wasn't empty. The forward case is tested in the doc test for the
4397	// Config::unicode_word_boundary API. We test the reverse case here, which
4398	// is sufficiently niche that it doesn't really belong in a doc test.
4399	#[test]
4400	fn heuristic_unicode_reverse() {
4401	let dfa = DFA::builder()
4402	.configure(DFA::config().unicode_word_boundary(`true`))
4403	.thompson(thompson::Config::new().reverse(`true`))
4404	.build(r"\b[0-9]+\b")
4405	.unwrap();
4406	let mut cache = dfa.create_cache();
4407
4408	let input = Input::new("β123").range(`2`..);
4409	let expected = MatchError::quit(`0xB2`, `1`);
4410	let got = dfa.try_search_rev(&mut cache, &input);
4411	assert_eq!(Err(expected), got);
4412
4413	let input = Input::new("123β").range(..`3`);
4414	let expected = MatchError::quit(`0xCE`, `3`);
4415	let got = dfa.try_search_rev(&mut cache, &input);
4416	assert_eq!(Err(expected), got);
4417	}
4418	}
4419

Provided by KDAB

Definitions