backtrack.rs - Codebrowser

1	/!*
2	An NFA backed bounded backtracker for executing regex searches with capturing
3	groups.
4
5	This module provides a [`BoundedBacktracker`] that works by simulating an NFA
6	using the classical backtracking algorithm with a twist: it avoids redoing
7	work that it has done before and thereby avoids worst case exponential time.
8	In exchange, it can only be used on "short" haystacks. Its advantage is that
9	is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases
10	because it does less book-keeping.
11	*/
12
13	use alloc::{vec, vec::Vec};
14
15	use crate::{
16	nfa::thompson::{self, BuildError, State, NFA},
17	util::{
18	captures::Captures,
19	empty, iter,
20	prefilter::Prefilter,
21	primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
22	search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
23	},
24	};
25
26	/// Returns the minimum visited capacity for the given haystack.
27	///
28	/// This function can be used as the argument to [`Config::visited_capacity`]
29	/// in order to guarantee that a backtracking search for the given `input`
30	/// won't return an error when using a [`BoundedBacktracker`] built from the
31	/// given `NFA`.
32	///
33	/// This routine exists primarily as a way to test that the bounded backtracker
34	/// works correctly when its capacity is set to the smallest possible amount.
35	/// Still, it may be useful in cases where you know you want to use the bounded
36	/// backtracker for a specific input, and just need to know what visited
37	/// capacity to provide to make it work.
38	///
39	/// Be warned that this number could be quite large as it is multiplicative in
40	/// the size the given NFA and haystack.
41	pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize {
42	div_ceil(nfa.states().len() * (input.get_span().len() + `1`), `8`)
43	}
44
45	/// The configuration used for building a bounded backtracker.
46	///
47	/// A bounded backtracker configuration is a simple data object that is
48	/// typically used with [`Builder::configure`].
49	#[derive(Clone, Debug, Default)]
50	pub struct Config {
51	pre: Option<Option<Prefilter>>,
52	visited_capacity: Option<usize>,
53	}
54
55	impl Config {
56	/// Return a new default regex configuration.
57	pub fn new() -> Config {
58	Config::default()
59	}
60
61	/// Set a prefilter to be used whenever a start state is entered.
62	///
63	/// A [`Prefilter`] in this context is meant to accelerate searches by
64	/// looking for literal prefixes that every match for the corresponding
65	/// pattern (or patterns) must start with. Once a prefilter produces a
66	/// match, the underlying search routine continues on to try and confirm
67	/// the match.
68	///
69	/// Be warned that setting a prefilter does not guarantee that the search
70	/// will be faster. While it's usually a good bet, if the prefilter
71	/// produces a lot of false positive candidates (i.e., positions matched
72	/// by the prefilter but not by the regex), then the overall result can
73	/// be slower than if you had just executed the regex engine without any
74	/// prefilters.
75	///
76	/// By default no prefilter is set.
77	///
78	/// # Example
79	///
80	/// ```
81	/// use regex_automata::{
82	/// nfa::thompson::backtrack::BoundedBacktracker,
83	/// util::prefilter::Prefilter,
84	/// Input, Match, MatchKind,
85	/// };
86	///
87	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
88	/// let re = BoundedBacktracker::builder()
89	/// .configure(BoundedBacktracker::config().prefilter(pre))
90	/// .build(r"(foo\|bar)[a-z]+")?;
91	/// let mut cache = re.create_cache();
92	/// let input = Input::new("foo1 barfox bar");
93	/// assert_eq!(
94	/// Some(Match::must(`0`, `5`..`11`)),
95	/// re.try_find(&mut cache, input)?,
96	/// );
97	///
98	/// # Ok::<(), Box<dyn std::error::Error>>(())
99	/// ```
100	///
101	/// Be warned though that an incorrect prefilter can lead to incorrect
102	/// results!
103	///
104	/// ```
105	/// use regex_automata::{
106	/// nfa::thompson::backtrack::BoundedBacktracker,
107	/// util::prefilter::Prefilter,
108	/// Input, HalfMatch, MatchKind,
109	/// };
110	///
111	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
112	/// let re = BoundedBacktracker::builder()
113	/// .configure(BoundedBacktracker::config().prefilter(pre))
114	/// .build(r"(foo\|bar)[a-z]+")?;
115	/// let mut cache = re.create_cache();
116	/// let input = Input::new("foo1 barfox bar");
117	/// // No match reported even though there clearly is one!
118	/// assert_eq!(None, re.try_find(&mut cache, input)?);
119	///
120	/// # Ok::<(), Box<dyn std::error::Error>>(())
121	/// ```
122	pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
123	self.pre = Some(pre);
124	self
125	}
126
127	/// Set the visited capacity used to bound backtracking.
128	///
129	/// The visited capacity represents the amount of heap memory (in bytes) to
130	/// allocate toward tracking which parts of the backtracking search have
131	/// been done before. The heap memory needed for any particular search is
132	/// proportional to `haystack.len() nfa.states().len()`, which an be*
133	/// quite large. Therefore, the bounded backtracker is typically only able
134	/// to run on shorter haystacks.
135	///
136	/// For a given regex, increasing the visited capacity means that the
137	/// maximum haystack length that can be searched is increased. The
138	/// [`BoundedBacktracker::max_haystack_len`] method returns that maximum.
139	///
140	/// The default capacity is a reasonable but empirically chosen size.
141	///
142	/// # Example
143	///
144	/// As with other regex engines, Unicode is what tends to make the bounded
145	/// backtracker less useful by making the maximum haystack length quite
146	/// small. If necessary, increasing the visited capacity using this routine
147	/// will increase the maximum haystack length at the cost of using more
148	/// memory.
149	///
150	/// Note though that the specific maximum values here are not an API
151	/// guarantee. The default visited capacity is subject to change and not
152	/// covered by semver.
153	///
154	/// ```
155	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
156	/// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
157	///
158	/// // Unicode inflates the size of the underlying NFA quite a bit, and
159	/// // thus means that the backtracker can only handle smaller haystacks,
160	/// // assuming that the visited capacity remains unchanged.
161	/// let re = BoundedBacktracker::new(r"\w+")?;
162	/// assert!(re.max_haystack_len() <= `7_000`);
163	/// // But we can increase the visited capacity to handle bigger haystacks!
164	/// let re = BoundedBacktracker::builder()
165	/// .configure(BoundedBacktracker::config().visited_capacity(`1`<<`20`))
166	/// .build(r"\w+")?;
167	/// assert!(re.max_haystack_len() >= `25_000`);
168	/// assert!(re.max_haystack_len() <= `28_000`);
169	/// # Ok::<(), Box<dyn std::error::Error>>(())
170	/// ```
171	pub fn visited_capacity(mut self, capacity: usize) -> Config {
172	self.visited_capacity = Some(capacity);
173	self
174	}
175
176	/// Returns the prefilter set in this configuration, if one at all.
177	pub fn get_prefilter(&self) -> Option<&Prefilter> {
178	self.pre.as_ref().unwrap_or(&None).as_ref()
179	}
180
181	/// Returns the configured visited capacity.
182	///
183	/// Note that the actual capacity used may be slightly bigger than the
184	/// configured capacity.
185	pub fn get_visited_capacity(&self) -> usize {
186	const DEFAULT: usize = `256` * (`1` << `10`); // 256 KB
187	self.visited_capacity.unwrap_or(DEFAULT)
188	}
189
190	/// Overwrite the default configuration such that the options in `o` are
191	/// always used. If an option in `o` is not set, then the corresponding
192	/// option in `self` is used. If it's not set in `self` either, then it
193	/// remains not set.
194	pub(crate) fn overwrite(&self, o: Config) -> Config {
195	Config {
196	pre: o.pre.or_else(\|\| self.pre.clone()),
197	visited_capacity: o.visited_capacity.or(self.visited_capacity),
198	}
199	}
200	}
201
202	/// A builder for a bounded backtracker.
203	///
204	/// This builder permits configuring options for the syntax of a pattern, the
205	/// NFA construction and the `BoundedBacktracker` construction. This builder
206	/// is different from a general purpose regex builder in that it permits fine
207	/// grain configuration of the construction process. The trade off for this is
208	/// complexity, and the possibility of setting a configuration that might not
209	/// make sense. For example, there are two different UTF-8 modes:
210	///
211	/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
212	/// whether the pattern itself can contain sub-expressions that match invalid
213	/// UTF-8.
214	/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
215	/// advance the starting position of the next search when a match with zero
216	/// length is found.
217	///
218	/// Generally speaking, callers will want to either enable all of these or
219	/// disable all of these.
220	///
221	/// # Example
222	///
223	/// This example shows how to disable UTF-8 mode in the syntax and the regex
224	/// itself. This is generally what you want for matching on arbitrary bytes.
225	///
226	/// ```
227	/// use regex_automata::{
228	/// nfa::thompson::{self, backtrack::BoundedBacktracker},
229	/// util::syntax,
230	/// Match,
231	/// };
232	///
233	/// let re = BoundedBacktracker::builder()
234	/// .syntax(syntax::Config::new().utf8(`false`))
235	/// .thompson(thompson::Config::new().utf8(`false`))
236	/// .build(r"foo(?-u:[^b])ar.*")?;
237	/// let mut cache = re.create_cache();
238	///
239	/// let haystack = b"`\xFE`foo`\xFF`arzz`\xE2\x98\xFF\n`";
240	/// let expected = Some(Ok(Match::must(`0`, `1`..`9`)));
241	/// let got = re.try_find_iter(&mut cache, haystack).next();
242	/// assert_eq!(expected, got);
243	/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
244	/// // but the subsequent `.` does not! Disabling UTF-8*
245	/// // on the syntax permits this.
246	/// //
247	/// // N.B. This example does not show the impact of
248	/// // disabling UTF-8 mode on a BoundedBacktracker Config, since that
249	/// // only impacts regexes that can produce matches of
250	/// // length 0.
251	/// assert_eq!(b"foo`\xFF`arzz", &haystack[got.unwrap()?.range()]);
252	///
253	/// # Ok::<(), Box<dyn std::error::Error>>(())
254	/// ```
255	#[derive(Clone, Debug)]
256	pub struct Builder {
257	config: Config,
258	#[cfg(feature = "syntax")]
259	thompson: thompson::Compiler,
260	}
261
262	impl Builder {
263	/// Create a new BoundedBacktracker builder with its default configuration.
264	pub fn new() -> Builder {
265	Builder {
266	config: Config::default(),
267	#[cfg(feature = "syntax")]
268	thompson: thompson::Compiler::new(),
269	}
270	}
271
272	/// Build a `BoundedBacktracker` from the given pattern.
273	///
274	/// If there was a problem parsing or compiling the pattern, then an error
275	/// is returned.
276	#[cfg(feature = "syntax")]
277	pub fn build(
278	&self,
279	pattern: &str,
280	) -> Result<BoundedBacktracker, BuildError> {
281	self.build_many(&[pattern])
282	}
283
284	/// Build a `BoundedBacktracker` from the given patterns.
285	#[cfg(feature = "syntax")]
286	pub fn build_many<P: AsRef<str>>(
287	&self,
288	patterns: &[P],
289	) -> Result<BoundedBacktracker, BuildError> {
290	let nfa = self.thompson.build_many(patterns)?;
291	self.build_from_nfa(nfa)
292	}
293
294	/// Build a `BoundedBacktracker` directly from its NFA.
295	///
296	/// Note that when using this method, any configuration that applies to the
297	/// construction of the NFA itself will of course be ignored, since the NFA
298	/// given here is already built.
299	pub fn build_from_nfa(
300	&self,
301	nfa: NFA,
302	) -> Result<BoundedBacktracker, BuildError> {
303	nfa.look_set_any().available().map_err(BuildError::word)?;
304	Ok(BoundedBacktracker { config: self.config.clone(), nfa })
305	}
306
307	/// Apply the given `BoundedBacktracker` configuration options to this
308	/// builder.
309	pub fn configure(&mut self, config: Config) -> &mut Builder {
310	self.config = self.config.overwrite(config);
311	self
312	}
313
314	/// Set the syntax configuration for this builder using
315	/// [`syntax::Config`](crate::util::syntax::Config).
316	///
317	/// This permits setting things like case insensitivity, Unicode and multi
318	/// line mode.
319	///
320	/// These settings only apply when constructing a `BoundedBacktracker`
321	/// directly from a pattern.
322	#[cfg(feature = "syntax")]
323	pub fn syntax(
324	&mut self,
325	config: crate::util::syntax::Config,
326	) -> &mut Builder {
327	self.thompson.syntax(config);
328	self
329	}
330
331	/// Set the Thompson NFA configuration for this builder using
332	/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
333	///
334	/// This permits setting things like if additional time should be spent
335	/// shrinking the size of the NFA.
336	///
337	/// These settings only apply when constructing a `BoundedBacktracker`
338	/// directly from a pattern.
339	#[cfg(feature = "syntax")]
340	pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
341	self.thompson.configure(config);
342	self
343	}
344	}
345
346	/// A backtracking regex engine that bounds its execution to avoid exponential
347	/// blow-up.
348	///
349	/// This regex engine only implements leftmost-first match semantics and
350	/// only supports leftmost searches. It effectively does the same thing as a
351	/// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because
352	/// it doesn't have to worry about copying capturing group spans for most NFA
353	/// states. Instead, the backtracker can maintain one set of captures (provided
354	/// by the caller) and never needs to copy them. In exchange, the backtracker
355	/// bounds itself to ensure it doesn't exhibit worst case exponential time.
356	/// This results in the backtracker only being able to handle short haystacks
357	/// given reasonable memory usage.
358	///
359	/// # Searches may return an error!
360	///
361	/// By design, this backtracking regex engine is bounded. This bound is
362	/// implemented by not visiting any combination of NFA state ID and position
363	/// in a haystack more than once. Thus, the total memory required to bound
364	/// backtracking is proportional to `haystack.len() nfa.states().len()`.*
365	/// This can obviously get quite large, since large haystacks aren't terribly
366	/// uncommon. To avoid using exorbitant memory, the capacity is bounded by
367	/// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total
368	/// capacity required for a particular regex and a haystack exceeds this
369	/// capacity, then the search routine will return an error.
370	///
371	/// Unlike other regex engines that may return an error at search time (like
372	/// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded
373	/// backtracker will work for every haystack. Therefore, this regex engine
374	/// _only_ exposes fallible search routines to avoid the footgun of panicking
375	/// when running a search on a haystack that is too big.
376	///
377	/// If one wants to use the fallible search APIs without handling the
378	/// error, the only way to guarantee an error won't occur from the
379	/// haystack length is to ensure the haystack length does not exceed
380	/// [`BoundedBacktracker::max_haystack_len`].
381	///
382	/// # Example: Unicode word boundaries
383	///
384	/// This example shows that the bounded backtracker implements Unicode word
385	/// boundaries correctly by default.
386	///
387	/// ```
388	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
389	/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
390	///
391	/// let re = BoundedBacktracker::new(r"\b\w+\b")?;
392	/// let mut cache = re.create_cache();
393	///
394	/// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс");
395	/// assert_eq!(Some(Ok(Match::must(`0`, `0`..`12`))), it.next());
396	/// assert_eq!(Some(Ok(Match::must(`0`, `13`..`23`))), it.next());
397	/// assert_eq!(None, it.next());
398	/// # Ok::<(), Box<dyn std::error::Error>>(())
399	/// ```
400	///
401	/// # Example: multiple regex patterns
402	///
403	/// The bounded backtracker supports searching for multiple patterns
404	/// simultaneously, just like other regex engines. Note though that because it
405	/// uses a backtracking strategy, this regex engine is unlikely to scale well
406	/// as more patterns are added. But then again, as more patterns are added, the
407	/// maximum haystack length allowed will also shorten (assuming the visited
408	/// capacity remains invariant).
409	///
410	/// ```
411	/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
412	///
413	/// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
414	/// let mut cache = re.create_cache();
415	///
416	/// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
417	/// assert_eq!(Some(Ok(Match::must(`0`, `0`..`3`))), it.next());
418	/// assert_eq!(Some(Ok(Match::must(`1`, `4`..`5`))), it.next());
419	/// assert_eq!(Some(Ok(Match::must(`0`, `6`..`9`))), it.next());
420	/// assert_eq!(Some(Ok(Match::must(`1`, `10`..`14`))), it.next());
421	/// assert_eq!(Some(Ok(Match::must(`1`, `15`..`16`))), it.next());
422	/// assert_eq!(Some(Ok(Match::must(`0`, `17`..`21`))), it.next());
423	/// assert_eq!(None, it.next());
424	/// # Ok::<(), Box<dyn std::error::Error>>(())
425	/// ```
426	#[derive(Clone, Debug)]
427	pub struct BoundedBacktracker {
428	config: Config,
429	nfa: NFA,
430	}
431
432	impl BoundedBacktracker {
433	/// Parse the given regular expression using the default configuration and
434	/// return the corresponding `BoundedBacktracker`.
435	///
436	/// If you want a non-default configuration, then use the [`Builder`] to
437	/// set your own configuration.
438	///
439	/// # Example
440	///
441	/// ```
442	/// use regex_automata::{
443	/// nfa::thompson::backtrack::BoundedBacktracker,
444	/// Match,
445	/// };
446	///
447	/// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
448	/// let mut cache = re.create_cache();
449	/// assert_eq!(
450	/// Some(Ok(Match::must(`0`, `3`..`14`))),
451	/// re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(),
452	/// );
453	/// # Ok::<(), Box<dyn std::error::Error>>(())
454	/// ```
455	#[cfg(feature = "syntax")]
456	pub fn new(pattern: &str) -> Result<BoundedBacktracker, BuildError> {
457	BoundedBacktracker::builder().build(pattern)
458	}
459
460	/// Like `new`, but parses multiple patterns into a single "multi regex."
461	/// This similarly uses the default regex configuration.
462	///
463	/// # Example
464	///
465	/// ```
466	/// use regex_automata::{
467	/// nfa::thompson::backtrack::BoundedBacktracker,
468	/// Match,
469	/// };
470	///
471	/// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
472	/// let mut cache = re.create_cache();
473	///
474	/// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
475	/// assert_eq!(Some(Ok(Match::must(`0`, `0`..`3`))), it.next());
476	/// assert_eq!(Some(Ok(Match::must(`1`, `4`..`5`))), it.next());
477	/// assert_eq!(Some(Ok(Match::must(`0`, `6`..`9`))), it.next());
478	/// assert_eq!(Some(Ok(Match::must(`1`, `10`..`14`))), it.next());
479	/// assert_eq!(Some(Ok(Match::must(`1`, `15`..`16`))), it.next());
480	/// assert_eq!(Some(Ok(Match::must(`0`, `17`..`21`))), it.next());
481	/// assert_eq!(None, it.next());
482	/// # Ok::<(), Box<dyn std::error::Error>>(())
483	/// ```
484	#[cfg(feature = "syntax")]
485	pub fn new_many<P: AsRef<str>>(
486	patterns: &[P],
487	) -> Result<BoundedBacktracker, BuildError> {
488	BoundedBacktracker::builder().build_many(patterns)
489	}
490
491	/// # Example
492	///
493	/// This shows how to hand assemble a regular expression via its HIR,
494	/// compile an NFA from it and build a BoundedBacktracker from the NFA.
495	///
496	/// ```
497	/// use regex_automata::{
498	/// nfa::thompson::{NFA, backtrack::BoundedBacktracker},
499	/// Match,
500	/// };
501	/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
502	///
503	/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
504	/// ClassBytesRange::new(b'0', b'9'),
505	/// ClassBytesRange::new(b'A', b'Z'),
506	/// ClassBytesRange::new(b'_', b'_'),
507	/// ClassBytesRange::new(b'a', b'z'),
508	/// ])));
509	///
510	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
511	/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
512	///
513	/// let re = BoundedBacktracker::new_from_nfa(nfa)?;
514	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
515	/// let expected = Some(Match::must(`0`, `3`..`4`));
516	/// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?;
517	/// assert_eq!(expected, caps.get_match());
518	///
519	/// # Ok::<(), Box<dyn std::error::Error>>(())
520	/// ```
521	pub fn new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError> {
522	BoundedBacktracker::builder().build_from_nfa(nfa)
523	}
524
525	/// Create a new `BoundedBacktracker` that matches every input.
526	///
527	/// # Example
528	///
529	/// ```
530	/// use regex_automata::{
531	/// nfa::thompson::backtrack::BoundedBacktracker,
532	/// Match,
533	/// };
534	///
535	/// let re = BoundedBacktracker::always_match()?;
536	/// let mut cache = re.create_cache();
537	///
538	/// let expected = Some(Ok(Match::must(`0`, `0`..`0`)));
539	/// assert_eq!(expected, re.try_find_iter(&mut cache, "").next());
540	/// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next());
541	/// # Ok::<(), Box<dyn std::error::Error>>(())
542	/// ```
543	pub fn always_match() -> Result<BoundedBacktracker, BuildError> {
544	let nfa = thompson::NFA::always_match();
545	BoundedBacktracker::new_from_nfa(nfa)
546	}
547
548	/// Create a new `BoundedBacktracker` that never matches any input.
549	///
550	/// # Example
551	///
552	/// ```
553	/// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
554	///
555	/// let re = BoundedBacktracker::never_match()?;
556	/// let mut cache = re.create_cache();
557	///
558	/// assert_eq!(None, re.try_find_iter(&mut cache, "").next());
559	/// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next());
560	/// # Ok::<(), Box<dyn std::error::Error>>(())
561	/// ```
562	pub fn never_match() -> Result<BoundedBacktracker, BuildError> {
563	let nfa = thompson::NFA::never_match();
564	BoundedBacktracker::new_from_nfa(nfa)
565	}
566
567	/// Return a default configuration for a `BoundedBacktracker`.
568	///
569	/// This is a convenience routine to avoid needing to import the `Config`
570	/// type when customizing the construction of a `BoundedBacktracker`.
571	///
572	/// # Example
573	///
574	/// This example shows how to disable UTF-8 mode. When UTF-8 mode is
575	/// disabled, zero-width matches that split a codepoint are allowed.
576	/// Otherwise they are never reported.
577	///
578	/// In the code below, notice that `""` is permitted to match positions
579	/// that split the encoding of a codepoint.
580	///
581	/// ```
582	/// use regex_automata::{
583	/// nfa::thompson::{self, backtrack::BoundedBacktracker},
584	/// Match,
585	/// };
586	///
587	/// let re = BoundedBacktracker::builder()
588	/// .thompson(thompson::Config::new().utf8(`false`))
589	/// .build(r"")?;
590	/// let mut cache = re.create_cache();
591	///
592	/// let haystack = "a☃z";
593	/// let mut it = re.try_find_iter(&mut cache, haystack);
594	/// assert_eq!(Some(Ok(Match::must(`0`, `0`..`0`))), it.next());
595	/// assert_eq!(Some(Ok(Match::must(`0`, `1`..`1`))), it.next());
596	/// assert_eq!(Some(Ok(Match::must(`0`, `2`..`2`))), it.next());
597	/// assert_eq!(Some(Ok(Match::must(`0`, `3`..`3`))), it.next());
598	/// assert_eq!(Some(Ok(Match::must(`0`, `4`..`4`))), it.next());
599	/// assert_eq!(Some(Ok(Match::must(`0`, `5`..`5`))), it.next());
600	/// assert_eq!(None, it.next());
601	///
602	/// # Ok::<(), Box<dyn std::error::Error>>(())
603	/// ```
604	pub fn config() -> Config {
605	Config::new()
606	}
607
608	/// Return a builder for configuring the construction of a
609	/// `BoundedBacktracker`.
610	///
611	/// This is a convenience routine to avoid needing to import the
612	/// [`Builder`] type in common cases.
613	///
614	/// # Example
615	///
616	/// This example shows how to use the builder to disable UTF-8 mode
617	/// everywhere.
618	///
619	/// ```
620	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
621	/// use regex_automata::{
622	/// nfa::thompson::{self, backtrack::BoundedBacktracker},
623	/// util::syntax,
624	/// Match,
625	/// };
626	///
627	/// let re = BoundedBacktracker::builder()
628	/// .syntax(syntax::Config::new().utf8(`false`))
629	/// .thompson(thompson::Config::new().utf8(`false`))
630	/// .build(r"foo(?-u:[^b])ar.*")?;
631	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
632	///
633	/// let haystack = b"`\xFE`foo`\xFF`arzz`\xE2\x98\xFF\n`";
634	/// let expected = Some(Match::must(`0`, `1`..`9`));
635	/// re.try_captures(&mut cache, haystack, &mut caps)?;
636	/// assert_eq!(expected, caps.get_match());
637	///
638	/// # Ok::<(), Box<dyn std::error::Error>>(())
639	/// ```
640	pub fn builder() -> Builder {
641	Builder::new()
642	}
643
644	/// Create a new cache for this regex.
645	///
646	/// The cache returned should only be used for searches for this
647	/// regex. If you want to reuse the cache for another regex, then you
648	/// must call [`Cache::reset`] with that regex (or, equivalently,
649	/// [`BoundedBacktracker::reset_cache`]).
650	pub fn create_cache(&self) -> Cache {
651	Cache::new(self)
652	}
653
654	/// Create a new empty set of capturing groups that is guaranteed to be
655	/// valid for the search APIs on this `BoundedBacktracker`.
656	///
657	/// A `Captures` value created for a specific `BoundedBacktracker` cannot
658	/// be used with any other `BoundedBacktracker`.
659	///
660	/// This is a convenience function for [`Captures::all`]. See the
661	/// [`Captures`] documentation for an explanation of its alternative
662	/// constructors that permit the `BoundedBacktracker` to do less work
663	/// during a search, and thus might make it faster.
664	pub fn create_captures(&self) -> Captures {
665	Captures::all(self.get_nfa().group_info().clone())
666	}
667
668	/// Reset the given cache such that it can be used for searching with the
669	/// this `BoundedBacktracker` (and only this `BoundedBacktracker`).
670	///
671	/// A cache reset permits reusing memory already allocated in this cache
672	/// with a different `BoundedBacktracker`.
673	///
674	/// # Example
675	///
676	/// This shows how to re-purpose a cache for use with a different
677	/// `BoundedBacktracker`.
678	///
679	/// ```
680	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
681	/// use regex_automata::{
682	/// nfa::thompson::backtrack::BoundedBacktracker,
683	/// Match,
684	/// };
685	///
686	/// let re1 = BoundedBacktracker::new(r"\w")?;
687	/// let re2 = BoundedBacktracker::new(r"\W")?;
688	///
689	/// let mut cache = re1.create_cache();
690	/// assert_eq!(
691	/// Some(Ok(Match::must(`0`, `0`..`2`))),
692	/// re1.try_find_iter(&mut cache, "Δ").next(),
693	/// );
694	///
695	/// // Using 'cache' with re2 is not allowed. It may result in panics or
696	/// // incorrect results. In order to re-purpose the cache, we must reset
697	/// // it with the BoundedBacktracker we'd like to use it with.
698	/// //
699	/// // Similarly, after this reset, using the cache with 're1' is also not
700	/// // allowed.
701	/// cache.reset(&re2);
702	/// assert_eq!(
703	/// Some(Ok(Match::must(`0`, `0`..`3`))),
704	/// re2.try_find_iter(&mut cache, "☃").next(),
705	/// );
706	///
707	/// # Ok::<(), Box<dyn std::error::Error>>(())
708	/// ```
709	pub fn reset_cache(&self, cache: &mut Cache) {
710	cache.reset(self);
711	}
712
713	/// Returns the total number of patterns compiled into this
714	/// `BoundedBacktracker`.
715	///
716	/// In the case of a `BoundedBacktracker` that contains no patterns, this
717	/// returns `0`.
718	///
719	/// # Example
720	///
721	/// This example shows the pattern length for a `BoundedBacktracker` that
722	/// never matches:
723	///
724	/// ```
725	/// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
726	///
727	/// let re = BoundedBacktracker::never_match()?;
728	/// assert_eq!(re.pattern_len(), `0`);
729	/// # Ok::<(), Box<dyn std::error::Error>>(())
730	/// ```
731	///
732	/// And another example for a `BoundedBacktracker` that matches at every
733	/// position:
734	///
735	/// ```
736	/// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
737	///
738	/// let re = BoundedBacktracker::always_match()?;
739	/// assert_eq!(re.pattern_len(), `1`);
740	/// # Ok::<(), Box<dyn std::error::Error>>(())
741	/// ```
742	///
743	/// And finally, a `BoundedBacktracker` that was constructed from multiple
744	/// patterns:
745	///
746	/// ```
747	/// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
748	///
749	/// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
750	/// assert_eq!(re.pattern_len(), `3`);
751	/// # Ok::<(), Box<dyn std::error::Error>>(())
752	/// ```
753	pub fn pattern_len(&self) -> usize {
754	self.nfa.pattern_len()
755	}
756
757	/// Return the config for this `BoundedBacktracker`.
758	#[inline]
759	pub fn get_config(&self) -> &Config {
760	&self.config
761	}
762
763	/// Returns a reference to the underlying NFA.
764	#[inline]
765	pub fn get_nfa(&self) -> &NFA {
766	&self.nfa
767	}
768
769	/// Returns the maximum haystack length supported by this backtracker.
770	///
771	/// This routine is a function of both [`Config::visited_capacity`] and the
772	/// internal size of the backtracker's NFA.
773	///
774	/// # Example
775	///
776	/// This example shows how the maximum haystack length can vary depending
777	/// on the size of the regex itself. Note though that the specific maximum
778	/// values here are not an API guarantee. The default visited capacity is
779	/// subject to change and not covered by semver.
780	///
781	/// ```
782	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
783	/// use regex_automata::{
784	/// nfa::thompson::backtrack::BoundedBacktracker,
785	/// Match, MatchError,
786	/// };
787	///
788	/// // If you're only using ASCII, you get a big budget.
789	/// let re = BoundedBacktracker::new(r"(?-u)\w+")?;
790	/// let mut cache = re.create_cache();
791	/// assert_eq!(re.max_haystack_len(), `299_592`);
792	/// // Things work up to the max.
793	/// let mut haystack = "a".repeat(`299_592`);
794	/// let expected = Some(Ok(Match::must(`0`, `0`..`299_592`)));
795	/// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
796	/// // But you'll get an error if you provide a haystack that's too big.
797	/// // Notice that we use the 'try_find_iter' routine instead, which
798	/// // yields Result<Match, MatchError> instead of Match.
799	/// haystack.push('a');
800	/// let expected = Some(Err(MatchError::haystack_too_long(`299_593`)));
801	/// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
802	///
803	/// // Unicode inflates the size of the underlying NFA quite a bit, and
804	/// // thus means that the backtracker can only handle smaller haystacks,
805	/// // assuming that the visited capacity remains unchanged.
806	/// let re = BoundedBacktracker::new(r"\w+")?;
807	/// assert!(re.max_haystack_len() <= `7_000`);
808	/// // But we can increase the visited capacity to handle bigger haystacks!
809	/// let re = BoundedBacktracker::builder()
810	/// .configure(BoundedBacktracker::config().visited_capacity(`1`<<`20`))
811	/// .build(r"\w+")?;
812	/// assert!(re.max_haystack_len() >= `25_000`);
813	/// assert!(re.max_haystack_len() <= `28_000`);
814	/// # Ok::<(), Box<dyn std::error::Error>>(())
815	/// ```
816	#[inline]
817	pub fn max_haystack_len(&self) -> usize {
818	// The capacity given in the config is "bytes of heap memory," but the
819	// capacity we use here is "number of bits." So convert the capacity in
820	// bytes to the capacity in bits.
821	let capacity = `8` * self.get_config().get_visited_capacity();
822	let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
823	let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
824	// It's possible for `real_capacity` to be smaller than the number of
825	// NFA states for particularly large regexes, so we saturate towards
826	// zero.
827	(real_capacity / self.nfa.states().len()).saturating_sub(`1`)
828	}
829	}
830
831	impl BoundedBacktracker {
832	/// Returns true if and only if this regex matches the given haystack.
833	///
834	/// In the case of a backtracking regex engine, and unlike most other
835	/// regex engines in this crate, short circuiting isn't practical. However,
836	/// this routine may still be faster because it instructs backtracking to
837	/// not keep track of any capturing groups.
838	///
839	/// # Errors
840	///
841	/// This routine only errors if the search could not complete. For this
842	/// backtracking regex engine, this only occurs when the haystack length
843	/// exceeds [`BoundedBacktracker::max_haystack_len`].
844	///
845	/// When a search cannot complete, callers cannot know whether a match
846	/// exists or not.
847	///
848	/// # Example
849	///
850	/// ```
851	/// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
852	///
853	/// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
854	/// let mut cache = re.create_cache();
855	///
856	/// assert!(re.try_is_match(&mut cache, "foo12345bar")?);
857	/// assert!(!re.try_is_match(&mut cache, "foobar")?);
858	/// # Ok::<(), Box<dyn std::error::Error>>(())
859	/// ```
860	///
861	/// # Example: consistency with search APIs
862	///
863	/// `is_match` is guaranteed to return `true` whenever `find` returns a
864	/// match. This includes searches that are executed entirely within a
865	/// codepoint:
866	///
867	/// ```
868	/// use regex_automata::{
869	/// nfa::thompson::backtrack::BoundedBacktracker,
870	/// Input,
871	/// };
872	///
873	/// let re = BoundedBacktracker::new("a*")?;
874	/// let mut cache = re.create_cache();
875	///
876	/// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(`1`..`2`))?);
877	/// # Ok::<(), Box<dyn std::error::Error>>(())
878	/// ```
879	///
880	/// Notice that when UTF-8 mode is disabled, then the above reports a
881	/// match because the restriction against zero-width matches that split a
882	/// codepoint has been lifted:
883	///
884	/// ```
885	/// use regex_automata::{
886	/// nfa::thompson::{backtrack::BoundedBacktracker, NFA},
887	/// Input,
888	/// };
889	///
890	/// let re = BoundedBacktracker::builder()
891	/// .thompson(NFA::config().utf8(`false`))
892	/// .build("a*")?;
893	/// let mut cache = re.create_cache();
894	///
895	/// assert!(re.try_is_match(&mut cache, Input::new("☃").span(`1`..`2`))?);
896	/// # Ok::<(), Box<dyn std::error::Error>>(())
897	/// ```
898	#[inline]
899	pub fn try_is_match<'h, I: Into<Input<'h>>>(
900	&self,
901	cache: &mut Cache,
902	input: I,
903	) -> Result<bool, MatchError> {
904	let input = input.into().earliest(`true`);
905	self.try_search_slots(cache, &input, &mut []).map(\|pid\| pid.is_some())
906	}
907
908	/// Executes a leftmost forward search and returns a `Match` if one exists.
909	///
910	/// This routine only includes the overall match span. To get
911	/// access to the individual spans of each capturing group, use
912	/// [`BoundedBacktracker::try_captures`].
913	///
914	/// # Errors
915	///
916	/// This routine only errors if the search could not complete. For this
917	/// backtracking regex engine, this only occurs when the haystack length
918	/// exceeds [`BoundedBacktracker::max_haystack_len`].
919	///
920	/// When a search cannot complete, callers cannot know whether a match
921	/// exists or not.
922	///
923	/// # Example
924	///
925	/// ```
926	/// use regex_automata::{
927	/// nfa::thompson::backtrack::BoundedBacktracker,
928	/// Match,
929	/// };
930	///
931	/// let re = BoundedBacktracker::new("foo[0-9]+")?;
932	/// let mut cache = re.create_cache();
933	/// let expected = Match::must(`0`, `0`..`8`);
934	/// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?);
935	///
936	/// # Ok::<(), Box<dyn std::error::Error>>(())
937	/// ```
938	#[inline]
939	pub fn try_find<'h, I: Into<Input<'h>>>(
940	&self,
941	cache: &mut Cache,
942	input: I,
943	) -> Result<Option<Match>, MatchError> {
944	let input = input.into();
945	if self.get_nfa().pattern_len() == `1` {
946	let mut slots = [None, None];
947	let pid = match self.try_search_slots(cache, &input, &mut slots)? {
948	None => return Ok(None),
949	Some(pid) => pid,
950	};
951	let start = match slots[`0`] {
952	None => return Ok(None),
953	Some(s) => s.get(),
954	};
955	let end = match slots[`1`] {
956	None => return Ok(None),
957	Some(s) => s.get(),
958	};
959	return Ok(Some(Match::new(pid, Span { start, end })));
960	}
961	let ginfo = self.get_nfa().group_info();
962	let slots_len = ginfo.implicit_slot_len();
963	let mut slots = vec![None; slots_len];
964	let pid = match self.try_search_slots(cache, &input, &mut slots)? {
965	None => return Ok(None),
966	Some(pid) => pid,
967	};
968	let start = match slots[pid.as_usize() * `2`] {
969	None => return Ok(None),
970	Some(s) => s.get(),
971	};
972	let end = match slots[pid.as_usize() * `2` + `1`] {
973	None => return Ok(None),
974	Some(s) => s.get(),
975	};
976	Ok(Some(Match::new(pid, Span { start, end })))
977	}
978
979	/// Executes a leftmost forward search and writes the spans of capturing
980	/// groups that participated in a match into the provided [`Captures`]
981	/// value. If no match was found, then [`Captures::is_match`] is guaranteed
982	/// to return `false`.
983	///
984	/// # Errors
985	///
986	/// This routine only errors if the search could not complete. For this
987	/// backtracking regex engine, this only occurs when the haystack length
988	/// exceeds [`BoundedBacktracker::max_haystack_len`].
989	///
990	/// When a search cannot complete, callers cannot know whether a match
991	/// exists or not.
992	///
993	/// # Example
994	///
995	/// ```
996	/// use regex_automata::{
997	/// nfa::thompson::backtrack::BoundedBacktracker,
998	/// Span,
999	/// };
1000	///
1001	/// let re = BoundedBacktracker::new(
1002	/// r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$",
1003	/// )?;
1004	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1005	///
1006	/// re.try_captures(&mut cache, "2010-03-14", &mut caps)?;
1007	/// assert!(caps.is_match());
1008	/// assert_eq!(Some(Span::from(`0`..`4`)), caps.get_group(`1`));
1009	/// assert_eq!(Some(Span::from(`5`..`7`)), caps.get_group(`2`));
1010	/// assert_eq!(Some(Span::from(`8`..`10`)), caps.get_group(`3`));
1011	///
1012	/// # Ok::<(), Box<dyn std::error::Error>>(())
1013	/// ```
1014	#[inline]
1015	pub fn try_captures<'h, I: Into<Input<'h>>>(
1016	&self,
1017	cache: &mut Cache,
1018	input: I,
1019	caps: &mut Captures,
1020	) -> Result<(), MatchError> {
1021	self.try_search(cache, &input.into(), caps)
1022	}
1023
1024	/// Returns an iterator over all non-overlapping leftmost matches in the
1025	/// given bytes. If no match exists, then the iterator yields no elements.
1026	///
1027	/// If the regex engine returns an error at any point, then the iterator
1028	/// will yield that error.
1029	///
1030	/// # Example
1031	///
1032	/// ```
1033	/// use regex_automata::{
1034	/// nfa::thompson::backtrack::BoundedBacktracker,
1035	/// Match, MatchError,
1036	/// };
1037	///
1038	/// let re = BoundedBacktracker::new("foo[0-9]+")?;
1039	/// let mut cache = re.create_cache();
1040	///
1041	/// let text = "foo1 foo12 foo123";
1042	/// let result: Result<Vec<Match>, MatchError> = re
1043	/// .try_find_iter(&mut cache, text)
1044	/// .collect();
1045	/// let matches = result?;
1046	/// assert_eq!(matches, vec![
1047	/// Match::must(`0`, `0`..`4`),
1048	/// Match::must(`0`, `5`..`10`),
1049	/// Match::must(`0`, `11`..`17`),
1050	/// ]);
1051	/// # Ok::<(), Box<dyn std::error::Error>>(())
1052	/// ```
1053	#[inline]
1054	pub fn try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1055	&'r self,
1056	cache: &'c mut Cache,
1057	input: I,
1058	) -> TryFindMatches<'r, 'c, 'h> {
1059	let caps = Captures::matches(self.get_nfa().group_info().clone());
1060	let it = iter::Searcher::new(input.into());
1061	TryFindMatches { re: self, cache, caps, it }
1062	}
1063
1064	/// Returns an iterator over all non-overlapping `Captures` values. If no
1065	/// match exists, then the iterator yields no elements.
1066	///
1067	/// This yields the same matches as [`BoundedBacktracker::try_find_iter`],
1068	/// but it includes the spans of all capturing groups that participate in
1069	/// each match.
1070	///
1071	/// If the regex engine returns an error at any point, then the iterator
1072	/// will yield that error.
1073	///
1074	/// Tip:* See* [`util::iter::Searcher`](crate::util::iter::Searcher) for
1075	/// how to correctly iterate over all matches in a haystack while avoiding
1076	/// the creation of a new `Captures` value for every match. (Which you are
1077	/// forced to do with an `Iterator`.)
1078	///
1079	/// # Example
1080	///
1081	/// ```
1082	/// use regex_automata::{
1083	/// nfa::thompson::backtrack::BoundedBacktracker,
1084	/// Span,
1085	/// };
1086	///
1087	/// let re = BoundedBacktracker::new("foo(?P<numbers>[0-9]+)")?;
1088	/// let mut cache = re.create_cache();
1089	///
1090	/// let text = "foo1 foo12 foo123";
1091	/// let mut spans = vec![];
1092	/// for result in re.try_captures_iter(&mut cache, text) {
1093	/// let caps = result?;
1094	/// // The unwrap is OK since 'numbers' matches if the pattern matches.
1095	/// spans.push(caps.get_group_by_name("numbers").unwrap());
1096	/// }
1097	/// assert_eq!(spans, vec![
1098	/// Span::from(`3`..`4`),
1099	/// Span::from(`8`..`10`),
1100	/// Span::from(`14`..`17`),
1101	/// ]);
1102	/// # Ok::<(), Box<dyn std::error::Error>>(())
1103	/// ```
1104	#[inline]
1105	pub fn try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1106	&'r self,
1107	cache: &'c mut Cache,
1108	input: I,
1109	) -> TryCapturesMatches<'r, 'c, 'h> {
1110	let caps = self.create_captures();
1111	let it = iter::Searcher::new(input.into());
1112	TryCapturesMatches { re: self, cache, caps, it }
1113	}
1114	}
1115
1116	impl BoundedBacktracker {
1117	/// Executes a leftmost forward search and writes the spans of capturing
1118	/// groups that participated in a match into the provided [`Captures`]
1119	/// value. If no match was found, then [`Captures::is_match`] is guaranteed
1120	/// to return `false`.
1121	///
1122	/// This is like [`BoundedBacktracker::try_captures`], but it accepts a
1123	/// concrete `&Input` instead of an `Into<Input>`.
1124	///
1125	/// # Errors
1126	///
1127	/// This routine only errors if the search could not complete. For this
1128	/// backtracking regex engine, this only occurs when the haystack length
1129	/// exceeds [`BoundedBacktracker::max_haystack_len`].
1130	///
1131	/// When a search cannot complete, callers cannot know whether a match
1132	/// exists or not.
1133	///
1134	/// # Example: specific pattern search
1135	///
1136	/// This example shows how to build a multi bounded backtracker that
1137	/// permits searching for specific patterns.
1138	///
1139	/// ```
1140	/// use regex_automata::{
1141	/// nfa::thompson::backtrack::BoundedBacktracker,
1142	/// Anchored, Input, Match, PatternID,
1143	/// };
1144	///
1145	/// let re = BoundedBacktracker::new_many(&[
1146	/// "[a-z0-9]{6}",
1147	/// "[a-z][a-z0-9]{5}",
1148	/// ])?;
1149	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1150	/// let haystack = "foo123";
1151	///
1152	/// // Since we are using the default leftmost-first match and both
1153	/// // patterns match at the same starting position, only the first pattern
1154	/// // will be returned in this case when doing a search for any of the
1155	/// // patterns.
1156	/// let expected = Some(Match::must(`0`, `0`..`6`));
1157	/// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?;
1158	/// assert_eq!(expected, caps.get_match());
1159	///
1160	/// // But if we want to check whether some other pattern matches, then we
1161	/// // can provide its pattern ID.
1162	/// let expected = Some(Match::must(`1`, `0`..`6`));
1163	/// let input = Input::new(haystack)
1164	/// .anchored(Anchored::Pattern(PatternID::must(`1`)));
1165	/// re.try_search(&mut cache, &input, &mut caps)?;
1166	/// assert_eq!(expected, caps.get_match());
1167	///
1168	/// # Ok::<(), Box<dyn std::error::Error>>(())
1169	/// ```
1170	///
1171	/// # Example: specifying the bounds of a search
1172	///
1173	/// This example shows how providing the bounds of a search can produce
1174	/// different results than simply sub-slicing the haystack.
1175	///
1176	/// ```
1177	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1178	/// use regex_automata::{
1179	/// nfa::thompson::backtrack::BoundedBacktracker,
1180	/// Match, Input,
1181	/// };
1182	///
1183	/// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?;
1184	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1185	/// let haystack = "foo123bar";
1186	///
1187	/// // Since we sub-slice the haystack, the search doesn't know about
1188	/// // the larger context and assumes that `123` is surrounded by word
1189	/// // boundaries. And of course, the match position is reported relative
1190	/// // to the sub-slice as well, which means we get `0..3` instead of
1191	/// // `3..6`.
1192	/// let expected = Some(Match::must(`0`, `0`..`3`));
1193	/// re.try_search(&mut cache, &Input::new(&haystack[`3`..`6`]), &mut caps)?;
1194	/// assert_eq!(expected, caps.get_match());
1195	///
1196	/// // But if we provide the bounds of the search within the context of the
1197	/// // entire haystack, then the search can take the surrounding context
1198	/// // into account. (And if we did find a match, it would be reported
1199	/// // as a valid offset into `haystack` instead of its sub-slice.)
1200	/// let expected = None;
1201	/// re.try_search(
1202	/// &mut cache, &Input::new(haystack).range(`3`..`6`), &mut caps,
1203	/// )?;
1204	/// assert_eq!(expected, caps.get_match());
1205	///
1206	/// # Ok::<(), Box<dyn std::error::Error>>(())
1207	/// ```
1208	#[inline]
1209	pub fn try_search(
1210	&self,
1211	cache: &mut Cache,
1212	input: &Input<'_>,
1213	caps: &mut Captures,
1214	) -> Result<(), MatchError> {
1215	caps.set_pattern(None);
1216	let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
1217	caps.set_pattern(pid);
1218	Ok(())
1219	}
1220
1221	/// Executes a leftmost forward search and writes the spans of capturing
1222	/// groups that participated in a match into the provided `slots`, and
1223	/// returns the matching pattern ID. The contents of the slots for patterns
1224	/// other than the matching pattern are unspecified. If no match was found,
1225	/// then `None` is returned and the contents of all `slots` is unspecified.
1226	///
1227	/// This is like [`BoundedBacktracker::try_search`], but it accepts a raw
1228	/// slots slice instead of a `Captures` value. This is useful in contexts
1229	/// where you don't want or need to allocate a `Captures`.
1230	///
1231	/// It is legal to pass _any_ number of slots to this routine. If the regex
1232	/// engine would otherwise write a slot offset that doesn't fit in the
1233	/// provided slice, then it is simply skipped. In general though, there are
1234	/// usually three slice lengths you might want to use:
1235	///
1236	/// An empty slice, if you only care about which pattern matched.*
1237	/// A slice with*
1238	/// [`pattern_len() 2`](crate::nfa::thompson::NFA::pattern_len)*
1239	/// slots, if you only care about the overall match spans for each matching
1240	/// pattern.
1241	/// A slice with*
1242	/// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1243	/// permits recording match offsets for every capturing group in every
1244	/// pattern.
1245	///
1246	/// # Errors
1247	///
1248	/// This routine only errors if the search could not complete. For this
1249	/// backtracking regex engine, this only occurs when the haystack length
1250	/// exceeds [`BoundedBacktracker::max_haystack_len`].
1251	///
1252	/// When a search cannot complete, callers cannot know whether a match
1253	/// exists or not.
1254	///
1255	/// # Example
1256	///
1257	/// This example shows how to find the overall match offsets in a
1258	/// multi-pattern search without allocating a `Captures` value. Indeed, we
1259	/// can put our slots right on the stack.
1260	///
1261	/// ```
1262	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1263	/// use regex_automata::{
1264	/// nfa::thompson::backtrack::BoundedBacktracker,
1265	/// PatternID, Input,
1266	/// };
1267	///
1268	/// let re = BoundedBacktracker::new_many(&[
1269	/// r"\pL+",
1270	/// r"\d+",
1271	/// ])?;
1272	/// let mut cache = re.create_cache();
1273	/// let input = Input::new("!@#123");
1274	///
1275	/// // We only care about the overall match offsets here, so we just
1276	/// // allocate two slots for each pattern. Each slot records the start
1277	/// // and end of the match.
1278	/// let mut slots = [None; `4`];
1279	/// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
1280	/// assert_eq!(Some(PatternID::must(`1`)), pid);
1281	///
1282	/// // The overall match offsets are always at 'pid 2' and 'pid * 2 + 1'.*
1283	/// // See 'GroupInfo' for more details on the mapping between groups and
1284	/// // slot indices.
1285	/// let slot_start = pid.unwrap().as_usize() * `2`;
1286	/// let slot_end = slot_start + `1`;
1287	/// assert_eq!(Some(`3`), slots[slot_start].map(\|s\| s.get()));
1288	/// assert_eq!(Some(`6`), slots[slot_end].map(\|s\| s.get()));
1289	///
1290	/// # Ok::<(), Box<dyn std::error::Error>>(())
1291	/// ```
1292	#[inline]
1293	pub fn try_search_slots(
1294	&self,
1295	cache: &mut Cache,
1296	input: &Input<'_>,
1297	slots: &mut [Option<NonMaxUsize>],
1298	) -> Result<Option<PatternID>, MatchError> {
1299	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1300	if !utf8empty {
1301	let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1302	return Ok(maybe_hm.map(\|hm\| hm.pattern()));
1303	}
1304	// See PikeVM::try_search_slots for why we do this.
1305	let min = self.get_nfa().group_info().implicit_slot_len();
1306	if slots.len() >= min {
1307	let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1308	return Ok(maybe_hm.map(\|hm\| hm.pattern()));
1309	}
1310	if self.get_nfa().pattern_len() == `1` {
1311	let mut enough = [None, None];
1312	let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1313	// This is OK because we know `enough_slots` is strictly bigger
1314	// than `slots`, otherwise this special case isn't reached.
1315	slots.copy_from_slice(&enough[..slots.len()]);
1316	return Ok(got.map(\|hm\| hm.pattern()));
1317	}
1318	let mut enough = vec![None; min];
1319	let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1320	// This is OK because we know `enough_slots` is strictly bigger than
1321	// `slots`, otherwise this special case isn't reached.
1322	slots.copy_from_slice(&enough[..slots.len()]);
1323	Ok(got.map(\|hm\| hm.pattern()))
1324	}
1325
1326	/// This is the actual implementation of `try_search_slots_imp` that
1327	/// doesn't account for the special case when 1) the NFA has UTF-8 mode
1328	/// enabled, 2) the NFA can match the empty string and 3) the caller has
1329	/// provided an insufficient number of slots to record match offsets.
1330	#[inline(never)]
1331	fn try_search_slots_imp(
1332	&self,
1333	cache: &mut Cache,
1334	input: &Input<'_>,
1335	slots: &mut [Option<NonMaxUsize>],
1336	) -> Result<Option<HalfMatch>, MatchError> {
1337	let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1338	let hm = match self.search_imp(cache, input, slots)? {
1339	None => return Ok(None),
1340	Some(hm) if !utf8empty => return Ok(Some(hm)),
1341	Some(hm) => hm,
1342	};
1343	empty::skip_splits_fwd(input, hm, hm.offset(), \|input\| {
1344	Ok(self
1345	.search_imp(cache, input, slots)?
1346	.map(\|hm\| (hm, hm.offset())))
1347	})
1348	}
1349
1350	/// The implementation of standard leftmost backtracking search.
1351	///
1352	/// Capturing group spans are written to 'caps', but only if requested.
1353	/// 'caps' can be one of three things: 1) totally empty, in which case, we
1354	/// only report the pattern that matched or 2) only has slots for recording
1355	/// the overall match offsets for any pattern or 3) has all slots available
1356	/// for recording the spans of any groups participating in a match.
1357	fn search_imp(
1358	&self,
1359	cache: &mut Cache,
1360	input: &Input<'_>,
1361	slots: &mut [Option<NonMaxUsize>],
1362	) -> Result<Option<HalfMatch>, MatchError> {
1363	// Unlike in the PikeVM, we write our capturing group spans directly
1364	// into the caller's captures groups. So we have to make sure we're
1365	// starting with a blank slate first. In the PikeVM, we avoid this
1366	// by construction: the spans that are copied to every slot in the
1367	// 'Captures' value already account for presence/absence. In this
1368	// backtracker, we write directly into the caller provided slots, where
1369	// as in the PikeVM, we write into scratch space first and only copy
1370	// them to the caller provided slots when a match is found.
1371	for slot in slots.iter_mut() {
1372	*slot = None;
1373	}
1374	cache.setup_search(&self, input)?;
1375	if input.is_done() {
1376	return Ok(None);
1377	}
1378	let (anchored, start_id) = match input.get_anchored() {
1379	// Only way we're unanchored is if both the caller asked for an
1380	// unanchored search and* the pattern is itself not anchored.*
1381	Anchored::No => (
1382	self.nfa.is_always_start_anchored(),
1383	// We always use the anchored starting state here, even if
1384	// doing an unanchored search. The "unanchored" part of it is
1385	// implemented in the loop below, by simply trying the next
1386	// byte offset if the previous backtracking exploration failed.
1387	self.nfa.start_anchored(),
1388	),
1389	Anchored::Yes => (`true`, self.nfa.start_anchored()),
1390	Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) {
1391	None => return Ok(None),
1392	Some(sid) => (`true`, sid),
1393	},
1394	};
1395	if anchored {
1396	let at = input.start();
1397	return Ok(self.backtrack(cache, input, at, start_id, slots));
1398	}
1399	let pre = self.get_config().get_prefilter();
1400	let mut at = input.start();
1401	while at <= input.end() {
1402	if let Some(ref pre) = pre {
1403	let span = Span::from(at..input.end());
1404	match pre.find(input.haystack(), span) {
1405	None => break,
1406	Some(ref span) => at = span.start,
1407	}
1408	}
1409	if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
1410	{
1411	return Ok(Some(hm));
1412	}
1413	at += `1`;
1414	}
1415	Ok(None)
1416	}
1417
1418	/// Look for a match starting at `at` in `input` and write the matching
1419	/// pattern ID and group spans to `caps`. The search uses `start_id` as its
1420	/// starting state in the underlying NFA.
1421	///
1422	/// If no match was found, then the caller should increment `at` and try
1423	/// at the next position.
1424	#[cfg_attr(feature = "perf-inline", inline(always))]
1425	fn backtrack(
1426	&self,
1427	cache: &mut Cache,
1428	input: &Input<'_>,
1429	at: usize,
1430	start_id: StateID,
1431	slots: &mut [Option<NonMaxUsize>],
1432	) -> Option<HalfMatch> {
1433	cache.stack.push(Frame::Step { sid: start_id, at });
1434	while let Some(frame) = cache.stack.pop() {
1435	match frame {
1436	Frame::Step { sid, at } => {
1437	if let Some(hm) = self.step(cache, input, sid, at, slots) {
1438	return Some(hm);
1439	}
1440	}
1441	Frame::RestoreCapture { slot, offset } => {
1442	slots[slot] = offset;
1443	}
1444	}
1445	}
1446	None
1447	}
1448
1449	// LAMENTATION: The actual backtracking search is implemented in about
1450	// 75 lines below. Yet this file is over 2,000 lines long. What have I
1451	// done?
1452
1453	/// Execute a "step" in the backtracing algorithm.
1454	///
1455	/// A "step" is somewhat of a misnomer, because this routine keeps going
1456	/// until it either runs out of things to try or fins a match. In the
1457	/// former case, it may have pushed some things on to the backtracking
1458	/// stack, in which case, those will be tried next as part of the
1459	/// 'backtrack' routine above.
1460	#[cfg_attr(feature = "perf-inline", inline(always))]
1461	fn step(
1462	&self,
1463	cache: &mut Cache,
1464	input: &Input<'_>,
1465	mut sid: StateID,
1466	mut at: usize,
1467	slots: &mut [Option<NonMaxUsize>],
1468	) -> Option<HalfMatch> {
1469	loop {
1470	if !cache.visited.insert(sid, at - input.start()) {
1471	return None;
1472	}
1473	match *self.nfa.state(sid) {
1474	State::ByteRange { ref trans } => {
1475	// Why do we need this? Unlike other regex engines in this
1476	// crate, the backtracker can steam roll ahead in the
1477	// haystack outside of the main loop over the bytes in the
1478	// haystack. While 'trans.matches()' below handles the case
1479	// of 'at' being out of bounds of 'input.haystack()', we
1480	// also need to handle the case of 'at' going out of bounds
1481	// of the span the caller asked to search.
1482	//
1483	// We should perhaps make the 'trans.matches()' API accept
1484	// an '&Input' instead of a '&[u8]'. Or at least, add a new
1485	// API that does it.
1486	if at >= input.end() {
1487	return None;
1488	}
1489	if !trans.matches(input.haystack(), at) {
1490	return None;
1491	}
1492	sid = trans.next;
1493	at += `1`;
1494	}
1495	State::Sparse(ref sparse) => {
1496	if at >= input.end() {
1497	return None;
1498	}
1499	sid = sparse.matches(input.haystack(), at)?;
1500	at += `1`;
1501	}
1502	State::Dense(ref dense) => {
1503	if at >= input.end() {
1504	return None;
1505	}
1506	sid = dense.matches(input.haystack(), at)?;
1507	at += `1`;
1508	}
1509	State::Look { look, next } => {
1510	// OK because we don't permit building a searcher with a
1511	// Unicode word boundary if the requisite Unicode data is
1512	// unavailable.
1513	if !self.nfa.look_matcher().matches_inline(
1514	look,
1515	input.haystack(),
1516	at,
1517	) {
1518	return None;
1519	}
1520	sid = next;
1521	}
1522	State::Union { ref alternates } => {
1523	sid = match alternates.get(`0`) {
1524	None => return None,
1525	Some(&sid) => sid,
1526	};
1527	cache.stack.extend(
1528	alternates[`1`..]
1529	.iter()
1530	.copied()
1531	.rev()
1532	.map(\|sid\| Frame::Step { sid, at }),
1533	);
1534	}
1535	State::BinaryUnion { alt1, alt2 } => {
1536	sid = alt1;
1537	cache.stack.push(Frame::Step { sid: alt2, at });
1538	}
1539	State::Capture { next, slot, .. } => {
1540	if slot.as_usize() < slots.len() {
1541	cache.stack.push(Frame::RestoreCapture {
1542	slot,
1543	offset: slots[slot],
1544	});
1545	slots[slot] = NonMaxUsize::new(at);
1546	}
1547	sid = next;
1548	}
1549	State::Fail => return None,
1550	State::Match { pattern_id } => {
1551	return Some(HalfMatch::new(pattern_id, at));
1552	}
1553	}
1554	}
1555	}
1556	}
1557
1558	/// An iterator over all non-overlapping matches for a fallible search.
1559	///
1560	/// The iterator yields a `Result<Match, MatchError` value until no more
1561	/// matches could be found.
1562	///
1563	/// The lifetime parameters are as follows:
1564	///
1565	/// `'r` represents the lifetime of the BoundedBacktracker.*
1566	/// `'c` represents the lifetime of the BoundedBacktracker's cache.*
1567	/// `'h` represents the lifetime of the haystack being searched.*
1568	///
1569	/// This iterator can be created with the [`BoundedBacktracker::try_find_iter`]
1570	/// method.
1571	#[derive(Debug)]
1572	pub struct TryFindMatches<'r, 'c, 'h> {
1573	re: &'r BoundedBacktracker,
1574	cache: &'c mut Cache,
1575	caps: Captures,
1576	it: iter::Searcher<'h>,
1577	}
1578
1579	impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> {
1580	type Item = Result<Match, MatchError>;
1581
1582	#[inline]
1583	fn next(&mut self) -> Option<Result<Match, MatchError>> {
1584	// Splitting 'self' apart seems necessary to appease borrowck.
1585	let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } =
1586	*self;
1587	it.try_advance(\|input\| {
1588	re.try_search(cache, input, caps)?;
1589	Ok(caps.get_match())
1590	})
1591	.transpose()
1592	}
1593	}
1594
1595	/// An iterator over all non-overlapping leftmost matches, with their capturing
1596	/// groups, for a fallible search.
1597	///
1598	/// The iterator yields a `Result<Captures, MatchError>` value until no more
1599	/// matches could be found.
1600	///
1601	/// The lifetime parameters are as follows:
1602	///
1603	/// `'r` represents the lifetime of the BoundedBacktracker.*
1604	/// `'c` represents the lifetime of the BoundedBacktracker's cache.*
1605	/// `'h` represents the lifetime of the haystack being searched.*
1606	///
1607	/// This iterator can be created with the
1608	/// [`BoundedBacktracker::try_captures_iter`] method.
1609	#[derive(Debug)]
1610	pub struct TryCapturesMatches<'r, 'c, 'h> {
1611	re: &'r BoundedBacktracker,
1612	cache: &'c mut Cache,
1613	caps: Captures,
1614	it: iter::Searcher<'h>,
1615	}
1616
1617	impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> {
1618	type Item = Result<Captures, MatchError>;
1619
1620	#[inline]
1621	fn next(&mut self) -> Option<Result<Captures, MatchError>> {
1622	// Splitting 'self' apart seems necessary to appease borrowck.
1623	let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
1624	*self;
1625	let _ = it
1626	.try_advance(\|input\| {
1627	re.try_search(cache, input, caps)?;
1628	Ok(caps.get_match())
1629	})
1630	.transpose()?;
1631	if caps.is_match() {
1632	Some(Ok(caps.clone()))
1633	} else {
1634	None
1635	}
1636	}
1637	}
1638
1639	/// A cache represents mutable state that a [`BoundedBacktracker`] requires
1640	/// during a search.
1641	///
1642	/// For a given [`BoundedBacktracker`], its corresponding cache may be created
1643	/// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`].
1644	/// They are equivalent in every way, except the former does not require
1645	/// explicitly importing `Cache`.
1646	///
1647	/// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which
1648	/// it was created. It may only be used with that `BoundedBacktracker`. A cache
1649	/// and its allocations may be re-purposed via [`Cache::reset`], in which case,
1650	/// it can only be used with the new `BoundedBacktracker` (and not the old
1651	/// one).
1652	#[derive(Clone, Debug)]
1653	pub struct Cache {
1654	/// Stack used on the heap for doing backtracking instead of the
1655	/// traditional recursive approach. We don't want recursion because then
1656	/// we're likely to hit a stack overflow for bigger regexes.
1657	stack: Vec<Frame>,
1658	/// The set of (StateID, HaystackOffset) pairs that have been visited
1659	/// by the backtracker within a single search. If such a pair has been
1660	/// visited, then we avoid doing the work for that pair again. This is
1661	/// what "bounds" the backtracking and prevents it from having worst case
1662	/// exponential time.
1663	visited: Visited,
1664	}
1665
1666	impl Cache {
1667	/// Create a new [`BoundedBacktracker`] cache.
1668	///
1669	/// A potentially more convenient routine to create a cache is
1670	/// [`BoundedBacktracker::create_cache`], as it does not require also
1671	/// importing the `Cache` type.
1672	///
1673	/// If you want to reuse the returned `Cache` with some other
1674	/// `BoundedBacktracker`, then you must call [`Cache::reset`] with the
1675	/// desired `BoundedBacktracker`.
1676	pub fn new(re: &BoundedBacktracker) -> Cache {
1677	Cache { stack: vec![], visited: Visited::new(re) }
1678	}
1679
1680	/// Reset this cache such that it can be used for searching with different
1681	/// [`BoundedBacktracker`].
1682	///
1683	/// A cache reset permits reusing memory already allocated in this cache
1684	/// with a different `BoundedBacktracker`.
1685	///
1686	/// # Example
1687	///
1688	/// This shows how to re-purpose a cache for use with a different
1689	/// `BoundedBacktracker`.
1690	///
1691	/// ```
1692	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1693	/// use regex_automata::{
1694	/// nfa::thompson::backtrack::BoundedBacktracker,
1695	/// Match,
1696	/// };
1697	///
1698	/// let re1 = BoundedBacktracker::new(r"\w")?;
1699	/// let re2 = BoundedBacktracker::new(r"\W")?;
1700	///
1701	/// let mut cache = re1.create_cache();
1702	/// assert_eq!(
1703	/// Some(Ok(Match::must(`0`, `0`..`2`))),
1704	/// re1.try_find_iter(&mut cache, "Δ").next(),
1705	/// );
1706	///
1707	/// // Using 'cache' with re2 is not allowed. It may result in panics or
1708	/// // incorrect results. In order to re-purpose the cache, we must reset
1709	/// // it with the BoundedBacktracker we'd like to use it with.
1710	/// //
1711	/// // Similarly, after this reset, using the cache with 're1' is also not
1712	/// // allowed.
1713	/// cache.reset(&re2);
1714	/// assert_eq!(
1715	/// Some(Ok(Match::must(`0`, `0`..`3`))),
1716	/// re2.try_find_iter(&mut cache, "☃").next(),
1717	/// );
1718	///
1719	/// # Ok::<(), Box<dyn std::error::Error>>(())
1720	/// ```
1721	pub fn reset(&mut self, re: &BoundedBacktracker) {
1722	self.visited.reset(re);
1723	}
1724
1725	/// Returns the heap memory usage, in bytes, of this cache.
1726	///
1727	/// This does not* include the stack size used up by this cache. To*
1728	/// compute that, use `std::mem::size_of::<Cache>()`.
1729	pub fn memory_usage(&self) -> usize {
1730	self.stack.len() * core::mem::size_of::<Frame>()
1731	+ self.visited.memory_usage()
1732	}
1733
1734	/// Clears this cache. This should be called at the start of every search
1735	/// to ensure we start with a clean slate.
1736	///
1737	/// This also sets the length of the capturing groups used in the current
1738	/// search. This permits an optimization where by 'SlotTable::for_state'
1739	/// only returns the number of slots equivalent to the number of slots
1740	/// given in the 'Captures' value. This may be less than the total number
1741	/// of possible slots, e.g., when one only wants to track overall match
1742	/// offsets. This in turn permits less copying of capturing group spans
1743	/// in the BoundedBacktracker.
1744	fn setup_search(
1745	&mut self,
1746	re: &BoundedBacktracker,
1747	input: &Input<'_>,
1748	) -> Result<(), MatchError> {
1749	self.stack.clear();
1750	self.visited.setup_search(re, input)?;
1751	Ok(())
1752	}
1753	}
1754
1755	/// Represents a stack frame on the heap while doing backtracking.
1756	///
1757	/// Instead of using explicit recursion for backtracking, we use a stack on
1758	/// the heap to keep track of things that we want to explore if the current
1759	/// backtracking branch turns out to not lead to a match.
1760	#[derive(Clone, Debug)]
1761	enum Frame {
1762	/// Look for a match starting at `sid` and the given position in the
1763	/// haystack.
1764	Step { sid: StateID, at: usize },
1765	/// Reset the given `slot` to the given `offset` (which might be `None`).
1766	/// This effectively gives a "scope" to capturing groups, such that an
1767	/// offset for a particular group only gets returned if the match goes
1768	/// through that capturing group. If backtracking ends up going down a
1769	/// different branch that results in a different offset (or perhaps none at
1770	/// all), then this "restore capture" frame will cause the offset to get
1771	/// reset.
1772	RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
1773	}
1774
1775	/// A bitset that keeps track of whether a particular (StateID, offset) has
1776	/// been considered during backtracking. If it has already been visited, then
1777	/// backtracking skips it. This is what gives backtracking its "bound."
1778	#[derive(Clone, Debug)]
1779	struct Visited {
1780	/// The actual underlying bitset. Each element in the bitset corresponds
1781	/// to a particular (StateID, offset) pair. States correspond to the rows
1782	/// and the offsets correspond to the columns.
1783	///
1784	/// If our underlying NFA has N states and the haystack we're searching
1785	/// has M bytes, then we have N(M+1) entries in our bitset table. The*
1786	/// M+1 occurs because our matches are delayed by one byte (to support
1787	/// look-around), and so we need to handle the end position itself rather
1788	/// than stopping just before the end. (If there is no end position, then
1789	/// it's treated as "end-of-input," which is matched by things like '$'.)
1790	///
1791	/// Given BITS=N(M+1), we wind up with div_ceil(BITS, sizeof(usize))*
1792	/// blocks.
1793	///
1794	/// We use 'usize' to represent our blocks because it makes some of the
1795	/// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for
1796	/// our block, we'd either need to cast u32s to usizes or usizes to u32s.
1797	bitset: Vec<usize>,
1798	/// The stride represents one plus length of the haystack we're searching
1799	/// (as described above). The stride must be initialized for each search.
1800	stride: usize,
1801	}
1802
1803	impl Visited {
1804	/// The size of each block, in bits.
1805	const BLOCK_SIZE: usize = `8` * core::mem::size_of::<usize>();
1806
1807	/// Create a new visited set for the given backtracker.
1808	///
1809	/// The set is ready to use, but must be setup at the beginning of each
1810	/// search by calling `setup_search`.
1811	fn new(re: &BoundedBacktracker) -> Visited {
1812	let mut visited = Visited { bitset: vec![], stride: `0` };
1813	visited.reset(re);
1814	visited
1815	}
1816
1817	/// Insert the given (StateID, offset) pair into this set. If it already
1818	/// exists, then this is a no-op and it returns false. Otherwise this
1819	/// returns true.
1820	fn insert(&mut self, sid: StateID, at: usize) -> bool {
1821	let table_index = sid.as_usize() * self.stride + at;
1822	let block_index = table_index / Visited::BLOCK_SIZE;
1823	let bit = table_index % Visited::BLOCK_SIZE;
1824	let block_with_bit = `1` << bit;
1825	if self.bitset[block_index] & block_with_bit != `0` {
1826	return `false`;
1827	}
1828	self.bitset[block_index] \|= block_with_bit;
1829	`true`
1830	}
1831
1832	/// Reset this visited set to work with the given bounded backtracker.
1833	fn reset(&mut self, _: &BoundedBacktracker) {
1834	self.bitset.truncate(`0`);
1835	}
1836
1837	/// Setup this visited set to work for a search using the given NFA
1838	/// and input configuration. The NFA must be the same NFA used by the
1839	/// BoundedBacktracker given to Visited::reset. Failing to call this might
1840	/// result in panics or silently incorrect search behavior.
1841	fn setup_search(
1842	&mut self,
1843	re: &BoundedBacktracker,
1844	input: &Input<'_>,
1845	) -> Result<(), MatchError> {
1846	// Our haystack length is only the length of the span of the entire
1847	// haystack that we'll be searching.
1848	let haylen = input.get_span().len();
1849	let err = \|\| MatchError::haystack_too_long(haylen);
1850	// Our stride is one more than the length of the input because our main
1851	// search loop includes the position at input.end(). (And it does this
1852	// because matches are delayed by one byte to account for look-around.)
1853	self.stride = haylen + `1`;
1854	let needed_capacity =
1855	match re.get_nfa().states().len().checked_mul(self.stride) {
1856	None => return Err(err()),
1857	Some(capacity) => capacity,
1858	};
1859	let max_capacity = `8` * re.get_config().get_visited_capacity();
1860	if needed_capacity > max_capacity {
1861	return Err(err());
1862	}
1863	let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE);
1864	self.bitset.truncate(needed_blocks);
1865	for block in self.bitset.iter_mut() {
1866	*block = `0`;
1867	}
1868	if needed_blocks > self.bitset.len() {
1869	self.bitset.resize(needed_blocks, `0`);
1870	}
1871	Ok(())
1872	}
1873
1874	/// Return the heap memory usage, in bytes, of this visited set.
1875	fn memory_usage(&self) -> usize {
1876	self.bitset.len() * core::mem::size_of::<usize>()
1877	}
1878	}
1879
1880	/// Integer division, but rounds up instead of down.
1881	fn div_ceil(lhs: usize, rhs: usize) -> usize {
1882	if lhs % rhs == `0` {
1883	lhs / rhs
1884	} else {
1885	(lhs / rhs) + `1`
1886	}
1887	}
1888
1889	#[cfg(test)]
1890	mod tests {
1891	use super::*;
1892
1893	// This is a regression test for the maximum haystack length computation.
1894	// Previously, it assumed that the total capacity of the backtracker's
1895	// bitset would always be greater than the number of NFA states. But there
1896	// is of course no guarantee that this is true. This regression test
1897	// ensures that not only does `max_haystack_len` not panic, but that it
1898	// should return `0`.
1899	#[cfg(feature = "syntax")]
1900	#[test]
1901	fn max_haystack_len_overflow() {
1902	let re = BoundedBacktracker::builder()
1903	.configure(BoundedBacktracker::config().visited_capacity(`10`))
1904	.build(r"[0-9A-Za-z]{100}")
1905	.unwrap();
1906	assert_eq!(`0`, re.max_haystack_len());
1907	}
1908	}
1909