exec.rs source code [crates/regex-1.8.4/src/exec.rs]

1	use std::cell::RefCell;
2	use std::collections::HashMap;
3	use std::panic::AssertUnwindSafe;
4	use std::sync::Arc;
5
6	#[cfg(feature = "perf-literal")]
7	use aho_corasick::{AhoCorasick, MatchKind};
8	use regex_syntax::hir::literal;
9	use regex_syntax::hir::{Hir, Look};
10	use regex_syntax::ParserBuilder;
11
12	use crate::backtrack;
13	use crate::compile::Compiler;
14	#[cfg(feature = "perf-dfa")]
15	use crate::dfa;
16	use crate::error::Error;
17	use crate::input::{ByteInput, CharInput};
18	use crate::literal::LiteralSearcher;
19	use crate::pikevm;
20	use crate::pool::{Pool, PoolGuard};
21	use crate::prog::Program;
22	use crate::re_builder::RegexOptions;
23	use crate::re_bytes;
24	use crate::re_set;
25	use crate::re_trait::{Locations, RegularExpression, Slot};
26	use crate::re_unicode;
27	use crate::utf8::next_utf8;
28
29	/// `Exec` manages the execution of a regular expression.
30	///
31	/// In particular, this manages the various compiled forms of a single regular
32	/// expression and the choice of which matching engine to use to execute a
33	/// regular expression.
34	#[derive(Debug)]
35	pub struct Exec {
36	/// All read only state.
37	ro: Arc<ExecReadOnly>,
38	/// A pool of reusable values for the various matching engines.
39	///
40	/// Note that boxing this value is not strictly necessary, but it is an
41	/// easy way to ensure that T does not bloat the stack sized used by a pool
42	/// in the case where T is big. And this turns out to be the case at the
43	/// time of writing for regex's use of this pool. At the time of writing,
44	/// the size of a Regex on the stack is 856 bytes. Boxing this value
45	/// reduces that size to 16 bytes.
46	pool: Box<Pool<ProgramCache>>,
47	}
48
49	/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50	/// means it is no longer Sync, but we can now avoid the overhead of
51	/// synchronization to fetch the cache.
52	#[derive(Debug)]
53	pub struct ExecNoSync<'c> {
54	/// All read only state.
55	ro: &'c Arc<ExecReadOnly>,
56	/// Caches for the various matching engines.
57	cache: PoolGuard<'c, ProgramCache>,
58	}
59
60	/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
61	#[derive(Debug)]
62	pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
63
64	/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65	/// state is determined at compile time and never changes during search.
66	#[derive(Debug)]
67	struct ExecReadOnly {
68	/// The original regular expressions given by the caller to compile.
69	res: Vec<String>,
70	/// A compiled program that is used in the NFA simulation and backtracking.
71	/// It can be byte-based or Unicode codepoint based.
72	///
73	/// N.B. It is not possibly to make this byte-based from the public API.
74	/// It is only used for testing byte based programs in the NFA simulations.
75	nfa: Program,
76	/// A compiled byte based program for DFA execution. This is only used
77	/// if a DFA can be executed. (Currently, only word boundary assertions are
78	/// not supported.) Note that this program contains an embedded `.?`*
79	/// preceding the first capture group, unless the regex is anchored at the
80	/// beginning.
81	#[allow(dead_code)]
82	dfa: Program,
83	/// The same as above, except the program is reversed (and there is no
84	/// preceding `.?`). This is used by the DFA to find the starting location*
85	/// of matches.
86	#[allow(dead_code)]
87	dfa_reverse: Program,
88	/// A set of suffix literals extracted from the regex.
89	///
90	/// Prefix literals are stored on the `Program`, since they are used inside
91	/// the matching engines.
92	#[allow(dead_code)]
93	suffixes: LiteralSearcher,
94	/// An Aho-Corasick automaton with leftmost-first match semantics.
95	///
96	/// This is only set when the entire regex is a simple unanchored
97	/// alternation of literals. We could probably use it more circumstances,
98	/// but this is already hacky enough in this architecture.
99	///
100	/// N.B. We use u32 as a state ID representation under the assumption that
101	/// if we were to exhaust the ID space, we probably would have long
102	/// surpassed the compilation size limit.
103	#[cfg(feature = "perf-literal")]
104	ac: Option<AhoCorasick>,
105	/// match_type encodes as much upfront knowledge about how we're going to
106	/// execute a search as possible.
107	match_type: MatchType,
108	}
109
110	/// Facilitates the construction of an executor by exposing various knobs
111	/// to control how a regex is executed and what kinds of resources it's
112	/// permitted to use.
113	// `ExecBuilder` is only public via the `internal` module, so avoid deriving
114	// `Debug`.
115	#[allow(missing_debug_implementations)]
116	pub struct ExecBuilder {
117	options: RegexOptions,
118	match_type: Option<MatchType>,
119	bytes: bool,
120	only_utf8: bool,
121	}
122
123	/// Parsed represents a set of parsed regular expressions and their detected
124	/// literals.
125	struct Parsed {
126	exprs: Vec<Hir>,
127	prefixes: literal::Seq,
128	suffixes: literal::Seq,
129	bytes: bool,
130	}
131
132	impl ExecBuilder {
133	/// Create a regex execution builder.
134	///
135	/// This uses default settings for everything except the regex itself,
136	/// which must be provided. Further knobs can be set by calling methods,
137	/// and then finally, `build` to actually create the executor.
138	pub fn new(re: &str) -> Self {
139	Self::new_many(&[re])
140	}
141
142	/// Like new, but compiles the union of the given regular expressions.
143	///
144	/// Note that when compiling 2 or more regular expressions, capture groups
145	/// are completely unsupported. (This means both `find` and `captures`
146	/// won't work.)
147	pub fn new_many<I, S>(res: I) -> Self
148	where
149	S: AsRef<str>,
150	I: IntoIterator<Item = S>,
151	{
152	let mut opts = RegexOptions::default();
153	opts.pats = res.into_iter().map(\|s\| s.as_ref().to_owned()).collect();
154	Self::new_options(opts)
155	}
156
157	/// Create a regex execution builder.
158	pub fn new_options(opts: RegexOptions) -> Self {
159	ExecBuilder {
160	options: opts,
161	match_type: None,
162	bytes: `false`,
163	only_utf8: `true`,
164	}
165	}
166
167	/// Set the matching engine to be automatically determined.
168	///
169	/// This is the default state and will apply whatever optimizations are
170	/// possible, such as running a DFA.
171	///
172	/// This overrides whatever was previously set via the `nfa` or
173	/// `bounded_backtracking` methods.
174	pub fn automatic(mut self) -> Self {
175	self.match_type = None;
176	self
177	}
178
179	/// Sets the matching engine to use the NFA algorithm no matter what
180	/// optimizations are possible.
181	///
182	/// This overrides whatever was previously set via the `automatic` or
183	/// `bounded_backtracking` methods.
184	pub fn nfa(mut self) -> Self {
185	self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
186	self
187	}
188
189	/// Sets the matching engine to use a bounded backtracking engine no
190	/// matter what optimizations are possible.
191	///
192	/// One must use this with care, since the bounded backtracking engine
193	/// uses memory proportion to `len(regex) len(text)`.*
194	///
195	/// This overrides whatever was previously set via the `automatic` or
196	/// `nfa` methods.
197	pub fn bounded_backtracking(mut self) -> Self {
198	self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
199	self
200	}
201
202	/// Compiles byte based programs for use with the NFA matching engines.
203	///
204	/// By default, the NFA engines match on Unicode scalar values. They can
205	/// be made to use byte based programs instead. In general, the byte based
206	/// programs are slower because of a less efficient encoding of character
207	/// classes.
208	///
209	/// Note that this does not impact DFA matching engines, which always
210	/// execute on bytes.
211	pub fn bytes(mut self, yes: bool) -> Self {
212	self.bytes = yes;
213	self
214	}
215
216	/// When disabled, the program compiled may match arbitrary bytes.
217	///
218	/// When enabled (the default), all compiled programs exclusively match
219	/// valid UTF-8 bytes.
220	pub fn only_utf8(mut self, yes: bool) -> Self {
221	self.only_utf8 = yes;
222	self
223	}
224
225	/// Set the Unicode flag.
226	pub fn unicode(mut self, yes: bool) -> Self {
227	self.options.unicode = yes;
228	self
229	}
230
231	/// Parse the current set of patterns into their AST and extract literals.
232	fn parse(&self) -> Result<Parsed, Error> {
233	let mut exprs = Vec::with_capacity(self.options.pats.len());
234	let mut prefixes = Some(literal::Seq::empty());
235	let mut suffixes = Some(literal::Seq::empty());
236	let mut bytes = `false`;
237	let is_set = self.options.pats.len() > `1`;
238	// If we're compiling a regex set and that set has any anchored
239	// expressions, then disable all literal optimizations.
240	for pat in &self.options.pats {
241	let mut parser = ParserBuilder::new()
242	.octal(self.options.octal)
243	.case_insensitive(self.options.case_insensitive)
244	.multi_line(self.options.multi_line)
245	.dot_matches_new_line(self.options.dot_matches_new_line)
246	.swap_greed(self.options.swap_greed)
247	.ignore_whitespace(self.options.ignore_whitespace)
248	.unicode(self.options.unicode)
249	.utf8(self.only_utf8)
250	.nest_limit(self.options.nest_limit)
251	.build();
252	let expr =
253	parser.parse(pat).map_err(\|e\| Error::Syntax(e.to_string()))?;
254	let props = expr.properties();
255	// This used to just check whether the HIR matched valid UTF-8
256	// or not, but in regex-syntax 0.7, we changed our definition of
257	// "matches valid UTF-8" to exclude zero-width matches. And in
258	// particular, previously, we considered WordAsciiNegate (that
259	// is '(?-u:\B)') to be capable of matching invalid UTF-8. Our
260	// matcher engines were built under this assumption and fixing
261	// them is not worth it with the imminent plan to switch over to
262	// regex-automata. So for now, we retain the previous behavior by
263	// just explicitly treating the presence of a negated ASCII word
264	// boundary as forcing use to use a byte oriented automaton.
265	bytes = bytes
266	\|\| !props.is_utf8()
267	\|\| props.look_set().contains(Look::WordAsciiNegate);
268
269	if cfg!(feature = "perf-literal") {
270	if !props.look_set_prefix().contains(Look::Start)
271	&& props.look_set().contains(Look::Start)
272	{
273	// Partial anchors unfortunately make it hard to use
274	// prefixes, so disable them.
275	prefixes = None;
276	} else if is_set
277	&& props.look_set_prefix_any().contains(Look::Start)
278	{
279	// Regex sets with anchors do not go well with literal
280	// optimizations.
281	prefixes = None;
282	} else if props.look_set_prefix_any().contains_word() {
283	// The new literal extractor ignores look-around while
284	// the old one refused to extract prefixes from regexes
285	// that began with a \b. These old creaky regex internals
286	// can't deal with it, so we drop it.
287	prefixes = None;
288	} else if props.look_set_prefix_any().contains(Look::StartLF) {
289	// Similar to the reasoning for word boundaries, this old
290	// regex engine can't handle literal prefixes with '(?m:^)'
291	// at the beginning of a regex.
292	prefixes = None;
293	}
294
295	if !props.look_set_suffix().contains(Look::End)
296	&& props.look_set().contains(Look::End)
297	{
298	// Partial anchors unfortunately make it hard to use
299	// suffixes, so disable them.
300	suffixes = None;
301	} else if is_set
302	&& props.look_set_suffix_any().contains(Look::End)
303	{
304	// Regex sets with anchors do not go well with literal
305	// optimizations.
306	suffixes = None;
307	} else if props.look_set_suffix_any().contains_word() {
308	// See the prefix case for reasoning here.
309	suffixes = None;
310	} else if props.look_set_suffix_any().contains(Look::EndLF) {
311	// See the prefix case for reasoning here.
312	suffixes = None;
313	}
314
315	let (mut pres, mut suffs) =
316	if prefixes.is_none() && suffixes.is_none() {
317	(literal::Seq::infinite(), literal::Seq::infinite())
318	} else {
319	literal_analysis(&expr)
320	};
321	// These old creaky regex internals can't handle cases where
322	// the literal sequences are exact but there are look-around
323	// assertions. So we make sure the sequences are inexact if
324	// there are look-around assertions anywhere. This forces the
325	// regex engines to run instead of assuming that a literal
326	// match implies an overall match.
327	if !props.look_set().is_empty() {
328	pres.make_inexact();
329	suffs.make_inexact();
330	}
331	prefixes = prefixes.and_then(\|mut prefixes\| {
332	prefixes.union(&mut pres);
333	Some(prefixes)
334	});
335	suffixes = suffixes.and_then(\|mut suffixes\| {
336	suffixes.union(&mut suffs);
337	Some(suffixes)
338	});
339	}
340	exprs.push(expr);
341	}
342	Ok(Parsed {
343	exprs,
344	prefixes: prefixes.unwrap_or_else(literal::Seq::empty),
345	suffixes: suffixes.unwrap_or_else(literal::Seq::empty),
346	bytes,
347	})
348	}
349
350	/// Build an executor that can run a regular expression.
351	pub fn build(self) -> Result<Exec, Error> {
352	// Special case when we have no patterns to compile.
353	// This can happen when compiling a regex set.
354	if self.options.pats.is_empty() {
355	let ro = Arc::new(ExecReadOnly {
356	res: vec![],
357	nfa: Program::new(),
358	dfa: Program::new(),
359	dfa_reverse: Program::new(),
360	suffixes: LiteralSearcher::empty(),
361	#[cfg(feature = "perf-literal")]
362	ac: None,
363	match_type: MatchType::Nothing,
364	});
365	let pool = ExecReadOnly::new_pool(&ro);
366	return Ok(Exec { ro, pool });
367	}
368	let parsed = self.parse()?;
369	let mut nfa = Compiler::new()
370	.size_limit(self.options.size_limit)
371	.bytes(self.bytes \|\| parsed.bytes)
372	.only_utf8(self.only_utf8)
373	.compile(&parsed.exprs)?;
374	let mut dfa = Compiler::new()
375	.size_limit(self.options.size_limit)
376	.dfa(`true`)
377	.only_utf8(self.only_utf8)
378	.compile(&parsed.exprs)?;
379	let mut dfa_reverse = Compiler::new()
380	.size_limit(self.options.size_limit)
381	.dfa(`true`)
382	.only_utf8(self.only_utf8)
383	.reverse(`true`)
384	.compile(&parsed.exprs)?;
385
386	#[cfg(feature = "perf-literal")]
387	let ac = self.build_aho_corasick(&parsed);
388	nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
389	dfa.prefixes = nfa.prefixes.clone();
390	dfa.dfa_size_limit = self.options.dfa_size_limit;
391	dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
392
393	let mut ro = ExecReadOnly {
394	res: self.options.pats,
395	nfa,
396	dfa,
397	dfa_reverse,
398	suffixes: LiteralSearcher::suffixes(parsed.suffixes),
399	#[cfg(feature = "perf-literal")]
400	ac,
401	match_type: MatchType::Nothing,
402	};
403	ro.match_type = ro.choose_match_type(self.match_type);
404
405	let ro = Arc::new(ro);
406	let pool = ExecReadOnly::new_pool(&ro);
407	Ok(Exec { ro, pool })
408	}
409
410	#[cfg(feature = "perf-literal")]
411	fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> {
412	if parsed.exprs.len() != `1` {
413	return None;
414	}
415	let lits = match alternation_literals(&parsed.exprs[`0`]) {
416	None => return None,
417	Some(lits) => lits,
418	};
419	// If we have a small number of literals, then let Teddy handle
420	// things (see literal/mod.rs).
421	if lits.len() <= `32` {
422	return None;
423	}
424	Some(
425	AhoCorasick::builder()
426	.match_kind(MatchKind::LeftmostFirst)
427	.build(&lits)
428	// This should never happen because we'd long exceed the
429	// compilation limit for regexes first.
430	.expect("AC automaton too big"),
431	)
432	}
433	}
434
435	impl<'c> RegularExpression for ExecNoSyncStr<'c> {
436	type Text = str;
437
438	fn slots_len(&self) -> usize {
439	self.0.slots_len()
440	}
441
442	fn next_after_empty(&self, text: &str, i: usize) -> usize {
443	next_utf8(text.as_bytes(), i)
444	}
445
446	#[cfg_attr(feature = "perf-inline", inline(always))]
447	fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
448	self.0.shortest_match_at(text.as_bytes(), start)
449	}
450
451	#[cfg_attr(feature = "perf-inline", inline(always))]
452	fn is_match_at(&self, text: &str, start: usize) -> bool {
453	self.0.is_match_at(text.as_bytes(), start)
454	}
455
456	#[cfg_attr(feature = "perf-inline", inline(always))]
457	fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
458	self.0.find_at(text.as_bytes(), start)
459	}
460
461	#[cfg_attr(feature = "perf-inline", inline(always))]
462	fn captures_read_at(
463	&self,
464	locs: &mut Locations,
465	text: &str,
466	start: usize,
467	) -> Option<(usize, usize)> {
468	self.0.captures_read_at(locs, text.as_bytes(), start)
469	}
470	}
471
472	impl<'c> RegularExpression for ExecNoSync<'c> {
473	type Text = [u8];
474
475	/// Returns the number of capture slots in the regular expression. (There
476	/// are two slots for every capture group, corresponding to possibly empty
477	/// start and end locations of the capture.)
478	fn slots_len(&self) -> usize {
479	self.ro.nfa.captures.len() * `2`
480	}
481
482	fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
483	i + `1`
484	}
485
486	/// Returns the end of a match location, possibly occurring before the
487	/// end location of the correct leftmost-first match.
488	#[cfg_attr(feature = "perf-inline", inline(always))]
489	fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
490	if !self.is_anchor_end_match(text) {
491	return None;
492	}
493	match self.ro.match_type {
494	#[cfg(feature = "perf-literal")]
495	MatchType::Literal(ty) => {
496	self.find_literals(ty, text, start).map(\|(_, e)\| e)
497	}
498	#[cfg(feature = "perf-dfa")]
499	MatchType::Dfa \| MatchType::DfaMany => {
500	match self.shortest_dfa(text, start) {
501	dfa::Result::Match(end) => Some(end),
502	dfa::Result::NoMatch(_) => None,
503	dfa::Result::Quit => self.shortest_nfa(text, start),
504	}
505	}
506	#[cfg(feature = "perf-dfa")]
507	MatchType::DfaAnchoredReverse => {
508	match dfa::Fsm::reverse(
509	&self.ro.dfa_reverse,
510	self.cache.value(),
511	`true`,
512	&text[start..],
513	text.len() - start,
514	) {
515	dfa::Result::Match(_) => Some(text.len()),
516	dfa::Result::NoMatch(_) => None,
517	dfa::Result::Quit => self.shortest_nfa(text, start),
518	}
519	}
520	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
521	MatchType::DfaSuffix => {
522	match self.shortest_dfa_reverse_suffix(text, start) {
523	dfa::Result::Match(e) => Some(e),
524	dfa::Result::NoMatch(_) => None,
525	dfa::Result::Quit => self.shortest_nfa(text, start),
526	}
527	}
528	MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
529	MatchType::Nothing => None,
530	}
531	}
532
533	/// Returns true if and only if the regex matches text.
534	///
535	/// For single regular expressions, this is equivalent to calling
536	/// shortest_match(...).is_some().
537	#[cfg_attr(feature = "perf-inline", inline(always))]
538	fn is_match_at(&self, text: &[u8], start: usize) -> bool {
539	if !self.is_anchor_end_match(text) {
540	return `false`;
541	}
542	// We need to do this dance because shortest_match relies on the NFA
543	// filling in captures[1], but a RegexSet has no captures. In other
544	// words, a RegexSet can't (currently) use shortest_match. ---AG
545	match self.ro.match_type {
546	#[cfg(feature = "perf-literal")]
547	MatchType::Literal(ty) => {
548	self.find_literals(ty, text, start).is_some()
549	}
550	#[cfg(feature = "perf-dfa")]
551	MatchType::Dfa \| MatchType::DfaMany => {
552	match self.shortest_dfa(text, start) {
553	dfa::Result::Match(_) => `true`,
554	dfa::Result::NoMatch(_) => `false`,
555	dfa::Result::Quit => self.match_nfa(text, start),
556	}
557	}
558	#[cfg(feature = "perf-dfa")]
559	MatchType::DfaAnchoredReverse => {
560	match dfa::Fsm::reverse(
561	&self.ro.dfa_reverse,
562	self.cache.value(),
563	`true`,
564	&text[start..],
565	text.len() - start,
566	) {
567	dfa::Result::Match(_) => `true`,
568	dfa::Result::NoMatch(_) => `false`,
569	dfa::Result::Quit => self.match_nfa(text, start),
570	}
571	}
572	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
573	MatchType::DfaSuffix => {
574	match self.shortest_dfa_reverse_suffix(text, start) {
575	dfa::Result::Match(_) => `true`,
576	dfa::Result::NoMatch(_) => `false`,
577	dfa::Result::Quit => self.match_nfa(text, start),
578	}
579	}
580	MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
581	MatchType::Nothing => `false`,
582	}
583	}
584
585	/// Finds the start and end location of the leftmost-first match, starting
586	/// at the given location.
587	#[cfg_attr(feature = "perf-inline", inline(always))]
588	fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
589	if !self.is_anchor_end_match(text) {
590	return None;
591	}
592	match self.ro.match_type {
593	#[cfg(feature = "perf-literal")]
594	MatchType::Literal(ty) => self.find_literals(ty, text, start),
595	#[cfg(feature = "perf-dfa")]
596	MatchType::Dfa => match self.find_dfa_forward(text, start) {
597	dfa::Result::Match((s, e)) => Some((s, e)),
598	dfa::Result::NoMatch(_) => None,
599	dfa::Result::Quit => {
600	self.find_nfa(MatchNfaType::Auto, text, start)
601	}
602	},
603	#[cfg(feature = "perf-dfa")]
604	MatchType::DfaAnchoredReverse => {
605	match self.find_dfa_anchored_reverse(text, start) {
606	dfa::Result::Match((s, e)) => Some((s, e)),
607	dfa::Result::NoMatch(_) => None,
608	dfa::Result::Quit => {
609	self.find_nfa(MatchNfaType::Auto, text, start)
610	}
611	}
612	}
613	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
614	MatchType::DfaSuffix => {
615	match self.find_dfa_reverse_suffix(text, start) {
616	dfa::Result::Match((s, e)) => Some((s, e)),
617	dfa::Result::NoMatch(_) => None,
618	dfa::Result::Quit => {
619	self.find_nfa(MatchNfaType::Auto, text, start)
620	}
621	}
622	}
623	MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
624	MatchType::Nothing => None,
625	#[cfg(feature = "perf-dfa")]
626	MatchType::DfaMany => {
627	unreachable!("BUG: RegexSet cannot be used with find")
628	}
629	}
630	}
631
632	/// Finds the start and end location of the leftmost-first match and also
633	/// fills in all matching capture groups.
634	///
635	/// The number of capture slots given should be equal to the total number
636	/// of capture slots in the compiled program.
637	///
638	/// Note that the first two slots always correspond to the start and end
639	/// locations of the overall match.
640	fn captures_read_at(
641	&self,
642	locs: &mut Locations,
643	text: &[u8],
644	start: usize,
645	) -> Option<(usize, usize)> {
646	let slots = locs.as_slots();
647	for slot in slots.iter_mut() {
648	*slot = None;
649	}
650	// If the caller unnecessarily uses this, then we try to save them
651	// from themselves.
652	match slots.len() {
653	`0` => return self.find_at(text, start),
654	`2` => {
655	return self.find_at(text, start).map(\|(s, e)\| {
656	slots[`0`] = Some(s);
657	slots[`1`] = Some(e);
658	(s, e)
659	});
660	}
661	_ => {} // fallthrough
662	}
663	if !self.is_anchor_end_match(text) {
664	return None;
665	}
666	match self.ro.match_type {
667	#[cfg(feature = "perf-literal")]
668	MatchType::Literal(ty) => {
669	self.find_literals(ty, text, start).and_then(\|(s, e)\| {
670	self.captures_nfa_type(
671	MatchNfaType::Auto,
672	slots,
673	text,
674	s,
675	e,
676	)
677	})
678	}
679	#[cfg(feature = "perf-dfa")]
680	MatchType::Dfa => {
681	if self.ro.nfa.is_anchored_start {
682	self.captures_nfa(slots, text, start)
683	} else {
684	match self.find_dfa_forward(text, start) {
685	dfa::Result::Match((s, e)) => self.captures_nfa_type(
686	MatchNfaType::Auto,
687	slots,
688	text,
689	s,
690	e,
691	),
692	dfa::Result::NoMatch(_) => None,
693	dfa::Result::Quit => {
694	self.captures_nfa(slots, text, start)
695	}
696	}
697	}
698	}
699	#[cfg(feature = "perf-dfa")]
700	MatchType::DfaAnchoredReverse => {
701	match self.find_dfa_anchored_reverse(text, start) {
702	dfa::Result::Match((s, e)) => self.captures_nfa_type(
703	MatchNfaType::Auto,
704	slots,
705	text,
706	s,
707	e,
708	),
709	dfa::Result::NoMatch(_) => None,
710	dfa::Result::Quit => self.captures_nfa(slots, text, start),
711	}
712	}
713	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
714	MatchType::DfaSuffix => {
715	match self.find_dfa_reverse_suffix(text, start) {
716	dfa::Result::Match((s, e)) => self.captures_nfa_type(
717	MatchNfaType::Auto,
718	slots,
719	text,
720	s,
721	e,
722	),
723	dfa::Result::NoMatch(_) => None,
724	dfa::Result::Quit => self.captures_nfa(slots, text, start),
725	}
726	}
727	MatchType::Nfa(ty) => {
728	self.captures_nfa_type(ty, slots, text, start, text.len())
729	}
730	MatchType::Nothing => None,
731	#[cfg(feature = "perf-dfa")]
732	MatchType::DfaMany => {
733	unreachable!("BUG: RegexSet cannot be used with captures")
734	}
735	}
736	}
737	}
738
739	impl<'c> ExecNoSync<'c> {
740	/// Finds the leftmost-first match using only literal search.
741	#[cfg(feature = "perf-literal")]
742	#[cfg_attr(feature = "perf-inline", inline(always))]
743	fn find_literals(
744	&self,
745	ty: MatchLiteralType,
746	text: &[u8],
747	start: usize,
748	) -> Option<(usize, usize)> {
749	use self::MatchLiteralType::*;
750	match ty {
751	Unanchored => {
752	let lits = &self.ro.nfa.prefixes;
753	lits.find(&text[start..]).map(\|(s, e)\| (start + s, start + e))
754	}
755	AnchoredStart => {
756	let lits = &self.ro.nfa.prefixes;
757	if start == `0` \|\| !self.ro.nfa.is_anchored_start {
758	lits.find_start(&text[start..])
759	.map(\|(s, e)\| (start + s, start + e))
760	} else {
761	None
762	}
763	}
764	AnchoredEnd => {
765	let lits = &self.ro.suffixes;
766	lits.find_end(&text[start..])
767	.map(\|(s, e)\| (start + s, start + e))
768	}
769	AhoCorasick => self
770	.ro
771	.ac
772	.as_ref()
773	.unwrap()
774	.find(&text[start..])
775	.map(\|m\| (start + m.start(), start + m.end())),
776	}
777	}
778
779	/// Finds the leftmost-first match (start and end) using only the DFA.
780	///
781	/// If the result returned indicates that the DFA quit, then another
782	/// matching engine should be used.
783	#[cfg(feature = "perf-dfa")]
784	#[cfg_attr(feature = "perf-inline", inline(always))]
785	fn find_dfa_forward(
786	&self,
787	text: &[u8],
788	start: usize,
789	) -> dfa::Result<(usize, usize)> {
790	use crate::dfa::Result::*;
791	let end = match dfa::Fsm::forward(
792	&self.ro.dfa,
793	self.cache.value(),
794	`false`,
795	text,
796	start,
797	) {
798	NoMatch(i) => return NoMatch(i),
799	Quit => return Quit,
800	Match(end) if start == end => return Match((start, start)),
801	Match(end) => end,
802	};
803	// Now run the DFA in reverse to find the start of the match.
804	match dfa::Fsm::reverse(
805	&self.ro.dfa_reverse,
806	self.cache.value(),
807	`false`,
808	&text[start..],
809	end - start,
810	) {
811	Match(s) => Match((start + s, end)),
812	NoMatch(i) => NoMatch(i),
813	Quit => Quit,
814	}
815	}
816
817	/// Finds the leftmost-first match (start and end) using only the DFA,
818	/// but assumes the regex is anchored at the end and therefore starts at
819	/// the end of the regex and matches in reverse.
820	///
821	/// If the result returned indicates that the DFA quit, then another
822	/// matching engine should be used.
823	#[cfg(feature = "perf-dfa")]
824	#[cfg_attr(feature = "perf-inline", inline(always))]
825	fn find_dfa_anchored_reverse(
826	&self,
827	text: &[u8],
828	start: usize,
829	) -> dfa::Result<(usize, usize)> {
830	use crate::dfa::Result::*;
831	match dfa::Fsm::reverse(
832	&self.ro.dfa_reverse,
833	self.cache.value(),
834	`false`,
835	&text[start..],
836	text.len() - start,
837	) {
838	Match(s) => Match((start + s, text.len())),
839	NoMatch(i) => NoMatch(i),
840	Quit => Quit,
841	}
842	}
843
844	/// Finds the end of the shortest match using only the DFA.
845	#[cfg(feature = "perf-dfa")]
846	#[cfg_attr(feature = "perf-inline", inline(always))]
847	fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
848	dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), `true`, text, start)
849	}
850
851	/// Finds the end of the shortest match using only the DFA by scanning for
852	/// suffix literals.
853	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
854	#[cfg_attr(feature = "perf-inline", inline(always))]
855	fn shortest_dfa_reverse_suffix(
856	&self,
857	text: &[u8],
858	start: usize,
859	) -> dfa::Result<usize> {
860	match self.exec_dfa_reverse_suffix(text, start) {
861	None => self.shortest_dfa(text, start),
862	Some(r) => r.map(\|(_, end)\| end),
863	}
864	}
865
866	/// Finds the end of the shortest match using only the DFA by scanning for
867	/// suffix literals. It also reports the start of the match.
868	///
869	/// Note that if None is returned, then the optimization gave up to avoid
870	/// worst case quadratic behavior. A forward scanning DFA should be tried
871	/// next.
872	///
873	/// If a match is returned and the full leftmost-first match is desired,
874	/// then a forward scan starting from the beginning of the match must be
875	/// done.
876	///
877	/// If the result returned indicates that the DFA quit, then another
878	/// matching engine should be used.
879	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
880	#[cfg_attr(feature = "perf-inline", inline(always))]
881	fn exec_dfa_reverse_suffix(
882	&self,
883	text: &[u8],
884	original_start: usize,
885	) -> Option<dfa::Result<(usize, usize)>> {
886	use crate::dfa::Result::*;
887
888	let lcs = self.ro.suffixes.lcs();
889	debug_assert!(lcs.len() >= `1`);
890	let mut start = original_start;
891	let mut end = start;
892	let mut last_literal = start;
893	while end <= text.len() {
894	last_literal += match lcs.find(&text[last_literal..]) {
895	None => return Some(NoMatch(text.len())),
896	Some(i) => i,
897	};
898	end = last_literal + lcs.len();
899	match dfa::Fsm::reverse(
900	&self.ro.dfa_reverse,
901	self.cache.value(),
902	`false`,
903	&text[start..end],
904	end - start,
905	) {
906	Match(`0`) \| NoMatch(`0`) => return None,
907	Match(i) => return Some(Match((start + i, end))),
908	NoMatch(i) => {
909	start += i;
910	last_literal += `1`;
911	continue;
912	}
913	Quit => return Some(Quit),
914	};
915	}
916	Some(NoMatch(text.len()))
917	}
918
919	/// Finds the leftmost-first match (start and end) using only the DFA
920	/// by scanning for suffix literals.
921	///
922	/// If the result returned indicates that the DFA quit, then another
923	/// matching engine should be used.
924	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
925	#[cfg_attr(feature = "perf-inline", inline(always))]
926	fn find_dfa_reverse_suffix(
927	&self,
928	text: &[u8],
929	start: usize,
930	) -> dfa::Result<(usize, usize)> {
931	use crate::dfa::Result::*;
932
933	let match_start = match self.exec_dfa_reverse_suffix(text, start) {
934	None => return self.find_dfa_forward(text, start),
935	Some(Match((start, _))) => start,
936	Some(r) => return r,
937	};
938	// At this point, we've found a match. The only way to quit now
939	// without a match is if the DFA gives up (seems unlikely).
940	//
941	// Now run the DFA forwards to find the proper end of the match.
942	// (The suffix literal match can only indicate the earliest
943	// possible end location, which may appear before the end of the
944	// leftmost-first match.)
945	match dfa::Fsm::forward(
946	&self.ro.dfa,
947	self.cache.value(),
948	`false`,
949	text,
950	match_start,
951	) {
952	NoMatch(_) => panic!("BUG: reverse match implies forward match"),
953	Quit => Quit,
954	Match(e) => Match((match_start, e)),
955	}
956	}
957
958	/// Executes the NFA engine to return whether there is a match or not.
959	///
960	/// Ideally, we could use shortest_nfa(...).is_some() and get the same
961	/// performance characteristics, but regex sets don't have captures, which
962	/// shortest_nfa depends on.
963	#[cfg(feature = "perf-dfa")]
964	fn match_nfa(&self, text: &[u8], start: usize) -> bool {
965	self.match_nfa_type(MatchNfaType::Auto, text, start)
966	}
967
968	/// Like match_nfa, but allows specification of the type of NFA engine.
969	fn match_nfa_type(
970	&self,
971	ty: MatchNfaType,
972	text: &[u8],
973	start: usize,
974	) -> bool {
975	self.exec_nfa(
976	ty,
977	&mut [`false`],
978	&mut [],
979	`true`,
980	`false`,
981	text,
982	start,
983	text.len(),
984	)
985	}
986
987	/// Finds the shortest match using an NFA.
988	#[cfg(feature = "perf-dfa")]
989	fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
990	self.shortest_nfa_type(MatchNfaType::Auto, text, start)
991	}
992
993	/// Like shortest_nfa, but allows specification of the type of NFA engine.
994	fn shortest_nfa_type(
995	&self,
996	ty: MatchNfaType,
997	text: &[u8],
998	start: usize,
999	) -> Option<usize> {
1000	let mut slots = [None, None];
1001	if self.exec_nfa(
1002	ty,
1003	&mut [`false`],
1004	&mut slots,
1005	`true`,
1006	`true`,
1007	text,
1008	start,
1009	text.len(),
1010	) {
1011	slots[`1`]
1012	} else {
1013	None
1014	}
1015	}
1016
1017	/// Like find, but executes an NFA engine.
1018	fn find_nfa(
1019	&self,
1020	ty: MatchNfaType,
1021	text: &[u8],
1022	start: usize,
1023	) -> Option<(usize, usize)> {
1024	let mut slots = [None, None];
1025	if self.exec_nfa(
1026	ty,
1027	&mut [`false`],
1028	&mut slots,
1029	`false`,
1030	`false`,
1031	text,
1032	start,
1033	text.len(),
1034	) {
1035	match (slots[`0`], slots[`1`]) {
1036	(Some(s), Some(e)) => Some((s, e)),
1037	_ => None,
1038	}
1039	} else {
1040	None
1041	}
1042	}
1043
1044	/// Like find_nfa, but fills in captures.
1045	///
1046	/// `slots` should have length equal to `2 nfa.captures.len()`.*
1047	#[cfg(feature = "perf-dfa")]
1048	fn captures_nfa(
1049	&self,
1050	slots: &mut [Slot],
1051	text: &[u8],
1052	start: usize,
1053	) -> Option<(usize, usize)> {
1054	self.captures_nfa_type(
1055	MatchNfaType::Auto,
1056	slots,
1057	text,
1058	start,
1059	text.len(),
1060	)
1061	}
1062
1063	/// Like captures_nfa, but allows specification of type of NFA engine.
1064	fn captures_nfa_type(
1065	&self,
1066	ty: MatchNfaType,
1067	slots: &mut [Slot],
1068	text: &[u8],
1069	start: usize,
1070	end: usize,
1071	) -> Option<(usize, usize)> {
1072	if self.exec_nfa(
1073	ty,
1074	&mut [`false`],
1075	slots,
1076	`false`,
1077	`false`,
1078	text,
1079	start,
1080	end,
1081	) {
1082	match (slots[`0`], slots[`1`]) {
1083	(Some(s), Some(e)) => Some((s, e)),
1084	_ => None,
1085	}
1086	} else {
1087	None
1088	}
1089	}
1090
1091	fn exec_nfa(
1092	&self,
1093	mut ty: MatchNfaType,
1094	matches: &mut [bool],
1095	slots: &mut [Slot],
1096	quit_after_match: bool,
1097	quit_after_match_with_pos: bool,
1098	text: &[u8],
1099	start: usize,
1100	end: usize,
1101	) -> bool {
1102	use self::MatchNfaType::*;
1103	if let Auto = ty {
1104	if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1105	ty = Backtrack;
1106	} else {
1107	ty = PikeVM;
1108	}
1109	}
1110	// The backtracker can't return the shortest match position as it is
1111	// implemented today. So if someone calls `shortest_match` and we need
1112	// to run an NFA, then use the PikeVM.
1113	if quit_after_match_with_pos \|\| ty == PikeVM {
1114	self.exec_pikevm(
1115	matches,
1116	slots,
1117	quit_after_match,
1118	text,
1119	start,
1120	end,
1121	)
1122	} else {
1123	self.exec_backtrack(matches, slots, text, start, end)
1124	}
1125	}
1126
1127	/// Always run the NFA algorithm.
1128	fn exec_pikevm(
1129	&self,
1130	matches: &mut [bool],
1131	slots: &mut [Slot],
1132	quit_after_match: bool,
1133	text: &[u8],
1134	start: usize,
1135	end: usize,
1136	) -> bool {
1137	if self.ro.nfa.uses_bytes() {
1138	pikevm::Fsm::exec(
1139	&self.ro.nfa,
1140	self.cache.value(),
1141	matches,
1142	slots,
1143	quit_after_match,
1144	ByteInput::new(text, self.ro.nfa.only_utf8),
1145	start,
1146	end,
1147	)
1148	} else {
1149	pikevm::Fsm::exec(
1150	&self.ro.nfa,
1151	self.cache.value(),
1152	matches,
1153	slots,
1154	quit_after_match,
1155	CharInput::new(text),
1156	start,
1157	end,
1158	)
1159	}
1160	}
1161
1162	/// Always runs the NFA using bounded backtracking.
1163	fn exec_backtrack(
1164	&self,
1165	matches: &mut [bool],
1166	slots: &mut [Slot],
1167	text: &[u8],
1168	start: usize,
1169	end: usize,
1170	) -> bool {
1171	if self.ro.nfa.uses_bytes() {
1172	backtrack::Bounded::exec(
1173	&self.ro.nfa,
1174	self.cache.value(),
1175	matches,
1176	slots,
1177	ByteInput::new(text, self.ro.nfa.only_utf8),
1178	start,
1179	end,
1180	)
1181	} else {
1182	backtrack::Bounded::exec(
1183	&self.ro.nfa,
1184	self.cache.value(),
1185	matches,
1186	slots,
1187	CharInput::new(text),
1188	start,
1189	end,
1190	)
1191	}
1192	}
1193
1194	/// Finds which regular expressions match the given text.
1195	///
1196	/// `matches` should have length equal to the number of regexes being
1197	/// searched.
1198	///
1199	/// This is only useful when one wants to know which regexes in a set
1200	/// match some text.
1201	pub fn many_matches_at(
1202	&self,
1203	matches: &mut [bool],
1204	text: &[u8],
1205	start: usize,
1206	) -> bool {
1207	use self::MatchType::*;
1208	if !self.is_anchor_end_match(text) {
1209	return `false`;
1210	}
1211	match self.ro.match_type {
1212	#[cfg(feature = "perf-literal")]
1213	Literal(ty) => {
1214	debug_assert_eq!(matches.len(), `1`);
1215	matches[`0`] = self.find_literals(ty, text, start).is_some();
1216	matches[`0`]
1217	}
1218	#[cfg(feature = "perf-dfa")]
1219	Dfa \| DfaAnchoredReverse \| DfaMany => {
1220	match dfa::Fsm::forward_many(
1221	&self.ro.dfa,
1222	self.cache.value(),
1223	matches,
1224	text,
1225	start,
1226	) {
1227	dfa::Result::Match(_) => `true`,
1228	dfa::Result::NoMatch(_) => `false`,
1229	dfa::Result::Quit => self.exec_nfa(
1230	MatchNfaType::Auto,
1231	matches,
1232	&mut [],
1233	`false`,
1234	`false`,
1235	text,
1236	start,
1237	text.len(),
1238	),
1239	}
1240	}
1241	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1242	DfaSuffix => {
1243	match dfa::Fsm::forward_many(
1244	&self.ro.dfa,
1245	self.cache.value(),
1246	matches,
1247	text,
1248	start,
1249	) {
1250	dfa::Result::Match(_) => `true`,
1251	dfa::Result::NoMatch(_) => `false`,
1252	dfa::Result::Quit => self.exec_nfa(
1253	MatchNfaType::Auto,
1254	matches,
1255	&mut [],
1256	`false`,
1257	`false`,
1258	text,
1259	start,
1260	text.len(),
1261	),
1262	}
1263	}
1264	Nfa(ty) => self.exec_nfa(
1265	ty,
1266	matches,
1267	&mut [],
1268	`false`,
1269	`false`,
1270	text,
1271	start,
1272	text.len(),
1273	),
1274	Nothing => `false`,
1275	}
1276	}
1277
1278	#[cfg_attr(feature = "perf-inline", inline(always))]
1279	fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1280	#[cfg(not(feature = "perf-literal"))]
1281	fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1282	`true`
1283	}
1284
1285	#[cfg(feature = "perf-literal")]
1286	fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1287	// Only do this check if the haystack is big (>1MB).
1288	if text.len() > (`1` << `20`) && ro.nfa.is_anchored_end {
1289	let lcs = ro.suffixes.lcs();
1290	if lcs.len() >= `1` && !lcs.is_suffix(text) {
1291	return `false`;
1292	}
1293	}
1294	`true`
1295	}
1296
1297	imp(&self.ro, text)
1298	}
1299
1300	pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1301	&self.ro.nfa.capture_name_idx
1302	}
1303	}
1304
1305	impl<'c> ExecNoSyncStr<'c> {
1306	pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1307	self.0.capture_name_idx()
1308	}
1309	}
1310
1311	impl Exec {
1312	/// Get a searcher that isn't Sync.
1313	#[cfg_attr(feature = "perf-inline", inline(always))]
1314	pub fn searcher(&self) -> ExecNoSync<'_> {
1315	ExecNoSync {
1316	ro: &self.ro, // a clone is too expensive here! (and not needed)
1317	cache: self.pool.get(),
1318	}
1319	}
1320
1321	/// Get a searcher that isn't Sync and can match on &str.
1322	#[cfg_attr(feature = "perf-inline", inline(always))]
1323	pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
1324	ExecNoSyncStr(self.searcher())
1325	}
1326
1327	/// Build a Regex from this executor.
1328	pub fn into_regex(self) -> re_unicode::Regex {
1329	re_unicode::Regex::from(self)
1330	}
1331
1332	/// Build a RegexSet from this executor.
1333	pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1334	re_set::unicode::RegexSet::from(self)
1335	}
1336
1337	/// Build a Regex from this executor that can match arbitrary bytes.
1338	pub fn into_byte_regex(self) -> re_bytes::Regex {
1339	re_bytes::Regex::from(self)
1340	}
1341
1342	/// Build a RegexSet from this executor that can match arbitrary bytes.
1343	pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1344	re_set::bytes::RegexSet::from(self)
1345	}
1346
1347	/// The original regular expressions given by the caller that were
1348	/// compiled.
1349	pub fn regex_strings(&self) -> &[String] {
1350	&self.ro.res
1351	}
1352
1353	/// Return a slice of capture names.
1354	///
1355	/// Any capture that isn't named is None.
1356	pub fn capture_names(&self) -> &[Option<String>] {
1357	&self.ro.nfa.captures
1358	}
1359
1360	/// Return a reference to named groups mapping (from group name to
1361	/// group position).
1362	pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1363	&self.ro.nfa.capture_name_idx
1364	}
1365
1366	/// If the number of capture groups in every match is always the same, then
1367	/// return that number. Otherwise return `None`.
1368	pub fn static_captures_len(&self) -> Option<usize> {
1369	self.ro.nfa.static_captures_len
1370	}
1371	}
1372
1373	impl Clone for Exec {
1374	fn clone(&self) -> Exec {
1375	let pool: Box>> = ExecReadOnly::new_pool(&self.ro);
1376	Exec { ro: self.ro.clone(), pool }
1377	}
1378	}
1379
1380	impl ExecReadOnly {
1381	fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1382	if let Some(MatchType::Nfa(_)) = hint {
1383	return hint.unwrap();
1384	}
1385	// If the NFA is empty, then we'll never match anything.
1386	if self.nfa.insts.is_empty() {
1387	return MatchType::Nothing;
1388	}
1389	if let Some(literalty) = self.choose_literal_match_type() {
1390	return literalty;
1391	}
1392	if let Some(dfaty) = self.choose_dfa_match_type() {
1393	return dfaty;
1394	}
1395	// We're so totally hosed.
1396	MatchType::Nfa(MatchNfaType::Auto)
1397	}
1398
1399	/// If a plain literal scan can be used, then a corresponding literal
1400	/// search type is returned.
1401	fn choose_literal_match_type(&self) -> Option<MatchType> {
1402	#[cfg(not(feature = "perf-literal"))]
1403	fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1404	None
1405	}
1406
1407	#[cfg(feature = "perf-literal")]
1408	fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1409	// If our set of prefixes is complete, then we can use it to find
1410	// a match in lieu of a regex engine. This doesn't quite work well
1411	// in the presence of multiple regexes, so only do it when there's
1412	// one.
1413	//
1414	// TODO(burntsushi): Also, don't try to match literals if the regex
1415	// is partially anchored. We could technically do it, but we'd need
1416	// to create two sets of literals: all of them and then the subset
1417	// that aren't anchored. We would then only search for all of them
1418	// when at the beginning of the input and use the subset in all
1419	// other cases.
1420	if ro.res.len() != `1` {
1421	return None;
1422	}
1423	if ro.ac.is_some() {
1424	return Some(MatchType::Literal(
1425	MatchLiteralType::AhoCorasick,
1426	));
1427	}
1428	if ro.nfa.prefixes.complete() {
1429	return if ro.nfa.is_anchored_start {
1430	Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1431	} else {
1432	Some(MatchType::Literal(MatchLiteralType::Unanchored))
1433	};
1434	}
1435	if ro.suffixes.complete() {
1436	return if ro.nfa.is_anchored_end {
1437	Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1438	} else {
1439	// This case shouldn't happen. When the regex isn't
1440	// anchored, then complete prefixes should imply complete
1441	// suffixes.
1442	//
1443	// The above is wrong! This case can happen. While
1444	// complete prefixes should imply complete suffixes
1445	// here, that doesn't necessarily mean we have a useful
1446	// prefix matcher! It could be the case that the literal
1447	// searcher decided the prefixes---even though they are
1448	// "complete"---weren't good enough and thus created an
1449	// empty matcher. If that happens and we return Unanchored
1450	// here, then we'll end up using that matcher, which is
1451	// very bad because it matches at every position. So...
1452	// return None.
1453	None
1454	};
1455	}
1456	None
1457	}
1458
1459	imp(self)
1460	}
1461
1462	/// If a DFA scan can be used, then choose the appropriate DFA strategy.
1463	fn choose_dfa_match_type(&self) -> Option<MatchType> {
1464	#[cfg(not(feature = "perf-dfa"))]
1465	fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1466	None
1467	}
1468
1469	#[cfg(feature = "perf-dfa")]
1470	fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1471	if !dfa::can_exec(&ro.dfa) {
1472	return None;
1473	}
1474	// Regex sets require a slightly specialized path.
1475	if ro.res.len() >= `2` {
1476	return Some(MatchType::DfaMany);
1477	}
1478	// If the regex is anchored at the end but not the start, then
1479	// just match in reverse from the end of the haystack.
1480	if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1481	return Some(MatchType::DfaAnchoredReverse);
1482	}
1483	#[cfg(feature = "perf-literal")]
1484	{
1485	// If there's a longish suffix literal, then it might be faster
1486	// to look for that first.
1487	if ro.should_suffix_scan() {
1488	return Some(MatchType::DfaSuffix);
1489	}
1490	}
1491	// Fall back to your garden variety forward searching lazy DFA.
1492	Some(MatchType::Dfa)
1493	}
1494
1495	imp(self)
1496	}
1497
1498	/// Returns true if the program is amenable to suffix scanning.
1499	///
1500	/// When this is true, as a heuristic, we assume it is OK to quickly scan
1501	/// for suffix literals and then do a reverse* DFA match from any matches*
1502	/// produced by the literal scan. (And then followed by a forward DFA
1503	/// search, since the previously found suffix literal maybe not actually be
1504	/// the end of a match.)
1505	///
1506	/// This is a bit of a specialized optimization, but can result in pretty
1507	/// big performance wins if 1) there are no prefix literals and 2) the
1508	/// suffix literals are pretty rare in the text. (1) is obviously easy to
1509	/// account for but (2) is harder. As a proxy, we assume that longer
1510	/// strings are generally rarer, so we only enable this optimization when
1511	/// we have a meaty suffix.
1512	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1513	fn should_suffix_scan(&self) -> bool {
1514	if self.suffixes.is_empty() {
1515	return `false`;
1516	}
1517	let lcs_len = self.suffixes.lcs().char_len();
1518	lcs_len >= `3` && lcs_len > self.dfa.prefixes.lcp().char_len()
1519	}
1520
1521	fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
1522	let ro = ro.clone();
1523	Box::new(Pool::new(Box::new(move \|\| {
1524	AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
1525	})))
1526	}
1527	}
1528
1529	#[derive(Clone, Copy, Debug)]
1530	enum MatchType {
1531	/// A single or multiple literal search. This is only used when the regex
1532	/// can be decomposed into a literal search.
1533	#[cfg(feature = "perf-literal")]
1534	Literal(MatchLiteralType),
1535	/// A normal DFA search.
1536	#[cfg(feature = "perf-dfa")]
1537	Dfa,
1538	/// A reverse DFA search starting from the end of a haystack.
1539	#[cfg(feature = "perf-dfa")]
1540	DfaAnchoredReverse,
1541	/// A reverse DFA search with suffix literal scanning.
1542	#[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1543	DfaSuffix,
1544	/// Use the DFA on two or more regular expressions.
1545	#[cfg(feature = "perf-dfa")]
1546	DfaMany,
1547	/// An NFA variant.
1548	Nfa(MatchNfaType),
1549	/// No match is ever possible, so don't ever try to search.
1550	Nothing,
1551	}
1552
1553	#[derive(Clone, Copy, Debug)]
1554	#[cfg(feature = "perf-literal")]
1555	enum MatchLiteralType {
1556	/// Match literals anywhere in text.
1557	Unanchored,
1558	/// Match literals only at the start of text.
1559	AnchoredStart,
1560	/// Match literals only at the end of text.
1561	AnchoredEnd,
1562	/// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1563	/// ExecReadOnly.
1564	AhoCorasick,
1565	}
1566
1567	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1568	enum MatchNfaType {
1569	/// Choose between Backtrack and PikeVM.
1570	Auto,
1571	/// NFA bounded backtracking.
1572	///
1573	/// (This is only set by tests, since it never makes sense to always want
1574	/// backtracking.)
1575	Backtrack,
1576	/// The Pike VM.
1577	///
1578	/// (This is only set by tests, since it never makes sense to always want
1579	/// the Pike VM.)
1580	PikeVM,
1581	}
1582
1583	/// `ProgramCache` maintains reusable allocations for each matching engine
1584	/// available to a particular program.
1585	///
1586	/// We declare this as unwind safe since it's a cache that's only used for
1587	/// performance purposes. If a panic occurs, it is (or should be) always safe
1588	/// to continue using the same regex object.
1589	pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
1590
1591	#[derive(Debug)]
1592	pub struct ProgramCacheInner {
1593	pub pikevm: pikevm::Cache,
1594	pub backtrack: backtrack::Cache,
1595	#[cfg(feature = "perf-dfa")]
1596	pub dfa: dfa::Cache,
1597	#[cfg(feature = "perf-dfa")]
1598	pub dfa_reverse: dfa::Cache,
1599	}
1600
1601	impl ProgramCacheInner {
1602	fn new(ro: &ExecReadOnly) -> Self {
1603	ProgramCacheInner {
1604	pikevm: pikevm::Cache::new(&ro.nfa),
1605	backtrack: backtrack::Cache::new(&ro.nfa),
1606	#[cfg(feature = "perf-dfa")]
1607	dfa: dfa::Cache::new(&ro.dfa),
1608	#[cfg(feature = "perf-dfa")]
1609	dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1610	}
1611	}
1612	}
1613
1614	/// Alternation literals checks if the given HIR is a simple alternation of
1615	/// literals, and if so, returns them. Otherwise, this returns None.
1616	#[cfg(feature = "perf-literal")]
1617	fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1618	use regex_syntax::hir::{HirKind, Literal};
1619
1620	// This is pretty hacky, but basically, if `is_alternation_literal` is
1621	// true, then we can make several assumptions about the structure of our
1622	// HIR. This is what justifies the `unreachable!` statements below.
1623	//
1624	// This code should be refactored once we overhaul this crate's
1625	// optimization pipeline, because this is a terribly inflexible way to go
1626	// about things.
1627
1628	if !expr.properties().is_alternation_literal() {
1629	return None;
1630	}
1631	let alts = match *expr.kind() {
1632	HirKind::Alternation(ref alts) => alts,
1633	_ => return None, // one literal isn't worth it
1634	};
1635
1636	let mut lits = vec![];
1637	for alt in alts {
1638	let mut lit = vec![];
1639	match *alt.kind() {
1640	HirKind::Literal(Literal(ref bytes)) => {
1641	lit.extend_from_slice(bytes)
1642	}
1643	HirKind::Concat(ref exprs) => {
1644	for e in exprs {
1645	match *e.kind() {
1646	HirKind::Literal(Literal(ref bytes)) => {
1647	lit.extend_from_slice(bytes);
1648	}
1649	_ => unreachable!("expected literal, got {:?}", e),
1650	}
1651	}
1652	}
1653	_ => unreachable!("expected literal or concat, got {:?}", alt),
1654	}
1655	lits.push(lit);
1656	}
1657	Some(lits)
1658	}
1659
1660	#[cfg(not(feature = "perf-literal"))]
1661	fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) {
1662	(literal::Seq::infinite(), literal::Seq::infinite())
1663	}
1664
1665	#[cfg(feature = "perf-literal")]
1666	fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) {
1667	const ATTEMPTS: [(usize, usize); `3`] = [(`5`, `50`), (`4`, `30`), (`3`, `20`)];
1668
1669	let mut prefixes = literal::Extractor::new()
1670	.kind(literal::ExtractKind::Prefix)
1671	.extract(expr);
1672	for (keep, limit) in ATTEMPTS {
1673	let len = match prefixes.len() {
1674	None => break,
1675	Some(len) => len,
1676	};
1677	if len <= limit {
1678	break;
1679	}
1680	prefixes.keep_first_bytes(keep);
1681	prefixes.minimize_by_preference();
1682	}
1683
1684	let mut suffixes = literal::Extractor::new()
1685	.kind(literal::ExtractKind::Suffix)
1686	.extract(expr);
1687	for (keep, limit) in ATTEMPTS {
1688	let len = match suffixes.len() {
1689	None => break,
1690	Some(len) => len,
1691	};
1692	if len <= limit {
1693	break;
1694	}
1695	suffixes.keep_last_bytes(keep);
1696	suffixes.minimize_by_preference();
1697	}
1698
1699	(prefixes, suffixes)
1700	}
1701
1702	#[cfg(test)]
1703	mod test {
1704	#[test]
1705	fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1706	use crate::internal::ExecBuilder;
1707
1708	let backtrack_bytes_re = ExecBuilder::new("^S")
1709	.bounded_backtracking()
1710	.only_utf8(`false`)
1711	.build()
1712	.map(\|exec\| exec.into_byte_regex())
1713	.map_err(\|err\| format!("{}", err))
1714	.unwrap();
1715
1716	let default_bytes_re = ExecBuilder::new("^S")
1717	.only_utf8(`false`)
1718	.build()
1719	.map(\|exec\| exec.into_byte_regex())
1720	.map_err(\|err\| format!("{}", err))
1721	.unwrap();
1722
1723	let input = vec![`83`, `83`];
1724
1725	let s1 = backtrack_bytes_re.split(&input);
1726	let s2 = default_bytes_re.split(&input);
1727	for (chunk1, chunk2) in s1.zip(s2) {
1728	assert_eq!(chunk1, chunk2);
1729	}
1730	}
1731
1732	#[test]
1733	fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1734	use crate::internal::ExecBuilder;
1735
1736	let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1737	.bounded_backtracking()
1738	.bytes(`true`)
1739	.build()
1740	.map(\|exec\| exec.into_regex())
1741	.map_err(\|err\| format!("{}", err))
1742	.unwrap();
1743
1744	let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1745	.bytes(`true`)
1746	.build()
1747	.map(\|exec\| exec.into_regex())
1748	.map_err(\|err\| format!("{}", err))
1749	.unwrap();
1750
1751	let input = "**";
1752
1753	let s1 = backtrack_bytes_re.split(input);
1754	let s2 = default_bytes_re.split(input);
1755	for (chunk1, chunk2) in s1.zip(s2) {
1756	assert_eq!(chunk1, chunk2);
1757	}
1758	}
1759	}
1760