1use std::cell::RefCell;
2use std::collections::HashMap;
3use std::panic::AssertUnwindSafe;
4use std::sync::Arc;
5
6#[cfg(feature = "perf-literal")]
7use aho_corasick::{AhoCorasick, MatchKind};
8use regex_syntax::hir::literal;
9use regex_syntax::hir::{Hir, Look};
10use regex_syntax::ParserBuilder;
11
12use crate::backtrack;
13use crate::compile::Compiler;
14#[cfg(feature = "perf-dfa")]
15use crate::dfa;
16use crate::error::Error;
17use crate::input::{ByteInput, CharInput};
18use crate::literal::LiteralSearcher;
19use crate::pikevm;
20use crate::pool::{Pool, PoolGuard};
21use crate::prog::Program;
22use crate::re_builder::RegexOptions;
23use crate::re_bytes;
24use crate::re_set;
25use crate::re_trait::{Locations, RegularExpression, Slot};
26use crate::re_unicode;
27use crate::utf8::next_utf8;
28
29/// `Exec` manages the execution of a regular expression.
30///
31/// In particular, this manages the various compiled forms of a single regular
32/// expression and the choice of which matching engine to use to execute a
33/// regular expression.
34#[derive(Debug)]
35pub struct Exec {
36 /// All read only state.
37 ro: Arc<ExecReadOnly>,
38 /// A pool of reusable values for the various matching engines.
39 ///
40 /// Note that boxing this value is not strictly necessary, but it is an
41 /// easy way to ensure that T does not bloat the stack sized used by a pool
42 /// in the case where T is big. And this turns out to be the case at the
43 /// time of writing for regex's use of this pool. At the time of writing,
44 /// the size of a Regex on the stack is 856 bytes. Boxing this value
45 /// reduces that size to 16 bytes.
46 pool: Box<Pool<ProgramCache>>,
47}
48
49/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50/// means it is no longer Sync, but we can now avoid the overhead of
51/// synchronization to fetch the cache.
52#[derive(Debug)]
53pub struct ExecNoSync<'c> {
54 /// All read only state.
55 ro: &'c Arc<ExecReadOnly>,
56 /// Caches for the various matching engines.
57 cache: PoolGuard<'c, ProgramCache>,
58}
59
60/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
61#[derive(Debug)]
62pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
63
64/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65/// state is determined at compile time and never changes during search.
66#[derive(Debug)]
67struct ExecReadOnly {
68 /// The original regular expressions given by the caller to compile.
69 res: Vec<String>,
70 /// A compiled program that is used in the NFA simulation and backtracking.
71 /// It can be byte-based or Unicode codepoint based.
72 ///
73 /// N.B. It is not possibly to make this byte-based from the public API.
74 /// It is only used for testing byte based programs in the NFA simulations.
75 nfa: Program,
76 /// A compiled byte based program for DFA execution. This is only used
77 /// if a DFA can be executed. (Currently, only word boundary assertions are
78 /// not supported.) Note that this program contains an embedded `.*?`
79 /// preceding the first capture group, unless the regex is anchored at the
80 /// beginning.
81 #[allow(dead_code)]
82 dfa: Program,
83 /// The same as above, except the program is reversed (and there is no
84 /// preceding `.*?`). This is used by the DFA to find the starting location
85 /// of matches.
86 #[allow(dead_code)]
87 dfa_reverse: Program,
88 /// A set of suffix literals extracted from the regex.
89 ///
90 /// Prefix literals are stored on the `Program`, since they are used inside
91 /// the matching engines.
92 #[allow(dead_code)]
93 suffixes: LiteralSearcher,
94 /// An Aho-Corasick automaton with leftmost-first match semantics.
95 ///
96 /// This is only set when the entire regex is a simple unanchored
97 /// alternation of literals. We could probably use it more circumstances,
98 /// but this is already hacky enough in this architecture.
99 ///
100 /// N.B. We use u32 as a state ID representation under the assumption that
101 /// if we were to exhaust the ID space, we probably would have long
102 /// surpassed the compilation size limit.
103 #[cfg(feature = "perf-literal")]
104 ac: Option<AhoCorasick>,
105 /// match_type encodes as much upfront knowledge about how we're going to
106 /// execute a search as possible.
107 match_type: MatchType,
108}
109
110/// Facilitates the construction of an executor by exposing various knobs
111/// to control how a regex is executed and what kinds of resources it's
112/// permitted to use.
113// `ExecBuilder` is only public via the `internal` module, so avoid deriving
114// `Debug`.
115#[allow(missing_debug_implementations)]
116pub struct ExecBuilder {
117 options: RegexOptions,
118 match_type: Option<MatchType>,
119 bytes: bool,
120 only_utf8: bool,
121}
122
123/// Parsed represents a set of parsed regular expressions and their detected
124/// literals.
125struct Parsed {
126 exprs: Vec<Hir>,
127 prefixes: literal::Seq,
128 suffixes: literal::Seq,
129 bytes: bool,
130}
131
132impl ExecBuilder {
133 /// Create a regex execution builder.
134 ///
135 /// This uses default settings for everything except the regex itself,
136 /// which must be provided. Further knobs can be set by calling methods,
137 /// and then finally, `build` to actually create the executor.
138 pub fn new(re: &str) -> Self {
139 Self::new_many(&[re])
140 }
141
142 /// Like new, but compiles the union of the given regular expressions.
143 ///
144 /// Note that when compiling 2 or more regular expressions, capture groups
145 /// are completely unsupported. (This means both `find` and `captures`
146 /// won't work.)
147 pub fn new_many<I, S>(res: I) -> Self
148 where
149 S: AsRef<str>,
150 I: IntoIterator<Item = S>,
151 {
152 let mut opts = RegexOptions::default();
153 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
154 Self::new_options(opts)
155 }
156
157 /// Create a regex execution builder.
158 pub fn new_options(opts: RegexOptions) -> Self {
159 ExecBuilder {
160 options: opts,
161 match_type: None,
162 bytes: false,
163 only_utf8: true,
164 }
165 }
166
167 /// Set the matching engine to be automatically determined.
168 ///
169 /// This is the default state and will apply whatever optimizations are
170 /// possible, such as running a DFA.
171 ///
172 /// This overrides whatever was previously set via the `nfa` or
173 /// `bounded_backtracking` methods.
174 pub fn automatic(mut self) -> Self {
175 self.match_type = None;
176 self
177 }
178
179 /// Sets the matching engine to use the NFA algorithm no matter what
180 /// optimizations are possible.
181 ///
182 /// This overrides whatever was previously set via the `automatic` or
183 /// `bounded_backtracking` methods.
184 pub fn nfa(mut self) -> Self {
185 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
186 self
187 }
188
189 /// Sets the matching engine to use a bounded backtracking engine no
190 /// matter what optimizations are possible.
191 ///
192 /// One must use this with care, since the bounded backtracking engine
193 /// uses memory proportion to `len(regex) * len(text)`.
194 ///
195 /// This overrides whatever was previously set via the `automatic` or
196 /// `nfa` methods.
197 pub fn bounded_backtracking(mut self) -> Self {
198 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
199 self
200 }
201
202 /// Compiles byte based programs for use with the NFA matching engines.
203 ///
204 /// By default, the NFA engines match on Unicode scalar values. They can
205 /// be made to use byte based programs instead. In general, the byte based
206 /// programs are slower because of a less efficient encoding of character
207 /// classes.
208 ///
209 /// Note that this does not impact DFA matching engines, which always
210 /// execute on bytes.
211 pub fn bytes(mut self, yes: bool) -> Self {
212 self.bytes = yes;
213 self
214 }
215
216 /// When disabled, the program compiled may match arbitrary bytes.
217 ///
218 /// When enabled (the default), all compiled programs exclusively match
219 /// valid UTF-8 bytes.
220 pub fn only_utf8(mut self, yes: bool) -> Self {
221 self.only_utf8 = yes;
222 self
223 }
224
225 /// Set the Unicode flag.
226 pub fn unicode(mut self, yes: bool) -> Self {
227 self.options.unicode = yes;
228 self
229 }
230
231 /// Parse the current set of patterns into their AST and extract literals.
232 fn parse(&self) -> Result<Parsed, Error> {
233 let mut exprs = Vec::with_capacity(self.options.pats.len());
234 let mut prefixes = Some(literal::Seq::empty());
235 let mut suffixes = Some(literal::Seq::empty());
236 let mut bytes = false;
237 let is_set = self.options.pats.len() > 1;
238 // If we're compiling a regex set and that set has any anchored
239 // expressions, then disable all literal optimizations.
240 for pat in &self.options.pats {
241 let mut parser = ParserBuilder::new()
242 .octal(self.options.octal)
243 .case_insensitive(self.options.case_insensitive)
244 .multi_line(self.options.multi_line)
245 .dot_matches_new_line(self.options.dot_matches_new_line)
246 .swap_greed(self.options.swap_greed)
247 .ignore_whitespace(self.options.ignore_whitespace)
248 .unicode(self.options.unicode)
249 .utf8(self.only_utf8)
250 .nest_limit(self.options.nest_limit)
251 .build();
252 let expr =
253 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
254 let props = expr.properties();
255 // This used to just check whether the HIR matched valid UTF-8
256 // or not, but in regex-syntax 0.7, we changed our definition of
257 // "matches valid UTF-8" to exclude zero-width matches. And in
258 // particular, previously, we considered WordAsciiNegate (that
259 // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our
260 // matcher engines were built under this assumption and fixing
261 // them is not worth it with the imminent plan to switch over to
262 // regex-automata. So for now, we retain the previous behavior by
263 // just explicitly treating the presence of a negated ASCII word
264 // boundary as forcing use to use a byte oriented automaton.
265 bytes = bytes
266 || !props.is_utf8()
267 || props.look_set().contains(Look::WordAsciiNegate);
268
269 if cfg!(feature = "perf-literal") {
270 if !props.look_set_prefix().contains(Look::Start)
271 && props.look_set().contains(Look::Start)
272 {
273 // Partial anchors unfortunately make it hard to use
274 // prefixes, so disable them.
275 prefixes = None;
276 } else if is_set
277 && props.look_set_prefix_any().contains(Look::Start)
278 {
279 // Regex sets with anchors do not go well with literal
280 // optimizations.
281 prefixes = None;
282 } else if props.look_set_prefix_any().contains_word() {
283 // The new literal extractor ignores look-around while
284 // the old one refused to extract prefixes from regexes
285 // that began with a \b. These old creaky regex internals
286 // can't deal with it, so we drop it.
287 prefixes = None;
288 } else if props.look_set_prefix_any().contains(Look::StartLF) {
289 // Similar to the reasoning for word boundaries, this old
290 // regex engine can't handle literal prefixes with '(?m:^)'
291 // at the beginning of a regex.
292 prefixes = None;
293 }
294
295 if !props.look_set_suffix().contains(Look::End)
296 && props.look_set().contains(Look::End)
297 {
298 // Partial anchors unfortunately make it hard to use
299 // suffixes, so disable them.
300 suffixes = None;
301 } else if is_set
302 && props.look_set_suffix_any().contains(Look::End)
303 {
304 // Regex sets with anchors do not go well with literal
305 // optimizations.
306 suffixes = None;
307 } else if props.look_set_suffix_any().contains_word() {
308 // See the prefix case for reasoning here.
309 suffixes = None;
310 } else if props.look_set_suffix_any().contains(Look::EndLF) {
311 // See the prefix case for reasoning here.
312 suffixes = None;
313 }
314
315 let (mut pres, mut suffs) =
316 if prefixes.is_none() && suffixes.is_none() {
317 (literal::Seq::infinite(), literal::Seq::infinite())
318 } else {
319 literal_analysis(&expr)
320 };
321 // These old creaky regex internals can't handle cases where
322 // the literal sequences are exact but there are look-around
323 // assertions. So we make sure the sequences are inexact if
324 // there are look-around assertions anywhere. This forces the
325 // regex engines to run instead of assuming that a literal
326 // match implies an overall match.
327 if !props.look_set().is_empty() {
328 pres.make_inexact();
329 suffs.make_inexact();
330 }
331 prefixes = prefixes.and_then(|mut prefixes| {
332 prefixes.union(&mut pres);
333 Some(prefixes)
334 });
335 suffixes = suffixes.and_then(|mut suffixes| {
336 suffixes.union(&mut suffs);
337 Some(suffixes)
338 });
339 }
340 exprs.push(expr);
341 }
342 Ok(Parsed {
343 exprs,
344 prefixes: prefixes.unwrap_or_else(literal::Seq::empty),
345 suffixes: suffixes.unwrap_or_else(literal::Seq::empty),
346 bytes,
347 })
348 }
349
350 /// Build an executor that can run a regular expression.
351 pub fn build(self) -> Result<Exec, Error> {
352 // Special case when we have no patterns to compile.
353 // This can happen when compiling a regex set.
354 if self.options.pats.is_empty() {
355 let ro = Arc::new(ExecReadOnly {
356 res: vec![],
357 nfa: Program::new(),
358 dfa: Program::new(),
359 dfa_reverse: Program::new(),
360 suffixes: LiteralSearcher::empty(),
361 #[cfg(feature = "perf-literal")]
362 ac: None,
363 match_type: MatchType::Nothing,
364 });
365 let pool = ExecReadOnly::new_pool(&ro);
366 return Ok(Exec { ro, pool });
367 }
368 let parsed = self.parse()?;
369 let mut nfa = Compiler::new()
370 .size_limit(self.options.size_limit)
371 .bytes(self.bytes || parsed.bytes)
372 .only_utf8(self.only_utf8)
373 .compile(&parsed.exprs)?;
374 let mut dfa = Compiler::new()
375 .size_limit(self.options.size_limit)
376 .dfa(true)
377 .only_utf8(self.only_utf8)
378 .compile(&parsed.exprs)?;
379 let mut dfa_reverse = Compiler::new()
380 .size_limit(self.options.size_limit)
381 .dfa(true)
382 .only_utf8(self.only_utf8)
383 .reverse(true)
384 .compile(&parsed.exprs)?;
385
386 #[cfg(feature = "perf-literal")]
387 let ac = self.build_aho_corasick(&parsed);
388 nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
389 dfa.prefixes = nfa.prefixes.clone();
390 dfa.dfa_size_limit = self.options.dfa_size_limit;
391 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
392
393 let mut ro = ExecReadOnly {
394 res: self.options.pats,
395 nfa,
396 dfa,
397 dfa_reverse,
398 suffixes: LiteralSearcher::suffixes(parsed.suffixes),
399 #[cfg(feature = "perf-literal")]
400 ac,
401 match_type: MatchType::Nothing,
402 };
403 ro.match_type = ro.choose_match_type(self.match_type);
404
405 let ro = Arc::new(ro);
406 let pool = ExecReadOnly::new_pool(&ro);
407 Ok(Exec { ro, pool })
408 }
409
410 #[cfg(feature = "perf-literal")]
411 fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> {
412 if parsed.exprs.len() != 1 {
413 return None;
414 }
415 let lits = match alternation_literals(&parsed.exprs[0]) {
416 None => return None,
417 Some(lits) => lits,
418 };
419 // If we have a small number of literals, then let Teddy handle
420 // things (see literal/mod.rs).
421 if lits.len() <= 32 {
422 return None;
423 }
424 Some(
425 AhoCorasick::builder()
426 .match_kind(MatchKind::LeftmostFirst)
427 .build(&lits)
428 // This should never happen because we'd long exceed the
429 // compilation limit for regexes first.
430 .expect("AC automaton too big"),
431 )
432 }
433}
434
435impl<'c> RegularExpression for ExecNoSyncStr<'c> {
436 type Text = str;
437
438 fn slots_len(&self) -> usize {
439 self.0.slots_len()
440 }
441
442 fn next_after_empty(&self, text: &str, i: usize) -> usize {
443 next_utf8(text.as_bytes(), i)
444 }
445
446 #[cfg_attr(feature = "perf-inline", inline(always))]
447 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
448 self.0.shortest_match_at(text.as_bytes(), start)
449 }
450
451 #[cfg_attr(feature = "perf-inline", inline(always))]
452 fn is_match_at(&self, text: &str, start: usize) -> bool {
453 self.0.is_match_at(text.as_bytes(), start)
454 }
455
456 #[cfg_attr(feature = "perf-inline", inline(always))]
457 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
458 self.0.find_at(text.as_bytes(), start)
459 }
460
461 #[cfg_attr(feature = "perf-inline", inline(always))]
462 fn captures_read_at(
463 &self,
464 locs: &mut Locations,
465 text: &str,
466 start: usize,
467 ) -> Option<(usize, usize)> {
468 self.0.captures_read_at(locs, text.as_bytes(), start)
469 }
470}
471
472impl<'c> RegularExpression for ExecNoSync<'c> {
473 type Text = [u8];
474
475 /// Returns the number of capture slots in the regular expression. (There
476 /// are two slots for every capture group, corresponding to possibly empty
477 /// start and end locations of the capture.)
478 fn slots_len(&self) -> usize {
479 self.ro.nfa.captures.len() * 2
480 }
481
482 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
483 i + 1
484 }
485
486 /// Returns the end of a match location, possibly occurring before the
487 /// end location of the correct leftmost-first match.
488 #[cfg_attr(feature = "perf-inline", inline(always))]
489 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
490 if !self.is_anchor_end_match(text) {
491 return None;
492 }
493 match self.ro.match_type {
494 #[cfg(feature = "perf-literal")]
495 MatchType::Literal(ty) => {
496 self.find_literals(ty, text, start).map(|(_, e)| e)
497 }
498 #[cfg(feature = "perf-dfa")]
499 MatchType::Dfa | MatchType::DfaMany => {
500 match self.shortest_dfa(text, start) {
501 dfa::Result::Match(end) => Some(end),
502 dfa::Result::NoMatch(_) => None,
503 dfa::Result::Quit => self.shortest_nfa(text, start),
504 }
505 }
506 #[cfg(feature = "perf-dfa")]
507 MatchType::DfaAnchoredReverse => {
508 match dfa::Fsm::reverse(
509 &self.ro.dfa_reverse,
510 self.cache.value(),
511 true,
512 &text[start..],
513 text.len() - start,
514 ) {
515 dfa::Result::Match(_) => Some(text.len()),
516 dfa::Result::NoMatch(_) => None,
517 dfa::Result::Quit => self.shortest_nfa(text, start),
518 }
519 }
520 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
521 MatchType::DfaSuffix => {
522 match self.shortest_dfa_reverse_suffix(text, start) {
523 dfa::Result::Match(e) => Some(e),
524 dfa::Result::NoMatch(_) => None,
525 dfa::Result::Quit => self.shortest_nfa(text, start),
526 }
527 }
528 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
529 MatchType::Nothing => None,
530 }
531 }
532
533 /// Returns true if and only if the regex matches text.
534 ///
535 /// For single regular expressions, this is equivalent to calling
536 /// shortest_match(...).is_some().
537 #[cfg_attr(feature = "perf-inline", inline(always))]
538 fn is_match_at(&self, text: &[u8], start: usize) -> bool {
539 if !self.is_anchor_end_match(text) {
540 return false;
541 }
542 // We need to do this dance because shortest_match relies on the NFA
543 // filling in captures[1], but a RegexSet has no captures. In other
544 // words, a RegexSet can't (currently) use shortest_match. ---AG
545 match self.ro.match_type {
546 #[cfg(feature = "perf-literal")]
547 MatchType::Literal(ty) => {
548 self.find_literals(ty, text, start).is_some()
549 }
550 #[cfg(feature = "perf-dfa")]
551 MatchType::Dfa | MatchType::DfaMany => {
552 match self.shortest_dfa(text, start) {
553 dfa::Result::Match(_) => true,
554 dfa::Result::NoMatch(_) => false,
555 dfa::Result::Quit => self.match_nfa(text, start),
556 }
557 }
558 #[cfg(feature = "perf-dfa")]
559 MatchType::DfaAnchoredReverse => {
560 match dfa::Fsm::reverse(
561 &self.ro.dfa_reverse,
562 self.cache.value(),
563 true,
564 &text[start..],
565 text.len() - start,
566 ) {
567 dfa::Result::Match(_) => true,
568 dfa::Result::NoMatch(_) => false,
569 dfa::Result::Quit => self.match_nfa(text, start),
570 }
571 }
572 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
573 MatchType::DfaSuffix => {
574 match self.shortest_dfa_reverse_suffix(text, start) {
575 dfa::Result::Match(_) => true,
576 dfa::Result::NoMatch(_) => false,
577 dfa::Result::Quit => self.match_nfa(text, start),
578 }
579 }
580 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
581 MatchType::Nothing => false,
582 }
583 }
584
585 /// Finds the start and end location of the leftmost-first match, starting
586 /// at the given location.
587 #[cfg_attr(feature = "perf-inline", inline(always))]
588 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
589 if !self.is_anchor_end_match(text) {
590 return None;
591 }
592 match self.ro.match_type {
593 #[cfg(feature = "perf-literal")]
594 MatchType::Literal(ty) => self.find_literals(ty, text, start),
595 #[cfg(feature = "perf-dfa")]
596 MatchType::Dfa => match self.find_dfa_forward(text, start) {
597 dfa::Result::Match((s, e)) => Some((s, e)),
598 dfa::Result::NoMatch(_) => None,
599 dfa::Result::Quit => {
600 self.find_nfa(MatchNfaType::Auto, text, start)
601 }
602 },
603 #[cfg(feature = "perf-dfa")]
604 MatchType::DfaAnchoredReverse => {
605 match self.find_dfa_anchored_reverse(text, start) {
606 dfa::Result::Match((s, e)) => Some((s, e)),
607 dfa::Result::NoMatch(_) => None,
608 dfa::Result::Quit => {
609 self.find_nfa(MatchNfaType::Auto, text, start)
610 }
611 }
612 }
613 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
614 MatchType::DfaSuffix => {
615 match self.find_dfa_reverse_suffix(text, start) {
616 dfa::Result::Match((s, e)) => Some((s, e)),
617 dfa::Result::NoMatch(_) => None,
618 dfa::Result::Quit => {
619 self.find_nfa(MatchNfaType::Auto, text, start)
620 }
621 }
622 }
623 MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
624 MatchType::Nothing => None,
625 #[cfg(feature = "perf-dfa")]
626 MatchType::DfaMany => {
627 unreachable!("BUG: RegexSet cannot be used with find")
628 }
629 }
630 }
631
632 /// Finds the start and end location of the leftmost-first match and also
633 /// fills in all matching capture groups.
634 ///
635 /// The number of capture slots given should be equal to the total number
636 /// of capture slots in the compiled program.
637 ///
638 /// Note that the first two slots always correspond to the start and end
639 /// locations of the overall match.
640 fn captures_read_at(
641 &self,
642 locs: &mut Locations,
643 text: &[u8],
644 start: usize,
645 ) -> Option<(usize, usize)> {
646 let slots = locs.as_slots();
647 for slot in slots.iter_mut() {
648 *slot = None;
649 }
650 // If the caller unnecessarily uses this, then we try to save them
651 // from themselves.
652 match slots.len() {
653 0 => return self.find_at(text, start),
654 2 => {
655 return self.find_at(text, start).map(|(s, e)| {
656 slots[0] = Some(s);
657 slots[1] = Some(e);
658 (s, e)
659 });
660 }
661 _ => {} // fallthrough
662 }
663 if !self.is_anchor_end_match(text) {
664 return None;
665 }
666 match self.ro.match_type {
667 #[cfg(feature = "perf-literal")]
668 MatchType::Literal(ty) => {
669 self.find_literals(ty, text, start).and_then(|(s, e)| {
670 self.captures_nfa_type(
671 MatchNfaType::Auto,
672 slots,
673 text,
674 s,
675 e,
676 )
677 })
678 }
679 #[cfg(feature = "perf-dfa")]
680 MatchType::Dfa => {
681 if self.ro.nfa.is_anchored_start {
682 self.captures_nfa(slots, text, start)
683 } else {
684 match self.find_dfa_forward(text, start) {
685 dfa::Result::Match((s, e)) => self.captures_nfa_type(
686 MatchNfaType::Auto,
687 slots,
688 text,
689 s,
690 e,
691 ),
692 dfa::Result::NoMatch(_) => None,
693 dfa::Result::Quit => {
694 self.captures_nfa(slots, text, start)
695 }
696 }
697 }
698 }
699 #[cfg(feature = "perf-dfa")]
700 MatchType::DfaAnchoredReverse => {
701 match self.find_dfa_anchored_reverse(text, start) {
702 dfa::Result::Match((s, e)) => self.captures_nfa_type(
703 MatchNfaType::Auto,
704 slots,
705 text,
706 s,
707 e,
708 ),
709 dfa::Result::NoMatch(_) => None,
710 dfa::Result::Quit => self.captures_nfa(slots, text, start),
711 }
712 }
713 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
714 MatchType::DfaSuffix => {
715 match self.find_dfa_reverse_suffix(text, start) {
716 dfa::Result::Match((s, e)) => self.captures_nfa_type(
717 MatchNfaType::Auto,
718 slots,
719 text,
720 s,
721 e,
722 ),
723 dfa::Result::NoMatch(_) => None,
724 dfa::Result::Quit => self.captures_nfa(slots, text, start),
725 }
726 }
727 MatchType::Nfa(ty) => {
728 self.captures_nfa_type(ty, slots, text, start, text.len())
729 }
730 MatchType::Nothing => None,
731 #[cfg(feature = "perf-dfa")]
732 MatchType::DfaMany => {
733 unreachable!("BUG: RegexSet cannot be used with captures")
734 }
735 }
736 }
737}
738
739impl<'c> ExecNoSync<'c> {
740 /// Finds the leftmost-first match using only literal search.
741 #[cfg(feature = "perf-literal")]
742 #[cfg_attr(feature = "perf-inline", inline(always))]
743 fn find_literals(
744 &self,
745 ty: MatchLiteralType,
746 text: &[u8],
747 start: usize,
748 ) -> Option<(usize, usize)> {
749 use self::MatchLiteralType::*;
750 match ty {
751 Unanchored => {
752 let lits = &self.ro.nfa.prefixes;
753 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
754 }
755 AnchoredStart => {
756 let lits = &self.ro.nfa.prefixes;
757 if start == 0 || !self.ro.nfa.is_anchored_start {
758 lits.find_start(&text[start..])
759 .map(|(s, e)| (start + s, start + e))
760 } else {
761 None
762 }
763 }
764 AnchoredEnd => {
765 let lits = &self.ro.suffixes;
766 lits.find_end(&text[start..])
767 .map(|(s, e)| (start + s, start + e))
768 }
769 AhoCorasick => self
770 .ro
771 .ac
772 .as_ref()
773 .unwrap()
774 .find(&text[start..])
775 .map(|m| (start + m.start(), start + m.end())),
776 }
777 }
778
779 /// Finds the leftmost-first match (start and end) using only the DFA.
780 ///
781 /// If the result returned indicates that the DFA quit, then another
782 /// matching engine should be used.
783 #[cfg(feature = "perf-dfa")]
784 #[cfg_attr(feature = "perf-inline", inline(always))]
785 fn find_dfa_forward(
786 &self,
787 text: &[u8],
788 start: usize,
789 ) -> dfa::Result<(usize, usize)> {
790 use crate::dfa::Result::*;
791 let end = match dfa::Fsm::forward(
792 &self.ro.dfa,
793 self.cache.value(),
794 false,
795 text,
796 start,
797 ) {
798 NoMatch(i) => return NoMatch(i),
799 Quit => return Quit,
800 Match(end) if start == end => return Match((start, start)),
801 Match(end) => end,
802 };
803 // Now run the DFA in reverse to find the start of the match.
804 match dfa::Fsm::reverse(
805 &self.ro.dfa_reverse,
806 self.cache.value(),
807 false,
808 &text[start..],
809 end - start,
810 ) {
811 Match(s) => Match((start + s, end)),
812 NoMatch(i) => NoMatch(i),
813 Quit => Quit,
814 }
815 }
816
817 /// Finds the leftmost-first match (start and end) using only the DFA,
818 /// but assumes the regex is anchored at the end and therefore starts at
819 /// the end of the regex and matches in reverse.
820 ///
821 /// If the result returned indicates that the DFA quit, then another
822 /// matching engine should be used.
823 #[cfg(feature = "perf-dfa")]
824 #[cfg_attr(feature = "perf-inline", inline(always))]
825 fn find_dfa_anchored_reverse(
826 &self,
827 text: &[u8],
828 start: usize,
829 ) -> dfa::Result<(usize, usize)> {
830 use crate::dfa::Result::*;
831 match dfa::Fsm::reverse(
832 &self.ro.dfa_reverse,
833 self.cache.value(),
834 false,
835 &text[start..],
836 text.len() - start,
837 ) {
838 Match(s) => Match((start + s, text.len())),
839 NoMatch(i) => NoMatch(i),
840 Quit => Quit,
841 }
842 }
843
844 /// Finds the end of the shortest match using only the DFA.
845 #[cfg(feature = "perf-dfa")]
846 #[cfg_attr(feature = "perf-inline", inline(always))]
847 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
848 dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
849 }
850
851 /// Finds the end of the shortest match using only the DFA by scanning for
852 /// suffix literals.
853 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
854 #[cfg_attr(feature = "perf-inline", inline(always))]
855 fn shortest_dfa_reverse_suffix(
856 &self,
857 text: &[u8],
858 start: usize,
859 ) -> dfa::Result<usize> {
860 match self.exec_dfa_reverse_suffix(text, start) {
861 None => self.shortest_dfa(text, start),
862 Some(r) => r.map(|(_, end)| end),
863 }
864 }
865
866 /// Finds the end of the shortest match using only the DFA by scanning for
867 /// suffix literals. It also reports the start of the match.
868 ///
869 /// Note that if None is returned, then the optimization gave up to avoid
870 /// worst case quadratic behavior. A forward scanning DFA should be tried
871 /// next.
872 ///
873 /// If a match is returned and the full leftmost-first match is desired,
874 /// then a forward scan starting from the beginning of the match must be
875 /// done.
876 ///
877 /// If the result returned indicates that the DFA quit, then another
878 /// matching engine should be used.
879 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
880 #[cfg_attr(feature = "perf-inline", inline(always))]
881 fn exec_dfa_reverse_suffix(
882 &self,
883 text: &[u8],
884 original_start: usize,
885 ) -> Option<dfa::Result<(usize, usize)>> {
886 use crate::dfa::Result::*;
887
888 let lcs = self.ro.suffixes.lcs();
889 debug_assert!(lcs.len() >= 1);
890 let mut start = original_start;
891 let mut end = start;
892 let mut last_literal = start;
893 while end <= text.len() {
894 last_literal += match lcs.find(&text[last_literal..]) {
895 None => return Some(NoMatch(text.len())),
896 Some(i) => i,
897 };
898 end = last_literal + lcs.len();
899 match dfa::Fsm::reverse(
900 &self.ro.dfa_reverse,
901 self.cache.value(),
902 false,
903 &text[start..end],
904 end - start,
905 ) {
906 Match(0) | NoMatch(0) => return None,
907 Match(i) => return Some(Match((start + i, end))),
908 NoMatch(i) => {
909 start += i;
910 last_literal += 1;
911 continue;
912 }
913 Quit => return Some(Quit),
914 };
915 }
916 Some(NoMatch(text.len()))
917 }
918
919 /// Finds the leftmost-first match (start and end) using only the DFA
920 /// by scanning for suffix literals.
921 ///
922 /// If the result returned indicates that the DFA quit, then another
923 /// matching engine should be used.
924 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
925 #[cfg_attr(feature = "perf-inline", inline(always))]
926 fn find_dfa_reverse_suffix(
927 &self,
928 text: &[u8],
929 start: usize,
930 ) -> dfa::Result<(usize, usize)> {
931 use crate::dfa::Result::*;
932
933 let match_start = match self.exec_dfa_reverse_suffix(text, start) {
934 None => return self.find_dfa_forward(text, start),
935 Some(Match((start, _))) => start,
936 Some(r) => return r,
937 };
938 // At this point, we've found a match. The only way to quit now
939 // without a match is if the DFA gives up (seems unlikely).
940 //
941 // Now run the DFA forwards to find the proper end of the match.
942 // (The suffix literal match can only indicate the earliest
943 // possible end location, which may appear before the end of the
944 // leftmost-first match.)
945 match dfa::Fsm::forward(
946 &self.ro.dfa,
947 self.cache.value(),
948 false,
949 text,
950 match_start,
951 ) {
952 NoMatch(_) => panic!("BUG: reverse match implies forward match"),
953 Quit => Quit,
954 Match(e) => Match((match_start, e)),
955 }
956 }
957
958 /// Executes the NFA engine to return whether there is a match or not.
959 ///
960 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
961 /// performance characteristics, but regex sets don't have captures, which
962 /// shortest_nfa depends on.
963 #[cfg(feature = "perf-dfa")]
964 fn match_nfa(&self, text: &[u8], start: usize) -> bool {
965 self.match_nfa_type(MatchNfaType::Auto, text, start)
966 }
967
968 /// Like match_nfa, but allows specification of the type of NFA engine.
969 fn match_nfa_type(
970 &self,
971 ty: MatchNfaType,
972 text: &[u8],
973 start: usize,
974 ) -> bool {
975 self.exec_nfa(
976 ty,
977 &mut [false],
978 &mut [],
979 true,
980 false,
981 text,
982 start,
983 text.len(),
984 )
985 }
986
987 /// Finds the shortest match using an NFA.
988 #[cfg(feature = "perf-dfa")]
989 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
990 self.shortest_nfa_type(MatchNfaType::Auto, text, start)
991 }
992
993 /// Like shortest_nfa, but allows specification of the type of NFA engine.
994 fn shortest_nfa_type(
995 &self,
996 ty: MatchNfaType,
997 text: &[u8],
998 start: usize,
999 ) -> Option<usize> {
1000 let mut slots = [None, None];
1001 if self.exec_nfa(
1002 ty,
1003 &mut [false],
1004 &mut slots,
1005 true,
1006 true,
1007 text,
1008 start,
1009 text.len(),
1010 ) {
1011 slots[1]
1012 } else {
1013 None
1014 }
1015 }
1016
1017 /// Like find, but executes an NFA engine.
1018 fn find_nfa(
1019 &self,
1020 ty: MatchNfaType,
1021 text: &[u8],
1022 start: usize,
1023 ) -> Option<(usize, usize)> {
1024 let mut slots = [None, None];
1025 if self.exec_nfa(
1026 ty,
1027 &mut [false],
1028 &mut slots,
1029 false,
1030 false,
1031 text,
1032 start,
1033 text.len(),
1034 ) {
1035 match (slots[0], slots[1]) {
1036 (Some(s), Some(e)) => Some((s, e)),
1037 _ => None,
1038 }
1039 } else {
1040 None
1041 }
1042 }
1043
1044 /// Like find_nfa, but fills in captures.
1045 ///
1046 /// `slots` should have length equal to `2 * nfa.captures.len()`.
1047 #[cfg(feature = "perf-dfa")]
1048 fn captures_nfa(
1049 &self,
1050 slots: &mut [Slot],
1051 text: &[u8],
1052 start: usize,
1053 ) -> Option<(usize, usize)> {
1054 self.captures_nfa_type(
1055 MatchNfaType::Auto,
1056 slots,
1057 text,
1058 start,
1059 text.len(),
1060 )
1061 }
1062
1063 /// Like captures_nfa, but allows specification of type of NFA engine.
1064 fn captures_nfa_type(
1065 &self,
1066 ty: MatchNfaType,
1067 slots: &mut [Slot],
1068 text: &[u8],
1069 start: usize,
1070 end: usize,
1071 ) -> Option<(usize, usize)> {
1072 if self.exec_nfa(
1073 ty,
1074 &mut [false],
1075 slots,
1076 false,
1077 false,
1078 text,
1079 start,
1080 end,
1081 ) {
1082 match (slots[0], slots[1]) {
1083 (Some(s), Some(e)) => Some((s, e)),
1084 _ => None,
1085 }
1086 } else {
1087 None
1088 }
1089 }
1090
1091 fn exec_nfa(
1092 &self,
1093 mut ty: MatchNfaType,
1094 matches: &mut [bool],
1095 slots: &mut [Slot],
1096 quit_after_match: bool,
1097 quit_after_match_with_pos: bool,
1098 text: &[u8],
1099 start: usize,
1100 end: usize,
1101 ) -> bool {
1102 use self::MatchNfaType::*;
1103 if let Auto = ty {
1104 if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1105 ty = Backtrack;
1106 } else {
1107 ty = PikeVM;
1108 }
1109 }
1110 // The backtracker can't return the shortest match position as it is
1111 // implemented today. So if someone calls `shortest_match` and we need
1112 // to run an NFA, then use the PikeVM.
1113 if quit_after_match_with_pos || ty == PikeVM {
1114 self.exec_pikevm(
1115 matches,
1116 slots,
1117 quit_after_match,
1118 text,
1119 start,
1120 end,
1121 )
1122 } else {
1123 self.exec_backtrack(matches, slots, text, start, end)
1124 }
1125 }
1126
1127 /// Always run the NFA algorithm.
1128 fn exec_pikevm(
1129 &self,
1130 matches: &mut [bool],
1131 slots: &mut [Slot],
1132 quit_after_match: bool,
1133 text: &[u8],
1134 start: usize,
1135 end: usize,
1136 ) -> bool {
1137 if self.ro.nfa.uses_bytes() {
1138 pikevm::Fsm::exec(
1139 &self.ro.nfa,
1140 self.cache.value(),
1141 matches,
1142 slots,
1143 quit_after_match,
1144 ByteInput::new(text, self.ro.nfa.only_utf8),
1145 start,
1146 end,
1147 )
1148 } else {
1149 pikevm::Fsm::exec(
1150 &self.ro.nfa,
1151 self.cache.value(),
1152 matches,
1153 slots,
1154 quit_after_match,
1155 CharInput::new(text),
1156 start,
1157 end,
1158 )
1159 }
1160 }
1161
1162 /// Always runs the NFA using bounded backtracking.
1163 fn exec_backtrack(
1164 &self,
1165 matches: &mut [bool],
1166 slots: &mut [Slot],
1167 text: &[u8],
1168 start: usize,
1169 end: usize,
1170 ) -> bool {
1171 if self.ro.nfa.uses_bytes() {
1172 backtrack::Bounded::exec(
1173 &self.ro.nfa,
1174 self.cache.value(),
1175 matches,
1176 slots,
1177 ByteInput::new(text, self.ro.nfa.only_utf8),
1178 start,
1179 end,
1180 )
1181 } else {
1182 backtrack::Bounded::exec(
1183 &self.ro.nfa,
1184 self.cache.value(),
1185 matches,
1186 slots,
1187 CharInput::new(text),
1188 start,
1189 end,
1190 )
1191 }
1192 }
1193
1194 /// Finds which regular expressions match the given text.
1195 ///
1196 /// `matches` should have length equal to the number of regexes being
1197 /// searched.
1198 ///
1199 /// This is only useful when one wants to know which regexes in a set
1200 /// match some text.
1201 pub fn many_matches_at(
1202 &self,
1203 matches: &mut [bool],
1204 text: &[u8],
1205 start: usize,
1206 ) -> bool {
1207 use self::MatchType::*;
1208 if !self.is_anchor_end_match(text) {
1209 return false;
1210 }
1211 match self.ro.match_type {
1212 #[cfg(feature = "perf-literal")]
1213 Literal(ty) => {
1214 debug_assert_eq!(matches.len(), 1);
1215 matches[0] = self.find_literals(ty, text, start).is_some();
1216 matches[0]
1217 }
1218 #[cfg(feature = "perf-dfa")]
1219 Dfa | DfaAnchoredReverse | DfaMany => {
1220 match dfa::Fsm::forward_many(
1221 &self.ro.dfa,
1222 self.cache.value(),
1223 matches,
1224 text,
1225 start,
1226 ) {
1227 dfa::Result::Match(_) => true,
1228 dfa::Result::NoMatch(_) => false,
1229 dfa::Result::Quit => self.exec_nfa(
1230 MatchNfaType::Auto,
1231 matches,
1232 &mut [],
1233 false,
1234 false,
1235 text,
1236 start,
1237 text.len(),
1238 ),
1239 }
1240 }
1241 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1242 DfaSuffix => {
1243 match dfa::Fsm::forward_many(
1244 &self.ro.dfa,
1245 self.cache.value(),
1246 matches,
1247 text,
1248 start,
1249 ) {
1250 dfa::Result::Match(_) => true,
1251 dfa::Result::NoMatch(_) => false,
1252 dfa::Result::Quit => self.exec_nfa(
1253 MatchNfaType::Auto,
1254 matches,
1255 &mut [],
1256 false,
1257 false,
1258 text,
1259 start,
1260 text.len(),
1261 ),
1262 }
1263 }
1264 Nfa(ty) => self.exec_nfa(
1265 ty,
1266 matches,
1267 &mut [],
1268 false,
1269 false,
1270 text,
1271 start,
1272 text.len(),
1273 ),
1274 Nothing => false,
1275 }
1276 }
1277
1278 #[cfg_attr(feature = "perf-inline", inline(always))]
1279 fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1280 #[cfg(not(feature = "perf-literal"))]
1281 fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1282 true
1283 }
1284
1285 #[cfg(feature = "perf-literal")]
1286 fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1287 // Only do this check if the haystack is big (>1MB).
1288 if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1289 let lcs = ro.suffixes.lcs();
1290 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1291 return false;
1292 }
1293 }
1294 true
1295 }
1296
1297 imp(&self.ro, text)
1298 }
1299
1300 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1301 &self.ro.nfa.capture_name_idx
1302 }
1303}
1304
1305impl<'c> ExecNoSyncStr<'c> {
1306 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1307 self.0.capture_name_idx()
1308 }
1309}
1310
1311impl Exec {
1312 /// Get a searcher that isn't Sync.
1313 #[cfg_attr(feature = "perf-inline", inline(always))]
1314 pub fn searcher(&self) -> ExecNoSync<'_> {
1315 ExecNoSync {
1316 ro: &self.ro, // a clone is too expensive here! (and not needed)
1317 cache: self.pool.get(),
1318 }
1319 }
1320
1321 /// Get a searcher that isn't Sync and can match on &str.
1322 #[cfg_attr(feature = "perf-inline", inline(always))]
1323 pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
1324 ExecNoSyncStr(self.searcher())
1325 }
1326
1327 /// Build a Regex from this executor.
1328 pub fn into_regex(self) -> re_unicode::Regex {
1329 re_unicode::Regex::from(self)
1330 }
1331
1332 /// Build a RegexSet from this executor.
1333 pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1334 re_set::unicode::RegexSet::from(self)
1335 }
1336
1337 /// Build a Regex from this executor that can match arbitrary bytes.
1338 pub fn into_byte_regex(self) -> re_bytes::Regex {
1339 re_bytes::Regex::from(self)
1340 }
1341
1342 /// Build a RegexSet from this executor that can match arbitrary bytes.
1343 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1344 re_set::bytes::RegexSet::from(self)
1345 }
1346
1347 /// The original regular expressions given by the caller that were
1348 /// compiled.
1349 pub fn regex_strings(&self) -> &[String] {
1350 &self.ro.res
1351 }
1352
1353 /// Return a slice of capture names.
1354 ///
1355 /// Any capture that isn't named is None.
1356 pub fn capture_names(&self) -> &[Option<String>] {
1357 &self.ro.nfa.captures
1358 }
1359
1360 /// Return a reference to named groups mapping (from group name to
1361 /// group position).
1362 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1363 &self.ro.nfa.capture_name_idx
1364 }
1365
1366 /// If the number of capture groups in every match is always the same, then
1367 /// return that number. Otherwise return `None`.
1368 pub fn static_captures_len(&self) -> Option<usize> {
1369 self.ro.nfa.static_captures_len
1370 }
1371}
1372
1373impl Clone for Exec {
1374 fn clone(&self) -> Exec {
1375 let pool: Box>> = ExecReadOnly::new_pool(&self.ro);
1376 Exec { ro: self.ro.clone(), pool }
1377 }
1378}
1379
1380impl ExecReadOnly {
1381 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1382 if let Some(MatchType::Nfa(_)) = hint {
1383 return hint.unwrap();
1384 }
1385 // If the NFA is empty, then we'll never match anything.
1386 if self.nfa.insts.is_empty() {
1387 return MatchType::Nothing;
1388 }
1389 if let Some(literalty) = self.choose_literal_match_type() {
1390 return literalty;
1391 }
1392 if let Some(dfaty) = self.choose_dfa_match_type() {
1393 return dfaty;
1394 }
1395 // We're so totally hosed.
1396 MatchType::Nfa(MatchNfaType::Auto)
1397 }
1398
1399 /// If a plain literal scan can be used, then a corresponding literal
1400 /// search type is returned.
1401 fn choose_literal_match_type(&self) -> Option<MatchType> {
1402 #[cfg(not(feature = "perf-literal"))]
1403 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1404 None
1405 }
1406
1407 #[cfg(feature = "perf-literal")]
1408 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1409 // If our set of prefixes is complete, then we can use it to find
1410 // a match in lieu of a regex engine. This doesn't quite work well
1411 // in the presence of multiple regexes, so only do it when there's
1412 // one.
1413 //
1414 // TODO(burntsushi): Also, don't try to match literals if the regex
1415 // is partially anchored. We could technically do it, but we'd need
1416 // to create two sets of literals: all of them and then the subset
1417 // that aren't anchored. We would then only search for all of them
1418 // when at the beginning of the input and use the subset in all
1419 // other cases.
1420 if ro.res.len() != 1 {
1421 return None;
1422 }
1423 if ro.ac.is_some() {
1424 return Some(MatchType::Literal(
1425 MatchLiteralType::AhoCorasick,
1426 ));
1427 }
1428 if ro.nfa.prefixes.complete() {
1429 return if ro.nfa.is_anchored_start {
1430 Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1431 } else {
1432 Some(MatchType::Literal(MatchLiteralType::Unanchored))
1433 };
1434 }
1435 if ro.suffixes.complete() {
1436 return if ro.nfa.is_anchored_end {
1437 Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1438 } else {
1439 // This case shouldn't happen. When the regex isn't
1440 // anchored, then complete prefixes should imply complete
1441 // suffixes.
1442 //
1443 // The above is wrong! This case can happen. While
1444 // complete prefixes should imply complete suffixes
1445 // here, that doesn't necessarily mean we have a useful
1446 // prefix matcher! It could be the case that the literal
1447 // searcher decided the prefixes---even though they are
1448 // "complete"---weren't good enough and thus created an
1449 // empty matcher. If that happens and we return Unanchored
1450 // here, then we'll end up using that matcher, which is
1451 // very bad because it matches at every position. So...
1452 // return None.
1453 None
1454 };
1455 }
1456 None
1457 }
1458
1459 imp(self)
1460 }
1461
1462 /// If a DFA scan can be used, then choose the appropriate DFA strategy.
1463 fn choose_dfa_match_type(&self) -> Option<MatchType> {
1464 #[cfg(not(feature = "perf-dfa"))]
1465 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1466 None
1467 }
1468
1469 #[cfg(feature = "perf-dfa")]
1470 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1471 if !dfa::can_exec(&ro.dfa) {
1472 return None;
1473 }
1474 // Regex sets require a slightly specialized path.
1475 if ro.res.len() >= 2 {
1476 return Some(MatchType::DfaMany);
1477 }
1478 // If the regex is anchored at the end but not the start, then
1479 // just match in reverse from the end of the haystack.
1480 if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1481 return Some(MatchType::DfaAnchoredReverse);
1482 }
1483 #[cfg(feature = "perf-literal")]
1484 {
1485 // If there's a longish suffix literal, then it might be faster
1486 // to look for that first.
1487 if ro.should_suffix_scan() {
1488 return Some(MatchType::DfaSuffix);
1489 }
1490 }
1491 // Fall back to your garden variety forward searching lazy DFA.
1492 Some(MatchType::Dfa)
1493 }
1494
1495 imp(self)
1496 }
1497
1498 /// Returns true if the program is amenable to suffix scanning.
1499 ///
1500 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1501 /// for suffix literals and then do a *reverse* DFA match from any matches
1502 /// produced by the literal scan. (And then followed by a forward DFA
1503 /// search, since the previously found suffix literal maybe not actually be
1504 /// the end of a match.)
1505 ///
1506 /// This is a bit of a specialized optimization, but can result in pretty
1507 /// big performance wins if 1) there are no prefix literals and 2) the
1508 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1509 /// account for but (2) is harder. As a proxy, we assume that longer
1510 /// strings are generally rarer, so we only enable this optimization when
1511 /// we have a meaty suffix.
1512 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1513 fn should_suffix_scan(&self) -> bool {
1514 if self.suffixes.is_empty() {
1515 return false;
1516 }
1517 let lcs_len = self.suffixes.lcs().char_len();
1518 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1519 }
1520
1521 fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
1522 let ro = ro.clone();
1523 Box::new(Pool::new(Box::new(move || {
1524 AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
1525 })))
1526 }
1527}
1528
1529#[derive(Clone, Copy, Debug)]
1530enum MatchType {
1531 /// A single or multiple literal search. This is only used when the regex
1532 /// can be decomposed into a literal search.
1533 #[cfg(feature = "perf-literal")]
1534 Literal(MatchLiteralType),
1535 /// A normal DFA search.
1536 #[cfg(feature = "perf-dfa")]
1537 Dfa,
1538 /// A reverse DFA search starting from the end of a haystack.
1539 #[cfg(feature = "perf-dfa")]
1540 DfaAnchoredReverse,
1541 /// A reverse DFA search with suffix literal scanning.
1542 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1543 DfaSuffix,
1544 /// Use the DFA on two or more regular expressions.
1545 #[cfg(feature = "perf-dfa")]
1546 DfaMany,
1547 /// An NFA variant.
1548 Nfa(MatchNfaType),
1549 /// No match is ever possible, so don't ever try to search.
1550 Nothing,
1551}
1552
1553#[derive(Clone, Copy, Debug)]
1554#[cfg(feature = "perf-literal")]
1555enum MatchLiteralType {
1556 /// Match literals anywhere in text.
1557 Unanchored,
1558 /// Match literals only at the start of text.
1559 AnchoredStart,
1560 /// Match literals only at the end of text.
1561 AnchoredEnd,
1562 /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1563 /// ExecReadOnly.
1564 AhoCorasick,
1565}
1566
1567#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1568enum MatchNfaType {
1569 /// Choose between Backtrack and PikeVM.
1570 Auto,
1571 /// NFA bounded backtracking.
1572 ///
1573 /// (This is only set by tests, since it never makes sense to always want
1574 /// backtracking.)
1575 Backtrack,
1576 /// The Pike VM.
1577 ///
1578 /// (This is only set by tests, since it never makes sense to always want
1579 /// the Pike VM.)
1580 PikeVM,
1581}
1582
1583/// `ProgramCache` maintains reusable allocations for each matching engine
1584/// available to a particular program.
1585///
1586/// We declare this as unwind safe since it's a cache that's only used for
1587/// performance purposes. If a panic occurs, it is (or should be) always safe
1588/// to continue using the same regex object.
1589pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
1590
1591#[derive(Debug)]
1592pub struct ProgramCacheInner {
1593 pub pikevm: pikevm::Cache,
1594 pub backtrack: backtrack::Cache,
1595 #[cfg(feature = "perf-dfa")]
1596 pub dfa: dfa::Cache,
1597 #[cfg(feature = "perf-dfa")]
1598 pub dfa_reverse: dfa::Cache,
1599}
1600
1601impl ProgramCacheInner {
1602 fn new(ro: &ExecReadOnly) -> Self {
1603 ProgramCacheInner {
1604 pikevm: pikevm::Cache::new(&ro.nfa),
1605 backtrack: backtrack::Cache::new(&ro.nfa),
1606 #[cfg(feature = "perf-dfa")]
1607 dfa: dfa::Cache::new(&ro.dfa),
1608 #[cfg(feature = "perf-dfa")]
1609 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1610 }
1611 }
1612}
1613
1614/// Alternation literals checks if the given HIR is a simple alternation of
1615/// literals, and if so, returns them. Otherwise, this returns None.
1616#[cfg(feature = "perf-literal")]
1617fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1618 use regex_syntax::hir::{HirKind, Literal};
1619
1620 // This is pretty hacky, but basically, if `is_alternation_literal` is
1621 // true, then we can make several assumptions about the structure of our
1622 // HIR. This is what justifies the `unreachable!` statements below.
1623 //
1624 // This code should be refactored once we overhaul this crate's
1625 // optimization pipeline, because this is a terribly inflexible way to go
1626 // about things.
1627
1628 if !expr.properties().is_alternation_literal() {
1629 return None;
1630 }
1631 let alts = match *expr.kind() {
1632 HirKind::Alternation(ref alts) => alts,
1633 _ => return None, // one literal isn't worth it
1634 };
1635
1636 let mut lits = vec![];
1637 for alt in alts {
1638 let mut lit = vec![];
1639 match *alt.kind() {
1640 HirKind::Literal(Literal(ref bytes)) => {
1641 lit.extend_from_slice(bytes)
1642 }
1643 HirKind::Concat(ref exprs) => {
1644 for e in exprs {
1645 match *e.kind() {
1646 HirKind::Literal(Literal(ref bytes)) => {
1647 lit.extend_from_slice(bytes);
1648 }
1649 _ => unreachable!("expected literal, got {:?}", e),
1650 }
1651 }
1652 }
1653 _ => unreachable!("expected literal or concat, got {:?}", alt),
1654 }
1655 lits.push(lit);
1656 }
1657 Some(lits)
1658}
1659
1660#[cfg(not(feature = "perf-literal"))]
1661fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) {
1662 (literal::Seq::infinite(), literal::Seq::infinite())
1663}
1664
1665#[cfg(feature = "perf-literal")]
1666fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) {
1667 const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)];
1668
1669 let mut prefixes = literal::Extractor::new()
1670 .kind(literal::ExtractKind::Prefix)
1671 .extract(expr);
1672 for (keep, limit) in ATTEMPTS {
1673 let len = match prefixes.len() {
1674 None => break,
1675 Some(len) => len,
1676 };
1677 if len <= limit {
1678 break;
1679 }
1680 prefixes.keep_first_bytes(keep);
1681 prefixes.minimize_by_preference();
1682 }
1683
1684 let mut suffixes = literal::Extractor::new()
1685 .kind(literal::ExtractKind::Suffix)
1686 .extract(expr);
1687 for (keep, limit) in ATTEMPTS {
1688 let len = match suffixes.len() {
1689 None => break,
1690 Some(len) => len,
1691 };
1692 if len <= limit {
1693 break;
1694 }
1695 suffixes.keep_last_bytes(keep);
1696 suffixes.minimize_by_preference();
1697 }
1698
1699 (prefixes, suffixes)
1700}
1701
1702#[cfg(test)]
1703mod test {
1704 #[test]
1705 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1706 use crate::internal::ExecBuilder;
1707
1708 let backtrack_bytes_re = ExecBuilder::new("^S")
1709 .bounded_backtracking()
1710 .only_utf8(false)
1711 .build()
1712 .map(|exec| exec.into_byte_regex())
1713 .map_err(|err| format!("{}", err))
1714 .unwrap();
1715
1716 let default_bytes_re = ExecBuilder::new("^S")
1717 .only_utf8(false)
1718 .build()
1719 .map(|exec| exec.into_byte_regex())
1720 .map_err(|err| format!("{}", err))
1721 .unwrap();
1722
1723 let input = vec![83, 83];
1724
1725 let s1 = backtrack_bytes_re.split(&input);
1726 let s2 = default_bytes_re.split(&input);
1727 for (chunk1, chunk2) in s1.zip(s2) {
1728 assert_eq!(chunk1, chunk2);
1729 }
1730 }
1731
1732 #[test]
1733 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1734 use crate::internal::ExecBuilder;
1735
1736 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1737 .bounded_backtracking()
1738 .bytes(true)
1739 .build()
1740 .map(|exec| exec.into_regex())
1741 .map_err(|err| format!("{}", err))
1742 .unwrap();
1743
1744 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1745 .bytes(true)
1746 .build()
1747 .map(|exec| exec.into_regex())
1748 .map_err(|err| format!("{}", err))
1749 .unwrap();
1750
1751 let input = "**";
1752
1753 let s1 = backtrack_bytes_re.split(input);
1754 let s2 = default_bytes_re.split(input);
1755 for (chunk1, chunk2) in s1.zip(s2) {
1756 assert_eq!(chunk1, chunk2);
1757 }
1758 }
1759}
1760