1use core::{
2 fmt::Debug,
3 panic::{RefUnwindSafe, UnwindSafe},
4};
5
6use alloc::sync::Arc;
7
8use regex_syntax::hir::{literal, Hir};
9
10use crate::{
11 meta::{
12 error::{BuildError, RetryError, RetryFailError, RetryQuadraticError},
13 regex::{Cache, RegexInfo},
14 reverse_inner, wrappers,
15 },
16 nfa::thompson::{self, WhichCaptures, NFA},
17 util::{
18 captures::{Captures, GroupInfo},
19 look::LookMatcher,
20 prefilter::{self, Prefilter, PrefilterI},
21 primitives::{NonMaxUsize, PatternID},
22 search::{Anchored, HalfMatch, Input, Match, MatchKind, PatternSet},
23 },
24};
25
26/// A trait that represents a single meta strategy. Its main utility is in
27/// providing a way to do dynamic dispatch over a few choices.
28///
29/// Why dynamic dispatch? I actually don't have a super compelling reason, and
30/// importantly, I have not benchmarked it with the main alternative: an enum.
31/// I went with dynamic dispatch initially because the regex engine search code
32/// really can't be inlined into caller code in most cases because it's just
33/// too big. In other words, it is already expected that every regex search
34/// will entail at least the cost of a function call.
35///
36/// I do wonder whether using enums would result in better codegen overall
37/// though. It's a worthwhile experiment to try. Probably the most interesting
38/// benchmark to run in such a case would be one with a high match count. That
39/// is, a benchmark to test the overall latency of a search call.
40pub(super) trait Strategy:
41 Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static
42{
43 fn group_info(&self) -> &GroupInfo;
44
45 fn create_cache(&self) -> Cache;
46
47 fn reset_cache(&self, cache: &mut Cache);
48
49 fn is_accelerated(&self) -> bool;
50
51 fn memory_usage(&self) -> usize;
52
53 fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match>;
54
55 fn search_half(
56 &self,
57 cache: &mut Cache,
58 input: &Input<'_>,
59 ) -> Option<HalfMatch>;
60
61 fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool;
62
63 fn search_slots(
64 &self,
65 cache: &mut Cache,
66 input: &Input<'_>,
67 slots: &mut [Option<NonMaxUsize>],
68 ) -> Option<PatternID>;
69
70 fn which_overlapping_matches(
71 &self,
72 cache: &mut Cache,
73 input: &Input<'_>,
74 patset: &mut PatternSet,
75 );
76}
77
78pub(super) fn new(
79 info: &RegexInfo,
80 hirs: &[&Hir],
81) -> Result<Arc<dyn Strategy>, BuildError> {
82 // At this point, we're committed to a regex engine of some kind. So pull
83 // out a prefilter if we can, which will feed to each of the constituent
84 // regex engines.
85 let pre = if info.is_always_anchored_start() {
86 // PERF: I'm not sure we necessarily want to do this... We may want to
87 // run a prefilter for quickly rejecting in some cases. The problem
88 // is that anchored searches overlap quite a bit with the use case
89 // of "run a regex on every line to extract data." In that case, the
90 // regex always matches, so running a prefilter doesn't really help us
91 // there. The main place where a prefilter helps in an anchored search
92 // is if the anchored search is not expected to match frequently. That
93 // is, the prefilter gives us a way to possibly reject a haystack very
94 // quickly.
95 //
96 // Maybe we should do use a prefilter, but only for longer haystacks?
97 // Or maybe we should only use a prefilter when we think it's "fast"?
98 //
99 // Interestingly, I think we currently lack the infrastructure for
100 // disabling a prefilter based on haystack length. That would probably
101 // need to be a new 'Input' option. (Interestingly, an 'Input' used to
102 // carry a 'Prefilter' with it, but I moved away from that.)
103 debug!("skipping literal extraction since regex is anchored");
104 None
105 } else if let Some(pre) = info.config().get_prefilter() {
106 debug!(
107 "skipping literal extraction since the caller provided a prefilter"
108 );
109 Some(pre.clone())
110 } else if info.config().get_auto_prefilter() {
111 let kind = info.config().get_match_kind();
112 let prefixes = crate::util::prefilter::prefixes(kind, hirs);
113 // If we can build a full `Strategy` from just the extracted prefixes,
114 // then we can short-circuit and avoid building a regex engine at all.
115 if let Some(pre) = Pre::from_prefixes(info, &prefixes) {
116 debug!(
117 "found that the regex can be broken down to a literal \
118 search, avoiding the regex engine entirely",
119 );
120 return Ok(pre);
121 }
122 // This now attempts another short-circuit of the regex engine: if we
123 // have a huge alternation of just plain literals, then we can just use
124 // Aho-Corasick for that and avoid the regex engine entirely.
125 //
126 // You might think this case would just be handled by
127 // `Pre::from_prefixes`, but that technique relies on heuristic literal
128 // extraction from the corresponding `Hir`. That works, but part of
129 // heuristics limit the size and number of literals returned. This case
130 // will specifically handle patterns with very large alternations.
131 //
132 // One wonders if we should just roll this our heuristic literal
133 // extraction, and then I think this case could disappear entirely.
134 if let Some(pre) = Pre::from_alternation_literals(info, hirs) {
135 debug!(
136 "found plain alternation of literals, \
137 avoiding regex engine entirely and using Aho-Corasick"
138 );
139 return Ok(pre);
140 }
141 prefixes.literals().and_then(|strings| {
142 debug!(
143 "creating prefilter from {} literals: {:?}",
144 strings.len(),
145 strings,
146 );
147 Prefilter::new(kind, strings)
148 })
149 } else {
150 debug!("skipping literal extraction since prefilters were disabled");
151 None
152 };
153 let mut core = Core::new(info.clone(), pre.clone(), hirs)?;
154 // Now that we have our core regex engines built, there are a few cases
155 // where we can do a little bit better than just a normal "search forward
156 // and maybe use a prefilter when in a start state." However, these cases
157 // may not always work or otherwise build on top of the Core searcher.
158 // For example, the reverse anchored optimization seems like it might
159 // always work, but only the DFAs support reverse searching and the DFAs
160 // might give up or quit for reasons. If we had, e.g., a PikeVM that
161 // supported reverse searching, then we could avoid building a full Core
162 // engine for this case.
163 core = match ReverseAnchored::new(core) {
164 Err(core) => core,
165 Ok(ra) => {
166 debug!("using reverse anchored strategy");
167 return Ok(Arc::new(ra));
168 }
169 };
170 core = match ReverseSuffix::new(core, hirs) {
171 Err(core) => core,
172 Ok(rs) => {
173 debug!("using reverse suffix strategy");
174 return Ok(Arc::new(rs));
175 }
176 };
177 core = match ReverseInner::new(core, hirs) {
178 Err(core) => core,
179 Ok(ri) => {
180 debug!("using reverse inner strategy");
181 return Ok(Arc::new(ri));
182 }
183 };
184 debug!("using core strategy");
185 Ok(Arc::new(core))
186}
187
188#[derive(Clone, Debug)]
189struct Pre<P> {
190 pre: P,
191 group_info: GroupInfo,
192}
193
194impl<P: PrefilterI> Pre<P> {
195 fn new(pre: P) -> Arc<dyn Strategy> {
196 // The only thing we support when we use prefilters directly as a
197 // strategy is the start and end of the overall match for a single
198 // pattern. In other words, exactly one implicit capturing group. Which
199 // is exactly what we use here for a GroupInfo.
200 let group_info = GroupInfo::new([[None::<&str>]]).unwrap();
201 Arc::new(Pre { pre, group_info })
202 }
203}
204
205// This is a little weird, but we don't actually care about the type parameter
206// here because we're selecting which underlying prefilter to use. So we just
207// define it on an arbitrary type.
208impl Pre<()> {
209 /// Given a sequence of prefixes, attempt to return a full `Strategy` using
210 /// just the prefixes.
211 ///
212 /// Basically, this occurs when the prefixes given not just prefixes,
213 /// but an enumeration of the entire language matched by the regular
214 /// expression.
215 ///
216 /// A number of other conditions need to be true too. For example, there
217 /// can be only one pattern, the number of explicit capture groups is 0, no
218 /// look-around assertions and so on.
219 ///
220 /// Note that this ignores `Config::get_auto_prefilter` because if this
221 /// returns something, then it isn't a prefilter but a matcher itself.
222 /// Therefore, it shouldn't suffer from the problems typical to prefilters
223 /// (such as a high false positive rate).
224 fn from_prefixes(
225 info: &RegexInfo,
226 prefixes: &literal::Seq,
227 ) -> Option<Arc<dyn Strategy>> {
228 let kind = info.config().get_match_kind();
229 // Check to see if our prefixes are exact, which means we might be
230 // able to bypass the regex engine entirely and just rely on literal
231 // searches.
232 if !prefixes.is_exact() {
233 return None;
234 }
235 // We also require that we have a single regex pattern. Namely,
236 // we reuse the prefilter infrastructure to implement search and
237 // prefilters only report spans. Prefilters don't know about pattern
238 // IDs. The multi-regex case isn't a lost cause, we might still use
239 // Aho-Corasick and we might still just use a regular prefilter, but
240 // that's done below.
241 if info.pattern_len() != 1 {
242 return None;
243 }
244 // We can't have any capture groups either. The literal engines don't
245 // know how to deal with things like '(foo)(bar)'. In that case, a
246 // prefilter will just be used and then the regex engine will resolve
247 // the capture groups.
248 if info.props()[0].explicit_captures_len() != 0 {
249 return None;
250 }
251 // We also require that it has zero look-around assertions. Namely,
252 // literal extraction treats look-around assertions as if they match
253 // *every* empty string. But of course, that isn't true. So for
254 // example, 'foo\bquux' never matches anything, but 'fooquux' is
255 // extracted from that as an exact literal. Such cases should just run
256 // the regex engine. 'fooquux' will be used as a normal prefilter, and
257 // then the regex engine will try to look for an actual match.
258 if !info.props()[0].look_set().is_empty() {
259 return None;
260 }
261 // Finally, currently, our prefilters are all oriented around
262 // leftmost-first match semantics, so don't try to use them if the
263 // caller asked for anything else.
264 if kind != MatchKind::LeftmostFirst {
265 return None;
266 }
267 // The above seems like a lot of requirements to meet, but it applies
268 // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet
269 // the above criteria, for example.
270 //
271 // Note that this is effectively a latency optimization. If we didn't
272 // do this, then the extracted literals would still get bundled into
273 // a prefilter, and every regex engine capable of running unanchored
274 // searches supports prefilters. So this optimization merely sidesteps
275 // having to run the regex engine at all to confirm the match. Thus, it
276 // decreases the latency of a match.
277
278 // OK because we know the set is exact and thus finite.
279 let prefixes = prefixes.literals().unwrap();
280 debug!(
281 "trying to bypass regex engine by creating \
282 prefilter from {} literals: {:?}",
283 prefixes.len(),
284 prefixes,
285 );
286 let choice = match prefilter::Choice::new(kind, prefixes) {
287 Some(choice) => choice,
288 None => {
289 debug!(
290 "regex bypass failed because no prefilter could be built"
291 );
292 return None;
293 }
294 };
295 let strat: Arc<dyn Strategy> = match choice {
296 prefilter::Choice::Memchr(pre) => Pre::new(pre),
297 prefilter::Choice::Memchr2(pre) => Pre::new(pre),
298 prefilter::Choice::Memchr3(pre) => Pre::new(pre),
299 prefilter::Choice::Memmem(pre) => Pre::new(pre),
300 prefilter::Choice::Teddy(pre) => Pre::new(pre),
301 prefilter::Choice::ByteSet(pre) => Pre::new(pre),
302 prefilter::Choice::AhoCorasick(pre) => Pre::new(pre),
303 };
304 Some(strat)
305 }
306
307 /// Attempts to extract an alternation of literals, and if it's deemed
308 /// worth doing, returns an Aho-Corasick prefilter as a strategy.
309 ///
310 /// And currently, this only returns something when 'hirs.len() == 1'. This
311 /// could in theory do something if there are multiple HIRs where all of
312 /// them are alternation of literals, but I haven't had the time to go down
313 /// that path yet.
314 fn from_alternation_literals(
315 info: &RegexInfo,
316 hirs: &[&Hir],
317 ) -> Option<Arc<dyn Strategy>> {
318 use crate::util::prefilter::AhoCorasick;
319
320 let lits = crate::meta::literal::alternation_literals(info, hirs)?;
321 let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?;
322 Some(Pre::new(ac))
323 }
324}
325
326// This implements Strategy for anything that implements PrefilterI.
327//
328// Note that this must only be used for regexes of length 1. Multi-regexes
329// don't work here. The prefilter interface only provides the span of a match
330// and not the pattern ID. (I did consider making it more expressive, but I
331// couldn't figure out how to tie everything together elegantly.) Thus, so long
332// as the regex only contains one pattern, we can simply assume that a match
333// corresponds to PatternID::ZERO. And indeed, that's what we do here.
334//
335// In practice, since this impl is used to report matches directly and thus
336// completely bypasses the regex engine, we only wind up using this under the
337// following restrictions:
338//
339// * There must be only one pattern. As explained above.
340// * The literal sequence must be finite and only contain exact literals.
341// * There must not be any look-around assertions. If there are, the literals
342// extracted might be exact, but a match doesn't necessarily imply an overall
343// match. As a trivial example, 'foo\bbar' does not match 'foobar'.
344// * The pattern must not have any explicit capturing groups. If it does, the
345// caller might expect them to be resolved. e.g., 'foo(bar)'.
346//
347// So when all of those things are true, we use a prefilter directly as a
348// strategy.
349//
350// In the case where the number of patterns is more than 1, we don't use this
351// but do use a special Aho-Corasick strategy if all of the regexes are just
352// simple literals or alternations of literals. (We also use the Aho-Corasick
353// strategy when len(patterns)==1 if the number of literals is large. In that
354// case, literal extraction gives up and will return an infinite set.)
355impl<P: PrefilterI> Strategy for Pre<P> {
356 #[cfg_attr(feature = "perf-inline", inline(always))]
357 fn group_info(&self) -> &GroupInfo {
358 &self.group_info
359 }
360
361 fn create_cache(&self) -> Cache {
362 Cache {
363 capmatches: Captures::all(self.group_info().clone()),
364 pikevm: wrappers::PikeVMCache::none(),
365 backtrack: wrappers::BoundedBacktrackerCache::none(),
366 onepass: wrappers::OnePassCache::none(),
367 hybrid: wrappers::HybridCache::none(),
368 revhybrid: wrappers::ReverseHybridCache::none(),
369 }
370 }
371
372 fn reset_cache(&self, _cache: &mut Cache) {}
373
374 fn is_accelerated(&self) -> bool {
375 self.pre.is_fast()
376 }
377
378 fn memory_usage(&self) -> usize {
379 self.pre.memory_usage()
380 }
381
382 #[cfg_attr(feature = "perf-inline", inline(always))]
383 fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
384 if input.is_done() {
385 return None;
386 }
387 if input.get_anchored().is_anchored() {
388 return self
389 .pre
390 .prefix(input.haystack(), input.get_span())
391 .map(|sp| Match::new(PatternID::ZERO, sp));
392 }
393 self.pre
394 .find(input.haystack(), input.get_span())
395 .map(|sp| Match::new(PatternID::ZERO, sp))
396 }
397
398 #[cfg_attr(feature = "perf-inline", inline(always))]
399 fn search_half(
400 &self,
401 cache: &mut Cache,
402 input: &Input<'_>,
403 ) -> Option<HalfMatch> {
404 self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end()))
405 }
406
407 #[cfg_attr(feature = "perf-inline", inline(always))]
408 fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
409 self.search(cache, input).is_some()
410 }
411
412 #[cfg_attr(feature = "perf-inline", inline(always))]
413 fn search_slots(
414 &self,
415 cache: &mut Cache,
416 input: &Input<'_>,
417 slots: &mut [Option<NonMaxUsize>],
418 ) -> Option<PatternID> {
419 let m = self.search(cache, input)?;
420 if let Some(slot) = slots.get_mut(0) {
421 *slot = NonMaxUsize::new(m.start());
422 }
423 if let Some(slot) = slots.get_mut(1) {
424 *slot = NonMaxUsize::new(m.end());
425 }
426 Some(m.pattern())
427 }
428
429 #[cfg_attr(feature = "perf-inline", inline(always))]
430 fn which_overlapping_matches(
431 &self,
432 cache: &mut Cache,
433 input: &Input<'_>,
434 patset: &mut PatternSet,
435 ) {
436 if self.search(cache, input).is_some() {
437 patset.insert(PatternID::ZERO);
438 }
439 }
440}
441
442#[derive(Debug)]
443struct Core {
444 info: RegexInfo,
445 pre: Option<Prefilter>,
446 nfa: NFA,
447 nfarev: Option<NFA>,
448 pikevm: wrappers::PikeVM,
449 backtrack: wrappers::BoundedBacktracker,
450 onepass: wrappers::OnePass,
451 hybrid: wrappers::Hybrid,
452 dfa: wrappers::DFA,
453}
454
455impl Core {
456 fn new(
457 info: RegexInfo,
458 pre: Option<Prefilter>,
459 hirs: &[&Hir],
460 ) -> Result<Core, BuildError> {
461 let mut lookm = LookMatcher::new();
462 lookm.set_line_terminator(info.config().get_line_terminator());
463 let thompson_config = thompson::Config::new()
464 .utf8(info.config().get_utf8_empty())
465 .nfa_size_limit(info.config().get_nfa_size_limit())
466 .shrink(false)
467 .which_captures(info.config().get_which_captures())
468 .look_matcher(lookm);
469 let nfa = thompson::Compiler::new()
470 .configure(thompson_config.clone())
471 .build_many_from_hir(hirs)
472 .map_err(BuildError::nfa)?;
473 // It's possible for the PikeVM or the BB to fail to build, even though
474 // at this point, we already have a full NFA in hand. They can fail
475 // when a Unicode word boundary is used but where Unicode word boundary
476 // support is disabled at compile time, thus making it impossible to
477 // match. (Construction can also fail if the NFA was compiled without
478 // captures, but we always enable that above.)
479 let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?;
480 let backtrack =
481 wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?;
482 // The onepass engine can of course fail to build, but we expect it to
483 // fail in many cases because it is an optimization that doesn't apply
484 // to all regexes. The 'OnePass' wrapper encapsulates this failure (and
485 // logs a message if it occurs).
486 let onepass = wrappers::OnePass::new(&info, &nfa);
487 // We try to encapsulate whether a particular regex engine should be
488 // used within each respective wrapper, but the DFAs need a reverse NFA
489 // to build itself, and we really do not want to build a reverse NFA if
490 // we know we aren't going to use the lazy DFA. So we do a config check
491 // up front, which is in practice the only way we won't try to use the
492 // DFA.
493 let (nfarev, hybrid, dfa) =
494 if !info.config().get_hybrid() && !info.config().get_dfa() {
495 (None, wrappers::Hybrid::none(), wrappers::DFA::none())
496 } else {
497 // FIXME: Technically, we don't quite yet KNOW that we need
498 // a reverse NFA. It's possible for the DFAs below to both
499 // fail to build just based on the forward NFA. In which case,
500 // building the reverse NFA was totally wasted work. But...
501 // fixing this requires breaking DFA construction apart into
502 // two pieces: one for the forward part and another for the
503 // reverse part. Quite annoying. Making it worse, when building
504 // both DFAs fails, it's quite likely that the NFA is large and
505 // that it will take quite some time to build the reverse NFA
506 // too. So... it's really probably worth it to do this!
507 let nfarev = thompson::Compiler::new()
508 // Currently, reverse NFAs don't support capturing groups,
509 // so we MUST disable them. But even if we didn't have to,
510 // we would, because nothing in this crate does anything
511 // useful with capturing groups in reverse. And of course,
512 // the lazy DFA ignores capturing groups in all cases.
513 .configure(
514 thompson_config
515 .clone()
516 .which_captures(WhichCaptures::None)
517 .reverse(true),
518 )
519 .build_many_from_hir(hirs)
520 .map_err(BuildError::nfa)?;
521 let dfa = if !info.config().get_dfa() {
522 wrappers::DFA::none()
523 } else {
524 wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev)
525 };
526 let hybrid = if !info.config().get_hybrid() {
527 wrappers::Hybrid::none()
528 } else if dfa.is_some() {
529 debug!("skipping lazy DFA because we have a full DFA");
530 wrappers::Hybrid::none()
531 } else {
532 wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev)
533 };
534 (Some(nfarev), hybrid, dfa)
535 };
536 Ok(Core {
537 info,
538 pre,
539 nfa,
540 nfarev,
541 pikevm,
542 backtrack,
543 onepass,
544 hybrid,
545 dfa,
546 })
547 }
548
549 #[cfg_attr(feature = "perf-inline", inline(always))]
550 fn try_search_mayfail(
551 &self,
552 cache: &mut Cache,
553 input: &Input<'_>,
554 ) -> Option<Result<Option<Match>, RetryFailError>> {
555 if let Some(e) = self.dfa.get(input) {
556 trace!("using full DFA for search at {:?}", input.get_span());
557 Some(e.try_search(input))
558 } else if let Some(e) = self.hybrid.get(input) {
559 trace!("using lazy DFA for search at {:?}", input.get_span());
560 Some(e.try_search(&mut cache.hybrid, input))
561 } else {
562 None
563 }
564 }
565
566 fn search_nofail(
567 &self,
568 cache: &mut Cache,
569 input: &Input<'_>,
570 ) -> Option<Match> {
571 let caps = &mut cache.capmatches;
572 caps.set_pattern(None);
573 // We manually inline 'try_search_slots_nofail' here because we need to
574 // borrow from 'cache.capmatches' in this method, but if we do, then
575 // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a
576 // classic example of how the borrow checker inhibits decomposition.
577 // There are of course work-arounds (more types and/or interior
578 // mutability), but that's more annoying than this IMO.
579 let pid = if let Some(ref e) = self.onepass.get(input) {
580 trace!("using OnePass for search at {:?}", input.get_span());
581 e.search_slots(&mut cache.onepass, input, caps.slots_mut())
582 } else if let Some(ref e) = self.backtrack.get(input) {
583 trace!(
584 "using BoundedBacktracker for search at {:?}",
585 input.get_span()
586 );
587 e.search_slots(&mut cache.backtrack, input, caps.slots_mut())
588 } else {
589 trace!("using PikeVM for search at {:?}", input.get_span());
590 let e = self.pikevm.get();
591 e.search_slots(&mut cache.pikevm, input, caps.slots_mut())
592 };
593 caps.set_pattern(pid);
594 caps.get_match()
595 }
596
597 fn search_half_nofail(
598 &self,
599 cache: &mut Cache,
600 input: &Input<'_>,
601 ) -> Option<HalfMatch> {
602 // Only the lazy/full DFA returns half-matches, since the DFA requires
603 // a reverse scan to find the start position. These fallback regex
604 // engines can find the start and end in a single pass, so we just do
605 // that and throw away the start offset to conform to the API.
606 let m = self.search_nofail(cache, input)?;
607 Some(HalfMatch::new(m.pattern(), m.end()))
608 }
609
610 fn search_slots_nofail(
611 &self,
612 cache: &mut Cache,
613 input: &Input<'_>,
614 slots: &mut [Option<NonMaxUsize>],
615 ) -> Option<PatternID> {
616 if let Some(ref e) = self.onepass.get(input) {
617 trace!(
618 "using OnePass for capture search at {:?}",
619 input.get_span()
620 );
621 e.search_slots(&mut cache.onepass, input, slots)
622 } else if let Some(ref e) = self.backtrack.get(input) {
623 trace!(
624 "using BoundedBacktracker for capture search at {:?}",
625 input.get_span()
626 );
627 e.search_slots(&mut cache.backtrack, input, slots)
628 } else {
629 trace!(
630 "using PikeVM for capture search at {:?}",
631 input.get_span()
632 );
633 let e = self.pikevm.get();
634 e.search_slots(&mut cache.pikevm, input, slots)
635 }
636 }
637
638 fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
639 if let Some(ref e) = self.onepass.get(input) {
640 trace!(
641 "using OnePass for is-match search at {:?}",
642 input.get_span()
643 );
644 e.search_slots(&mut cache.onepass, input, &mut []).is_some()
645 } else if let Some(ref e) = self.backtrack.get(input) {
646 trace!(
647 "using BoundedBacktracker for is-match search at {:?}",
648 input.get_span()
649 );
650 e.is_match(&mut cache.backtrack, input)
651 } else {
652 trace!(
653 "using PikeVM for is-match search at {:?}",
654 input.get_span()
655 );
656 let e = self.pikevm.get();
657 e.is_match(&mut cache.pikevm, input)
658 }
659 }
660
661 fn is_capture_search_needed(&self, slots_len: usize) -> bool {
662 slots_len > self.nfa.group_info().implicit_slot_len()
663 }
664}
665
666impl Strategy for Core {
667 #[cfg_attr(feature = "perf-inline", inline(always))]
668 fn group_info(&self) -> &GroupInfo {
669 self.nfa.group_info()
670 }
671
672 #[cfg_attr(feature = "perf-inline", inline(always))]
673 fn create_cache(&self) -> Cache {
674 Cache {
675 capmatches: Captures::all(self.group_info().clone()),
676 pikevm: self.pikevm.create_cache(),
677 backtrack: self.backtrack.create_cache(),
678 onepass: self.onepass.create_cache(),
679 hybrid: self.hybrid.create_cache(),
680 revhybrid: wrappers::ReverseHybridCache::none(),
681 }
682 }
683
684 #[cfg_attr(feature = "perf-inline", inline(always))]
685 fn reset_cache(&self, cache: &mut Cache) {
686 cache.pikevm.reset(&self.pikevm);
687 cache.backtrack.reset(&self.backtrack);
688 cache.onepass.reset(&self.onepass);
689 cache.hybrid.reset(&self.hybrid);
690 }
691
692 fn is_accelerated(&self) -> bool {
693 self.pre.as_ref().map_or(false, |pre| pre.is_fast())
694 }
695
696 fn memory_usage(&self) -> usize {
697 self.info.memory_usage()
698 + self.pre.as_ref().map_or(0, |pre| pre.memory_usage())
699 + self.nfa.memory_usage()
700 + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage())
701 + self.onepass.memory_usage()
702 + self.dfa.memory_usage()
703 }
704
705 #[cfg_attr(feature = "perf-inline", inline(always))]
706 fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
707 // We manually inline try_search_mayfail here because letting the
708 // compiler do it seems to produce pretty crappy codegen.
709 return if let Some(e) = self.dfa.get(input) {
710 trace!("using full DFA for full search at {:?}", input.get_span());
711 match e.try_search(input) {
712 Ok(x) => x,
713 Err(_err) => {
714 trace!("full DFA search failed: {}", _err);
715 self.search_nofail(cache, input)
716 }
717 }
718 } else if let Some(e) = self.hybrid.get(input) {
719 trace!("using lazy DFA for full search at {:?}", input.get_span());
720 match e.try_search(&mut cache.hybrid, input) {
721 Ok(x) => x,
722 Err(_err) => {
723 trace!("lazy DFA search failed: {}", _err);
724 self.search_nofail(cache, input)
725 }
726 }
727 } else {
728 self.search_nofail(cache, input)
729 };
730 }
731
732 #[cfg_attr(feature = "perf-inline", inline(always))]
733 fn search_half(
734 &self,
735 cache: &mut Cache,
736 input: &Input<'_>,
737 ) -> Option<HalfMatch> {
738 // The main difference with 'search' is that if we're using a DFA, we
739 // can use a single forward scan without needing to run the reverse
740 // DFA.
741 if let Some(e) = self.dfa.get(input) {
742 trace!("using full DFA for half search at {:?}", input.get_span());
743 match e.try_search_half_fwd(input) {
744 Ok(x) => x,
745 Err(_err) => {
746 trace!("full DFA half search failed: {}", _err);
747 self.search_half_nofail(cache, input)
748 }
749 }
750 } else if let Some(e) = self.hybrid.get(input) {
751 trace!("using lazy DFA for half search at {:?}", input.get_span());
752 match e.try_search_half_fwd(&mut cache.hybrid, input) {
753 Ok(x) => x,
754 Err(_err) => {
755 trace!("lazy DFA half search failed: {}", _err);
756 self.search_half_nofail(cache, input)
757 }
758 }
759 } else {
760 self.search_half_nofail(cache, input)
761 }
762 }
763
764 #[cfg_attr(feature = "perf-inline", inline(always))]
765 fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
766 if let Some(e) = self.dfa.get(input) {
767 trace!(
768 "using full DFA for is-match search at {:?}",
769 input.get_span()
770 );
771 match e.try_search_half_fwd(input) {
772 Ok(x) => x.is_some(),
773 Err(_err) => {
774 trace!("full DFA half search failed: {}", _err);
775 self.is_match_nofail(cache, input)
776 }
777 }
778 } else if let Some(e) = self.hybrid.get(input) {
779 trace!(
780 "using lazy DFA for is-match search at {:?}",
781 input.get_span()
782 );
783 match e.try_search_half_fwd(&mut cache.hybrid, input) {
784 Ok(x) => x.is_some(),
785 Err(_err) => {
786 trace!("lazy DFA half search failed: {}", _err);
787 self.is_match_nofail(cache, input)
788 }
789 }
790 } else {
791 self.is_match_nofail(cache, input)
792 }
793 }
794
795 #[cfg_attr(feature = "perf-inline", inline(always))]
796 fn search_slots(
797 &self,
798 cache: &mut Cache,
799 input: &Input<'_>,
800 slots: &mut [Option<NonMaxUsize>],
801 ) -> Option<PatternID> {
802 // Even if the regex has explicit capture groups, if the caller didn't
803 // provide any explicit slots, then it doesn't make sense to try and do
804 // extra work to get offsets for those slots. Ideally the caller should
805 // realize this and not call this routine in the first place, but alas,
806 // we try to save the caller from themselves if they do.
807 if !self.is_capture_search_needed(slots.len()) {
808 trace!("asked for slots unnecessarily, trying fast path");
809 let m = self.search(cache, input)?;
810 copy_match_to_slots(m, slots);
811 return Some(m.pattern());
812 }
813 // If the onepass DFA is available for this search (which only happens
814 // when it's anchored), then skip running a fallible DFA. The onepass
815 // DFA isn't as fast as a full or lazy DFA, but it is typically quite
816 // a bit faster than the backtracker or the PikeVM. So it isn't as
817 // advantageous to try and do a full/lazy DFA scan first.
818 //
819 // We still theorize that it's better to do a full/lazy DFA scan, even
820 // when it's anchored, because it's usually much faster and permits us
821 // to say "no match" much more quickly. This does hurt the case of,
822 // say, parsing each line in a log file into capture groups, because
823 // in that case, the line always matches. So the lazy DFA scan is
824 // usually just wasted work. But, the lazy DFA is usually quite fast
825 // and doesn't cost too much here.
826 if self.onepass.get(&input).is_some() {
827 return self.search_slots_nofail(cache, &input, slots);
828 }
829 let m = match self.try_search_mayfail(cache, input) {
830 Some(Ok(Some(m))) => m,
831 Some(Ok(None)) => return None,
832 Some(Err(_err)) => {
833 trace!("fast capture search failed: {}", _err);
834 return self.search_slots_nofail(cache, input, slots);
835 }
836 None => {
837 return self.search_slots_nofail(cache, input, slots);
838 }
839 };
840 // At this point, now that we've found the bounds of the
841 // match, we need to re-run something that can resolve
842 // capturing groups. But we only need to run on it on the
843 // match bounds and not the entire haystack.
844 trace!(
845 "match found at {}..{} in capture search, \
846 using another engine to find captures",
847 m.start(),
848 m.end(),
849 );
850 let input = input
851 .clone()
852 .span(m.start()..m.end())
853 .anchored(Anchored::Pattern(m.pattern()));
854 Some(
855 self.search_slots_nofail(cache, &input, slots)
856 .expect("should find a match"),
857 )
858 }
859
860 #[cfg_attr(feature = "perf-inline", inline(always))]
861 fn which_overlapping_matches(
862 &self,
863 cache: &mut Cache,
864 input: &Input<'_>,
865 patset: &mut PatternSet,
866 ) {
867 if let Some(e) = self.dfa.get(input) {
868 trace!(
869 "using full DFA for overlapping search at {:?}",
870 input.get_span()
871 );
872 let _err = match e.try_which_overlapping_matches(input, patset) {
873 Ok(()) => return,
874 Err(err) => err,
875 };
876 trace!("fast overlapping search failed: {}", _err);
877 } else if let Some(e) = self.hybrid.get(input) {
878 trace!(
879 "using lazy DFA for overlapping search at {:?}",
880 input.get_span()
881 );
882 let _err = match e.try_which_overlapping_matches(
883 &mut cache.hybrid,
884 input,
885 patset,
886 ) {
887 Ok(()) => {
888 return;
889 }
890 Err(err) => err,
891 };
892 trace!("fast overlapping search failed: {}", _err);
893 }
894 trace!(
895 "using PikeVM for overlapping search at {:?}",
896 input.get_span()
897 );
898 let e = self.pikevm.get();
899 e.which_overlapping_matches(&mut cache.pikevm, input, patset)
900 }
901}
902
903#[derive(Debug)]
904struct ReverseAnchored {
905 core: Core,
906}
907
908impl ReverseAnchored {
909 fn new(core: Core) -> Result<ReverseAnchored, Core> {
910 if !core.info.is_always_anchored_end() {
911 debug!(
912 "skipping reverse anchored optimization because \
913 the regex is not always anchored at the end"
914 );
915 return Err(core);
916 }
917 // Note that the caller can still request an anchored search even when
918 // the regex isn't anchored at the start. We detect that case in the
919 // search routines below and just fallback to the core engine. This
920 // is fine because both searches are anchored. It's just a matter of
921 // picking one. Falling back to the core engine is a little simpler,
922 // since if we used the reverse anchored approach, we'd have to add an
923 // extra check to ensure the match reported starts at the place where
924 // the caller requested the search to start.
925 if core.info.is_always_anchored_start() {
926 debug!(
927 "skipping reverse anchored optimization because \
928 the regex is also anchored at the start"
929 );
930 return Err(core);
931 }
932 // Only DFAs can do reverse searches (currently), so we need one of
933 // them in order to do this optimization. It's possible (although
934 // pretty unlikely) that we have neither and need to give up.
935 if !core.hybrid.is_some() && !core.dfa.is_some() {
936 debug!(
937 "skipping reverse anchored optimization because \
938 we don't have a lazy DFA or a full DFA"
939 );
940 return Err(core);
941 }
942 Ok(ReverseAnchored { core })
943 }
944
945 #[cfg_attr(feature = "perf-inline", inline(always))]
946 fn try_search_half_anchored_rev(
947 &self,
948 cache: &mut Cache,
949 input: &Input<'_>,
950 ) -> Result<Option<HalfMatch>, RetryFailError> {
951 // We of course always want an anchored search. In theory, the
952 // underlying regex engines should automatically enable anchored
953 // searches since the regex is itself anchored, but this more clearly
954 // expresses intent and is always correct.
955 let input = input.clone().anchored(Anchored::Yes);
956 if let Some(e) = self.core.dfa.get(&input) {
957 trace!(
958 "using full DFA for reverse anchored search at {:?}",
959 input.get_span()
960 );
961 e.try_search_half_rev(&input)
962 } else if let Some(e) = self.core.hybrid.get(&input) {
963 trace!(
964 "using lazy DFA for reverse anchored search at {:?}",
965 input.get_span()
966 );
967 e.try_search_half_rev(&mut cache.hybrid, &input)
968 } else {
969 unreachable!("ReverseAnchored always has a DFA")
970 }
971 }
972}
973
974// Note that in this impl, we don't check that 'input.end() ==
975// input.haystack().len()'. In particular, when that condition is false, a
976// match is always impossible because we know that the regex is always anchored
977// at the end (or else 'ReverseAnchored' won't be built). We don't check that
978// here because the 'Regex' wrapper actually does that for us in all cases.
979// Thus, in this impl, we can actually assume that the end position in 'input'
980// is equivalent to the length of the haystack.
981impl Strategy for ReverseAnchored {
982 #[cfg_attr(feature = "perf-inline", inline(always))]
983 fn group_info(&self) -> &GroupInfo {
984 self.core.group_info()
985 }
986
987 #[cfg_attr(feature = "perf-inline", inline(always))]
988 fn create_cache(&self) -> Cache {
989 self.core.create_cache()
990 }
991
992 #[cfg_attr(feature = "perf-inline", inline(always))]
993 fn reset_cache(&self, cache: &mut Cache) {
994 self.core.reset_cache(cache);
995 }
996
997 fn is_accelerated(&self) -> bool {
998 // Since this is anchored at the end, a reverse anchored search is
999 // almost certainly guaranteed to result in a much faster search than
1000 // a standard forward search.
1001 true
1002 }
1003
1004 fn memory_usage(&self) -> usize {
1005 self.core.memory_usage()
1006 }
1007
1008 #[cfg_attr(feature = "perf-inline", inline(always))]
1009 fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
1010 if input.get_anchored().is_anchored() {
1011 return self.core.search(cache, input);
1012 }
1013 match self.try_search_half_anchored_rev(cache, input) {
1014 Err(_err) => {
1015 trace!("fast reverse anchored search failed: {}", _err);
1016 self.core.search_nofail(cache, input)
1017 }
1018 Ok(None) => None,
1019 Ok(Some(hm)) => {
1020 Some(Match::new(hm.pattern(), hm.offset()..input.end()))
1021 }
1022 }
1023 }
1024
1025 #[cfg_attr(feature = "perf-inline", inline(always))]
1026 fn search_half(
1027 &self,
1028 cache: &mut Cache,
1029 input: &Input<'_>,
1030 ) -> Option<HalfMatch> {
1031 if input.get_anchored().is_anchored() {
1032 return self.core.search_half(cache, input);
1033 }
1034 match self.try_search_half_anchored_rev(cache, input) {
1035 Err(_err) => {
1036 trace!("fast reverse anchored search failed: {}", _err);
1037 self.core.search_half_nofail(cache, input)
1038 }
1039 Ok(None) => None,
1040 Ok(Some(hm)) => {
1041 // Careful here! 'try_search_half' is a *forward* search that
1042 // only cares about the *end* position of a match. But
1043 // 'hm.offset()' is actually the start of the match. So we
1044 // actually just throw that away here and, since we know we
1045 // have a match, return the only possible position at which a
1046 // match can occur: input.end().
1047 Some(HalfMatch::new(hm.pattern(), input.end()))
1048 }
1049 }
1050 }
1051
1052 #[cfg_attr(feature = "perf-inline", inline(always))]
1053 fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
1054 if input.get_anchored().is_anchored() {
1055 return self.core.is_match(cache, input);
1056 }
1057 match self.try_search_half_anchored_rev(cache, input) {
1058 Err(_err) => {
1059 trace!("fast reverse anchored search failed: {}", _err);
1060 self.core.is_match_nofail(cache, input)
1061 }
1062 Ok(None) => false,
1063 Ok(Some(_)) => true,
1064 }
1065 }
1066
1067 #[cfg_attr(feature = "perf-inline", inline(always))]
1068 fn search_slots(
1069 &self,
1070 cache: &mut Cache,
1071 input: &Input<'_>,
1072 slots: &mut [Option<NonMaxUsize>],
1073 ) -> Option<PatternID> {
1074 if input.get_anchored().is_anchored() {
1075 return self.core.search_slots(cache, input, slots);
1076 }
1077 match self.try_search_half_anchored_rev(cache, input) {
1078 Err(_err) => {
1079 trace!("fast reverse anchored search failed: {}", _err);
1080 self.core.search_slots_nofail(cache, input, slots)
1081 }
1082 Ok(None) => None,
1083 Ok(Some(hm)) => {
1084 if !self.core.is_capture_search_needed(slots.len()) {
1085 trace!("asked for slots unnecessarily, skipping captures");
1086 let m = Match::new(hm.pattern(), hm.offset()..input.end());
1087 copy_match_to_slots(m, slots);
1088 return Some(m.pattern());
1089 }
1090 let start = hm.offset();
1091 let input = input
1092 .clone()
1093 .span(start..input.end())
1094 .anchored(Anchored::Pattern(hm.pattern()));
1095 self.core.search_slots_nofail(cache, &input, slots)
1096 }
1097 }
1098 }
1099
1100 #[cfg_attr(feature = "perf-inline", inline(always))]
1101 fn which_overlapping_matches(
1102 &self,
1103 cache: &mut Cache,
1104 input: &Input<'_>,
1105 patset: &mut PatternSet,
1106 ) {
1107 // It seems like this could probably benefit from a reverse anchored
1108 // optimization, perhaps by doing an overlapping reverse search (which
1109 // the DFAs do support). I haven't given it much thought though, and
1110 // I'm currently focus more on the single pattern case.
1111 self.core.which_overlapping_matches(cache, input, patset)
1112 }
1113}
1114
1115#[derive(Debug)]
1116struct ReverseSuffix {
1117 core: Core,
1118 pre: Prefilter,
1119}
1120
1121impl ReverseSuffix {
1122 fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseSuffix, Core> {
1123 if !core.info.config().get_auto_prefilter() {
1124 debug!(
1125 "skipping reverse suffix optimization because \
1126 automatic prefilters are disabled"
1127 );
1128 return Err(core);
1129 }
1130 // Like the reverse inner optimization, we don't do this for regexes
1131 // that are always anchored. It could lead to scanning too much, but
1132 // could say "no match" much more quickly than running the regex
1133 // engine if the initial literal scan doesn't match. With that said,
1134 // the reverse suffix optimization has lower overhead, since it only
1135 // requires a reverse scan after a literal match to confirm or reject
1136 // the match. (Although, in the case of confirmation, it then needs to
1137 // do another forward scan to find the end position.)
1138 //
1139 // Note that the caller can still request an anchored search even
1140 // when the regex isn't anchored. We detect that case in the search
1141 // routines below and just fallback to the core engine. Currently this
1142 // optimization assumes all searches are unanchored, so if we do want
1143 // to enable this optimization for anchored searches, it will need a
1144 // little work to support it.
1145 if core.info.is_always_anchored_start() {
1146 debug!(
1147 "skipping reverse suffix optimization because \
1148 the regex is always anchored at the start",
1149 );
1150 return Err(core);
1151 }
1152 // Only DFAs can do reverse searches (currently), so we need one of
1153 // them in order to do this optimization. It's possible (although
1154 // pretty unlikely) that we have neither and need to give up.
1155 if !core.hybrid.is_some() && !core.dfa.is_some() {
1156 debug!(
1157 "skipping reverse suffix optimization because \
1158 we don't have a lazy DFA or a full DFA"
1159 );
1160 return Err(core);
1161 }
1162 if core.pre.as_ref().map_or(false, |p| p.is_fast()) {
1163 debug!(
1164 "skipping reverse suffix optimization because \
1165 we already have a prefilter that we think is fast"
1166 );
1167 return Err(core);
1168 }
1169 let kind = core.info.config().get_match_kind();
1170 let suffixes = crate::util::prefilter::suffixes(kind, hirs);
1171 let lcs = match suffixes.longest_common_suffix() {
1172 None => {
1173 debug!(
1174 "skipping reverse suffix optimization because \
1175 a longest common suffix could not be found",
1176 );
1177 return Err(core);
1178 }
1179 Some(lcs) if lcs.is_empty() => {
1180 debug!(
1181 "skipping reverse suffix optimization because \
1182 the longest common suffix is the empty string",
1183 );
1184 return Err(core);
1185 }
1186 Some(lcs) => lcs,
1187 };
1188 let pre = match Prefilter::new(kind, &[lcs]) {
1189 Some(pre) => pre,
1190 None => {
1191 debug!(
1192 "skipping reverse suffix optimization because \
1193 a prefilter could not be constructed from the \
1194 longest common suffix",
1195 );
1196 return Err(core);
1197 }
1198 };
1199 if !pre.is_fast() {
1200 debug!(
1201 "skipping reverse suffix optimization because \
1202 while we have a suffix prefilter, it is not \
1203 believed to be 'fast'"
1204 );
1205 return Err(core);
1206 }
1207 Ok(ReverseSuffix { core, pre })
1208 }
1209
1210 #[cfg_attr(feature = "perf-inline", inline(always))]
1211 fn try_search_half_start(
1212 &self,
1213 cache: &mut Cache,
1214 input: &Input<'_>,
1215 ) -> Result<Option<HalfMatch>, RetryError> {
1216 let mut span = input.get_span();
1217 let mut min_start = 0;
1218 loop {
1219 let litmatch = match self.pre.find(input.haystack(), span) {
1220 None => return Ok(None),
1221 Some(span) => span,
1222 };
1223 trace!("reverse suffix scan found suffix match at {:?}", litmatch);
1224 let revinput = input
1225 .clone()
1226 .anchored(Anchored::Yes)
1227 .span(input.start()..litmatch.end);
1228 match self
1229 .try_search_half_rev_limited(cache, &revinput, min_start)?
1230 {
1231 None => {
1232 if span.start >= span.end {
1233 break;
1234 }
1235 span.start = litmatch.start.checked_add(1).unwrap();
1236 }
1237 Some(hm) => return Ok(Some(hm)),
1238 }
1239 min_start = litmatch.end;
1240 }
1241 Ok(None)
1242 }
1243
1244 #[cfg_attr(feature = "perf-inline", inline(always))]
1245 fn try_search_half_fwd(
1246 &self,
1247 cache: &mut Cache,
1248 input: &Input<'_>,
1249 ) -> Result<Option<HalfMatch>, RetryFailError> {
1250 if let Some(e) = self.core.dfa.get(&input) {
1251 trace!(
1252 "using full DFA for forward reverse suffix search at {:?}",
1253 input.get_span()
1254 );
1255 e.try_search_half_fwd(&input)
1256 } else if let Some(e) = self.core.hybrid.get(&input) {
1257 trace!(
1258 "using lazy DFA for forward reverse suffix search at {:?}",
1259 input.get_span()
1260 );
1261 e.try_search_half_fwd(&mut cache.hybrid, &input)
1262 } else {
1263 unreachable!("ReverseSuffix always has a DFA")
1264 }
1265 }
1266
1267 #[cfg_attr(feature = "perf-inline", inline(always))]
1268 fn try_search_half_rev_limited(
1269 &self,
1270 cache: &mut Cache,
1271 input: &Input<'_>,
1272 min_start: usize,
1273 ) -> Result<Option<HalfMatch>, RetryError> {
1274 if let Some(e) = self.core.dfa.get(&input) {
1275 trace!(
1276 "using full DFA for reverse suffix search at {:?}, \
1277 but will be stopped at {} to avoid quadratic behavior",
1278 input.get_span(),
1279 min_start,
1280 );
1281 e.try_search_half_rev_limited(&input, min_start)
1282 } else if let Some(e) = self.core.hybrid.get(&input) {
1283 trace!(
1284 "using lazy DFA for reverse suffix search at {:?}, \
1285 but will be stopped at {} to avoid quadratic behavior",
1286 input.get_span(),
1287 min_start,
1288 );
1289 e.try_search_half_rev_limited(&mut cache.hybrid, &input, min_start)
1290 } else {
1291 unreachable!("ReverseSuffix always has a DFA")
1292 }
1293 }
1294}
1295
1296impl Strategy for ReverseSuffix {
1297 #[cfg_attr(feature = "perf-inline", inline(always))]
1298 fn group_info(&self) -> &GroupInfo {
1299 self.core.group_info()
1300 }
1301
1302 #[cfg_attr(feature = "perf-inline", inline(always))]
1303 fn create_cache(&self) -> Cache {
1304 self.core.create_cache()
1305 }
1306
1307 #[cfg_attr(feature = "perf-inline", inline(always))]
1308 fn reset_cache(&self, cache: &mut Cache) {
1309 self.core.reset_cache(cache);
1310 }
1311
1312 fn is_accelerated(&self) -> bool {
1313 self.pre.is_fast()
1314 }
1315
1316 fn memory_usage(&self) -> usize {
1317 self.core.memory_usage() + self.pre.memory_usage()
1318 }
1319
1320 #[cfg_attr(feature = "perf-inline", inline(always))]
1321 fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
1322 if input.get_anchored().is_anchored() {
1323 return self.core.search(cache, input);
1324 }
1325 match self.try_search_half_start(cache, input) {
1326 Err(RetryError::Quadratic(_err)) => {
1327 trace!("reverse suffix optimization failed: {}", _err);
1328 self.core.search(cache, input)
1329 }
1330 Err(RetryError::Fail(_err)) => {
1331 trace!("reverse suffix reverse fast search failed: {}", _err);
1332 self.core.search_nofail(cache, input)
1333 }
1334 Ok(None) => None,
1335 Ok(Some(hm_start)) => {
1336 let fwdinput = input
1337 .clone()
1338 .anchored(Anchored::Pattern(hm_start.pattern()))
1339 .span(hm_start.offset()..input.end());
1340 match self.try_search_half_fwd(cache, &fwdinput) {
1341 Err(_err) => {
1342 trace!(
1343 "reverse suffix forward fast search failed: {}",
1344 _err
1345 );
1346 self.core.search_nofail(cache, input)
1347 }
1348 Ok(None) => {
1349 unreachable!(
1350 "suffix match plus reverse match implies \
1351 there must be a match",
1352 )
1353 }
1354 Ok(Some(hm_end)) => Some(Match::new(
1355 hm_start.pattern(),
1356 hm_start.offset()..hm_end.offset(),
1357 )),
1358 }
1359 }
1360 }
1361 }
1362
1363 #[cfg_attr(feature = "perf-inline", inline(always))]
1364 fn search_half(
1365 &self,
1366 cache: &mut Cache,
1367 input: &Input<'_>,
1368 ) -> Option<HalfMatch> {
1369 if input.get_anchored().is_anchored() {
1370 return self.core.search_half(cache, input);
1371 }
1372 match self.try_search_half_start(cache, input) {
1373 Err(RetryError::Quadratic(_err)) => {
1374 trace!("reverse suffix half optimization failed: {}", _err);
1375 self.core.search_half(cache, input)
1376 }
1377 Err(RetryError::Fail(_err)) => {
1378 trace!(
1379 "reverse suffix reverse fast half search failed: {}",
1380 _err
1381 );
1382 self.core.search_half_nofail(cache, input)
1383 }
1384 Ok(None) => None,
1385 Ok(Some(hm_start)) => {
1386 // This is a bit subtle. It is tempting to just stop searching
1387 // at this point and return a half-match with an offset
1388 // corresponding to where the suffix was found. But the suffix
1389 // match does not necessarily correspond to the end of the
1390 // proper leftmost-first match. Consider /[a-z]+ing/ against
1391 // 'tingling'. The first suffix match is the first 'ing', and
1392 // the /[a-z]+/ matches the 't'. So if we stopped here, then
1393 // we'd report 'ting' as the match. But 'tingling' is the
1394 // correct match because of greediness.
1395 let fwdinput = input
1396 .clone()
1397 .anchored(Anchored::Pattern(hm_start.pattern()))
1398 .span(hm_start.offset()..input.end());
1399 match self.try_search_half_fwd(cache, &fwdinput) {
1400 Err(_err) => {
1401 trace!(
1402 "reverse suffix forward fast search failed: {}",
1403 _err
1404 );
1405 self.core.search_half_nofail(cache, input)
1406 }
1407 Ok(None) => {
1408 unreachable!(
1409 "suffix match plus reverse match implies \
1410 there must be a match",
1411 )
1412 }
1413 Ok(Some(hm_end)) => Some(hm_end),
1414 }
1415 }
1416 }
1417 }
1418
1419 #[cfg_attr(feature = "perf-inline", inline(always))]
1420 fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
1421 if input.get_anchored().is_anchored() {
1422 return self.core.is_match(cache, input);
1423 }
1424 match self.try_search_half_start(cache, input) {
1425 Err(RetryError::Quadratic(_err)) => {
1426 trace!("reverse suffix half optimization failed: {}", _err);
1427 self.core.is_match_nofail(cache, input)
1428 }
1429 Err(RetryError::Fail(_err)) => {
1430 trace!(
1431 "reverse suffix reverse fast half search failed: {}",
1432 _err
1433 );
1434 self.core.is_match_nofail(cache, input)
1435 }
1436 Ok(None) => false,
1437 Ok(Some(_)) => true,
1438 }
1439 }
1440
1441 #[cfg_attr(feature = "perf-inline", inline(always))]
1442 fn search_slots(
1443 &self,
1444 cache: &mut Cache,
1445 input: &Input<'_>,
1446 slots: &mut [Option<NonMaxUsize>],
1447 ) -> Option<PatternID> {
1448 if input.get_anchored().is_anchored() {
1449 return self.core.search_slots(cache, input, slots);
1450 }
1451 if !self.core.is_capture_search_needed(slots.len()) {
1452 trace!("asked for slots unnecessarily, trying fast path");
1453 let m = self.search(cache, input)?;
1454 copy_match_to_slots(m, slots);
1455 return Some(m.pattern());
1456 }
1457 let hm_start = match self.try_search_half_start(cache, input) {
1458 Err(RetryError::Quadratic(_err)) => {
1459 trace!(
1460 "reverse suffix captures optimization failed: {}",
1461 _err
1462 );
1463 return self.core.search_slots(cache, input, slots);
1464 }
1465 Err(RetryError::Fail(_err)) => {
1466 trace!(
1467 "reverse suffix reverse fast captures search failed: {}",
1468 _err
1469 );
1470 return self.core.search_slots_nofail(cache, input, slots);
1471 }
1472 Ok(None) => return None,
1473 Ok(Some(hm_start)) => hm_start,
1474 };
1475 trace!(
1476 "match found at {}..{} in capture search, \
1477 using another engine to find captures",
1478 hm_start.offset(),
1479 input.end(),
1480 );
1481 let start = hm_start.offset();
1482 let input = input
1483 .clone()
1484 .span(start..input.end())
1485 .anchored(Anchored::Pattern(hm_start.pattern()));
1486 self.core.search_slots_nofail(cache, &input, slots)
1487 }
1488
1489 #[cfg_attr(feature = "perf-inline", inline(always))]
1490 fn which_overlapping_matches(
1491 &self,
1492 cache: &mut Cache,
1493 input: &Input<'_>,
1494 patset: &mut PatternSet,
1495 ) {
1496 self.core.which_overlapping_matches(cache, input, patset)
1497 }
1498}
1499
1500#[derive(Debug)]
1501struct ReverseInner {
1502 core: Core,
1503 preinner: Prefilter,
1504 nfarev: NFA,
1505 hybrid: wrappers::ReverseHybrid,
1506 dfa: wrappers::ReverseDFA,
1507}
1508
1509impl ReverseInner {
1510 fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseInner, Core> {
1511 if !core.info.config().get_auto_prefilter() {
1512 debug!(
1513 "skipping reverse inner optimization because \
1514 automatic prefilters are disabled"
1515 );
1516 return Err(core);
1517 }
1518 // Currently we hard-code the assumption of leftmost-first match
1519 // semantics. This isn't a huge deal because 'all' semantics tend to
1520 // only be used for forward overlapping searches with multiple regexes,
1521 // and this optimization only supports a single pattern at the moment.
1522 if core.info.config().get_match_kind() != MatchKind::LeftmostFirst {
1523 debug!(
1524 "skipping reverse inner optimization because \
1525 match kind is {:?} but this only supports leftmost-first",
1526 core.info.config().get_match_kind(),
1527 );
1528 return Err(core);
1529 }
1530 // It's likely that a reverse inner scan has too much overhead for it
1531 // to be worth it when the regex is anchored at the start. It is
1532 // possible for it to be quite a bit faster if the initial literal
1533 // scan fails to detect a match, in which case, we can say "no match"
1534 // very quickly. But this could be undesirable, e.g., scanning too far
1535 // or when the literal scan matches. If it matches, then confirming the
1536 // match requires a reverse scan followed by a forward scan to confirm
1537 // or reject, which is a fair bit of work.
1538 //
1539 // Note that the caller can still request an anchored search even
1540 // when the regex isn't anchored. We detect that case in the search
1541 // routines below and just fallback to the core engine. Currently this
1542 // optimization assumes all searches are unanchored, so if we do want
1543 // to enable this optimization for anchored searches, it will need a
1544 // little work to support it.
1545 if core.info.is_always_anchored_start() {
1546 debug!(
1547 "skipping reverse inner optimization because \
1548 the regex is always anchored at the start",
1549 );
1550 return Err(core);
1551 }
1552 // Only DFAs can do reverse searches (currently), so we need one of
1553 // them in order to do this optimization. It's possible (although
1554 // pretty unlikely) that we have neither and need to give up.
1555 if !core.hybrid.is_some() && !core.dfa.is_some() {
1556 debug!(
1557 "skipping reverse inner optimization because \
1558 we don't have a lazy DFA or a full DFA"
1559 );
1560 return Err(core);
1561 }
1562 if core.pre.as_ref().map_or(false, |p| p.is_fast()) {
1563 debug!(
1564 "skipping reverse inner optimization because \
1565 we already have a prefilter that we think is fast"
1566 );
1567 return Err(core);
1568 } else if core.pre.is_some() {
1569 debug!(
1570 "core engine has a prefix prefilter, but it is \
1571 probably not fast, so continuing with attempt to \
1572 use reverse inner prefilter"
1573 );
1574 }
1575 let (concat_prefix, preinner) = match reverse_inner::extract(hirs) {
1576 Some(x) => x,
1577 // N.B. the 'extract' function emits debug messages explaining
1578 // why we bailed out here.
1579 None => return Err(core),
1580 };
1581 debug!("building reverse NFA for prefix before inner literal");
1582 let mut lookm = LookMatcher::new();
1583 lookm.set_line_terminator(core.info.config().get_line_terminator());
1584 let thompson_config = thompson::Config::new()
1585 .reverse(true)
1586 .utf8(core.info.config().get_utf8_empty())
1587 .nfa_size_limit(core.info.config().get_nfa_size_limit())
1588 .shrink(false)
1589 .which_captures(WhichCaptures::None)
1590 .look_matcher(lookm);
1591 let result = thompson::Compiler::new()
1592 .configure(thompson_config)
1593 .build_from_hir(&concat_prefix);
1594 let nfarev = match result {
1595 Ok(nfarev) => nfarev,
1596 Err(_err) => {
1597 debug!(
1598 "skipping reverse inner optimization because the \
1599 reverse NFA failed to build: {}",
1600 _err,
1601 );
1602 return Err(core);
1603 }
1604 };
1605 debug!("building reverse DFA for prefix before inner literal");
1606 let dfa = if !core.info.config().get_dfa() {
1607 wrappers::ReverseDFA::none()
1608 } else {
1609 wrappers::ReverseDFA::new(&core.info, &nfarev)
1610 };
1611 let hybrid = if !core.info.config().get_hybrid() {
1612 wrappers::ReverseHybrid::none()
1613 } else if dfa.is_some() {
1614 debug!(
1615 "skipping lazy DFA for reverse inner optimization \
1616 because we have a full DFA"
1617 );
1618 wrappers::ReverseHybrid::none()
1619 } else {
1620 wrappers::ReverseHybrid::new(&core.info, &nfarev)
1621 };
1622 Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa })
1623 }
1624
1625 #[cfg_attr(feature = "perf-inline", inline(always))]
1626 fn try_search_full(
1627 &self,
1628 cache: &mut Cache,
1629 input: &Input<'_>,
1630 ) -> Result<Option<Match>, RetryError> {
1631 let mut span = input.get_span();
1632 let mut min_match_start = 0;
1633 let mut min_pre_start = 0;
1634 loop {
1635 let litmatch = match self.preinner.find(input.haystack(), span) {
1636 None => return Ok(None),
1637 Some(span) => span,
1638 };
1639 if litmatch.start < min_pre_start {
1640 trace!(
1641 "found inner prefilter match at {:?}, which starts \
1642 before the end of the last forward scan at {}, \
1643 quitting to avoid quadratic behavior",
1644 litmatch,
1645 min_pre_start,
1646 );
1647 return Err(RetryError::Quadratic(RetryQuadraticError::new()));
1648 }
1649 trace!("reverse inner scan found inner match at {:?}", litmatch);
1650 let revinput = input
1651 .clone()
1652 .anchored(Anchored::Yes)
1653 .span(input.start()..litmatch.start);
1654 // Note that in addition to the literal search above scanning past
1655 // our minimum start point, this routine can also return an error
1656 // as a result of detecting possible quadratic behavior if the
1657 // reverse scan goes past the minimum start point. That is, the
1658 // literal search might not, but the reverse regex search for the
1659 // prefix might!
1660 match self.try_search_half_rev_limited(
1661 cache,
1662 &revinput,
1663 min_match_start,
1664 )? {
1665 None => {
1666 if span.start >= span.end {
1667 break;
1668 }
1669 span.start = litmatch.start.checked_add(1).unwrap();
1670 }
1671 Some(hm_start) => {
1672 let fwdinput = input
1673 .clone()
1674 .anchored(Anchored::Pattern(hm_start.pattern()))
1675 .span(hm_start.offset()..input.end());
1676 match self.try_search_half_fwd_stopat(cache, &fwdinput)? {
1677 Err(stopat) => {
1678 min_pre_start = stopat;
1679 span.start =
1680 litmatch.start.checked_add(1).unwrap();
1681 }
1682 Ok(hm_end) => {
1683 return Ok(Some(Match::new(
1684 hm_start.pattern(),
1685 hm_start.offset()..hm_end.offset(),
1686 )))
1687 }
1688 }
1689 }
1690 }
1691 min_match_start = litmatch.end;
1692 }
1693 Ok(None)
1694 }
1695
1696 #[cfg_attr(feature = "perf-inline", inline(always))]
1697 fn try_search_half_fwd_stopat(
1698 &self,
1699 cache: &mut Cache,
1700 input: &Input<'_>,
1701 ) -> Result<Result<HalfMatch, usize>, RetryFailError> {
1702 if let Some(e) = self.core.dfa.get(&input) {
1703 trace!(
1704 "using full DFA for forward reverse inner search at {:?}",
1705 input.get_span()
1706 );
1707 e.try_search_half_fwd_stopat(&input)
1708 } else if let Some(e) = self.core.hybrid.get(&input) {
1709 trace!(
1710 "using lazy DFA for forward reverse inner search at {:?}",
1711 input.get_span()
1712 );
1713 e.try_search_half_fwd_stopat(&mut cache.hybrid, &input)
1714 } else {
1715 unreachable!("ReverseInner always has a DFA")
1716 }
1717 }
1718
1719 #[cfg_attr(feature = "perf-inline", inline(always))]
1720 fn try_search_half_rev_limited(
1721 &self,
1722 cache: &mut Cache,
1723 input: &Input<'_>,
1724 min_start: usize,
1725 ) -> Result<Option<HalfMatch>, RetryError> {
1726 if let Some(e) = self.dfa.get(&input) {
1727 trace!(
1728 "using full DFA for reverse inner search at {:?}, \
1729 but will be stopped at {} to avoid quadratic behavior",
1730 input.get_span(),
1731 min_start,
1732 );
1733 e.try_search_half_rev_limited(&input, min_start)
1734 } else if let Some(e) = self.hybrid.get(&input) {
1735 trace!(
1736 "using lazy DFA for reverse inner search at {:?}, \
1737 but will be stopped at {} to avoid quadratic behavior",
1738 input.get_span(),
1739 min_start,
1740 );
1741 e.try_search_half_rev_limited(
1742 &mut cache.revhybrid,
1743 &input,
1744 min_start,
1745 )
1746 } else {
1747 unreachable!("ReverseInner always has a DFA")
1748 }
1749 }
1750}
1751
1752impl Strategy for ReverseInner {
1753 #[cfg_attr(feature = "perf-inline", inline(always))]
1754 fn group_info(&self) -> &GroupInfo {
1755 self.core.group_info()
1756 }
1757
1758 #[cfg_attr(feature = "perf-inline", inline(always))]
1759 fn create_cache(&self) -> Cache {
1760 let mut cache = self.core.create_cache();
1761 cache.revhybrid = self.hybrid.create_cache();
1762 cache
1763 }
1764
1765 #[cfg_attr(feature = "perf-inline", inline(always))]
1766 fn reset_cache(&self, cache: &mut Cache) {
1767 self.core.reset_cache(cache);
1768 cache.revhybrid.reset(&self.hybrid);
1769 }
1770
1771 fn is_accelerated(&self) -> bool {
1772 self.preinner.is_fast()
1773 }
1774
1775 fn memory_usage(&self) -> usize {
1776 self.core.memory_usage()
1777 + self.preinner.memory_usage()
1778 + self.nfarev.memory_usage()
1779 + self.dfa.memory_usage()
1780 }
1781
1782 #[cfg_attr(feature = "perf-inline", inline(always))]
1783 fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
1784 if input.get_anchored().is_anchored() {
1785 return self.core.search(cache, input);
1786 }
1787 match self.try_search_full(cache, input) {
1788 Err(RetryError::Quadratic(_err)) => {
1789 trace!("reverse inner optimization failed: {}", _err);
1790 self.core.search(cache, input)
1791 }
1792 Err(RetryError::Fail(_err)) => {
1793 trace!("reverse inner fast search failed: {}", _err);
1794 self.core.search_nofail(cache, input)
1795 }
1796 Ok(matornot) => matornot,
1797 }
1798 }
1799
1800 #[cfg_attr(feature = "perf-inline", inline(always))]
1801 fn search_half(
1802 &self,
1803 cache: &mut Cache,
1804 input: &Input<'_>,
1805 ) -> Option<HalfMatch> {
1806 if input.get_anchored().is_anchored() {
1807 return self.core.search_half(cache, input);
1808 }
1809 match self.try_search_full(cache, input) {
1810 Err(RetryError::Quadratic(_err)) => {
1811 trace!("reverse inner half optimization failed: {}", _err);
1812 self.core.search_half(cache, input)
1813 }
1814 Err(RetryError::Fail(_err)) => {
1815 trace!("reverse inner fast half search failed: {}", _err);
1816 self.core.search_half_nofail(cache, input)
1817 }
1818 Ok(None) => None,
1819 Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())),
1820 }
1821 }
1822
1823 #[cfg_attr(feature = "perf-inline", inline(always))]
1824 fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
1825 if input.get_anchored().is_anchored() {
1826 return self.core.is_match(cache, input);
1827 }
1828 match self.try_search_full(cache, input) {
1829 Err(RetryError::Quadratic(_err)) => {
1830 trace!("reverse inner half optimization failed: {}", _err);
1831 self.core.is_match_nofail(cache, input)
1832 }
1833 Err(RetryError::Fail(_err)) => {
1834 trace!("reverse inner fast half search failed: {}", _err);
1835 self.core.is_match_nofail(cache, input)
1836 }
1837 Ok(None) => false,
1838 Ok(Some(_)) => true,
1839 }
1840 }
1841
1842 #[cfg_attr(feature = "perf-inline", inline(always))]
1843 fn search_slots(
1844 &self,
1845 cache: &mut Cache,
1846 input: &Input<'_>,
1847 slots: &mut [Option<NonMaxUsize>],
1848 ) -> Option<PatternID> {
1849 if input.get_anchored().is_anchored() {
1850 return self.core.search_slots(cache, input, slots);
1851 }
1852 if !self.core.is_capture_search_needed(slots.len()) {
1853 trace!("asked for slots unnecessarily, trying fast path");
1854 let m = self.search(cache, input)?;
1855 copy_match_to_slots(m, slots);
1856 return Some(m.pattern());
1857 }
1858 let m = match self.try_search_full(cache, input) {
1859 Err(RetryError::Quadratic(_err)) => {
1860 trace!("reverse inner captures optimization failed: {}", _err);
1861 return self.core.search_slots(cache, input, slots);
1862 }
1863 Err(RetryError::Fail(_err)) => {
1864 trace!("reverse inner fast captures search failed: {}", _err);
1865 return self.core.search_slots_nofail(cache, input, slots);
1866 }
1867 Ok(None) => return None,
1868 Ok(Some(m)) => m,
1869 };
1870 trace!(
1871 "match found at {}..{} in capture search, \
1872 using another engine to find captures",
1873 m.start(),
1874 m.end(),
1875 );
1876 let input = input
1877 .clone()
1878 .span(m.start()..m.end())
1879 .anchored(Anchored::Pattern(m.pattern()));
1880 self.core.search_slots_nofail(cache, &input, slots)
1881 }
1882
1883 #[cfg_attr(feature = "perf-inline", inline(always))]
1884 fn which_overlapping_matches(
1885 &self,
1886 cache: &mut Cache,
1887 input: &Input<'_>,
1888 patset: &mut PatternSet,
1889 ) {
1890 self.core.which_overlapping_matches(cache, input, patset)
1891 }
1892}
1893
1894/// Copies the offsets in the given match to the corresponding positions in
1895/// `slots`.
1896///
1897/// In effect, this sets the slots corresponding to the implicit group for the
1898/// pattern in the given match. If the indices for the corresponding slots do
1899/// not exist, then no slots are set.
1900///
1901/// This is useful when the caller provides slots (or captures), but you use a
1902/// regex engine that doesn't operate on slots (like a lazy DFA). This function
1903/// lets you map the match you get back to the slots provided by the caller.
1904#[cfg_attr(feature = "perf-inline", inline(always))]
1905fn copy_match_to_slots(m: Match, slots: &mut [Option<NonMaxUsize>]) {
1906 let slot_start = m.pattern().as_usize() * 2;
1907 let slot_end = slot_start + 1;
1908 if let Some(slot) = slots.get_mut(slot_start) {
1909 *slot = NonMaxUsize::new(m.start());
1910 }
1911 if let Some(slot) = slots.get_mut(slot_end) {
1912 *slot = NonMaxUsize::new(m.end());
1913 }
1914}
1915