| 1 | use core::{ |
| 2 | fmt::Debug, |
| 3 | panic::{RefUnwindSafe, UnwindSafe}, |
| 4 | }; |
| 5 | |
| 6 | use alloc::sync::Arc; |
| 7 | |
| 8 | use regex_syntax::hir::{literal, Hir}; |
| 9 | |
| 10 | use crate::{ |
| 11 | meta::{ |
| 12 | error::{BuildError, RetryError, RetryFailError, RetryQuadraticError}, |
| 13 | regex::{Cache, RegexInfo}, |
| 14 | reverse_inner, wrappers, |
| 15 | }, |
| 16 | nfa::thompson::{self, WhichCaptures, NFA}, |
| 17 | util::{ |
| 18 | captures::{Captures, GroupInfo}, |
| 19 | look::LookMatcher, |
| 20 | prefilter::{self, Prefilter, PrefilterI}, |
| 21 | primitives::{NonMaxUsize, PatternID}, |
| 22 | search::{Anchored, HalfMatch, Input, Match, MatchKind, PatternSet}, |
| 23 | }, |
| 24 | }; |
| 25 | |
| 26 | /// A trait that represents a single meta strategy. Its main utility is in |
| 27 | /// providing a way to do dynamic dispatch over a few choices. |
| 28 | /// |
| 29 | /// Why dynamic dispatch? I actually don't have a super compelling reason, and |
| 30 | /// importantly, I have not benchmarked it with the main alternative: an enum. |
| 31 | /// I went with dynamic dispatch initially because the regex engine search code |
| 32 | /// really can't be inlined into caller code in most cases because it's just |
| 33 | /// too big. In other words, it is already expected that every regex search |
| 34 | /// will entail at least the cost of a function call. |
| 35 | /// |
| 36 | /// I do wonder whether using enums would result in better codegen overall |
| 37 | /// though. It's a worthwhile experiment to try. Probably the most interesting |
| 38 | /// benchmark to run in such a case would be one with a high match count. That |
| 39 | /// is, a benchmark to test the overall latency of a search call. |
| 40 | pub(super) trait Strategy: |
| 41 | Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static |
| 42 | { |
| 43 | fn group_info(&self) -> &GroupInfo; |
| 44 | |
| 45 | fn create_cache(&self) -> Cache; |
| 46 | |
| 47 | fn reset_cache(&self, cache: &mut Cache); |
| 48 | |
| 49 | fn is_accelerated(&self) -> bool; |
| 50 | |
| 51 | fn memory_usage(&self) -> usize; |
| 52 | |
| 53 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match>; |
| 54 | |
| 55 | fn search_half( |
| 56 | &self, |
| 57 | cache: &mut Cache, |
| 58 | input: &Input<'_>, |
| 59 | ) -> Option<HalfMatch>; |
| 60 | |
| 61 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; |
| 62 | |
| 63 | fn search_slots( |
| 64 | &self, |
| 65 | cache: &mut Cache, |
| 66 | input: &Input<'_>, |
| 67 | slots: &mut [Option<NonMaxUsize>], |
| 68 | ) -> Option<PatternID>; |
| 69 | |
| 70 | fn which_overlapping_matches( |
| 71 | &self, |
| 72 | cache: &mut Cache, |
| 73 | input: &Input<'_>, |
| 74 | patset: &mut PatternSet, |
| 75 | ); |
| 76 | } |
| 77 | |
| 78 | pub(super) fn new( |
| 79 | info: &RegexInfo, |
| 80 | hirs: &[&Hir], |
| 81 | ) -> Result<Arc<dyn Strategy>, BuildError> { |
| 82 | // At this point, we're committed to a regex engine of some kind. So pull |
| 83 | // out a prefilter if we can, which will feed to each of the constituent |
| 84 | // regex engines. |
| 85 | let pre = if info.is_always_anchored_start() { |
| 86 | // PERF: I'm not sure we necessarily want to do this... We may want to |
| 87 | // run a prefilter for quickly rejecting in some cases. The problem |
| 88 | // is that anchored searches overlap quite a bit with the use case |
| 89 | // of "run a regex on every line to extract data." In that case, the |
| 90 | // regex always matches, so running a prefilter doesn't really help us |
| 91 | // there. The main place where a prefilter helps in an anchored search |
| 92 | // is if the anchored search is not expected to match frequently. That |
| 93 | // is, the prefilter gives us a way to possibly reject a haystack very |
| 94 | // quickly. |
| 95 | // |
| 96 | // Maybe we should do use a prefilter, but only for longer haystacks? |
| 97 | // Or maybe we should only use a prefilter when we think it's "fast"? |
| 98 | // |
| 99 | // Interestingly, I think we currently lack the infrastructure for |
| 100 | // disabling a prefilter based on haystack length. That would probably |
| 101 | // need to be a new 'Input' option. (Interestingly, an 'Input' used to |
| 102 | // carry a 'Prefilter' with it, but I moved away from that.) |
| 103 | debug!("skipping literal extraction since regex is anchored" ); |
| 104 | None |
| 105 | } else if let Some(pre) = info.config().get_prefilter() { |
| 106 | debug!( |
| 107 | "skipping literal extraction since the caller provided a prefilter" |
| 108 | ); |
| 109 | Some(pre.clone()) |
| 110 | } else if info.config().get_auto_prefilter() { |
| 111 | let kind = info.config().get_match_kind(); |
| 112 | let prefixes = crate::util::prefilter::prefixes(kind, hirs); |
| 113 | // If we can build a full `Strategy` from just the extracted prefixes, |
| 114 | // then we can short-circuit and avoid building a regex engine at all. |
| 115 | if let Some(pre) = Pre::from_prefixes(info, &prefixes) { |
| 116 | debug!( |
| 117 | "found that the regex can be broken down to a literal \ |
| 118 | search, avoiding the regex engine entirely" , |
| 119 | ); |
| 120 | return Ok(pre); |
| 121 | } |
| 122 | // This now attempts another short-circuit of the regex engine: if we |
| 123 | // have a huge alternation of just plain literals, then we can just use |
| 124 | // Aho-Corasick for that and avoid the regex engine entirely. |
| 125 | // |
| 126 | // You might think this case would just be handled by |
| 127 | // `Pre::from_prefixes`, but that technique relies on heuristic literal |
| 128 | // extraction from the corresponding `Hir`. That works, but part of |
| 129 | // heuristics limit the size and number of literals returned. This case |
| 130 | // will specifically handle patterns with very large alternations. |
| 131 | // |
| 132 | // One wonders if we should just roll this our heuristic literal |
| 133 | // extraction, and then I think this case could disappear entirely. |
| 134 | if let Some(pre) = Pre::from_alternation_literals(info, hirs) { |
| 135 | debug!( |
| 136 | "found plain alternation of literals, \ |
| 137 | avoiding regex engine entirely and using Aho-Corasick" |
| 138 | ); |
| 139 | return Ok(pre); |
| 140 | } |
| 141 | prefixes.literals().and_then(|strings| { |
| 142 | debug!( |
| 143 | "creating prefilter from {} literals: {:?}" , |
| 144 | strings.len(), |
| 145 | strings, |
| 146 | ); |
| 147 | Prefilter::new(kind, strings) |
| 148 | }) |
| 149 | } else { |
| 150 | debug!("skipping literal extraction since prefilters were disabled" ); |
| 151 | None |
| 152 | }; |
| 153 | let mut core = Core::new(info.clone(), pre.clone(), hirs)?; |
| 154 | // Now that we have our core regex engines built, there are a few cases |
| 155 | // where we can do a little bit better than just a normal "search forward |
| 156 | // and maybe use a prefilter when in a start state." However, these cases |
| 157 | // may not always work or otherwise build on top of the Core searcher. |
| 158 | // For example, the reverse anchored optimization seems like it might |
| 159 | // always work, but only the DFAs support reverse searching and the DFAs |
| 160 | // might give up or quit for reasons. If we had, e.g., a PikeVM that |
| 161 | // supported reverse searching, then we could avoid building a full Core |
| 162 | // engine for this case. |
| 163 | core = match ReverseAnchored::new(core) { |
| 164 | Err(core) => core, |
| 165 | Ok(ra) => { |
| 166 | debug!("using reverse anchored strategy" ); |
| 167 | return Ok(Arc::new(ra)); |
| 168 | } |
| 169 | }; |
| 170 | core = match ReverseSuffix::new(core, hirs) { |
| 171 | Err(core) => core, |
| 172 | Ok(rs) => { |
| 173 | debug!("using reverse suffix strategy" ); |
| 174 | return Ok(Arc::new(rs)); |
| 175 | } |
| 176 | }; |
| 177 | core = match ReverseInner::new(core, hirs) { |
| 178 | Err(core) => core, |
| 179 | Ok(ri) => { |
| 180 | debug!("using reverse inner strategy" ); |
| 181 | return Ok(Arc::new(ri)); |
| 182 | } |
| 183 | }; |
| 184 | debug!("using core strategy" ); |
| 185 | Ok(Arc::new(core)) |
| 186 | } |
| 187 | |
| 188 | #[derive (Clone, Debug)] |
| 189 | struct Pre<P> { |
| 190 | pre: P, |
| 191 | group_info: GroupInfo, |
| 192 | } |
| 193 | |
| 194 | impl<P: PrefilterI> Pre<P> { |
| 195 | fn new(pre: P) -> Arc<dyn Strategy> { |
| 196 | // The only thing we support when we use prefilters directly as a |
| 197 | // strategy is the start and end of the overall match for a single |
| 198 | // pattern. In other words, exactly one implicit capturing group. Which |
| 199 | // is exactly what we use here for a GroupInfo. |
| 200 | let group_info: GroupInfo = GroupInfo::new([[None::<&str>]]).unwrap(); |
| 201 | Arc::new(data:Pre { pre, group_info }) |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | // This is a little weird, but we don't actually care about the type parameter |
| 206 | // here because we're selecting which underlying prefilter to use. So we just |
| 207 | // define it on an arbitrary type. |
| 208 | impl Pre<()> { |
| 209 | /// Given a sequence of prefixes, attempt to return a full `Strategy` using |
| 210 | /// just the prefixes. |
| 211 | /// |
| 212 | /// Basically, this occurs when the prefixes given not just prefixes, |
| 213 | /// but an enumeration of the entire language matched by the regular |
| 214 | /// expression. |
| 215 | /// |
| 216 | /// A number of other conditions need to be true too. For example, there |
| 217 | /// can be only one pattern, the number of explicit capture groups is 0, no |
| 218 | /// look-around assertions and so on. |
| 219 | /// |
| 220 | /// Note that this ignores `Config::get_auto_prefilter` because if this |
| 221 | /// returns something, then it isn't a prefilter but a matcher itself. |
| 222 | /// Therefore, it shouldn't suffer from the problems typical to prefilters |
| 223 | /// (such as a high false positive rate). |
| 224 | fn from_prefixes( |
| 225 | info: &RegexInfo, |
| 226 | prefixes: &literal::Seq, |
| 227 | ) -> Option<Arc<dyn Strategy>> { |
| 228 | let kind = info.config().get_match_kind(); |
| 229 | // Check to see if our prefixes are exact, which means we might be |
| 230 | // able to bypass the regex engine entirely and just rely on literal |
| 231 | // searches. |
| 232 | if !prefixes.is_exact() { |
| 233 | return None; |
| 234 | } |
| 235 | // We also require that we have a single regex pattern. Namely, |
| 236 | // we reuse the prefilter infrastructure to implement search and |
| 237 | // prefilters only report spans. Prefilters don't know about pattern |
| 238 | // IDs. The multi-regex case isn't a lost cause, we might still use |
| 239 | // Aho-Corasick and we might still just use a regular prefilter, but |
| 240 | // that's done below. |
| 241 | if info.pattern_len() != 1 { |
| 242 | return None; |
| 243 | } |
| 244 | // We can't have any capture groups either. The literal engines don't |
| 245 | // know how to deal with things like '(foo)(bar)'. In that case, a |
| 246 | // prefilter will just be used and then the regex engine will resolve |
| 247 | // the capture groups. |
| 248 | if info.props()[0].explicit_captures_len() != 0 { |
| 249 | return None; |
| 250 | } |
| 251 | // We also require that it has zero look-around assertions. Namely, |
| 252 | // literal extraction treats look-around assertions as if they match |
| 253 | // *every* empty string. But of course, that isn't true. So for |
| 254 | // example, 'foo\bquux' never matches anything, but 'fooquux' is |
| 255 | // extracted from that as an exact literal. Such cases should just run |
| 256 | // the regex engine. 'fooquux' will be used as a normal prefilter, and |
| 257 | // then the regex engine will try to look for an actual match. |
| 258 | if !info.props()[0].look_set().is_empty() { |
| 259 | return None; |
| 260 | } |
| 261 | // Finally, currently, our prefilters are all oriented around |
| 262 | // leftmost-first match semantics, so don't try to use them if the |
| 263 | // caller asked for anything else. |
| 264 | if kind != MatchKind::LeftmostFirst { |
| 265 | return None; |
| 266 | } |
| 267 | // The above seems like a lot of requirements to meet, but it applies |
| 268 | // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet |
| 269 | // the above criteria, for example. |
| 270 | // |
| 271 | // Note that this is effectively a latency optimization. If we didn't |
| 272 | // do this, then the extracted literals would still get bundled into |
| 273 | // a prefilter, and every regex engine capable of running unanchored |
| 274 | // searches supports prefilters. So this optimization merely sidesteps |
| 275 | // having to run the regex engine at all to confirm the match. Thus, it |
| 276 | // decreases the latency of a match. |
| 277 | |
| 278 | // OK because we know the set is exact and thus finite. |
| 279 | let prefixes = prefixes.literals().unwrap(); |
| 280 | debug!( |
| 281 | "trying to bypass regex engine by creating \ |
| 282 | prefilter from {} literals: {:?}" , |
| 283 | prefixes.len(), |
| 284 | prefixes, |
| 285 | ); |
| 286 | let choice = match prefilter::Choice::new(kind, prefixes) { |
| 287 | Some(choice) => choice, |
| 288 | None => { |
| 289 | debug!( |
| 290 | "regex bypass failed because no prefilter could be built" |
| 291 | ); |
| 292 | return None; |
| 293 | } |
| 294 | }; |
| 295 | let strat: Arc<dyn Strategy> = match choice { |
| 296 | prefilter::Choice::Memchr(pre) => Pre::new(pre), |
| 297 | prefilter::Choice::Memchr2(pre) => Pre::new(pre), |
| 298 | prefilter::Choice::Memchr3(pre) => Pre::new(pre), |
| 299 | prefilter::Choice::Memmem(pre) => Pre::new(pre), |
| 300 | prefilter::Choice::Teddy(pre) => Pre::new(pre), |
| 301 | prefilter::Choice::ByteSet(pre) => Pre::new(pre), |
| 302 | prefilter::Choice::AhoCorasick(pre) => Pre::new(pre), |
| 303 | }; |
| 304 | Some(strat) |
| 305 | } |
| 306 | |
| 307 | /// Attempts to extract an alternation of literals, and if it's deemed |
| 308 | /// worth doing, returns an Aho-Corasick prefilter as a strategy. |
| 309 | /// |
| 310 | /// And currently, this only returns something when 'hirs.len() == 1'. This |
| 311 | /// could in theory do something if there are multiple HIRs where all of |
| 312 | /// them are alternation of literals, but I haven't had the time to go down |
| 313 | /// that path yet. |
| 314 | fn from_alternation_literals( |
| 315 | info: &RegexInfo, |
| 316 | hirs: &[&Hir], |
| 317 | ) -> Option<Arc<dyn Strategy>> { |
| 318 | use crate::util::prefilter::AhoCorasick; |
| 319 | |
| 320 | let lits = crate::meta::literal::alternation_literals(info, hirs)?; |
| 321 | let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?; |
| 322 | Some(Pre::new(ac)) |
| 323 | } |
| 324 | } |
| 325 | |
| 326 | // This implements Strategy for anything that implements PrefilterI. |
| 327 | // |
| 328 | // Note that this must only be used for regexes of length 1. Multi-regexes |
| 329 | // don't work here. The prefilter interface only provides the span of a match |
| 330 | // and not the pattern ID. (I did consider making it more expressive, but I |
| 331 | // couldn't figure out how to tie everything together elegantly.) Thus, so long |
| 332 | // as the regex only contains one pattern, we can simply assume that a match |
| 333 | // corresponds to PatternID::ZERO. And indeed, that's what we do here. |
| 334 | // |
| 335 | // In practice, since this impl is used to report matches directly and thus |
| 336 | // completely bypasses the regex engine, we only wind up using this under the |
| 337 | // following restrictions: |
| 338 | // |
| 339 | // * There must be only one pattern. As explained above. |
| 340 | // * The literal sequence must be finite and only contain exact literals. |
| 341 | // * There must not be any look-around assertions. If there are, the literals |
| 342 | // extracted might be exact, but a match doesn't necessarily imply an overall |
| 343 | // match. As a trivial example, 'foo\bbar' does not match 'foobar'. |
| 344 | // * The pattern must not have any explicit capturing groups. If it does, the |
| 345 | // caller might expect them to be resolved. e.g., 'foo(bar)'. |
| 346 | // |
| 347 | // So when all of those things are true, we use a prefilter directly as a |
| 348 | // strategy. |
| 349 | // |
| 350 | // In the case where the number of patterns is more than 1, we don't use this |
| 351 | // but do use a special Aho-Corasick strategy if all of the regexes are just |
| 352 | // simple literals or alternations of literals. (We also use the Aho-Corasick |
| 353 | // strategy when len(patterns)==1 if the number of literals is large. In that |
| 354 | // case, literal extraction gives up and will return an infinite set.) |
| 355 | impl<P: PrefilterI> Strategy for Pre<P> { |
| 356 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 357 | fn group_info(&self) -> &GroupInfo { |
| 358 | &self.group_info |
| 359 | } |
| 360 | |
| 361 | fn create_cache(&self) -> Cache { |
| 362 | Cache { |
| 363 | capmatches: Captures::all(self.group_info().clone()), |
| 364 | pikevm: wrappers::PikeVMCache::none(), |
| 365 | backtrack: wrappers::BoundedBacktrackerCache::none(), |
| 366 | onepass: wrappers::OnePassCache::none(), |
| 367 | hybrid: wrappers::HybridCache::none(), |
| 368 | revhybrid: wrappers::ReverseHybridCache::none(), |
| 369 | } |
| 370 | } |
| 371 | |
| 372 | fn reset_cache(&self, _cache: &mut Cache) {} |
| 373 | |
| 374 | fn is_accelerated(&self) -> bool { |
| 375 | self.pre.is_fast() |
| 376 | } |
| 377 | |
| 378 | fn memory_usage(&self) -> usize { |
| 379 | self.pre.memory_usage() |
| 380 | } |
| 381 | |
| 382 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 383 | fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
| 384 | if input.is_done() { |
| 385 | return None; |
| 386 | } |
| 387 | if input.get_anchored().is_anchored() { |
| 388 | return self |
| 389 | .pre |
| 390 | .prefix(input.haystack(), input.get_span()) |
| 391 | .map(|sp| Match::new(PatternID::ZERO, sp)); |
| 392 | } |
| 393 | self.pre |
| 394 | .find(input.haystack(), input.get_span()) |
| 395 | .map(|sp| Match::new(PatternID::ZERO, sp)) |
| 396 | } |
| 397 | |
| 398 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 399 | fn search_half( |
| 400 | &self, |
| 401 | cache: &mut Cache, |
| 402 | input: &Input<'_>, |
| 403 | ) -> Option<HalfMatch> { |
| 404 | self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) |
| 405 | } |
| 406 | |
| 407 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 408 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
| 409 | self.search(cache, input).is_some() |
| 410 | } |
| 411 | |
| 412 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 413 | fn search_slots( |
| 414 | &self, |
| 415 | cache: &mut Cache, |
| 416 | input: &Input<'_>, |
| 417 | slots: &mut [Option<NonMaxUsize>], |
| 418 | ) -> Option<PatternID> { |
| 419 | let m = self.search(cache, input)?; |
| 420 | if let Some(slot) = slots.get_mut(0) { |
| 421 | *slot = NonMaxUsize::new(m.start()); |
| 422 | } |
| 423 | if let Some(slot) = slots.get_mut(1) { |
| 424 | *slot = NonMaxUsize::new(m.end()); |
| 425 | } |
| 426 | Some(m.pattern()) |
| 427 | } |
| 428 | |
| 429 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 430 | fn which_overlapping_matches( |
| 431 | &self, |
| 432 | cache: &mut Cache, |
| 433 | input: &Input<'_>, |
| 434 | patset: &mut PatternSet, |
| 435 | ) { |
| 436 | if self.search(cache, input).is_some() { |
| 437 | patset.insert(PatternID::ZERO); |
| 438 | } |
| 439 | } |
| 440 | } |
| 441 | |
| 442 | #[derive (Debug)] |
| 443 | struct Core { |
| 444 | info: RegexInfo, |
| 445 | pre: Option<Prefilter>, |
| 446 | nfa: NFA, |
| 447 | nfarev: Option<NFA>, |
| 448 | pikevm: wrappers::PikeVM, |
| 449 | backtrack: wrappers::BoundedBacktracker, |
| 450 | onepass: wrappers::OnePass, |
| 451 | hybrid: wrappers::Hybrid, |
| 452 | dfa: wrappers::DFA, |
| 453 | } |
| 454 | |
| 455 | impl Core { |
| 456 | fn new( |
| 457 | info: RegexInfo, |
| 458 | pre: Option<Prefilter>, |
| 459 | hirs: &[&Hir], |
| 460 | ) -> Result<Core, BuildError> { |
| 461 | let mut lookm = LookMatcher::new(); |
| 462 | lookm.set_line_terminator(info.config().get_line_terminator()); |
| 463 | let thompson_config = thompson::Config::new() |
| 464 | .utf8(info.config().get_utf8_empty()) |
| 465 | .nfa_size_limit(info.config().get_nfa_size_limit()) |
| 466 | .shrink(false) |
| 467 | .which_captures(info.config().get_which_captures()) |
| 468 | .look_matcher(lookm); |
| 469 | let nfa = thompson::Compiler::new() |
| 470 | .configure(thompson_config.clone()) |
| 471 | .build_many_from_hir(hirs) |
| 472 | .map_err(BuildError::nfa)?; |
| 473 | // It's possible for the PikeVM or the BB to fail to build, even though |
| 474 | // at this point, we already have a full NFA in hand. They can fail |
| 475 | // when a Unicode word boundary is used but where Unicode word boundary |
| 476 | // support is disabled at compile time, thus making it impossible to |
| 477 | // match. (Construction can also fail if the NFA was compiled without |
| 478 | // captures, but we always enable that above.) |
| 479 | let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?; |
| 480 | let backtrack = |
| 481 | wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?; |
| 482 | // The onepass engine can of course fail to build, but we expect it to |
| 483 | // fail in many cases because it is an optimization that doesn't apply |
| 484 | // to all regexes. The 'OnePass' wrapper encapsulates this failure (and |
| 485 | // logs a message if it occurs). |
| 486 | let onepass = wrappers::OnePass::new(&info, &nfa); |
| 487 | // We try to encapsulate whether a particular regex engine should be |
| 488 | // used within each respective wrapper, but the DFAs need a reverse NFA |
| 489 | // to build itself, and we really do not want to build a reverse NFA if |
| 490 | // we know we aren't going to use the lazy DFA. So we do a config check |
| 491 | // up front, which is in practice the only way we won't try to use the |
| 492 | // DFA. |
| 493 | let (nfarev, hybrid, dfa) = |
| 494 | if !info.config().get_hybrid() && !info.config().get_dfa() { |
| 495 | (None, wrappers::Hybrid::none(), wrappers::DFA::none()) |
| 496 | } else { |
| 497 | // FIXME: Technically, we don't quite yet KNOW that we need |
| 498 | // a reverse NFA. It's possible for the DFAs below to both |
| 499 | // fail to build just based on the forward NFA. In which case, |
| 500 | // building the reverse NFA was totally wasted work. But... |
| 501 | // fixing this requires breaking DFA construction apart into |
| 502 | // two pieces: one for the forward part and another for the |
| 503 | // reverse part. Quite annoying. Making it worse, when building |
| 504 | // both DFAs fails, it's quite likely that the NFA is large and |
| 505 | // that it will take quite some time to build the reverse NFA |
| 506 | // too. So... it's really probably worth it to do this! |
| 507 | let nfarev = thompson::Compiler::new() |
| 508 | // Currently, reverse NFAs don't support capturing groups, |
| 509 | // so we MUST disable them. But even if we didn't have to, |
| 510 | // we would, because nothing in this crate does anything |
| 511 | // useful with capturing groups in reverse. And of course, |
| 512 | // the lazy DFA ignores capturing groups in all cases. |
| 513 | .configure( |
| 514 | thompson_config |
| 515 | .clone() |
| 516 | .which_captures(WhichCaptures::None) |
| 517 | .reverse(true), |
| 518 | ) |
| 519 | .build_many_from_hir(hirs) |
| 520 | .map_err(BuildError::nfa)?; |
| 521 | let dfa = if !info.config().get_dfa() { |
| 522 | wrappers::DFA::none() |
| 523 | } else { |
| 524 | wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) |
| 525 | }; |
| 526 | let hybrid = if !info.config().get_hybrid() { |
| 527 | wrappers::Hybrid::none() |
| 528 | } else if dfa.is_some() { |
| 529 | debug!("skipping lazy DFA because we have a full DFA" ); |
| 530 | wrappers::Hybrid::none() |
| 531 | } else { |
| 532 | wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) |
| 533 | }; |
| 534 | (Some(nfarev), hybrid, dfa) |
| 535 | }; |
| 536 | Ok(Core { |
| 537 | info, |
| 538 | pre, |
| 539 | nfa, |
| 540 | nfarev, |
| 541 | pikevm, |
| 542 | backtrack, |
| 543 | onepass, |
| 544 | hybrid, |
| 545 | dfa, |
| 546 | }) |
| 547 | } |
| 548 | |
| 549 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 550 | fn try_search_mayfail( |
| 551 | &self, |
| 552 | cache: &mut Cache, |
| 553 | input: &Input<'_>, |
| 554 | ) -> Option<Result<Option<Match>, RetryFailError>> { |
| 555 | if let Some(e) = self.dfa.get(input) { |
| 556 | trace!("using full DFA for search at {:?}" , input.get_span()); |
| 557 | Some(e.try_search(input)) |
| 558 | } else if let Some(e) = self.hybrid.get(input) { |
| 559 | trace!("using lazy DFA for search at {:?}" , input.get_span()); |
| 560 | Some(e.try_search(&mut cache.hybrid, input)) |
| 561 | } else { |
| 562 | None |
| 563 | } |
| 564 | } |
| 565 | |
| 566 | fn search_nofail( |
| 567 | &self, |
| 568 | cache: &mut Cache, |
| 569 | input: &Input<'_>, |
| 570 | ) -> Option<Match> { |
| 571 | let caps = &mut cache.capmatches; |
| 572 | caps.set_pattern(None); |
| 573 | // We manually inline 'try_search_slots_nofail' here because we need to |
| 574 | // borrow from 'cache.capmatches' in this method, but if we do, then |
| 575 | // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a |
| 576 | // classic example of how the borrow checker inhibits decomposition. |
| 577 | // There are of course work-arounds (more types and/or interior |
| 578 | // mutability), but that's more annoying than this IMO. |
| 579 | let pid = if let Some(ref e) = self.onepass.get(input) { |
| 580 | trace!("using OnePass for search at {:?}" , input.get_span()); |
| 581 | e.search_slots(&mut cache.onepass, input, caps.slots_mut()) |
| 582 | } else if let Some(ref e) = self.backtrack.get(input) { |
| 583 | trace!( |
| 584 | "using BoundedBacktracker for search at {:?}" , |
| 585 | input.get_span() |
| 586 | ); |
| 587 | e.search_slots(&mut cache.backtrack, input, caps.slots_mut()) |
| 588 | } else { |
| 589 | trace!("using PikeVM for search at {:?}" , input.get_span()); |
| 590 | let e = self.pikevm.get(); |
| 591 | e.search_slots(&mut cache.pikevm, input, caps.slots_mut()) |
| 592 | }; |
| 593 | caps.set_pattern(pid); |
| 594 | caps.get_match() |
| 595 | } |
| 596 | |
| 597 | fn search_half_nofail( |
| 598 | &self, |
| 599 | cache: &mut Cache, |
| 600 | input: &Input<'_>, |
| 601 | ) -> Option<HalfMatch> { |
| 602 | // Only the lazy/full DFA returns half-matches, since the DFA requires |
| 603 | // a reverse scan to find the start position. These fallback regex |
| 604 | // engines can find the start and end in a single pass, so we just do |
| 605 | // that and throw away the start offset to conform to the API. |
| 606 | let m = self.search_nofail(cache, input)?; |
| 607 | Some(HalfMatch::new(m.pattern(), m.end())) |
| 608 | } |
| 609 | |
| 610 | fn search_slots_nofail( |
| 611 | &self, |
| 612 | cache: &mut Cache, |
| 613 | input: &Input<'_>, |
| 614 | slots: &mut [Option<NonMaxUsize>], |
| 615 | ) -> Option<PatternID> { |
| 616 | if let Some(ref e) = self.onepass.get(input) { |
| 617 | trace!( |
| 618 | "using OnePass for capture search at {:?}" , |
| 619 | input.get_span() |
| 620 | ); |
| 621 | e.search_slots(&mut cache.onepass, input, slots) |
| 622 | } else if let Some(ref e) = self.backtrack.get(input) { |
| 623 | trace!( |
| 624 | "using BoundedBacktracker for capture search at {:?}" , |
| 625 | input.get_span() |
| 626 | ); |
| 627 | e.search_slots(&mut cache.backtrack, input, slots) |
| 628 | } else { |
| 629 | trace!( |
| 630 | "using PikeVM for capture search at {:?}" , |
| 631 | input.get_span() |
| 632 | ); |
| 633 | let e = self.pikevm.get(); |
| 634 | e.search_slots(&mut cache.pikevm, input, slots) |
| 635 | } |
| 636 | } |
| 637 | |
| 638 | fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
| 639 | if let Some(ref e) = self.onepass.get(input) { |
| 640 | trace!( |
| 641 | "using OnePass for is-match search at {:?}" , |
| 642 | input.get_span() |
| 643 | ); |
| 644 | e.search_slots(&mut cache.onepass, input, &mut []).is_some() |
| 645 | } else if let Some(ref e) = self.backtrack.get(input) { |
| 646 | trace!( |
| 647 | "using BoundedBacktracker for is-match search at {:?}" , |
| 648 | input.get_span() |
| 649 | ); |
| 650 | e.is_match(&mut cache.backtrack, input) |
| 651 | } else { |
| 652 | trace!( |
| 653 | "using PikeVM for is-match search at {:?}" , |
| 654 | input.get_span() |
| 655 | ); |
| 656 | let e = self.pikevm.get(); |
| 657 | e.is_match(&mut cache.pikevm, input) |
| 658 | } |
| 659 | } |
| 660 | |
| 661 | fn is_capture_search_needed(&self, slots_len: usize) -> bool { |
| 662 | slots_len > self.nfa.group_info().implicit_slot_len() |
| 663 | } |
| 664 | } |
| 665 | |
| 666 | impl Strategy for Core { |
| 667 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 668 | fn group_info(&self) -> &GroupInfo { |
| 669 | self.nfa.group_info() |
| 670 | } |
| 671 | |
| 672 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 673 | fn create_cache(&self) -> Cache { |
| 674 | Cache { |
| 675 | capmatches: Captures::all(self.group_info().clone()), |
| 676 | pikevm: self.pikevm.create_cache(), |
| 677 | backtrack: self.backtrack.create_cache(), |
| 678 | onepass: self.onepass.create_cache(), |
| 679 | hybrid: self.hybrid.create_cache(), |
| 680 | revhybrid: wrappers::ReverseHybridCache::none(), |
| 681 | } |
| 682 | } |
| 683 | |
| 684 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 685 | fn reset_cache(&self, cache: &mut Cache) { |
| 686 | cache.pikevm.reset(&self.pikevm); |
| 687 | cache.backtrack.reset(&self.backtrack); |
| 688 | cache.onepass.reset(&self.onepass); |
| 689 | cache.hybrid.reset(&self.hybrid); |
| 690 | } |
| 691 | |
| 692 | fn is_accelerated(&self) -> bool { |
| 693 | self.pre.as_ref().map_or(false, |pre| pre.is_fast()) |
| 694 | } |
| 695 | |
| 696 | fn memory_usage(&self) -> usize { |
| 697 | self.info.memory_usage() |
| 698 | + self.pre.as_ref().map_or(0, |pre| pre.memory_usage()) |
| 699 | + self.nfa.memory_usage() |
| 700 | + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage()) |
| 701 | + self.onepass.memory_usage() |
| 702 | + self.dfa.memory_usage() |
| 703 | } |
| 704 | |
| 705 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 706 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
| 707 | // We manually inline try_search_mayfail here because letting the |
| 708 | // compiler do it seems to produce pretty crappy codegen. |
| 709 | return if let Some(e) = self.dfa.get(input) { |
| 710 | trace!("using full DFA for full search at {:?}" , input.get_span()); |
| 711 | match e.try_search(input) { |
| 712 | Ok(x) => x, |
| 713 | Err(_err) => { |
| 714 | trace!("full DFA search failed: {}" , _err); |
| 715 | self.search_nofail(cache, input) |
| 716 | } |
| 717 | } |
| 718 | } else if let Some(e) = self.hybrid.get(input) { |
| 719 | trace!("using lazy DFA for full search at {:?}" , input.get_span()); |
| 720 | match e.try_search(&mut cache.hybrid, input) { |
| 721 | Ok(x) => x, |
| 722 | Err(_err) => { |
| 723 | trace!("lazy DFA search failed: {}" , _err); |
| 724 | self.search_nofail(cache, input) |
| 725 | } |
| 726 | } |
| 727 | } else { |
| 728 | self.search_nofail(cache, input) |
| 729 | }; |
| 730 | } |
| 731 | |
| 732 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 733 | fn search_half( |
| 734 | &self, |
| 735 | cache: &mut Cache, |
| 736 | input: &Input<'_>, |
| 737 | ) -> Option<HalfMatch> { |
| 738 | // The main difference with 'search' is that if we're using a DFA, we |
| 739 | // can use a single forward scan without needing to run the reverse |
| 740 | // DFA. |
| 741 | if let Some(e) = self.dfa.get(input) { |
| 742 | trace!("using full DFA for half search at {:?}" , input.get_span()); |
| 743 | match e.try_search_half_fwd(input) { |
| 744 | Ok(x) => x, |
| 745 | Err(_err) => { |
| 746 | trace!("full DFA half search failed: {}" , _err); |
| 747 | self.search_half_nofail(cache, input) |
| 748 | } |
| 749 | } |
| 750 | } else if let Some(e) = self.hybrid.get(input) { |
| 751 | trace!("using lazy DFA for half search at {:?}" , input.get_span()); |
| 752 | match e.try_search_half_fwd(&mut cache.hybrid, input) { |
| 753 | Ok(x) => x, |
| 754 | Err(_err) => { |
| 755 | trace!("lazy DFA half search failed: {}" , _err); |
| 756 | self.search_half_nofail(cache, input) |
| 757 | } |
| 758 | } |
| 759 | } else { |
| 760 | self.search_half_nofail(cache, input) |
| 761 | } |
| 762 | } |
| 763 | |
| 764 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 765 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
| 766 | if let Some(e) = self.dfa.get(input) { |
| 767 | trace!( |
| 768 | "using full DFA for is-match search at {:?}" , |
| 769 | input.get_span() |
| 770 | ); |
| 771 | match e.try_search_half_fwd(input) { |
| 772 | Ok(x) => x.is_some(), |
| 773 | Err(_err) => { |
| 774 | trace!("full DFA half search failed: {}" , _err); |
| 775 | self.is_match_nofail(cache, input) |
| 776 | } |
| 777 | } |
| 778 | } else if let Some(e) = self.hybrid.get(input) { |
| 779 | trace!( |
| 780 | "using lazy DFA for is-match search at {:?}" , |
| 781 | input.get_span() |
| 782 | ); |
| 783 | match e.try_search_half_fwd(&mut cache.hybrid, input) { |
| 784 | Ok(x) => x.is_some(), |
| 785 | Err(_err) => { |
| 786 | trace!("lazy DFA half search failed: {}" , _err); |
| 787 | self.is_match_nofail(cache, input) |
| 788 | } |
| 789 | } |
| 790 | } else { |
| 791 | self.is_match_nofail(cache, input) |
| 792 | } |
| 793 | } |
| 794 | |
| 795 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 796 | fn search_slots( |
| 797 | &self, |
| 798 | cache: &mut Cache, |
| 799 | input: &Input<'_>, |
| 800 | slots: &mut [Option<NonMaxUsize>], |
| 801 | ) -> Option<PatternID> { |
| 802 | // Even if the regex has explicit capture groups, if the caller didn't |
| 803 | // provide any explicit slots, then it doesn't make sense to try and do |
| 804 | // extra work to get offsets for those slots. Ideally the caller should |
| 805 | // realize this and not call this routine in the first place, but alas, |
| 806 | // we try to save the caller from themselves if they do. |
| 807 | if !self.is_capture_search_needed(slots.len()) { |
| 808 | trace!("asked for slots unnecessarily, trying fast path" ); |
| 809 | let m = self.search(cache, input)?; |
| 810 | copy_match_to_slots(m, slots); |
| 811 | return Some(m.pattern()); |
| 812 | } |
| 813 | // If the onepass DFA is available for this search (which only happens |
| 814 | // when it's anchored), then skip running a fallible DFA. The onepass |
| 815 | // DFA isn't as fast as a full or lazy DFA, but it is typically quite |
| 816 | // a bit faster than the backtracker or the PikeVM. So it isn't as |
| 817 | // advantageous to try and do a full/lazy DFA scan first. |
| 818 | // |
| 819 | // We still theorize that it's better to do a full/lazy DFA scan, even |
| 820 | // when it's anchored, because it's usually much faster and permits us |
| 821 | // to say "no match" much more quickly. This does hurt the case of, |
| 822 | // say, parsing each line in a log file into capture groups, because |
| 823 | // in that case, the line always matches. So the lazy DFA scan is |
| 824 | // usually just wasted work. But, the lazy DFA is usually quite fast |
| 825 | // and doesn't cost too much here. |
| 826 | if self.onepass.get(&input).is_some() { |
| 827 | return self.search_slots_nofail(cache, &input, slots); |
| 828 | } |
| 829 | let m = match self.try_search_mayfail(cache, input) { |
| 830 | Some(Ok(Some(m))) => m, |
| 831 | Some(Ok(None)) => return None, |
| 832 | Some(Err(_err)) => { |
| 833 | trace!("fast capture search failed: {}" , _err); |
| 834 | return self.search_slots_nofail(cache, input, slots); |
| 835 | } |
| 836 | None => { |
| 837 | return self.search_slots_nofail(cache, input, slots); |
| 838 | } |
| 839 | }; |
| 840 | // At this point, now that we've found the bounds of the |
| 841 | // match, we need to re-run something that can resolve |
| 842 | // capturing groups. But we only need to run on it on the |
| 843 | // match bounds and not the entire haystack. |
| 844 | trace!( |
| 845 | "match found at {}..{} in capture search, \ |
| 846 | using another engine to find captures" , |
| 847 | m.start(), |
| 848 | m.end(), |
| 849 | ); |
| 850 | let input = input |
| 851 | .clone() |
| 852 | .span(m.start()..m.end()) |
| 853 | .anchored(Anchored::Pattern(m.pattern())); |
| 854 | Some( |
| 855 | self.search_slots_nofail(cache, &input, slots) |
| 856 | .expect("should find a match" ), |
| 857 | ) |
| 858 | } |
| 859 | |
| 860 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 861 | fn which_overlapping_matches( |
| 862 | &self, |
| 863 | cache: &mut Cache, |
| 864 | input: &Input<'_>, |
| 865 | patset: &mut PatternSet, |
| 866 | ) { |
| 867 | if let Some(e) = self.dfa.get(input) { |
| 868 | trace!( |
| 869 | "using full DFA for overlapping search at {:?}" , |
| 870 | input.get_span() |
| 871 | ); |
| 872 | let _err = match e.try_which_overlapping_matches(input, patset) { |
| 873 | Ok(()) => return, |
| 874 | Err(err) => err, |
| 875 | }; |
| 876 | trace!("fast overlapping search failed: {}" , _err); |
| 877 | } else if let Some(e) = self.hybrid.get(input) { |
| 878 | trace!( |
| 879 | "using lazy DFA for overlapping search at {:?}" , |
| 880 | input.get_span() |
| 881 | ); |
| 882 | let _err = match e.try_which_overlapping_matches( |
| 883 | &mut cache.hybrid, |
| 884 | input, |
| 885 | patset, |
| 886 | ) { |
| 887 | Ok(()) => { |
| 888 | return; |
| 889 | } |
| 890 | Err(err) => err, |
| 891 | }; |
| 892 | trace!("fast overlapping search failed: {}" , _err); |
| 893 | } |
| 894 | trace!( |
| 895 | "using PikeVM for overlapping search at {:?}" , |
| 896 | input.get_span() |
| 897 | ); |
| 898 | let e = self.pikevm.get(); |
| 899 | e.which_overlapping_matches(&mut cache.pikevm, input, patset) |
| 900 | } |
| 901 | } |
| 902 | |
| 903 | #[derive (Debug)] |
| 904 | struct ReverseAnchored { |
| 905 | core: Core, |
| 906 | } |
| 907 | |
| 908 | impl ReverseAnchored { |
| 909 | fn new(core: Core) -> Result<ReverseAnchored, Core> { |
| 910 | if !core.info.is_always_anchored_end() { |
| 911 | debug!( |
| 912 | "skipping reverse anchored optimization because \ |
| 913 | the regex is not always anchored at the end" |
| 914 | ); |
| 915 | return Err(core); |
| 916 | } |
| 917 | // Note that the caller can still request an anchored search even when |
| 918 | // the regex isn't anchored at the start. We detect that case in the |
| 919 | // search routines below and just fallback to the core engine. This |
| 920 | // is fine because both searches are anchored. It's just a matter of |
| 921 | // picking one. Falling back to the core engine is a little simpler, |
| 922 | // since if we used the reverse anchored approach, we'd have to add an |
| 923 | // extra check to ensure the match reported starts at the place where |
| 924 | // the caller requested the search to start. |
| 925 | if core.info.is_always_anchored_start() { |
| 926 | debug!( |
| 927 | "skipping reverse anchored optimization because \ |
| 928 | the regex is also anchored at the start" |
| 929 | ); |
| 930 | return Err(core); |
| 931 | } |
| 932 | // Only DFAs can do reverse searches (currently), so we need one of |
| 933 | // them in order to do this optimization. It's possible (although |
| 934 | // pretty unlikely) that we have neither and need to give up. |
| 935 | if !core.hybrid.is_some() && !core.dfa.is_some() { |
| 936 | debug!( |
| 937 | "skipping reverse anchored optimization because \ |
| 938 | we don't have a lazy DFA or a full DFA" |
| 939 | ); |
| 940 | return Err(core); |
| 941 | } |
| 942 | Ok(ReverseAnchored { core }) |
| 943 | } |
| 944 | |
| 945 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 946 | fn try_search_half_anchored_rev( |
| 947 | &self, |
| 948 | cache: &mut Cache, |
| 949 | input: &Input<'_>, |
| 950 | ) -> Result<Option<HalfMatch>, RetryFailError> { |
| 951 | // We of course always want an anchored search. In theory, the |
| 952 | // underlying regex engines should automatically enable anchored |
| 953 | // searches since the regex is itself anchored, but this more clearly |
| 954 | // expresses intent and is always correct. |
| 955 | let input = input.clone().anchored(Anchored::Yes); |
| 956 | if let Some(e) = self.core.dfa.get(&input) { |
| 957 | trace!( |
| 958 | "using full DFA for reverse anchored search at {:?}" , |
| 959 | input.get_span() |
| 960 | ); |
| 961 | e.try_search_half_rev(&input) |
| 962 | } else if let Some(e) = self.core.hybrid.get(&input) { |
| 963 | trace!( |
| 964 | "using lazy DFA for reverse anchored search at {:?}" , |
| 965 | input.get_span() |
| 966 | ); |
| 967 | e.try_search_half_rev(&mut cache.hybrid, &input) |
| 968 | } else { |
| 969 | unreachable!("ReverseAnchored always has a DFA" ) |
| 970 | } |
| 971 | } |
| 972 | } |
| 973 | |
| 974 | // Note that in this impl, we don't check that 'input.end() == |
| 975 | // input.haystack().len()'. In particular, when that condition is false, a |
| 976 | // match is always impossible because we know that the regex is always anchored |
| 977 | // at the end (or else 'ReverseAnchored' won't be built). We don't check that |
| 978 | // here because the 'Regex' wrapper actually does that for us in all cases. |
| 979 | // Thus, in this impl, we can actually assume that the end position in 'input' |
| 980 | // is equivalent to the length of the haystack. |
| 981 | impl Strategy for ReverseAnchored { |
| 982 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 983 | fn group_info(&self) -> &GroupInfo { |
| 984 | self.core.group_info() |
| 985 | } |
| 986 | |
| 987 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 988 | fn create_cache(&self) -> Cache { |
| 989 | self.core.create_cache() |
| 990 | } |
| 991 | |
| 992 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 993 | fn reset_cache(&self, cache: &mut Cache) { |
| 994 | self.core.reset_cache(cache); |
| 995 | } |
| 996 | |
| 997 | fn is_accelerated(&self) -> bool { |
| 998 | // Since this is anchored at the end, a reverse anchored search is |
| 999 | // almost certainly guaranteed to result in a much faster search than |
| 1000 | // a standard forward search. |
| 1001 | true |
| 1002 | } |
| 1003 | |
| 1004 | fn memory_usage(&self) -> usize { |
| 1005 | self.core.memory_usage() |
| 1006 | } |
| 1007 | |
| 1008 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1009 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
| 1010 | if input.get_anchored().is_anchored() { |
| 1011 | return self.core.search(cache, input); |
| 1012 | } |
| 1013 | match self.try_search_half_anchored_rev(cache, input) { |
| 1014 | Err(_err) => { |
| 1015 | trace!("fast reverse anchored search failed: {}" , _err); |
| 1016 | self.core.search_nofail(cache, input) |
| 1017 | } |
| 1018 | Ok(None) => None, |
| 1019 | Ok(Some(hm)) => { |
| 1020 | Some(Match::new(hm.pattern(), hm.offset()..input.end())) |
| 1021 | } |
| 1022 | } |
| 1023 | } |
| 1024 | |
| 1025 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1026 | fn search_half( |
| 1027 | &self, |
| 1028 | cache: &mut Cache, |
| 1029 | input: &Input<'_>, |
| 1030 | ) -> Option<HalfMatch> { |
| 1031 | if input.get_anchored().is_anchored() { |
| 1032 | return self.core.search_half(cache, input); |
| 1033 | } |
| 1034 | match self.try_search_half_anchored_rev(cache, input) { |
| 1035 | Err(_err) => { |
| 1036 | trace!("fast reverse anchored search failed: {}" , _err); |
| 1037 | self.core.search_half_nofail(cache, input) |
| 1038 | } |
| 1039 | Ok(None) => None, |
| 1040 | Ok(Some(hm)) => { |
| 1041 | // Careful here! 'try_search_half' is a *forward* search that |
| 1042 | // only cares about the *end* position of a match. But |
| 1043 | // 'hm.offset()' is actually the start of the match. So we |
| 1044 | // actually just throw that away here and, since we know we |
| 1045 | // have a match, return the only possible position at which a |
| 1046 | // match can occur: input.end(). |
| 1047 | Some(HalfMatch::new(hm.pattern(), input.end())) |
| 1048 | } |
| 1049 | } |
| 1050 | } |
| 1051 | |
| 1052 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1053 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
| 1054 | if input.get_anchored().is_anchored() { |
| 1055 | return self.core.is_match(cache, input); |
| 1056 | } |
| 1057 | match self.try_search_half_anchored_rev(cache, input) { |
| 1058 | Err(_err) => { |
| 1059 | trace!("fast reverse anchored search failed: {}" , _err); |
| 1060 | self.core.is_match_nofail(cache, input) |
| 1061 | } |
| 1062 | Ok(None) => false, |
| 1063 | Ok(Some(_)) => true, |
| 1064 | } |
| 1065 | } |
| 1066 | |
| 1067 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1068 | fn search_slots( |
| 1069 | &self, |
| 1070 | cache: &mut Cache, |
| 1071 | input: &Input<'_>, |
| 1072 | slots: &mut [Option<NonMaxUsize>], |
| 1073 | ) -> Option<PatternID> { |
| 1074 | if input.get_anchored().is_anchored() { |
| 1075 | return self.core.search_slots(cache, input, slots); |
| 1076 | } |
| 1077 | match self.try_search_half_anchored_rev(cache, input) { |
| 1078 | Err(_err) => { |
| 1079 | trace!("fast reverse anchored search failed: {}" , _err); |
| 1080 | self.core.search_slots_nofail(cache, input, slots) |
| 1081 | } |
| 1082 | Ok(None) => None, |
| 1083 | Ok(Some(hm)) => { |
| 1084 | if !self.core.is_capture_search_needed(slots.len()) { |
| 1085 | trace!("asked for slots unnecessarily, skipping captures" ); |
| 1086 | let m = Match::new(hm.pattern(), hm.offset()..input.end()); |
| 1087 | copy_match_to_slots(m, slots); |
| 1088 | return Some(m.pattern()); |
| 1089 | } |
| 1090 | let start = hm.offset(); |
| 1091 | let input = input |
| 1092 | .clone() |
| 1093 | .span(start..input.end()) |
| 1094 | .anchored(Anchored::Pattern(hm.pattern())); |
| 1095 | self.core.search_slots_nofail(cache, &input, slots) |
| 1096 | } |
| 1097 | } |
| 1098 | } |
| 1099 | |
| 1100 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1101 | fn which_overlapping_matches( |
| 1102 | &self, |
| 1103 | cache: &mut Cache, |
| 1104 | input: &Input<'_>, |
| 1105 | patset: &mut PatternSet, |
| 1106 | ) { |
| 1107 | // It seems like this could probably benefit from a reverse anchored |
| 1108 | // optimization, perhaps by doing an overlapping reverse search (which |
| 1109 | // the DFAs do support). I haven't given it much thought though, and |
| 1110 | // I'm currently focus more on the single pattern case. |
| 1111 | self.core.which_overlapping_matches(cache, input, patset) |
| 1112 | } |
| 1113 | } |
| 1114 | |
| 1115 | #[derive (Debug)] |
| 1116 | struct ReverseSuffix { |
| 1117 | core: Core, |
| 1118 | pre: Prefilter, |
| 1119 | } |
| 1120 | |
| 1121 | impl ReverseSuffix { |
| 1122 | fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseSuffix, Core> { |
| 1123 | if !core.info.config().get_auto_prefilter() { |
| 1124 | debug!( |
| 1125 | "skipping reverse suffix optimization because \ |
| 1126 | automatic prefilters are disabled" |
| 1127 | ); |
| 1128 | return Err(core); |
| 1129 | } |
| 1130 | // Like the reverse inner optimization, we don't do this for regexes |
| 1131 | // that are always anchored. It could lead to scanning too much, but |
| 1132 | // could say "no match" much more quickly than running the regex |
| 1133 | // engine if the initial literal scan doesn't match. With that said, |
| 1134 | // the reverse suffix optimization has lower overhead, since it only |
| 1135 | // requires a reverse scan after a literal match to confirm or reject |
| 1136 | // the match. (Although, in the case of confirmation, it then needs to |
| 1137 | // do another forward scan to find the end position.) |
| 1138 | // |
| 1139 | // Note that the caller can still request an anchored search even |
| 1140 | // when the regex isn't anchored. We detect that case in the search |
| 1141 | // routines below and just fallback to the core engine. Currently this |
| 1142 | // optimization assumes all searches are unanchored, so if we do want |
| 1143 | // to enable this optimization for anchored searches, it will need a |
| 1144 | // little work to support it. |
| 1145 | if core.info.is_always_anchored_start() { |
| 1146 | debug!( |
| 1147 | "skipping reverse suffix optimization because \ |
| 1148 | the regex is always anchored at the start" , |
| 1149 | ); |
| 1150 | return Err(core); |
| 1151 | } |
| 1152 | // Only DFAs can do reverse searches (currently), so we need one of |
| 1153 | // them in order to do this optimization. It's possible (although |
| 1154 | // pretty unlikely) that we have neither and need to give up. |
| 1155 | if !core.hybrid.is_some() && !core.dfa.is_some() { |
| 1156 | debug!( |
| 1157 | "skipping reverse suffix optimization because \ |
| 1158 | we don't have a lazy DFA or a full DFA" |
| 1159 | ); |
| 1160 | return Err(core); |
| 1161 | } |
| 1162 | if core.pre.as_ref().map_or(false, |p| p.is_fast()) { |
| 1163 | debug!( |
| 1164 | "skipping reverse suffix optimization because \ |
| 1165 | we already have a prefilter that we think is fast" |
| 1166 | ); |
| 1167 | return Err(core); |
| 1168 | } |
| 1169 | let kind = core.info.config().get_match_kind(); |
| 1170 | let suffixes = crate::util::prefilter::suffixes(kind, hirs); |
| 1171 | let lcs = match suffixes.longest_common_suffix() { |
| 1172 | None => { |
| 1173 | debug!( |
| 1174 | "skipping reverse suffix optimization because \ |
| 1175 | a longest common suffix could not be found" , |
| 1176 | ); |
| 1177 | return Err(core); |
| 1178 | } |
| 1179 | Some(lcs) if lcs.is_empty() => { |
| 1180 | debug!( |
| 1181 | "skipping reverse suffix optimization because \ |
| 1182 | the longest common suffix is the empty string" , |
| 1183 | ); |
| 1184 | return Err(core); |
| 1185 | } |
| 1186 | Some(lcs) => lcs, |
| 1187 | }; |
| 1188 | let pre = match Prefilter::new(kind, &[lcs]) { |
| 1189 | Some(pre) => pre, |
| 1190 | None => { |
| 1191 | debug!( |
| 1192 | "skipping reverse suffix optimization because \ |
| 1193 | a prefilter could not be constructed from the \ |
| 1194 | longest common suffix" , |
| 1195 | ); |
| 1196 | return Err(core); |
| 1197 | } |
| 1198 | }; |
| 1199 | if !pre.is_fast() { |
| 1200 | debug!( |
| 1201 | "skipping reverse suffix optimization because \ |
| 1202 | while we have a suffix prefilter, it is not \ |
| 1203 | believed to be 'fast'" |
| 1204 | ); |
| 1205 | return Err(core); |
| 1206 | } |
| 1207 | Ok(ReverseSuffix { core, pre }) |
| 1208 | } |
| 1209 | |
| 1210 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1211 | fn try_search_half_start( |
| 1212 | &self, |
| 1213 | cache: &mut Cache, |
| 1214 | input: &Input<'_>, |
| 1215 | ) -> Result<Option<HalfMatch>, RetryError> { |
| 1216 | let mut span = input.get_span(); |
| 1217 | let mut min_start = 0; |
| 1218 | loop { |
| 1219 | let litmatch = match self.pre.find(input.haystack(), span) { |
| 1220 | None => return Ok(None), |
| 1221 | Some(span) => span, |
| 1222 | }; |
| 1223 | trace!("reverse suffix scan found suffix match at {:?}" , litmatch); |
| 1224 | let revinput = input |
| 1225 | .clone() |
| 1226 | .anchored(Anchored::Yes) |
| 1227 | .span(input.start()..litmatch.end); |
| 1228 | match self |
| 1229 | .try_search_half_rev_limited(cache, &revinput, min_start)? |
| 1230 | { |
| 1231 | None => { |
| 1232 | if span.start >= span.end { |
| 1233 | break; |
| 1234 | } |
| 1235 | span.start = litmatch.start.checked_add(1).unwrap(); |
| 1236 | } |
| 1237 | Some(hm) => return Ok(Some(hm)), |
| 1238 | } |
| 1239 | min_start = litmatch.end; |
| 1240 | } |
| 1241 | Ok(None) |
| 1242 | } |
| 1243 | |
| 1244 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1245 | fn try_search_half_fwd( |
| 1246 | &self, |
| 1247 | cache: &mut Cache, |
| 1248 | input: &Input<'_>, |
| 1249 | ) -> Result<Option<HalfMatch>, RetryFailError> { |
| 1250 | if let Some(e) = self.core.dfa.get(&input) { |
| 1251 | trace!( |
| 1252 | "using full DFA for forward reverse suffix search at {:?}" , |
| 1253 | input.get_span() |
| 1254 | ); |
| 1255 | e.try_search_half_fwd(&input) |
| 1256 | } else if let Some(e) = self.core.hybrid.get(&input) { |
| 1257 | trace!( |
| 1258 | "using lazy DFA for forward reverse suffix search at {:?}" , |
| 1259 | input.get_span() |
| 1260 | ); |
| 1261 | e.try_search_half_fwd(&mut cache.hybrid, &input) |
| 1262 | } else { |
| 1263 | unreachable!("ReverseSuffix always has a DFA" ) |
| 1264 | } |
| 1265 | } |
| 1266 | |
| 1267 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1268 | fn try_search_half_rev_limited( |
| 1269 | &self, |
| 1270 | cache: &mut Cache, |
| 1271 | input: &Input<'_>, |
| 1272 | min_start: usize, |
| 1273 | ) -> Result<Option<HalfMatch>, RetryError> { |
| 1274 | if let Some(e) = self.core.dfa.get(&input) { |
| 1275 | trace!( |
| 1276 | "using full DFA for reverse suffix search at {:?}, \ |
| 1277 | but will be stopped at {} to avoid quadratic behavior" , |
| 1278 | input.get_span(), |
| 1279 | min_start, |
| 1280 | ); |
| 1281 | e.try_search_half_rev_limited(&input, min_start) |
| 1282 | } else if let Some(e) = self.core.hybrid.get(&input) { |
| 1283 | trace!( |
| 1284 | "using lazy DFA for reverse suffix search at {:?}, \ |
| 1285 | but will be stopped at {} to avoid quadratic behavior" , |
| 1286 | input.get_span(), |
| 1287 | min_start, |
| 1288 | ); |
| 1289 | e.try_search_half_rev_limited(&mut cache.hybrid, &input, min_start) |
| 1290 | } else { |
| 1291 | unreachable!("ReverseSuffix always has a DFA" ) |
| 1292 | } |
| 1293 | } |
| 1294 | } |
| 1295 | |
| 1296 | impl Strategy for ReverseSuffix { |
| 1297 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1298 | fn group_info(&self) -> &GroupInfo { |
| 1299 | self.core.group_info() |
| 1300 | } |
| 1301 | |
| 1302 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1303 | fn create_cache(&self) -> Cache { |
| 1304 | self.core.create_cache() |
| 1305 | } |
| 1306 | |
| 1307 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1308 | fn reset_cache(&self, cache: &mut Cache) { |
| 1309 | self.core.reset_cache(cache); |
| 1310 | } |
| 1311 | |
| 1312 | fn is_accelerated(&self) -> bool { |
| 1313 | self.pre.is_fast() |
| 1314 | } |
| 1315 | |
| 1316 | fn memory_usage(&self) -> usize { |
| 1317 | self.core.memory_usage() + self.pre.memory_usage() |
| 1318 | } |
| 1319 | |
| 1320 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1321 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
| 1322 | if input.get_anchored().is_anchored() { |
| 1323 | return self.core.search(cache, input); |
| 1324 | } |
| 1325 | match self.try_search_half_start(cache, input) { |
| 1326 | Err(RetryError::Quadratic(_err)) => { |
| 1327 | trace!("reverse suffix optimization failed: {}" , _err); |
| 1328 | self.core.search(cache, input) |
| 1329 | } |
| 1330 | Err(RetryError::Fail(_err)) => { |
| 1331 | trace!("reverse suffix reverse fast search failed: {}" , _err); |
| 1332 | self.core.search_nofail(cache, input) |
| 1333 | } |
| 1334 | Ok(None) => None, |
| 1335 | Ok(Some(hm_start)) => { |
| 1336 | let fwdinput = input |
| 1337 | .clone() |
| 1338 | .anchored(Anchored::Pattern(hm_start.pattern())) |
| 1339 | .span(hm_start.offset()..input.end()); |
| 1340 | match self.try_search_half_fwd(cache, &fwdinput) { |
| 1341 | Err(_err) => { |
| 1342 | trace!( |
| 1343 | "reverse suffix forward fast search failed: {}" , |
| 1344 | _err |
| 1345 | ); |
| 1346 | self.core.search_nofail(cache, input) |
| 1347 | } |
| 1348 | Ok(None) => { |
| 1349 | unreachable!( |
| 1350 | "suffix match plus reverse match implies \ |
| 1351 | there must be a match" , |
| 1352 | ) |
| 1353 | } |
| 1354 | Ok(Some(hm_end)) => Some(Match::new( |
| 1355 | hm_start.pattern(), |
| 1356 | hm_start.offset()..hm_end.offset(), |
| 1357 | )), |
| 1358 | } |
| 1359 | } |
| 1360 | } |
| 1361 | } |
| 1362 | |
| 1363 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1364 | fn search_half( |
| 1365 | &self, |
| 1366 | cache: &mut Cache, |
| 1367 | input: &Input<'_>, |
| 1368 | ) -> Option<HalfMatch> { |
| 1369 | if input.get_anchored().is_anchored() { |
| 1370 | return self.core.search_half(cache, input); |
| 1371 | } |
| 1372 | match self.try_search_half_start(cache, input) { |
| 1373 | Err(RetryError::Quadratic(_err)) => { |
| 1374 | trace!("reverse suffix half optimization failed: {}" , _err); |
| 1375 | self.core.search_half(cache, input) |
| 1376 | } |
| 1377 | Err(RetryError::Fail(_err)) => { |
| 1378 | trace!( |
| 1379 | "reverse suffix reverse fast half search failed: {}" , |
| 1380 | _err |
| 1381 | ); |
| 1382 | self.core.search_half_nofail(cache, input) |
| 1383 | } |
| 1384 | Ok(None) => None, |
| 1385 | Ok(Some(hm_start)) => { |
| 1386 | // This is a bit subtle. It is tempting to just stop searching |
| 1387 | // at this point and return a half-match with an offset |
| 1388 | // corresponding to where the suffix was found. But the suffix |
| 1389 | // match does not necessarily correspond to the end of the |
| 1390 | // proper leftmost-first match. Consider /[a-z]+ing/ against |
| 1391 | // 'tingling'. The first suffix match is the first 'ing', and |
| 1392 | // the /[a-z]+/ matches the 't'. So if we stopped here, then |
| 1393 | // we'd report 'ting' as the match. But 'tingling' is the |
| 1394 | // correct match because of greediness. |
| 1395 | let fwdinput = input |
| 1396 | .clone() |
| 1397 | .anchored(Anchored::Pattern(hm_start.pattern())) |
| 1398 | .span(hm_start.offset()..input.end()); |
| 1399 | match self.try_search_half_fwd(cache, &fwdinput) { |
| 1400 | Err(_err) => { |
| 1401 | trace!( |
| 1402 | "reverse suffix forward fast search failed: {}" , |
| 1403 | _err |
| 1404 | ); |
| 1405 | self.core.search_half_nofail(cache, input) |
| 1406 | } |
| 1407 | Ok(None) => { |
| 1408 | unreachable!( |
| 1409 | "suffix match plus reverse match implies \ |
| 1410 | there must be a match" , |
| 1411 | ) |
| 1412 | } |
| 1413 | Ok(Some(hm_end)) => Some(hm_end), |
| 1414 | } |
| 1415 | } |
| 1416 | } |
| 1417 | } |
| 1418 | |
| 1419 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1420 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
| 1421 | if input.get_anchored().is_anchored() { |
| 1422 | return self.core.is_match(cache, input); |
| 1423 | } |
| 1424 | match self.try_search_half_start(cache, input) { |
| 1425 | Err(RetryError::Quadratic(_err)) => { |
| 1426 | trace!("reverse suffix half optimization failed: {}" , _err); |
| 1427 | self.core.is_match_nofail(cache, input) |
| 1428 | } |
| 1429 | Err(RetryError::Fail(_err)) => { |
| 1430 | trace!( |
| 1431 | "reverse suffix reverse fast half search failed: {}" , |
| 1432 | _err |
| 1433 | ); |
| 1434 | self.core.is_match_nofail(cache, input) |
| 1435 | } |
| 1436 | Ok(None) => false, |
| 1437 | Ok(Some(_)) => true, |
| 1438 | } |
| 1439 | } |
| 1440 | |
| 1441 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1442 | fn search_slots( |
| 1443 | &self, |
| 1444 | cache: &mut Cache, |
| 1445 | input: &Input<'_>, |
| 1446 | slots: &mut [Option<NonMaxUsize>], |
| 1447 | ) -> Option<PatternID> { |
| 1448 | if input.get_anchored().is_anchored() { |
| 1449 | return self.core.search_slots(cache, input, slots); |
| 1450 | } |
| 1451 | if !self.core.is_capture_search_needed(slots.len()) { |
| 1452 | trace!("asked for slots unnecessarily, trying fast path" ); |
| 1453 | let m = self.search(cache, input)?; |
| 1454 | copy_match_to_slots(m, slots); |
| 1455 | return Some(m.pattern()); |
| 1456 | } |
| 1457 | let hm_start = match self.try_search_half_start(cache, input) { |
| 1458 | Err(RetryError::Quadratic(_err)) => { |
| 1459 | trace!( |
| 1460 | "reverse suffix captures optimization failed: {}" , |
| 1461 | _err |
| 1462 | ); |
| 1463 | return self.core.search_slots(cache, input, slots); |
| 1464 | } |
| 1465 | Err(RetryError::Fail(_err)) => { |
| 1466 | trace!( |
| 1467 | "reverse suffix reverse fast captures search failed: {}" , |
| 1468 | _err |
| 1469 | ); |
| 1470 | return self.core.search_slots_nofail(cache, input, slots); |
| 1471 | } |
| 1472 | Ok(None) => return None, |
| 1473 | Ok(Some(hm_start)) => hm_start, |
| 1474 | }; |
| 1475 | trace!( |
| 1476 | "match found at {}..{} in capture search, \ |
| 1477 | using another engine to find captures" , |
| 1478 | hm_start.offset(), |
| 1479 | input.end(), |
| 1480 | ); |
| 1481 | let start = hm_start.offset(); |
| 1482 | let input = input |
| 1483 | .clone() |
| 1484 | .span(start..input.end()) |
| 1485 | .anchored(Anchored::Pattern(hm_start.pattern())); |
| 1486 | self.core.search_slots_nofail(cache, &input, slots) |
| 1487 | } |
| 1488 | |
| 1489 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1490 | fn which_overlapping_matches( |
| 1491 | &self, |
| 1492 | cache: &mut Cache, |
| 1493 | input: &Input<'_>, |
| 1494 | patset: &mut PatternSet, |
| 1495 | ) { |
| 1496 | self.core.which_overlapping_matches(cache, input, patset) |
| 1497 | } |
| 1498 | } |
| 1499 | |
| 1500 | #[derive (Debug)] |
| 1501 | struct ReverseInner { |
| 1502 | core: Core, |
| 1503 | preinner: Prefilter, |
| 1504 | nfarev: NFA, |
| 1505 | hybrid: wrappers::ReverseHybrid, |
| 1506 | dfa: wrappers::ReverseDFA, |
| 1507 | } |
| 1508 | |
| 1509 | impl ReverseInner { |
| 1510 | fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseInner, Core> { |
| 1511 | if !core.info.config().get_auto_prefilter() { |
| 1512 | debug!( |
| 1513 | "skipping reverse inner optimization because \ |
| 1514 | automatic prefilters are disabled" |
| 1515 | ); |
| 1516 | return Err(core); |
| 1517 | } |
| 1518 | // Currently we hard-code the assumption of leftmost-first match |
| 1519 | // semantics. This isn't a huge deal because 'all' semantics tend to |
| 1520 | // only be used for forward overlapping searches with multiple regexes, |
| 1521 | // and this optimization only supports a single pattern at the moment. |
| 1522 | if core.info.config().get_match_kind() != MatchKind::LeftmostFirst { |
| 1523 | debug!( |
| 1524 | "skipping reverse inner optimization because \ |
| 1525 | match kind is {:?} but this only supports leftmost-first" , |
| 1526 | core.info.config().get_match_kind(), |
| 1527 | ); |
| 1528 | return Err(core); |
| 1529 | } |
| 1530 | // It's likely that a reverse inner scan has too much overhead for it |
| 1531 | // to be worth it when the regex is anchored at the start. It is |
| 1532 | // possible for it to be quite a bit faster if the initial literal |
| 1533 | // scan fails to detect a match, in which case, we can say "no match" |
| 1534 | // very quickly. But this could be undesirable, e.g., scanning too far |
| 1535 | // or when the literal scan matches. If it matches, then confirming the |
| 1536 | // match requires a reverse scan followed by a forward scan to confirm |
| 1537 | // or reject, which is a fair bit of work. |
| 1538 | // |
| 1539 | // Note that the caller can still request an anchored search even |
| 1540 | // when the regex isn't anchored. We detect that case in the search |
| 1541 | // routines below and just fallback to the core engine. Currently this |
| 1542 | // optimization assumes all searches are unanchored, so if we do want |
| 1543 | // to enable this optimization for anchored searches, it will need a |
| 1544 | // little work to support it. |
| 1545 | if core.info.is_always_anchored_start() { |
| 1546 | debug!( |
| 1547 | "skipping reverse inner optimization because \ |
| 1548 | the regex is always anchored at the start" , |
| 1549 | ); |
| 1550 | return Err(core); |
| 1551 | } |
| 1552 | // Only DFAs can do reverse searches (currently), so we need one of |
| 1553 | // them in order to do this optimization. It's possible (although |
| 1554 | // pretty unlikely) that we have neither and need to give up. |
| 1555 | if !core.hybrid.is_some() && !core.dfa.is_some() { |
| 1556 | debug!( |
| 1557 | "skipping reverse inner optimization because \ |
| 1558 | we don't have a lazy DFA or a full DFA" |
| 1559 | ); |
| 1560 | return Err(core); |
| 1561 | } |
| 1562 | if core.pre.as_ref().map_or(false, |p| p.is_fast()) { |
| 1563 | debug!( |
| 1564 | "skipping reverse inner optimization because \ |
| 1565 | we already have a prefilter that we think is fast" |
| 1566 | ); |
| 1567 | return Err(core); |
| 1568 | } else if core.pre.is_some() { |
| 1569 | debug!( |
| 1570 | "core engine has a prefix prefilter, but it is \ |
| 1571 | probably not fast, so continuing with attempt to \ |
| 1572 | use reverse inner prefilter" |
| 1573 | ); |
| 1574 | } |
| 1575 | let (concat_prefix, preinner) = match reverse_inner::extract(hirs) { |
| 1576 | Some(x) => x, |
| 1577 | // N.B. the 'extract' function emits debug messages explaining |
| 1578 | // why we bailed out here. |
| 1579 | None => return Err(core), |
| 1580 | }; |
| 1581 | debug!("building reverse NFA for prefix before inner literal" ); |
| 1582 | let mut lookm = LookMatcher::new(); |
| 1583 | lookm.set_line_terminator(core.info.config().get_line_terminator()); |
| 1584 | let thompson_config = thompson::Config::new() |
| 1585 | .reverse(true) |
| 1586 | .utf8(core.info.config().get_utf8_empty()) |
| 1587 | .nfa_size_limit(core.info.config().get_nfa_size_limit()) |
| 1588 | .shrink(false) |
| 1589 | .which_captures(WhichCaptures::None) |
| 1590 | .look_matcher(lookm); |
| 1591 | let result = thompson::Compiler::new() |
| 1592 | .configure(thompson_config) |
| 1593 | .build_from_hir(&concat_prefix); |
| 1594 | let nfarev = match result { |
| 1595 | Ok(nfarev) => nfarev, |
| 1596 | Err(_err) => { |
| 1597 | debug!( |
| 1598 | "skipping reverse inner optimization because the \ |
| 1599 | reverse NFA failed to build: {}" , |
| 1600 | _err, |
| 1601 | ); |
| 1602 | return Err(core); |
| 1603 | } |
| 1604 | }; |
| 1605 | debug!("building reverse DFA for prefix before inner literal" ); |
| 1606 | let dfa = if !core.info.config().get_dfa() { |
| 1607 | wrappers::ReverseDFA::none() |
| 1608 | } else { |
| 1609 | wrappers::ReverseDFA::new(&core.info, &nfarev) |
| 1610 | }; |
| 1611 | let hybrid = if !core.info.config().get_hybrid() { |
| 1612 | wrappers::ReverseHybrid::none() |
| 1613 | } else if dfa.is_some() { |
| 1614 | debug!( |
| 1615 | "skipping lazy DFA for reverse inner optimization \ |
| 1616 | because we have a full DFA" |
| 1617 | ); |
| 1618 | wrappers::ReverseHybrid::none() |
| 1619 | } else { |
| 1620 | wrappers::ReverseHybrid::new(&core.info, &nfarev) |
| 1621 | }; |
| 1622 | Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa }) |
| 1623 | } |
| 1624 | |
| 1625 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1626 | fn try_search_full( |
| 1627 | &self, |
| 1628 | cache: &mut Cache, |
| 1629 | input: &Input<'_>, |
| 1630 | ) -> Result<Option<Match>, RetryError> { |
| 1631 | let mut span = input.get_span(); |
| 1632 | let mut min_match_start = 0; |
| 1633 | let mut min_pre_start = 0; |
| 1634 | loop { |
| 1635 | let litmatch = match self.preinner.find(input.haystack(), span) { |
| 1636 | None => return Ok(None), |
| 1637 | Some(span) => span, |
| 1638 | }; |
| 1639 | if litmatch.start < min_pre_start { |
| 1640 | trace!( |
| 1641 | "found inner prefilter match at {:?}, which starts \ |
| 1642 | before the end of the last forward scan at {}, \ |
| 1643 | quitting to avoid quadratic behavior" , |
| 1644 | litmatch, |
| 1645 | min_pre_start, |
| 1646 | ); |
| 1647 | return Err(RetryError::Quadratic(RetryQuadraticError::new())); |
| 1648 | } |
| 1649 | trace!("reverse inner scan found inner match at {:?}" , litmatch); |
| 1650 | let revinput = input |
| 1651 | .clone() |
| 1652 | .anchored(Anchored::Yes) |
| 1653 | .span(input.start()..litmatch.start); |
| 1654 | // Note that in addition to the literal search above scanning past |
| 1655 | // our minimum start point, this routine can also return an error |
| 1656 | // as a result of detecting possible quadratic behavior if the |
| 1657 | // reverse scan goes past the minimum start point. That is, the |
| 1658 | // literal search might not, but the reverse regex search for the |
| 1659 | // prefix might! |
| 1660 | match self.try_search_half_rev_limited( |
| 1661 | cache, |
| 1662 | &revinput, |
| 1663 | min_match_start, |
| 1664 | )? { |
| 1665 | None => { |
| 1666 | if span.start >= span.end { |
| 1667 | break; |
| 1668 | } |
| 1669 | span.start = litmatch.start.checked_add(1).unwrap(); |
| 1670 | } |
| 1671 | Some(hm_start) => { |
| 1672 | let fwdinput = input |
| 1673 | .clone() |
| 1674 | .anchored(Anchored::Pattern(hm_start.pattern())) |
| 1675 | .span(hm_start.offset()..input.end()); |
| 1676 | match self.try_search_half_fwd_stopat(cache, &fwdinput)? { |
| 1677 | Err(stopat) => { |
| 1678 | min_pre_start = stopat; |
| 1679 | span.start = |
| 1680 | litmatch.start.checked_add(1).unwrap(); |
| 1681 | } |
| 1682 | Ok(hm_end) => { |
| 1683 | return Ok(Some(Match::new( |
| 1684 | hm_start.pattern(), |
| 1685 | hm_start.offset()..hm_end.offset(), |
| 1686 | ))) |
| 1687 | } |
| 1688 | } |
| 1689 | } |
| 1690 | } |
| 1691 | min_match_start = litmatch.end; |
| 1692 | } |
| 1693 | Ok(None) |
| 1694 | } |
| 1695 | |
| 1696 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1697 | fn try_search_half_fwd_stopat( |
| 1698 | &self, |
| 1699 | cache: &mut Cache, |
| 1700 | input: &Input<'_>, |
| 1701 | ) -> Result<Result<HalfMatch, usize>, RetryFailError> { |
| 1702 | if let Some(e) = self.core.dfa.get(&input) { |
| 1703 | trace!( |
| 1704 | "using full DFA for forward reverse inner search at {:?}" , |
| 1705 | input.get_span() |
| 1706 | ); |
| 1707 | e.try_search_half_fwd_stopat(&input) |
| 1708 | } else if let Some(e) = self.core.hybrid.get(&input) { |
| 1709 | trace!( |
| 1710 | "using lazy DFA for forward reverse inner search at {:?}" , |
| 1711 | input.get_span() |
| 1712 | ); |
| 1713 | e.try_search_half_fwd_stopat(&mut cache.hybrid, &input) |
| 1714 | } else { |
| 1715 | unreachable!("ReverseInner always has a DFA" ) |
| 1716 | } |
| 1717 | } |
| 1718 | |
| 1719 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1720 | fn try_search_half_rev_limited( |
| 1721 | &self, |
| 1722 | cache: &mut Cache, |
| 1723 | input: &Input<'_>, |
| 1724 | min_start: usize, |
| 1725 | ) -> Result<Option<HalfMatch>, RetryError> { |
| 1726 | if let Some(e) = self.dfa.get(&input) { |
| 1727 | trace!( |
| 1728 | "using full DFA for reverse inner search at {:?}, \ |
| 1729 | but will be stopped at {} to avoid quadratic behavior" , |
| 1730 | input.get_span(), |
| 1731 | min_start, |
| 1732 | ); |
| 1733 | e.try_search_half_rev_limited(&input, min_start) |
| 1734 | } else if let Some(e) = self.hybrid.get(&input) { |
| 1735 | trace!( |
| 1736 | "using lazy DFA for reverse inner search at {:?}, \ |
| 1737 | but will be stopped at {} to avoid quadratic behavior" , |
| 1738 | input.get_span(), |
| 1739 | min_start, |
| 1740 | ); |
| 1741 | e.try_search_half_rev_limited( |
| 1742 | &mut cache.revhybrid, |
| 1743 | &input, |
| 1744 | min_start, |
| 1745 | ) |
| 1746 | } else { |
| 1747 | unreachable!("ReverseInner always has a DFA" ) |
| 1748 | } |
| 1749 | } |
| 1750 | } |
| 1751 | |
| 1752 | impl Strategy for ReverseInner { |
| 1753 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1754 | fn group_info(&self) -> &GroupInfo { |
| 1755 | self.core.group_info() |
| 1756 | } |
| 1757 | |
| 1758 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1759 | fn create_cache(&self) -> Cache { |
| 1760 | let mut cache = self.core.create_cache(); |
| 1761 | cache.revhybrid = self.hybrid.create_cache(); |
| 1762 | cache |
| 1763 | } |
| 1764 | |
| 1765 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1766 | fn reset_cache(&self, cache: &mut Cache) { |
| 1767 | self.core.reset_cache(cache); |
| 1768 | cache.revhybrid.reset(&self.hybrid); |
| 1769 | } |
| 1770 | |
| 1771 | fn is_accelerated(&self) -> bool { |
| 1772 | self.preinner.is_fast() |
| 1773 | } |
| 1774 | |
| 1775 | fn memory_usage(&self) -> usize { |
| 1776 | self.core.memory_usage() |
| 1777 | + self.preinner.memory_usage() |
| 1778 | + self.nfarev.memory_usage() |
| 1779 | + self.dfa.memory_usage() |
| 1780 | } |
| 1781 | |
| 1782 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1783 | fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { |
| 1784 | if input.get_anchored().is_anchored() { |
| 1785 | return self.core.search(cache, input); |
| 1786 | } |
| 1787 | match self.try_search_full(cache, input) { |
| 1788 | Err(RetryError::Quadratic(_err)) => { |
| 1789 | trace!("reverse inner optimization failed: {}" , _err); |
| 1790 | self.core.search(cache, input) |
| 1791 | } |
| 1792 | Err(RetryError::Fail(_err)) => { |
| 1793 | trace!("reverse inner fast search failed: {}" , _err); |
| 1794 | self.core.search_nofail(cache, input) |
| 1795 | } |
| 1796 | Ok(matornot) => matornot, |
| 1797 | } |
| 1798 | } |
| 1799 | |
| 1800 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1801 | fn search_half( |
| 1802 | &self, |
| 1803 | cache: &mut Cache, |
| 1804 | input: &Input<'_>, |
| 1805 | ) -> Option<HalfMatch> { |
| 1806 | if input.get_anchored().is_anchored() { |
| 1807 | return self.core.search_half(cache, input); |
| 1808 | } |
| 1809 | match self.try_search_full(cache, input) { |
| 1810 | Err(RetryError::Quadratic(_err)) => { |
| 1811 | trace!("reverse inner half optimization failed: {}" , _err); |
| 1812 | self.core.search_half(cache, input) |
| 1813 | } |
| 1814 | Err(RetryError::Fail(_err)) => { |
| 1815 | trace!("reverse inner fast half search failed: {}" , _err); |
| 1816 | self.core.search_half_nofail(cache, input) |
| 1817 | } |
| 1818 | Ok(None) => None, |
| 1819 | Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())), |
| 1820 | } |
| 1821 | } |
| 1822 | |
| 1823 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1824 | fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { |
| 1825 | if input.get_anchored().is_anchored() { |
| 1826 | return self.core.is_match(cache, input); |
| 1827 | } |
| 1828 | match self.try_search_full(cache, input) { |
| 1829 | Err(RetryError::Quadratic(_err)) => { |
| 1830 | trace!("reverse inner half optimization failed: {}" , _err); |
| 1831 | self.core.is_match_nofail(cache, input) |
| 1832 | } |
| 1833 | Err(RetryError::Fail(_err)) => { |
| 1834 | trace!("reverse inner fast half search failed: {}" , _err); |
| 1835 | self.core.is_match_nofail(cache, input) |
| 1836 | } |
| 1837 | Ok(None) => false, |
| 1838 | Ok(Some(_)) => true, |
| 1839 | } |
| 1840 | } |
| 1841 | |
| 1842 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1843 | fn search_slots( |
| 1844 | &self, |
| 1845 | cache: &mut Cache, |
| 1846 | input: &Input<'_>, |
| 1847 | slots: &mut [Option<NonMaxUsize>], |
| 1848 | ) -> Option<PatternID> { |
| 1849 | if input.get_anchored().is_anchored() { |
| 1850 | return self.core.search_slots(cache, input, slots); |
| 1851 | } |
| 1852 | if !self.core.is_capture_search_needed(slots.len()) { |
| 1853 | trace!("asked for slots unnecessarily, trying fast path" ); |
| 1854 | let m = self.search(cache, input)?; |
| 1855 | copy_match_to_slots(m, slots); |
| 1856 | return Some(m.pattern()); |
| 1857 | } |
| 1858 | let m = match self.try_search_full(cache, input) { |
| 1859 | Err(RetryError::Quadratic(_err)) => { |
| 1860 | trace!("reverse inner captures optimization failed: {}" , _err); |
| 1861 | return self.core.search_slots(cache, input, slots); |
| 1862 | } |
| 1863 | Err(RetryError::Fail(_err)) => { |
| 1864 | trace!("reverse inner fast captures search failed: {}" , _err); |
| 1865 | return self.core.search_slots_nofail(cache, input, slots); |
| 1866 | } |
| 1867 | Ok(None) => return None, |
| 1868 | Ok(Some(m)) => m, |
| 1869 | }; |
| 1870 | trace!( |
| 1871 | "match found at {}..{} in capture search, \ |
| 1872 | using another engine to find captures" , |
| 1873 | m.start(), |
| 1874 | m.end(), |
| 1875 | ); |
| 1876 | let input = input |
| 1877 | .clone() |
| 1878 | .span(m.start()..m.end()) |
| 1879 | .anchored(Anchored::Pattern(m.pattern())); |
| 1880 | self.core.search_slots_nofail(cache, &input, slots) |
| 1881 | } |
| 1882 | |
| 1883 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1884 | fn which_overlapping_matches( |
| 1885 | &self, |
| 1886 | cache: &mut Cache, |
| 1887 | input: &Input<'_>, |
| 1888 | patset: &mut PatternSet, |
| 1889 | ) { |
| 1890 | self.core.which_overlapping_matches(cache, input, patset) |
| 1891 | } |
| 1892 | } |
| 1893 | |
| 1894 | /// Copies the offsets in the given match to the corresponding positions in |
| 1895 | /// `slots`. |
| 1896 | /// |
| 1897 | /// In effect, this sets the slots corresponding to the implicit group for the |
| 1898 | /// pattern in the given match. If the indices for the corresponding slots do |
| 1899 | /// not exist, then no slots are set. |
| 1900 | /// |
| 1901 | /// This is useful when the caller provides slots (or captures), but you use a |
| 1902 | /// regex engine that doesn't operate on slots (like a lazy DFA). This function |
| 1903 | /// lets you map the match you get back to the slots provided by the caller. |
| 1904 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1905 | fn copy_match_to_slots(m: Match, slots: &mut [Option<NonMaxUsize>]) { |
| 1906 | let slot_start: usize = m.pattern().as_usize() * 2; |
| 1907 | let slot_end: usize = slot_start + 1; |
| 1908 | if let Some(slot: &mut Option) = slots.get_mut(index:slot_start) { |
| 1909 | *slot = NonMaxUsize::new(m.start()); |
| 1910 | } |
| 1911 | if let Some(slot: &mut Option) = slots.get_mut(index:slot_end) { |
| 1912 | *slot = NonMaxUsize::new(m.end()); |
| 1913 | } |
| 1914 | } |
| 1915 | |