| 1 | /*! |
| 2 | Defines a prefilter for accelerating regex searches. |
| 3 | |
| 4 | A prefilter can be created by building a [`Prefilter`] value. |
| 5 | |
| 6 | A prefilter represents one of the most important optimizations available for |
| 7 | accelerating regex searches. The idea of a prefilter is to very quickly find |
| 8 | candidate locations in a haystack where a regex _could_ match. Once a candidate |
| 9 | is found, it is then intended for the regex engine to run at that position to |
| 10 | determine whether the candidate is a match or a false positive. |
| 11 | |
| 12 | In the aforementioned description of the prefilter optimization also lay its |
| 13 | demise. Namely, if a prefilter has a high false positive rate and it produces |
| 14 | lots of candidates, then a prefilter can overall make a regex search slower. |
| 15 | It can run more slowly because more time is spent ping-ponging between the |
| 16 | prefilter search and the regex engine attempting to confirm each candidate as |
| 17 | a match. This ping-ponging has overhead that adds up, and is exacerbated by |
| 18 | a high false positive rate. |
| 19 | |
| 20 | Nevertheless, the optimization is still generally worth performing in most |
| 21 | cases. Particularly given just how much throughput can be improved. (It is not |
| 22 | uncommon for prefilter optimizations to improve throughput by one or two orders |
| 23 | of magnitude.) |
| 24 | |
| 25 | Typically a prefilter is used to find occurrences of literal prefixes from a |
| 26 | regex pattern, but this isn't required. A prefilter can be used to look for |
| 27 | suffixes or even inner literals. |
| 28 | |
| 29 | Note that as of now, prefilters throw away information about which pattern |
| 30 | each literal comes from. In other words, when a prefilter finds a match, |
| 31 | there's no way to know which pattern (or patterns) it came from. Therefore, |
| 32 | in order to confirm a match, you'll have to check all of the patterns by |
| 33 | running the full regex engine. |
| 34 | */ |
| 35 | |
| 36 | mod aho_corasick; |
| 37 | mod byteset; |
| 38 | mod memchr; |
| 39 | mod memmem; |
| 40 | mod teddy; |
| 41 | |
| 42 | use core::{ |
| 43 | borrow::Borrow, |
| 44 | fmt::Debug, |
| 45 | panic::{RefUnwindSafe, UnwindSafe}, |
| 46 | }; |
| 47 | |
| 48 | #[cfg (feature = "alloc" )] |
| 49 | use alloc::sync::Arc; |
| 50 | |
| 51 | #[cfg (feature = "syntax" )] |
| 52 | use regex_syntax::hir::{literal, Hir}; |
| 53 | |
| 54 | use crate::util::search::{MatchKind, Span}; |
| 55 | |
| 56 | pub(crate) use crate::util::prefilter::{ |
| 57 | aho_corasick::AhoCorasick, |
| 58 | byteset::ByteSet, |
| 59 | memchr::{Memchr, Memchr2, Memchr3}, |
| 60 | memmem::Memmem, |
| 61 | teddy::Teddy, |
| 62 | }; |
| 63 | |
| 64 | /// A prefilter for accelerating regex searches. |
| 65 | /// |
| 66 | /// If you already have your literals that you want to search with, |
| 67 | /// then the vanilla [`Prefilter::new`] constructor is for you. But |
| 68 | /// if you have an [`Hir`] value from the `regex-syntax` crate, then |
| 69 | /// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses |
| 70 | /// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to |
| 71 | /// extract literal prefixes for you, optimize them and then select and build a |
| 72 | /// prefilter matcher. |
| 73 | /// |
| 74 | /// A prefilter must have **zero false negatives**. However, by its very |
| 75 | /// nature, it may produce false positives. That is, a prefilter will never |
| 76 | /// skip over a position in the haystack that corresponds to a match of the |
| 77 | /// original regex pattern, but it *may* produce a match for a position |
| 78 | /// in the haystack that does *not* correspond to a match of the original |
| 79 | /// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or |
| 80 | /// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is |
| 81 | /// upheld for you automatically. This guarantee is not preserved if you use |
| 82 | /// [`Prefilter::new`] though, since it is up to the caller to provide correct |
| 83 | /// literal strings with respect to the original regex pattern. |
| 84 | /// |
| 85 | /// # Cloning |
| 86 | /// |
| 87 | /// It is an API guarantee that cloning a prefilter is cheap. That is, cloning |
| 88 | /// it will not duplicate whatever heap memory is used to represent the |
| 89 | /// underlying matcher. |
| 90 | /// |
| 91 | /// # Example |
| 92 | /// |
| 93 | /// This example shows how to attach a `Prefilter` to the |
| 94 | /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate |
| 95 | /// searches. |
| 96 | /// |
| 97 | /// ``` |
| 98 | /// use regex_automata::{ |
| 99 | /// nfa::thompson::pikevm::PikeVM, |
| 100 | /// util::prefilter::Prefilter, |
| 101 | /// Match, MatchKind, |
| 102 | /// }; |
| 103 | /// |
| 104 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce " ]) |
| 105 | /// .expect("a prefilter" ); |
| 106 | /// let re = PikeVM::builder() |
| 107 | /// .configure(PikeVM::config().prefilter(Some(pre))) |
| 108 | /// .build(r"Bruce \w+" )?; |
| 109 | /// let mut cache = re.create_cache(); |
| 110 | /// assert_eq!( |
| 111 | /// Some(Match::must(0, 6..23)), |
| 112 | /// re.find(&mut cache, "Hello Bruce Springsteen!" ), |
| 113 | /// ); |
| 114 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 115 | /// ``` |
| 116 | /// |
| 117 | /// But note that if you get your prefilter incorrect, it could lead to an |
| 118 | /// incorrect result! |
| 119 | /// |
| 120 | /// ``` |
| 121 | /// use regex_automata::{ |
| 122 | /// nfa::thompson::pikevm::PikeVM, |
| 123 | /// util::prefilter::Prefilter, |
| 124 | /// Match, MatchKind, |
| 125 | /// }; |
| 126 | /// |
| 127 | /// // This prefilter is wrong! |
| 128 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti " ]) |
| 129 | /// .expect("a prefilter" ); |
| 130 | /// let re = PikeVM::builder() |
| 131 | /// .configure(PikeVM::config().prefilter(Some(pre))) |
| 132 | /// .build(r"Bruce \w+" )?; |
| 133 | /// let mut cache = re.create_cache(); |
| 134 | /// // We find no match even though the regex does match. |
| 135 | /// assert_eq!( |
| 136 | /// None, |
| 137 | /// re.find(&mut cache, "Hello Bruce Springsteen!" ), |
| 138 | /// ); |
| 139 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 140 | /// ``` |
| 141 | #[derive (Clone, Debug)] |
| 142 | pub struct Prefilter { |
| 143 | #[cfg (not(feature = "alloc" ))] |
| 144 | _unused: (), |
| 145 | #[cfg (feature = "alloc" )] |
| 146 | pre: Arc<dyn PrefilterI>, |
| 147 | #[cfg (feature = "alloc" )] |
| 148 | is_fast: bool, |
| 149 | #[cfg (feature = "alloc" )] |
| 150 | max_needle_len: usize, |
| 151 | } |
| 152 | |
| 153 | impl Prefilter { |
| 154 | /// Create a new prefilter from a sequence of needles and a corresponding |
| 155 | /// match semantics. |
| 156 | /// |
| 157 | /// This may return `None` for a variety of reasons, for example, if |
| 158 | /// a suitable prefilter could not be constructed. That might occur |
| 159 | /// if they are unavailable (e.g., the `perf-literal-substring` and |
| 160 | /// `perf-literal-multisubstring` features aren't enabled), or it might |
| 161 | /// occur because of heuristics or other artifacts of how the prefilter |
| 162 | /// works. |
| 163 | /// |
| 164 | /// Note that if you have an [`Hir`] expression, it may be more convenient |
| 165 | /// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the |
| 166 | /// task of extracting prefix literals for you. |
| 167 | /// |
| 168 | /// # Example |
| 169 | /// |
| 170 | /// This example shows how match semantics can impact the matching |
| 171 | /// algorithm used by the prefilter. For this reason, it is important to |
| 172 | /// ensure that the match semantics given here are consistent with the |
| 173 | /// match semantics intended for the regular expression that the literals |
| 174 | /// were extracted from. |
| 175 | /// |
| 176 | /// ``` |
| 177 | /// use regex_automata::{ |
| 178 | /// util::{prefilter::Prefilter, syntax}, |
| 179 | /// MatchKind, Span, |
| 180 | /// }; |
| 181 | /// |
| 182 | /// let hay = "Hello samwise" ; |
| 183 | /// |
| 184 | /// // With leftmost-first, we find 'samwise' here because it comes |
| 185 | /// // before 'sam' in the sequence we give it.. |
| 186 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise" , "sam" ]) |
| 187 | /// .expect("a prefilter" ); |
| 188 | /// assert_eq!( |
| 189 | /// Some(Span::from(6..13)), |
| 190 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
| 191 | /// ); |
| 192 | /// // Still with leftmost-first but with the literals reverse, now 'sam' |
| 193 | /// // will match instead! |
| 194 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam" , "samwise" ]) |
| 195 | /// .expect("a prefilter" ); |
| 196 | /// assert_eq!( |
| 197 | /// Some(Span::from(6..9)), |
| 198 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
| 199 | /// ); |
| 200 | /// |
| 201 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 202 | /// ``` |
| 203 | pub fn new<B: AsRef<[u8]>>( |
| 204 | kind: MatchKind, |
| 205 | needles: &[B], |
| 206 | ) -> Option<Prefilter> { |
| 207 | Choice::new(kind, needles).and_then(|choice| { |
| 208 | let max_needle_len = |
| 209 | needles.iter().map(|b| b.as_ref().len()).max().unwrap_or(0); |
| 210 | Prefilter::from_choice(choice, max_needle_len) |
| 211 | }) |
| 212 | } |
| 213 | |
| 214 | /// This turns a prefilter selection into a `Prefilter`. That is, in turns |
| 215 | /// the enum given into a trait object. |
| 216 | fn from_choice( |
| 217 | choice: Choice, |
| 218 | max_needle_len: usize, |
| 219 | ) -> Option<Prefilter> { |
| 220 | #[cfg (not(feature = "alloc" ))] |
| 221 | { |
| 222 | None |
| 223 | } |
| 224 | #[cfg (feature = "alloc" )] |
| 225 | { |
| 226 | let pre: Arc<dyn PrefilterI> = match choice { |
| 227 | Choice::Memchr(p) => Arc::new(p), |
| 228 | Choice::Memchr2(p) => Arc::new(p), |
| 229 | Choice::Memchr3(p) => Arc::new(p), |
| 230 | Choice::Memmem(p) => Arc::new(p), |
| 231 | Choice::Teddy(p) => Arc::new(p), |
| 232 | Choice::ByteSet(p) => Arc::new(p), |
| 233 | Choice::AhoCorasick(p) => Arc::new(p), |
| 234 | }; |
| 235 | let is_fast = pre.is_fast(); |
| 236 | Some(Prefilter { pre, is_fast, max_needle_len }) |
| 237 | } |
| 238 | } |
| 239 | |
| 240 | /// This attempts to extract prefixes from the given `Hir` expression for |
| 241 | /// the given match semantics, and if possible, builds a prefilter for |
| 242 | /// them. |
| 243 | /// |
| 244 | /// # Example |
| 245 | /// |
| 246 | /// This example shows how to build a prefilter directly from an [`Hir`] |
| 247 | /// expression, and use to find an occurrence of a prefix from the regex |
| 248 | /// pattern. |
| 249 | /// |
| 250 | /// ``` |
| 251 | /// use regex_automata::{ |
| 252 | /// util::{prefilter::Prefilter, syntax}, |
| 253 | /// MatchKind, Span, |
| 254 | /// }; |
| 255 | /// |
| 256 | /// let hir = syntax::parse(r"(Bruce|Patti) \w+" )?; |
| 257 | /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) |
| 258 | /// .expect("a prefilter" ); |
| 259 | /// let hay = "Hello Patti Scialfa!" ; |
| 260 | /// assert_eq!( |
| 261 | /// Some(Span::from(6..12)), |
| 262 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
| 263 | /// ); |
| 264 | /// |
| 265 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 266 | /// ``` |
| 267 | #[cfg (feature = "syntax" )] |
| 268 | pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> { |
| 269 | Prefilter::from_hirs_prefix(kind, &[hir]) |
| 270 | } |
| 271 | |
| 272 | /// This attempts to extract prefixes from the given `Hir` expressions for |
| 273 | /// the given match semantics, and if possible, builds a prefilter for |
| 274 | /// them. |
| 275 | /// |
| 276 | /// Note that as of now, prefilters throw away information about which |
| 277 | /// pattern each literal comes from. In other words, when a prefilter finds |
| 278 | /// a match, there's no way to know which pattern (or patterns) it came |
| 279 | /// from. Therefore, in order to confirm a match, you'll have to check all |
| 280 | /// of the patterns by running the full regex engine. |
| 281 | /// |
| 282 | /// # Example |
| 283 | /// |
| 284 | /// This example shows how to build a prefilter directly from multiple |
| 285 | /// `Hir` expressions expression, and use it to find an occurrence of a |
| 286 | /// prefix from the regex patterns. |
| 287 | /// |
| 288 | /// ``` |
| 289 | /// use regex_automata::{ |
| 290 | /// util::{prefilter::Prefilter, syntax}, |
| 291 | /// MatchKind, Span, |
| 292 | /// }; |
| 293 | /// |
| 294 | /// let hirs = syntax::parse_many(&[ |
| 295 | /// r"(Bruce|Patti) \w+" , |
| 296 | /// r"Mrs?\. Doubtfire" , |
| 297 | /// ])?; |
| 298 | /// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs) |
| 299 | /// .expect("a prefilter" ); |
| 300 | /// let hay = "Hello Mrs. Doubtfire" ; |
| 301 | /// assert_eq!( |
| 302 | /// Some(Span::from(6..20)), |
| 303 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
| 304 | /// ); |
| 305 | /// |
| 306 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 307 | /// ``` |
| 308 | #[cfg (feature = "syntax" )] |
| 309 | pub fn from_hirs_prefix<H: Borrow<Hir>>( |
| 310 | kind: MatchKind, |
| 311 | hirs: &[H], |
| 312 | ) -> Option<Prefilter> { |
| 313 | prefixes(kind, hirs) |
| 314 | .literals() |
| 315 | .and_then(|lits| Prefilter::new(kind, lits)) |
| 316 | } |
| 317 | |
| 318 | /// Run this prefilter on `haystack[span.start..end]` and return a matching |
| 319 | /// span if one exists. |
| 320 | /// |
| 321 | /// The span returned is guaranteed to have a start position greater than |
| 322 | /// or equal to the one given, and an end position less than or equal to |
| 323 | /// the one given. |
| 324 | /// |
| 325 | /// # Example |
| 326 | /// |
| 327 | /// This example shows how to build a prefilter directly from an [`Hir`] |
| 328 | /// expression, and use it to find an occurrence of a prefix from the regex |
| 329 | /// pattern. |
| 330 | /// |
| 331 | /// ``` |
| 332 | /// use regex_automata::{ |
| 333 | /// util::{prefilter::Prefilter, syntax}, |
| 334 | /// MatchKind, Span, |
| 335 | /// }; |
| 336 | /// |
| 337 | /// let hir = syntax::parse(r"Bruce \w+" )?; |
| 338 | /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) |
| 339 | /// .expect("a prefilter" ); |
| 340 | /// let hay = "Hello Bruce Springsteen!" ; |
| 341 | /// assert_eq!( |
| 342 | /// Some(Span::from(6..12)), |
| 343 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
| 344 | /// ); |
| 345 | /// |
| 346 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 347 | /// ``` |
| 348 | #[inline ] |
| 349 | pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
| 350 | #[cfg (not(feature = "alloc" ))] |
| 351 | { |
| 352 | unreachable!() |
| 353 | } |
| 354 | #[cfg (feature = "alloc" )] |
| 355 | { |
| 356 | self.pre.find(haystack, span) |
| 357 | } |
| 358 | } |
| 359 | |
| 360 | /// Returns the span of a prefix of `haystack[span.start..span.end]` if |
| 361 | /// the prefilter matches. |
| 362 | /// |
| 363 | /// The span returned is guaranteed to have a start position equivalent to |
| 364 | /// the one given, and an end position less than or equal to the one given. |
| 365 | /// |
| 366 | /// # Example |
| 367 | /// |
| 368 | /// This example shows how to build a prefilter directly from an [`Hir`] |
| 369 | /// expression, and use it to find an occurrence of a prefix from the regex |
| 370 | /// pattern that begins at the start of a haystack only. |
| 371 | /// |
| 372 | /// ``` |
| 373 | /// use regex_automata::{ |
| 374 | /// util::{prefilter::Prefilter, syntax}, |
| 375 | /// MatchKind, Span, |
| 376 | /// }; |
| 377 | /// |
| 378 | /// let hir = syntax::parse(r"Bruce \w+" )?; |
| 379 | /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) |
| 380 | /// .expect("a prefilter" ); |
| 381 | /// let hay = "Hello Bruce Springsteen!" ; |
| 382 | /// // Nothing is found here because 'Bruce' does |
| 383 | /// // not occur at the beginning of our search. |
| 384 | /// assert_eq!( |
| 385 | /// None, |
| 386 | /// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())), |
| 387 | /// ); |
| 388 | /// // But if we change where we start the search |
| 389 | /// // to begin where 'Bruce ' begins, then a |
| 390 | /// // match will be found. |
| 391 | /// assert_eq!( |
| 392 | /// Some(Span::from(6..12)), |
| 393 | /// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())), |
| 394 | /// ); |
| 395 | /// |
| 396 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 397 | /// ``` |
| 398 | #[inline ] |
| 399 | pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
| 400 | #[cfg (not(feature = "alloc" ))] |
| 401 | { |
| 402 | unreachable!() |
| 403 | } |
| 404 | #[cfg (feature = "alloc" )] |
| 405 | { |
| 406 | self.pre.prefix(haystack, span) |
| 407 | } |
| 408 | } |
| 409 | |
| 410 | /// Returns the heap memory, in bytes, used by the underlying prefilter. |
| 411 | #[inline ] |
| 412 | pub fn memory_usage(&self) -> usize { |
| 413 | #[cfg (not(feature = "alloc" ))] |
| 414 | { |
| 415 | unreachable!() |
| 416 | } |
| 417 | #[cfg (feature = "alloc" )] |
| 418 | { |
| 419 | self.pre.memory_usage() |
| 420 | } |
| 421 | } |
| 422 | |
| 423 | /// Return the length of the longest needle |
| 424 | /// in this Prefilter |
| 425 | #[inline ] |
| 426 | pub fn max_needle_len(&self) -> usize { |
| 427 | #[cfg (not(feature = "alloc" ))] |
| 428 | { |
| 429 | unreachable!() |
| 430 | } |
| 431 | #[cfg (feature = "alloc" )] |
| 432 | { |
| 433 | self.max_needle_len |
| 434 | } |
| 435 | } |
| 436 | |
| 437 | /// Implementations might return true here if they believe themselves to |
| 438 | /// be "fast." The concept of "fast" is deliberately left vague, but in |
| 439 | /// practice this usually corresponds to whether it's believed that SIMD |
| 440 | /// will be used. |
| 441 | /// |
| 442 | /// Why do we care about this? Well, some prefilter tricks tend to come |
| 443 | /// with their own bits of overhead, and so might only make sense if we |
| 444 | /// know that a scan will be *much* faster than the regex engine itself. |
| 445 | /// Otherwise, the trick may not be worth doing. Whether something is |
| 446 | /// "much" faster than the regex engine generally boils down to whether |
| 447 | /// SIMD is used. (But not always. Even a SIMD matcher with a high false |
| 448 | /// positive rate can become quite slow.) |
| 449 | /// |
| 450 | /// Even if this returns true, it is still possible for the prefilter to |
| 451 | /// be "slow." Remember, prefilters are just heuristics. We can't really |
| 452 | /// *know* a prefilter will be fast without actually trying the prefilter. |
| 453 | /// (Which of course we cannot afford to do.) |
| 454 | #[inline ] |
| 455 | pub fn is_fast(&self) -> bool { |
| 456 | #[cfg (not(feature = "alloc" ))] |
| 457 | { |
| 458 | unreachable!() |
| 459 | } |
| 460 | #[cfg (feature = "alloc" )] |
| 461 | { |
| 462 | self.is_fast |
| 463 | } |
| 464 | } |
| 465 | } |
| 466 | |
| 467 | /// A trait for abstracting over prefilters. Basically, a prefilter is |
| 468 | /// something that do an unanchored *and* an anchored search in a haystack |
| 469 | /// within a given span. |
| 470 | /// |
| 471 | /// This exists pretty much only so that we can use prefilters as a trait |
| 472 | /// object (which is what `Prefilter` is). If we ever move off of trait objects |
| 473 | /// and to an enum, then it's likely this trait could be removed. |
| 474 | pub(crate) trait PrefilterI: |
| 475 | Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static |
| 476 | { |
| 477 | /// Run this prefilter on `haystack[span.start..end]` and return a matching |
| 478 | /// span if one exists. |
| 479 | /// |
| 480 | /// The span returned is guaranteed to have a start position greater than |
| 481 | /// or equal to the one given, and an end position less than or equal to |
| 482 | /// the one given. |
| 483 | fn find(&self, haystack: &[u8], span: Span) -> Option<Span>; |
| 484 | |
| 485 | /// Returns the span of a prefix of `haystack[span.start..span.end]` if |
| 486 | /// the prefilter matches. |
| 487 | /// |
| 488 | /// The span returned is guaranteed to have a start position equivalent to |
| 489 | /// the one given, and an end position less than or equal to the one given. |
| 490 | fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>; |
| 491 | |
| 492 | /// Returns the heap memory, in bytes, used by the underlying prefilter. |
| 493 | fn memory_usage(&self) -> usize; |
| 494 | |
| 495 | /// Implementations might return true here if they believe themselves to |
| 496 | /// be "fast." See [`Prefilter::is_fast`] for more details. |
| 497 | fn is_fast(&self) -> bool; |
| 498 | } |
| 499 | |
| 500 | #[cfg (feature = "alloc" )] |
| 501 | impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> { |
| 502 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 503 | fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
| 504 | (&**self).find(haystack, span) |
| 505 | } |
| 506 | |
| 507 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 508 | fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
| 509 | (&**self).prefix(haystack, span) |
| 510 | } |
| 511 | |
| 512 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 513 | fn memory_usage(&self) -> usize { |
| 514 | (&**self).memory_usage() |
| 515 | } |
| 516 | |
| 517 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 518 | fn is_fast(&self) -> bool { |
| 519 | (&**self).is_fast() |
| 520 | } |
| 521 | } |
| 522 | |
| 523 | /// A type that encapsulates the selection of a prefilter algorithm from a |
| 524 | /// sequence of needles. |
| 525 | /// |
| 526 | /// The existence of this type is a little tricky, because we don't (currently) |
| 527 | /// use it for performing a search. Instead, we really only consume it by |
| 528 | /// converting the underlying prefilter into a trait object, whether that be |
| 529 | /// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order |
| 530 | /// to avoid re-copying the prefilter selection logic, we isolate it here, and |
| 531 | /// then force anything downstream that wants to convert it to a trait object |
| 532 | /// to do trivial case analysis on it. |
| 533 | /// |
| 534 | /// One wonders whether we *should* use an enum instead of a trait object. |
| 535 | /// At time of writing, I chose trait objects based on instinct because 1) I |
| 536 | /// knew I wasn't going to inline anything and 2) there would potentially be |
| 537 | /// many different choices. However, as of time of writing, I haven't actually |
| 538 | /// compared the trait object approach to the enum approach. That probably |
| 539 | /// should be litigated, but I ran out of steam. |
| 540 | /// |
| 541 | /// Note that if the `alloc` feature is disabled, then values of this type |
| 542 | /// are (and should) never be constructed. Also, in practice, for any of the |
| 543 | /// prefilters to be selected, you'll need at least one of the `perf-literal-*` |
| 544 | /// features enabled. |
| 545 | #[derive (Clone, Debug)] |
| 546 | pub(crate) enum Choice { |
| 547 | Memchr(Memchr), |
| 548 | Memchr2(Memchr2), |
| 549 | Memchr3(Memchr3), |
| 550 | Memmem(Memmem), |
| 551 | Teddy(Teddy), |
| 552 | ByteSet(ByteSet), |
| 553 | AhoCorasick(AhoCorasick), |
| 554 | } |
| 555 | |
| 556 | impl Choice { |
| 557 | /// Select what is believed to be the best prefilter algorithm for the |
| 558 | /// match semantics and sequence of needles given. |
| 559 | /// |
| 560 | /// This selection algorithm uses the needles as given without any |
| 561 | /// modification. For example, if `[bar]` is given, then this doesn't |
| 562 | /// try to select `memchr` for `b`. Instead, it would select `memmem` |
| 563 | /// for `bar`. If callers would want `memchr` selected for `[bar]`, then |
| 564 | /// callers should massages the literals themselves. That is, callers are |
| 565 | /// responsible for heuristics surrounding which sequence of literals is |
| 566 | /// best. |
| 567 | /// |
| 568 | /// What this selection algorithm does is attempt to use the fastest |
| 569 | /// prefilter that works for the literals given. So if `[a, b]`, is given, |
| 570 | /// then `memchr2` is selected. |
| 571 | /// |
| 572 | /// Of course, which prefilter is selected is also subject to what |
| 573 | /// is available. For example, if `alloc` isn't enabled, then |
| 574 | /// that limits which prefilters can be selected. Similarly, if |
| 575 | /// `perf-literal-substring` isn't enabled, then nothing from the `memchr` |
| 576 | /// crate can be returned. |
| 577 | pub(crate) fn new<B: AsRef<[u8]>>( |
| 578 | kind: MatchKind, |
| 579 | needles: &[B], |
| 580 | ) -> Option<Choice> { |
| 581 | // An empty set means the regex matches nothing, so no sense in |
| 582 | // building a prefilter. |
| 583 | if needles.len() == 0 { |
| 584 | debug!("prefilter building failed: found empty set of literals" ); |
| 585 | return None; |
| 586 | } |
| 587 | // If the regex can match the empty string, then the prefilter |
| 588 | // will by definition match at every position. This is obviously |
| 589 | // completely ineffective. |
| 590 | if needles.iter().any(|n| n.as_ref().is_empty()) { |
| 591 | debug!("prefilter building failed: literals match empty string" ); |
| 592 | return None; |
| 593 | } |
| 594 | // BREADCRUMBS: Perhaps the literal optimizer should special case |
| 595 | // sequences of length two or three if the leading bytes of each are |
| 596 | // "rare"? Or perhaps, if there are two or three total possible leading |
| 597 | // bytes, regardless of the number of literals, and all are rare... |
| 598 | // Then well, perhaps we should use memchr2 or memchr3 in those cases? |
| 599 | if let Some(pre) = Memchr::new(kind, needles) { |
| 600 | debug!("prefilter built: memchr" ); |
| 601 | return Some(Choice::Memchr(pre)); |
| 602 | } |
| 603 | if let Some(pre) = Memchr2::new(kind, needles) { |
| 604 | debug!("prefilter built: memchr2" ); |
| 605 | return Some(Choice::Memchr2(pre)); |
| 606 | } |
| 607 | if let Some(pre) = Memchr3::new(kind, needles) { |
| 608 | debug!("prefilter built: memchr3" ); |
| 609 | return Some(Choice::Memchr3(pre)); |
| 610 | } |
| 611 | if let Some(pre) = Memmem::new(kind, needles) { |
| 612 | debug!("prefilter built: memmem" ); |
| 613 | return Some(Choice::Memmem(pre)); |
| 614 | } |
| 615 | if let Some(pre) = Teddy::new(kind, needles) { |
| 616 | debug!("prefilter built: teddy" ); |
| 617 | return Some(Choice::Teddy(pre)); |
| 618 | } |
| 619 | if let Some(pre) = ByteSet::new(kind, needles) { |
| 620 | debug!("prefilter built: byteset" ); |
| 621 | return Some(Choice::ByteSet(pre)); |
| 622 | } |
| 623 | if let Some(pre) = AhoCorasick::new(kind, needles) { |
| 624 | debug!("prefilter built: aho-corasick" ); |
| 625 | return Some(Choice::AhoCorasick(pre)); |
| 626 | } |
| 627 | debug!("prefilter building failed: no strategy could be found" ); |
| 628 | None |
| 629 | } |
| 630 | } |
| 631 | |
| 632 | /// Extracts all of the prefix literals from the given HIR expressions into a |
| 633 | /// single `Seq`. The literals in the sequence are ordered with respect to the |
| 634 | /// order of the given HIR expressions and consistent with the match semantics |
| 635 | /// given. |
| 636 | /// |
| 637 | /// The sequence returned is "optimized." That is, they may be shrunk or even |
| 638 | /// truncated according to heuristics with the intent of making them more |
| 639 | /// useful as a prefilter. (Which translates to both using faster algorithms |
| 640 | /// and minimizing the false positive rate.) |
| 641 | /// |
| 642 | /// Note that this erases any connection between the literals and which pattern |
| 643 | /// (or patterns) they came from. |
| 644 | /// |
| 645 | /// The match kind given must correspond to the match semantics of the regex |
| 646 | /// that is represented by the HIRs given. The match semantics may change the |
| 647 | /// literal sequence returned. |
| 648 | #[cfg (feature = "syntax" )] |
| 649 | pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq |
| 650 | where |
| 651 | H: core::borrow::Borrow<Hir>, |
| 652 | { |
| 653 | let mut extractor = literal::Extractor::new(); |
| 654 | extractor.kind(literal::ExtractKind::Prefix); |
| 655 | |
| 656 | let mut prefixes = literal::Seq::empty(); |
| 657 | for hir in hirs { |
| 658 | prefixes.union(&mut extractor.extract(hir.borrow())); |
| 659 | } |
| 660 | debug!( |
| 661 | "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}" , |
| 662 | prefixes.len(), |
| 663 | prefixes.is_exact(), |
| 664 | prefixes |
| 665 | ); |
| 666 | match kind { |
| 667 | MatchKind::All => { |
| 668 | prefixes.sort(); |
| 669 | prefixes.dedup(); |
| 670 | } |
| 671 | MatchKind::LeftmostFirst => { |
| 672 | prefixes.optimize_for_prefix_by_preference(); |
| 673 | } |
| 674 | } |
| 675 | debug!( |
| 676 | "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}" , |
| 677 | prefixes.len(), |
| 678 | prefixes.is_exact(), |
| 679 | prefixes |
| 680 | ); |
| 681 | prefixes |
| 682 | } |
| 683 | |
| 684 | /// Like `prefixes`, but for all suffixes of all matches for the given HIRs. |
| 685 | #[cfg (feature = "syntax" )] |
| 686 | pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq |
| 687 | where |
| 688 | H: core::borrow::Borrow<Hir>, |
| 689 | { |
| 690 | let mut extractor = literal::Extractor::new(); |
| 691 | extractor.kind(literal::ExtractKind::Suffix); |
| 692 | |
| 693 | let mut suffixes = literal::Seq::empty(); |
| 694 | for hir in hirs { |
| 695 | suffixes.union(&mut extractor.extract(hir.borrow())); |
| 696 | } |
| 697 | debug!( |
| 698 | "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}" , |
| 699 | suffixes.len(), |
| 700 | suffixes.is_exact(), |
| 701 | suffixes |
| 702 | ); |
| 703 | match kind { |
| 704 | MatchKind::All => { |
| 705 | suffixes.sort(); |
| 706 | suffixes.dedup(); |
| 707 | } |
| 708 | MatchKind::LeftmostFirst => { |
| 709 | suffixes.optimize_for_suffix_by_preference(); |
| 710 | } |
| 711 | } |
| 712 | debug!( |
| 713 | "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}" , |
| 714 | suffixes.len(), |
| 715 | suffixes.is_exact(), |
| 716 | suffixes |
| 717 | ); |
| 718 | suffixes |
| 719 | } |
| 720 | |