1 | /*! |
2 | Defines a prefilter for accelerating regex searches. |
3 | |
4 | A prefilter can be created by building a [`Prefilter`] value. |
5 | |
6 | A prefilter represents one of the most important optimizations available for |
7 | accelerating regex searches. The idea of a prefilter is to very quickly find |
8 | candidate locations in a haystack where a regex _could_ match. Once a candidate |
9 | is found, it is then intended for the regex engine to run at that position to |
10 | determine whether the candidate is a match or a false positive. |
11 | |
12 | In the aforementioned description of the prefilter optimization also lay its |
13 | demise. Namely, if a prefilter has a high false positive rate and it produces |
14 | lots of candidates, then a prefilter can overall make a regex search slower. |
15 | It can run more slowly because more time is spent ping-ponging between the |
16 | prefilter search and the regex engine attempting to confirm each candidate as |
17 | a match. This ping-ponging has overhead that adds up, and is exacerbated by |
18 | a high false positive rate. |
19 | |
20 | Nevertheless, the optimization is still generally worth performing in most |
21 | cases. Particularly given just how much throughput can be improved. (It is not |
22 | uncommon for prefilter optimizations to improve throughput by one or two orders |
23 | of magnitude.) |
24 | |
25 | Typically a prefilter is used to find occurrences of literal prefixes from a |
26 | regex pattern, but this isn't required. A prefilter can be used to look for |
27 | suffixes or even inner literals. |
28 | |
29 | Note that as of now, prefilters throw away information about which pattern |
30 | each literal comes from. In other words, when a prefilter finds a match, |
31 | there's no way to know which pattern (or patterns) it came from. Therefore, |
32 | in order to confirm a match, you'll have to check all of the patterns by |
33 | running the full regex engine. |
34 | */ |
35 | |
36 | mod aho_corasick; |
37 | mod byteset; |
38 | mod memchr; |
39 | mod memmem; |
40 | mod teddy; |
41 | |
42 | use core::{ |
43 | borrow::Borrow, |
44 | fmt::Debug, |
45 | panic::{RefUnwindSafe, UnwindSafe}, |
46 | }; |
47 | |
48 | #[cfg (feature = "alloc" )] |
49 | use alloc::sync::Arc; |
50 | |
51 | #[cfg (feature = "syntax" )] |
52 | use regex_syntax::hir::{literal, Hir}; |
53 | |
54 | use crate::util::search::{MatchKind, Span}; |
55 | |
56 | pub(crate) use crate::util::prefilter::{ |
57 | aho_corasick::AhoCorasick, |
58 | byteset::ByteSet, |
59 | memchr::{Memchr, Memchr2, Memchr3}, |
60 | memmem::Memmem, |
61 | teddy::Teddy, |
62 | }; |
63 | |
64 | /// A prefilter for accelerating regex searches. |
65 | /// |
66 | /// If you already have your literals that you want to search with, |
67 | /// then the vanilla [`Prefilter::new`] constructor is for you. But |
68 | /// if you have an [`Hir`] value from the `regex-syntax` crate, then |
69 | /// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses |
70 | /// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to |
71 | /// extract literal prefixes for you, optimize them and then select and build a |
72 | /// prefilter matcher. |
73 | /// |
74 | /// A prefilter must have **zero false negatives**. However, by its very |
75 | /// nature, it may produce false positives. That is, a prefilter will never |
76 | /// skip over a position in the haystack that corresponds to a match of the |
77 | /// original regex pattern, but it *may* produce a match for a position |
78 | /// in the haystack that does *not* correspond to a match of the original |
79 | /// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or |
80 | /// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is |
81 | /// upheld for you automatically. This guarantee is not preserved if you use |
82 | /// [`Prefilter::new`] though, since it is up to the caller to provide correct |
83 | /// literal strings with respect to the original regex pattern. |
84 | /// |
85 | /// # Cloning |
86 | /// |
87 | /// It is an API guarantee that cloning a prefilter is cheap. That is, cloning |
88 | /// it will not duplicate whatever heap memory is used to represent the |
89 | /// underlying matcher. |
90 | /// |
91 | /// # Example |
92 | /// |
93 | /// This example shows how to attach a `Prefilter` to the |
94 | /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate |
95 | /// searches. |
96 | /// |
97 | /// ``` |
98 | /// use regex_automata::{ |
99 | /// nfa::thompson::pikevm::PikeVM, |
100 | /// util::prefilter::Prefilter, |
101 | /// Match, MatchKind, |
102 | /// }; |
103 | /// |
104 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce " ]) |
105 | /// .expect("a prefilter" ); |
106 | /// let re = PikeVM::builder() |
107 | /// .configure(PikeVM::config().prefilter(Some(pre))) |
108 | /// .build(r"Bruce \w+" )?; |
109 | /// let mut cache = re.create_cache(); |
110 | /// assert_eq!( |
111 | /// Some(Match::must(0, 6..23)), |
112 | /// re.find(&mut cache, "Hello Bruce Springsteen!" ), |
113 | /// ); |
114 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
115 | /// ``` |
116 | /// |
117 | /// But note that if you get your prefilter incorrect, it could lead to an |
118 | /// incorrect result! |
119 | /// |
120 | /// ``` |
121 | /// use regex_automata::{ |
122 | /// nfa::thompson::pikevm::PikeVM, |
123 | /// util::prefilter::Prefilter, |
124 | /// Match, MatchKind, |
125 | /// }; |
126 | /// |
127 | /// // This prefilter is wrong! |
128 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti " ]) |
129 | /// .expect("a prefilter" ); |
130 | /// let re = PikeVM::builder() |
131 | /// .configure(PikeVM::config().prefilter(Some(pre))) |
132 | /// .build(r"Bruce \w+" )?; |
133 | /// let mut cache = re.create_cache(); |
134 | /// // We find no match even though the regex does match. |
135 | /// assert_eq!( |
136 | /// None, |
137 | /// re.find(&mut cache, "Hello Bruce Springsteen!" ), |
138 | /// ); |
139 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
140 | /// ``` |
141 | #[derive(Clone, Debug)] |
142 | pub struct Prefilter { |
143 | #[cfg (not(feature = "alloc" ))] |
144 | _unused: (), |
145 | #[cfg (feature = "alloc" )] |
146 | pre: Arc<dyn PrefilterI>, |
147 | #[cfg (feature = "alloc" )] |
148 | is_fast: bool, |
149 | } |
150 | |
151 | impl Prefilter { |
152 | /// Create a new prefilter from a sequence of needles and a corresponding |
153 | /// match semantics. |
154 | /// |
155 | /// This may return `None` for a variety of reasons, for example, if |
156 | /// a suitable prefilter could not be constructed. That might occur |
157 | /// if they are unavailable (e.g., the `perf-literal-substring` and |
158 | /// `perf-literal-multisubstring` features aren't enabled), or it might |
159 | /// occur because of heuristics or other artifacts of how the prefilter |
160 | /// works. |
161 | /// |
162 | /// Note that if you have an [`Hir`] expression, it may be more convenient |
163 | /// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the |
164 | /// task of extracting prefix literals for you. |
165 | /// |
166 | /// # Example |
167 | /// |
168 | /// This example shows how match semantics can impact the matching |
169 | /// algorithm used by the prefilter. For this reason, it is important to |
170 | /// ensure that the match semantics given here are consistent with the |
171 | /// match semantics intended for the regular expression that the literals |
172 | /// were extracted from. |
173 | /// |
174 | /// ``` |
175 | /// use regex_automata::{ |
176 | /// util::{prefilter::Prefilter, syntax}, |
177 | /// MatchKind, Span, |
178 | /// }; |
179 | /// |
180 | /// let hay = "Hello samwise" ; |
181 | /// |
182 | /// // With leftmost-first, we find 'samwise' here because it comes |
183 | /// // before 'sam' in the sequence we give it.. |
184 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise" , "sam" ]) |
185 | /// .expect("a prefilter" ); |
186 | /// assert_eq!( |
187 | /// Some(Span::from(6..13)), |
188 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
189 | /// ); |
190 | /// // Still with leftmost-first but with the literals reverse, now 'sam' |
191 | /// // will match instead! |
192 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam" , "samwise" ]) |
193 | /// .expect("a prefilter" ); |
194 | /// assert_eq!( |
195 | /// Some(Span::from(6..9)), |
196 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
197 | /// ); |
198 | /// |
199 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
200 | /// ``` |
201 | pub fn new<B: AsRef<[u8]>>( |
202 | kind: MatchKind, |
203 | needles: &[B], |
204 | ) -> Option<Prefilter> { |
205 | Choice::new(kind, needles).and_then(Prefilter::from_choice) |
206 | } |
207 | |
208 | /// This turns a prefilter selection into a `Prefilter`. That is, in turns |
209 | /// the enum given into a trait object. |
210 | fn from_choice(choice: Choice) -> Option<Prefilter> { |
211 | #[cfg (not(feature = "alloc" ))] |
212 | { |
213 | None |
214 | } |
215 | #[cfg (feature = "alloc" )] |
216 | { |
217 | let pre: Arc<dyn PrefilterI> = match choice { |
218 | Choice::Memchr(p) => Arc::new(p), |
219 | Choice::Memchr2(p) => Arc::new(p), |
220 | Choice::Memchr3(p) => Arc::new(p), |
221 | Choice::Memmem(p) => Arc::new(p), |
222 | Choice::Teddy(p) => Arc::new(p), |
223 | Choice::ByteSet(p) => Arc::new(p), |
224 | Choice::AhoCorasick(p) => Arc::new(p), |
225 | }; |
226 | let is_fast = pre.is_fast(); |
227 | Some(Prefilter { pre, is_fast }) |
228 | } |
229 | } |
230 | |
231 | /// This attempts to extract prefixes from the given `Hir` expression for |
232 | /// the given match semantics, and if possible, builds a prefilter for |
233 | /// them. |
234 | /// |
235 | /// # Example |
236 | /// |
237 | /// This example shows how to build a prefilter directly from an [`Hir`] |
238 | /// expression, and use to find an occurrence of a prefix from the regex |
239 | /// pattern. |
240 | /// |
241 | /// ``` |
242 | /// use regex_automata::{ |
243 | /// util::{prefilter::Prefilter, syntax}, |
244 | /// MatchKind, Span, |
245 | /// }; |
246 | /// |
247 | /// let hir = syntax::parse(r"(Bruce|Patti) \w+" )?; |
248 | /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) |
249 | /// .expect("a prefilter" ); |
250 | /// let hay = "Hello Patti Scialfa!" ; |
251 | /// assert_eq!( |
252 | /// Some(Span::from(6..12)), |
253 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
254 | /// ); |
255 | /// |
256 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
257 | /// ``` |
258 | #[cfg (feature = "syntax" )] |
259 | pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> { |
260 | Prefilter::from_hirs_prefix(kind, &[hir]) |
261 | } |
262 | |
263 | /// This attempts to extract prefixes from the given `Hir` expressions for |
264 | /// the given match semantics, and if possible, builds a prefilter for |
265 | /// them. |
266 | /// |
267 | /// Note that as of now, prefilters throw away information about which |
268 | /// pattern each literal comes from. In other words, when a prefilter finds |
269 | /// a match, there's no way to know which pattern (or patterns) it came |
270 | /// from. Therefore, in order to confirm a match, you'll have to check all |
271 | /// of the patterns by running the full regex engine. |
272 | /// |
273 | /// # Example |
274 | /// |
275 | /// This example shows how to build a prefilter directly from multiple |
276 | /// `Hir` expressions expression, and use it to find an occurrence of a |
277 | /// prefix from the regex patterns. |
278 | /// |
279 | /// ``` |
280 | /// use regex_automata::{ |
281 | /// util::{prefilter::Prefilter, syntax}, |
282 | /// MatchKind, Span, |
283 | /// }; |
284 | /// |
285 | /// let hirs = syntax::parse_many(&[ |
286 | /// r"(Bruce|Patti) \w+" , |
287 | /// r"Mrs?\. Doubtfire" , |
288 | /// ])?; |
289 | /// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs) |
290 | /// .expect("a prefilter" ); |
291 | /// let hay = "Hello Mrs. Doubtfire" ; |
292 | /// assert_eq!( |
293 | /// Some(Span::from(6..20)), |
294 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
295 | /// ); |
296 | /// |
297 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
298 | /// ``` |
299 | #[cfg (feature = "syntax" )] |
300 | pub fn from_hirs_prefix<H: Borrow<Hir>>( |
301 | kind: MatchKind, |
302 | hirs: &[H], |
303 | ) -> Option<Prefilter> { |
304 | prefixes(kind, hirs) |
305 | .literals() |
306 | .and_then(|lits| Prefilter::new(kind, lits)) |
307 | } |
308 | |
309 | /// Run this prefilter on `haystack[span.start..end]` and return a matching |
310 | /// span if one exists. |
311 | /// |
312 | /// The span returned is guaranteed to have a start position greater than |
313 | /// or equal to the one given, and an end position less than or equal to |
314 | /// the one given. |
315 | /// |
316 | /// # Example |
317 | /// |
318 | /// This example shows how to build a prefilter directly from an [`Hir`] |
319 | /// expression, and use it to find an occurrence of a prefix from the regex |
320 | /// pattern. |
321 | /// |
322 | /// ``` |
323 | /// use regex_automata::{ |
324 | /// util::{prefilter::Prefilter, syntax}, |
325 | /// MatchKind, Span, |
326 | /// }; |
327 | /// |
328 | /// let hir = syntax::parse(r"Bruce \w+" )?; |
329 | /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) |
330 | /// .expect("a prefilter" ); |
331 | /// let hay = "Hello Bruce Springsteen!" ; |
332 | /// assert_eq!( |
333 | /// Some(Span::from(6..12)), |
334 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
335 | /// ); |
336 | /// |
337 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
338 | /// ``` |
339 | #[inline ] |
340 | pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
341 | #[cfg (not(feature = "alloc" ))] |
342 | { |
343 | unreachable!() |
344 | } |
345 | #[cfg (feature = "alloc" )] |
346 | { |
347 | self.pre.find(haystack, span) |
348 | } |
349 | } |
350 | |
351 | /// Returns the span of a prefix of `haystack[span.start..span.end]` if |
352 | /// the prefilter matches. |
353 | /// |
354 | /// The span returned is guaranteed to have a start position equivalent to |
355 | /// the one given, and an end position less than or equal to the one given. |
356 | /// |
357 | /// # Example |
358 | /// |
359 | /// This example shows how to build a prefilter directly from an [`Hir`] |
360 | /// expression, and use it to find an occurrence of a prefix from the regex |
361 | /// pattern that begins at the start of a haystack only. |
362 | /// |
363 | /// ``` |
364 | /// use regex_automata::{ |
365 | /// util::{prefilter::Prefilter, syntax}, |
366 | /// MatchKind, Span, |
367 | /// }; |
368 | /// |
369 | /// let hir = syntax::parse(r"Bruce \w+" )?; |
370 | /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) |
371 | /// .expect("a prefilter" ); |
372 | /// let hay = "Hello Bruce Springsteen!" ; |
373 | /// // Nothing is found here because 'Bruce' does |
374 | /// // not occur at the beginning of our search. |
375 | /// assert_eq!( |
376 | /// None, |
377 | /// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())), |
378 | /// ); |
379 | /// // But if we change where we start the search |
380 | /// // to begin where 'Bruce ' begins, then a |
381 | /// // match will be found. |
382 | /// assert_eq!( |
383 | /// Some(Span::from(6..12)), |
384 | /// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())), |
385 | /// ); |
386 | /// |
387 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
388 | /// ``` |
389 | #[inline ] |
390 | pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
391 | #[cfg (not(feature = "alloc" ))] |
392 | { |
393 | unreachable!() |
394 | } |
395 | #[cfg (feature = "alloc" )] |
396 | { |
397 | self.pre.prefix(haystack, span) |
398 | } |
399 | } |
400 | |
401 | /// Returns the heap memory, in bytes, used by the underlying prefilter. |
402 | #[inline ] |
403 | pub fn memory_usage(&self) -> usize { |
404 | #[cfg (not(feature = "alloc" ))] |
405 | { |
406 | unreachable!() |
407 | } |
408 | #[cfg (feature = "alloc" )] |
409 | { |
410 | self.pre.memory_usage() |
411 | } |
412 | } |
413 | |
414 | /// Implementations might return true here if they believe themselves to |
415 | /// be "fast." The concept of "fast" is deliberately left vague, but in |
416 | /// practice this usually corresponds to whether it's believed that SIMD |
417 | /// will be used. |
418 | /// |
419 | /// Why do we care about this? Well, some prefilter tricks tend to come |
420 | /// with their own bits of overhead, and so might only make sense if we |
421 | /// know that a scan will be *much* faster than the regex engine itself. |
422 | /// Otherwise, the trick may not be worth doing. Whether something is |
423 | /// "much" faster than the regex engine generally boils down to whether |
424 | /// SIMD is used. (But not always. Even a SIMD matcher with a high false |
425 | /// positive rate can become quite slow.) |
426 | /// |
427 | /// Even if this returns true, it is still possible for the prefilter to |
428 | /// be "slow." Remember, prefilters are just heuristics. We can't really |
429 | /// *know* a prefilter will be fast without actually trying the prefilter. |
430 | /// (Which of course we cannot afford to do.) |
431 | #[inline ] |
432 | pub(crate) fn is_fast(&self) -> bool { |
433 | #[cfg (not(feature = "alloc" ))] |
434 | { |
435 | unreachable!() |
436 | } |
437 | #[cfg (feature = "alloc" )] |
438 | { |
439 | self.is_fast |
440 | } |
441 | } |
442 | } |
443 | |
444 | /// A trait for abstracting over prefilters. Basically, a prefilter is |
445 | /// something that do an unanchored *and* an anchored search in a haystack |
446 | /// within a given span. |
447 | /// |
448 | /// This exists pretty much only so that we can use prefilters as a trait |
449 | /// object (which is what `Prefilter` is). If we ever move off of trait objects |
450 | /// and to an enum, then it's likely this trait could be removed. |
451 | pub(crate) trait PrefilterI: |
452 | Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static |
453 | { |
454 | /// Run this prefilter on `haystack[span.start..end]` and return a matching |
455 | /// span if one exists. |
456 | /// |
457 | /// The span returned is guaranteed to have a start position greater than |
458 | /// or equal to the one given, and an end position less than or equal to |
459 | /// the one given. |
460 | fn find(&self, haystack: &[u8], span: Span) -> Option<Span>; |
461 | |
462 | /// Returns the span of a prefix of `haystack[span.start..span.end]` if |
463 | /// the prefilter matches. |
464 | /// |
465 | /// The span returned is guaranteed to have a start position equivalent to |
466 | /// the one given, and an end position less than or equal to the one given. |
467 | fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>; |
468 | |
469 | /// Returns the heap memory, in bytes, used by the underlying prefilter. |
470 | fn memory_usage(&self) -> usize; |
471 | |
472 | /// Implementations might return true here if they believe themselves to |
473 | /// be "fast." See [`Prefilter::is_fast`] for more details. |
474 | fn is_fast(&self) -> bool; |
475 | } |
476 | |
477 | #[cfg (feature = "alloc" )] |
478 | impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> { |
479 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
480 | fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
481 | (&**self).find(haystack, span) |
482 | } |
483 | |
484 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
485 | fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
486 | (&**self).prefix(haystack, span) |
487 | } |
488 | |
489 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
490 | fn memory_usage(&self) -> usize { |
491 | (&**self).memory_usage() |
492 | } |
493 | |
494 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
495 | fn is_fast(&self) -> bool { |
496 | (&**self).is_fast() |
497 | } |
498 | } |
499 | |
500 | /// A type that encapsulates the selection of a prefilter algorithm from a |
501 | /// sequence of needles. |
502 | /// |
503 | /// The existence of this type is a little tricky, because we don't (currently) |
504 | /// use it for performing a search. Instead, we really only consume it by |
505 | /// converting the underlying prefilter into a trait object, whether that be |
506 | /// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order |
507 | /// to avoid re-copying the prefilter selection logic, we isolate it here, and |
508 | /// then force anything downstream that wants to convert it to a trait object |
509 | /// to do trivial case analysis on it. |
510 | /// |
511 | /// One wonders whether we *should* use an enum instead of a trait object. |
512 | /// At time of writing, I chose trait objects based on instinct because 1) I |
513 | /// knew I wasn't going to inline anything and 2) there would potentially be |
514 | /// many different choices. However, as of time of writing, I haven't actually |
515 | /// compared the trait object approach to the enum approach. That probably |
516 | /// should be litigated, but I ran out of steam. |
517 | /// |
518 | /// Note that if the `alloc` feature is disabled, then values of this type |
519 | /// are (and should) never be constructed. Also, in practice, for any of the |
520 | /// prefilters to be selected, you'll need at least one of the `perf-literal-*` |
521 | /// features enabled. |
522 | #[derive(Clone, Debug)] |
523 | pub(crate) enum Choice { |
524 | Memchr(Memchr), |
525 | Memchr2(Memchr2), |
526 | Memchr3(Memchr3), |
527 | Memmem(Memmem), |
528 | Teddy(Teddy), |
529 | ByteSet(ByteSet), |
530 | AhoCorasick(AhoCorasick), |
531 | } |
532 | |
533 | impl Choice { |
534 | /// Select what is believed to be the best prefilter algorithm for the |
535 | /// match semantics and sequence of needles given. |
536 | /// |
537 | /// This selection algorithm uses the needles as given without any |
538 | /// modification. For example, if `[bar]` is given, then this doesn't |
539 | /// try to select `memchr` for `b`. Instead, it would select `memmem` |
540 | /// for `bar`. If callers would want `memchr` selected for `[bar]`, then |
541 | /// callers should massages the literals themselves. That is, callers are |
542 | /// responsible for heuristics surrounding which sequence of literals is |
543 | /// best. |
544 | /// |
545 | /// What this selection algorithm does is attempt to use the fastest |
546 | /// prefilter that works for the literals given. So if `[a, b]`, is given, |
547 | /// then `memchr2` is selected. |
548 | /// |
549 | /// Of course, which prefilter is selected is also subject to what |
550 | /// is available. For example, if `alloc` isn't enabled, then |
551 | /// that limits which prefilters can be selected. Similarly, if |
552 | /// `perf-literal-substring` isn't enabled, then nothing from the `memchr` |
553 | /// crate can be returned. |
554 | pub(crate) fn new<B: AsRef<[u8]>>( |
555 | kind: MatchKind, |
556 | needles: &[B], |
557 | ) -> Option<Choice> { |
558 | // An empty set means the regex matches nothing, so no sense in |
559 | // building a prefilter. |
560 | if needles.len() == 0 { |
561 | debug!("prefilter building failed: found empty set of literals" ); |
562 | return None; |
563 | } |
564 | // If the regex can match the empty string, then the prefilter |
565 | // will by definition match at every position. This is obviously |
566 | // completely ineffective. |
567 | if needles.iter().any(|n| n.as_ref().is_empty()) { |
568 | debug!("prefilter building failed: literals match empty string" ); |
569 | return None; |
570 | } |
571 | // BREADCRUMBS: Perhaps the literal optimizer should special case |
572 | // sequences of length two or three if the leading bytes of each are |
573 | // "rare"? Or perhaps, if there are two or three total possible leading |
574 | // bytes, regardless of the number of literals, and all are rare... |
575 | // Then well, perhaps we should use memchr2 or memchr3 in those cases? |
576 | if let Some(pre) = Memchr::new(kind, needles) { |
577 | debug!("prefilter built: memchr" ); |
578 | return Some(Choice::Memchr(pre)); |
579 | } |
580 | if let Some(pre) = Memchr2::new(kind, needles) { |
581 | debug!("prefilter built: memchr2" ); |
582 | return Some(Choice::Memchr2(pre)); |
583 | } |
584 | if let Some(pre) = Memchr3::new(kind, needles) { |
585 | debug!("prefilter built: memchr3" ); |
586 | return Some(Choice::Memchr3(pre)); |
587 | } |
588 | if let Some(pre) = Memmem::new(kind, needles) { |
589 | debug!("prefilter built: memmem" ); |
590 | return Some(Choice::Memmem(pre)); |
591 | } |
592 | if let Some(pre) = Teddy::new(kind, needles) { |
593 | debug!("prefilter built: teddy" ); |
594 | return Some(Choice::Teddy(pre)); |
595 | } |
596 | if let Some(pre) = ByteSet::new(kind, needles) { |
597 | debug!("prefilter built: byteset" ); |
598 | return Some(Choice::ByteSet(pre)); |
599 | } |
600 | if let Some(pre) = AhoCorasick::new(kind, needles) { |
601 | debug!("prefilter built: aho-corasick" ); |
602 | return Some(Choice::AhoCorasick(pre)); |
603 | } |
604 | debug!("prefilter building failed: no strategy could be found" ); |
605 | None |
606 | } |
607 | } |
608 | |
609 | /// Extracts all of the prefix literals from the given HIR expressions into a |
610 | /// single `Seq`. The literals in the sequence are ordered with respect to the |
611 | /// order of the given HIR expressions and consistent with the match semantics |
612 | /// given. |
613 | /// |
614 | /// The sequence returned is "optimized." That is, they may be shrunk or even |
615 | /// truncated according to heuristics with the intent of making them more |
616 | /// useful as a prefilter. (Which translates to both using faster algorithms |
617 | /// and minimizing the false positive rate.) |
618 | /// |
619 | /// Note that this erases any connection between the literals and which pattern |
620 | /// (or patterns) they came from. |
621 | /// |
622 | /// The match kind given must correspond to the match semantics of the regex |
623 | /// that is represented by the HIRs given. The match semantics may change the |
624 | /// literal sequence returned. |
625 | #[cfg (feature = "syntax" )] |
626 | pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq |
627 | where |
628 | H: core::borrow::Borrow<Hir>, |
629 | { |
630 | let mut extractor = literal::Extractor::new(); |
631 | extractor.kind(literal::ExtractKind::Prefix); |
632 | |
633 | let mut prefixes = literal::Seq::empty(); |
634 | for hir in hirs { |
635 | prefixes.union(&mut extractor.extract(hir.borrow())); |
636 | } |
637 | debug!( |
638 | "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}" , |
639 | prefixes.len(), |
640 | prefixes.is_exact(), |
641 | prefixes |
642 | ); |
643 | match kind { |
644 | MatchKind::All => { |
645 | prefixes.sort(); |
646 | prefixes.dedup(); |
647 | } |
648 | MatchKind::LeftmostFirst => { |
649 | prefixes.optimize_for_prefix_by_preference(); |
650 | } |
651 | } |
652 | debug!( |
653 | "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}" , |
654 | prefixes.len(), |
655 | prefixes.is_exact(), |
656 | prefixes |
657 | ); |
658 | prefixes |
659 | } |
660 | |
661 | /// Like `prefixes`, but for all suffixes of all matches for the given HIRs. |
662 | #[cfg (feature = "syntax" )] |
663 | pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq |
664 | where |
665 | H: core::borrow::Borrow<Hir>, |
666 | { |
667 | let mut extractor = literal::Extractor::new(); |
668 | extractor.kind(literal::ExtractKind::Suffix); |
669 | |
670 | let mut suffixes = literal::Seq::empty(); |
671 | for hir in hirs { |
672 | suffixes.union(&mut extractor.extract(hir.borrow())); |
673 | } |
674 | debug!( |
675 | "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}" , |
676 | suffixes.len(), |
677 | suffixes.is_exact(), |
678 | suffixes |
679 | ); |
680 | match kind { |
681 | MatchKind::All => { |
682 | suffixes.sort(); |
683 | suffixes.dedup(); |
684 | } |
685 | MatchKind::LeftmostFirst => { |
686 | suffixes.optimize_for_suffix_by_preference(); |
687 | } |
688 | } |
689 | debug!( |
690 | "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}" , |
691 | suffixes.len(), |
692 | suffixes.is_exact(), |
693 | suffixes |
694 | ); |
695 | suffixes |
696 | } |
697 | |