1 | /*! |
2 | Defines a prefilter for accelerating regex searches. |
3 | |
4 | A prefilter can be created by building a [`Prefilter`] value. |
5 | |
6 | A prefilter represents one of the most important optimizations available for |
7 | accelerating regex searches. The idea of a prefilter is to very quickly find |
8 | candidate locations in a haystack where a regex _could_ match. Once a candidate |
9 | is found, it is then intended for the regex engine to run at that position to |
10 | determine whether the candidate is a match or a false positive. |
11 | |
12 | In the aforementioned description of the prefilter optimization also lay its |
13 | demise. Namely, if a prefilter has a high false positive rate and it produces |
14 | lots of candidates, then a prefilter can overall make a regex search slower. |
15 | It can run more slowly because more time is spent ping-ponging between the |
16 | prefilter search and the regex engine attempting to confirm each candidate as |
17 | a match. This ping-ponging has overhead that adds up, and is exacerbated by |
18 | a high false positive rate. |
19 | |
20 | Nevertheless, the optimization is still generally worth performing in most |
21 | cases. Particularly given just how much throughput can be improved. (It is not |
22 | uncommon for prefilter optimizations to improve throughput by one or two orders |
23 | of magnitude.) |
24 | |
25 | Typically a prefilter is used to find occurrences of literal prefixes from a |
26 | regex pattern, but this isn't required. A prefilter can be used to look for |
27 | suffixes or even inner literals. |
28 | |
29 | Note that as of now, prefilters throw away information about which pattern |
30 | each literal comes from. In other words, when a prefilter finds a match, |
31 | there's no way to know which pattern (or patterns) it came from. Therefore, |
32 | in order to confirm a match, you'll have to check all of the patterns by |
33 | running the full regex engine. |
34 | */ |
35 | |
36 | mod aho_corasick; |
37 | mod byteset; |
38 | mod memchr; |
39 | mod memmem; |
40 | mod teddy; |
41 | |
42 | use core::{ |
43 | borrow::Borrow, |
44 | fmt::Debug, |
45 | panic::{RefUnwindSafe, UnwindSafe}, |
46 | }; |
47 | |
48 | #[cfg (feature = "alloc" )] |
49 | use alloc::sync::Arc; |
50 | |
51 | #[cfg (feature = "syntax" )] |
52 | use regex_syntax::hir::{literal, Hir}; |
53 | |
54 | use crate::util::search::{MatchKind, Span}; |
55 | |
56 | pub(crate) use crate::util::prefilter::{ |
57 | aho_corasick::AhoCorasick, |
58 | byteset::ByteSet, |
59 | memchr::{Memchr, Memchr2, Memchr3}, |
60 | memmem::Memmem, |
61 | teddy::Teddy, |
62 | }; |
63 | |
64 | /// A prefilter for accelerating regex searches. |
65 | /// |
66 | /// If you already have your literals that you want to search with, |
67 | /// then the vanilla [`Prefilter::new`] constructor is for you. But |
68 | /// if you have an [`Hir`] value from the `regex-syntax` crate, then |
69 | /// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses |
70 | /// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to |
71 | /// extract literal prefixes for you, optimize them and then select and build a |
72 | /// prefilter matcher. |
73 | /// |
74 | /// A prefilter must have **zero false negatives**. However, by its very |
75 | /// nature, it may produce false positives. That is, a prefilter will never |
76 | /// skip over a position in the haystack that corresponds to a match of the |
77 | /// original regex pattern, but it *may* produce a match for a position |
78 | /// in the haystack that does *not* correspond to a match of the original |
79 | /// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or |
80 | /// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is |
81 | /// upheld for you automatically. This guarantee is not preserved if you use |
82 | /// [`Prefilter::new`] though, since it is up to the caller to provide correct |
83 | /// literal strings with respect to the original regex pattern. |
84 | /// |
85 | /// # Cloning |
86 | /// |
87 | /// It is an API guarantee that cloning a prefilter is cheap. That is, cloning |
88 | /// it will not duplicate whatever heap memory is used to represent the |
89 | /// underlying matcher. |
90 | /// |
91 | /// # Example |
92 | /// |
93 | /// This example shows how to attach a `Prefilter` to the |
94 | /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate |
95 | /// searches. |
96 | /// |
97 | /// ``` |
98 | /// use regex_automata::{ |
99 | /// nfa::thompson::pikevm::PikeVM, |
100 | /// util::prefilter::Prefilter, |
101 | /// Match, MatchKind, |
102 | /// }; |
103 | /// |
104 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce " ]) |
105 | /// .expect("a prefilter" ); |
106 | /// let re = PikeVM::builder() |
107 | /// .configure(PikeVM::config().prefilter(Some(pre))) |
108 | /// .build(r"Bruce \w+" )?; |
109 | /// let mut cache = re.create_cache(); |
110 | /// assert_eq!( |
111 | /// Some(Match::must(0, 6..23)), |
112 | /// re.find(&mut cache, "Hello Bruce Springsteen!" ), |
113 | /// ); |
114 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
115 | /// ``` |
116 | /// |
117 | /// But note that if you get your prefilter incorrect, it could lead to an |
118 | /// incorrect result! |
119 | /// |
120 | /// ``` |
121 | /// use regex_automata::{ |
122 | /// nfa::thompson::pikevm::PikeVM, |
123 | /// util::prefilter::Prefilter, |
124 | /// Match, MatchKind, |
125 | /// }; |
126 | /// |
127 | /// // This prefilter is wrong! |
128 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti " ]) |
129 | /// .expect("a prefilter" ); |
130 | /// let re = PikeVM::builder() |
131 | /// .configure(PikeVM::config().prefilter(Some(pre))) |
132 | /// .build(r"Bruce \w+" )?; |
133 | /// let mut cache = re.create_cache(); |
134 | /// // We find no match even though the regex does match. |
135 | /// assert_eq!( |
136 | /// None, |
137 | /// re.find(&mut cache, "Hello Bruce Springsteen!" ), |
138 | /// ); |
139 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
140 | /// ``` |
141 | #[derive (Clone, Debug)] |
142 | pub struct Prefilter { |
143 | #[cfg (not(feature = "alloc" ))] |
144 | _unused: (), |
145 | #[cfg (feature = "alloc" )] |
146 | pre: Arc<dyn PrefilterI>, |
147 | #[cfg (feature = "alloc" )] |
148 | is_fast: bool, |
149 | #[cfg (feature = "alloc" )] |
150 | max_needle_len: usize, |
151 | } |
152 | |
153 | impl Prefilter { |
154 | /// Create a new prefilter from a sequence of needles and a corresponding |
155 | /// match semantics. |
156 | /// |
157 | /// This may return `None` for a variety of reasons, for example, if |
158 | /// a suitable prefilter could not be constructed. That might occur |
159 | /// if they are unavailable (e.g., the `perf-literal-substring` and |
160 | /// `perf-literal-multisubstring` features aren't enabled), or it might |
161 | /// occur because of heuristics or other artifacts of how the prefilter |
162 | /// works. |
163 | /// |
164 | /// Note that if you have an [`Hir`] expression, it may be more convenient |
165 | /// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the |
166 | /// task of extracting prefix literals for you. |
167 | /// |
168 | /// # Example |
169 | /// |
170 | /// This example shows how match semantics can impact the matching |
171 | /// algorithm used by the prefilter. For this reason, it is important to |
172 | /// ensure that the match semantics given here are consistent with the |
173 | /// match semantics intended for the regular expression that the literals |
174 | /// were extracted from. |
175 | /// |
176 | /// ``` |
177 | /// use regex_automata::{ |
178 | /// util::{prefilter::Prefilter, syntax}, |
179 | /// MatchKind, Span, |
180 | /// }; |
181 | /// |
182 | /// let hay = "Hello samwise" ; |
183 | /// |
184 | /// // With leftmost-first, we find 'samwise' here because it comes |
185 | /// // before 'sam' in the sequence we give it.. |
186 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise" , "sam" ]) |
187 | /// .expect("a prefilter" ); |
188 | /// assert_eq!( |
189 | /// Some(Span::from(6..13)), |
190 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
191 | /// ); |
192 | /// // Still with leftmost-first but with the literals reverse, now 'sam' |
193 | /// // will match instead! |
194 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam" , "samwise" ]) |
195 | /// .expect("a prefilter" ); |
196 | /// assert_eq!( |
197 | /// Some(Span::from(6..9)), |
198 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
199 | /// ); |
200 | /// |
201 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
202 | /// ``` |
203 | pub fn new<B: AsRef<[u8]>>( |
204 | kind: MatchKind, |
205 | needles: &[B], |
206 | ) -> Option<Prefilter> { |
207 | Choice::new(kind, needles).and_then(|choice| { |
208 | let max_needle_len = |
209 | needles.iter().map(|b| b.as_ref().len()).max().unwrap_or(0); |
210 | Prefilter::from_choice(choice, max_needle_len) |
211 | }) |
212 | } |
213 | |
214 | /// This turns a prefilter selection into a `Prefilter`. That is, in turns |
215 | /// the enum given into a trait object. |
216 | fn from_choice( |
217 | choice: Choice, |
218 | max_needle_len: usize, |
219 | ) -> Option<Prefilter> { |
220 | #[cfg (not(feature = "alloc" ))] |
221 | { |
222 | None |
223 | } |
224 | #[cfg (feature = "alloc" )] |
225 | { |
226 | let pre: Arc<dyn PrefilterI> = match choice { |
227 | Choice::Memchr(p) => Arc::new(p), |
228 | Choice::Memchr2(p) => Arc::new(p), |
229 | Choice::Memchr3(p) => Arc::new(p), |
230 | Choice::Memmem(p) => Arc::new(p), |
231 | Choice::Teddy(p) => Arc::new(p), |
232 | Choice::ByteSet(p) => Arc::new(p), |
233 | Choice::AhoCorasick(p) => Arc::new(p), |
234 | }; |
235 | let is_fast = pre.is_fast(); |
236 | Some(Prefilter { pre, is_fast, max_needle_len }) |
237 | } |
238 | } |
239 | |
240 | /// This attempts to extract prefixes from the given `Hir` expression for |
241 | /// the given match semantics, and if possible, builds a prefilter for |
242 | /// them. |
243 | /// |
244 | /// # Example |
245 | /// |
246 | /// This example shows how to build a prefilter directly from an [`Hir`] |
247 | /// expression, and use to find an occurrence of a prefix from the regex |
248 | /// pattern. |
249 | /// |
250 | /// ``` |
251 | /// use regex_automata::{ |
252 | /// util::{prefilter::Prefilter, syntax}, |
253 | /// MatchKind, Span, |
254 | /// }; |
255 | /// |
256 | /// let hir = syntax::parse(r"(Bruce|Patti) \w+" )?; |
257 | /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) |
258 | /// .expect("a prefilter" ); |
259 | /// let hay = "Hello Patti Scialfa!" ; |
260 | /// assert_eq!( |
261 | /// Some(Span::from(6..12)), |
262 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
263 | /// ); |
264 | /// |
265 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
266 | /// ``` |
267 | #[cfg (feature = "syntax" )] |
268 | pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> { |
269 | Prefilter::from_hirs_prefix(kind, &[hir]) |
270 | } |
271 | |
272 | /// This attempts to extract prefixes from the given `Hir` expressions for |
273 | /// the given match semantics, and if possible, builds a prefilter for |
274 | /// them. |
275 | /// |
276 | /// Note that as of now, prefilters throw away information about which |
277 | /// pattern each literal comes from. In other words, when a prefilter finds |
278 | /// a match, there's no way to know which pattern (or patterns) it came |
279 | /// from. Therefore, in order to confirm a match, you'll have to check all |
280 | /// of the patterns by running the full regex engine. |
281 | /// |
282 | /// # Example |
283 | /// |
284 | /// This example shows how to build a prefilter directly from multiple |
285 | /// `Hir` expressions expression, and use it to find an occurrence of a |
286 | /// prefix from the regex patterns. |
287 | /// |
288 | /// ``` |
289 | /// use regex_automata::{ |
290 | /// util::{prefilter::Prefilter, syntax}, |
291 | /// MatchKind, Span, |
292 | /// }; |
293 | /// |
294 | /// let hirs = syntax::parse_many(&[ |
295 | /// r"(Bruce|Patti) \w+" , |
296 | /// r"Mrs?\. Doubtfire" , |
297 | /// ])?; |
298 | /// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs) |
299 | /// .expect("a prefilter" ); |
300 | /// let hay = "Hello Mrs. Doubtfire" ; |
301 | /// assert_eq!( |
302 | /// Some(Span::from(6..20)), |
303 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
304 | /// ); |
305 | /// |
306 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
307 | /// ``` |
308 | #[cfg (feature = "syntax" )] |
309 | pub fn from_hirs_prefix<H: Borrow<Hir>>( |
310 | kind: MatchKind, |
311 | hirs: &[H], |
312 | ) -> Option<Prefilter> { |
313 | prefixes(kind, hirs) |
314 | .literals() |
315 | .and_then(|lits| Prefilter::new(kind, lits)) |
316 | } |
317 | |
318 | /// Run this prefilter on `haystack[span.start..end]` and return a matching |
319 | /// span if one exists. |
320 | /// |
321 | /// The span returned is guaranteed to have a start position greater than |
322 | /// or equal to the one given, and an end position less than or equal to |
323 | /// the one given. |
324 | /// |
325 | /// # Example |
326 | /// |
327 | /// This example shows how to build a prefilter directly from an [`Hir`] |
328 | /// expression, and use it to find an occurrence of a prefix from the regex |
329 | /// pattern. |
330 | /// |
331 | /// ``` |
332 | /// use regex_automata::{ |
333 | /// util::{prefilter::Prefilter, syntax}, |
334 | /// MatchKind, Span, |
335 | /// }; |
336 | /// |
337 | /// let hir = syntax::parse(r"Bruce \w+" )?; |
338 | /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) |
339 | /// .expect("a prefilter" ); |
340 | /// let hay = "Hello Bruce Springsteen!" ; |
341 | /// assert_eq!( |
342 | /// Some(Span::from(6..12)), |
343 | /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), |
344 | /// ); |
345 | /// |
346 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
347 | /// ``` |
348 | #[inline ] |
349 | pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
350 | #[cfg (not(feature = "alloc" ))] |
351 | { |
352 | unreachable!() |
353 | } |
354 | #[cfg (feature = "alloc" )] |
355 | { |
356 | self.pre.find(haystack, span) |
357 | } |
358 | } |
359 | |
360 | /// Returns the span of a prefix of `haystack[span.start..span.end]` if |
361 | /// the prefilter matches. |
362 | /// |
363 | /// The span returned is guaranteed to have a start position equivalent to |
364 | /// the one given, and an end position less than or equal to the one given. |
365 | /// |
366 | /// # Example |
367 | /// |
368 | /// This example shows how to build a prefilter directly from an [`Hir`] |
369 | /// expression, and use it to find an occurrence of a prefix from the regex |
370 | /// pattern that begins at the start of a haystack only. |
371 | /// |
372 | /// ``` |
373 | /// use regex_automata::{ |
374 | /// util::{prefilter::Prefilter, syntax}, |
375 | /// MatchKind, Span, |
376 | /// }; |
377 | /// |
378 | /// let hir = syntax::parse(r"Bruce \w+" )?; |
379 | /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) |
380 | /// .expect("a prefilter" ); |
381 | /// let hay = "Hello Bruce Springsteen!" ; |
382 | /// // Nothing is found here because 'Bruce' does |
383 | /// // not occur at the beginning of our search. |
384 | /// assert_eq!( |
385 | /// None, |
386 | /// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())), |
387 | /// ); |
388 | /// // But if we change where we start the search |
389 | /// // to begin where 'Bruce ' begins, then a |
390 | /// // match will be found. |
391 | /// assert_eq!( |
392 | /// Some(Span::from(6..12)), |
393 | /// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())), |
394 | /// ); |
395 | /// |
396 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
397 | /// ``` |
398 | #[inline ] |
399 | pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
400 | #[cfg (not(feature = "alloc" ))] |
401 | { |
402 | unreachable!() |
403 | } |
404 | #[cfg (feature = "alloc" )] |
405 | { |
406 | self.pre.prefix(haystack, span) |
407 | } |
408 | } |
409 | |
410 | /// Returns the heap memory, in bytes, used by the underlying prefilter. |
411 | #[inline ] |
412 | pub fn memory_usage(&self) -> usize { |
413 | #[cfg (not(feature = "alloc" ))] |
414 | { |
415 | unreachable!() |
416 | } |
417 | #[cfg (feature = "alloc" )] |
418 | { |
419 | self.pre.memory_usage() |
420 | } |
421 | } |
422 | |
423 | /// Return the length of the longest needle |
424 | /// in this Prefilter |
425 | #[inline ] |
426 | pub fn max_needle_len(&self) -> usize { |
427 | #[cfg (not(feature = "alloc" ))] |
428 | { |
429 | unreachable!() |
430 | } |
431 | #[cfg (feature = "alloc" )] |
432 | { |
433 | self.max_needle_len |
434 | } |
435 | } |
436 | |
437 | /// Implementations might return true here if they believe themselves to |
438 | /// be "fast." The concept of "fast" is deliberately left vague, but in |
439 | /// practice this usually corresponds to whether it's believed that SIMD |
440 | /// will be used. |
441 | /// |
442 | /// Why do we care about this? Well, some prefilter tricks tend to come |
443 | /// with their own bits of overhead, and so might only make sense if we |
444 | /// know that a scan will be *much* faster than the regex engine itself. |
445 | /// Otherwise, the trick may not be worth doing. Whether something is |
446 | /// "much" faster than the regex engine generally boils down to whether |
447 | /// SIMD is used. (But not always. Even a SIMD matcher with a high false |
448 | /// positive rate can become quite slow.) |
449 | /// |
450 | /// Even if this returns true, it is still possible for the prefilter to |
451 | /// be "slow." Remember, prefilters are just heuristics. We can't really |
452 | /// *know* a prefilter will be fast without actually trying the prefilter. |
453 | /// (Which of course we cannot afford to do.) |
454 | #[inline ] |
455 | pub fn is_fast(&self) -> bool { |
456 | #[cfg (not(feature = "alloc" ))] |
457 | { |
458 | unreachable!() |
459 | } |
460 | #[cfg (feature = "alloc" )] |
461 | { |
462 | self.is_fast |
463 | } |
464 | } |
465 | } |
466 | |
467 | /// A trait for abstracting over prefilters. Basically, a prefilter is |
468 | /// something that do an unanchored *and* an anchored search in a haystack |
469 | /// within a given span. |
470 | /// |
471 | /// This exists pretty much only so that we can use prefilters as a trait |
472 | /// object (which is what `Prefilter` is). If we ever move off of trait objects |
473 | /// and to an enum, then it's likely this trait could be removed. |
474 | pub(crate) trait PrefilterI: |
475 | Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static |
476 | { |
477 | /// Run this prefilter on `haystack[span.start..end]` and return a matching |
478 | /// span if one exists. |
479 | /// |
480 | /// The span returned is guaranteed to have a start position greater than |
481 | /// or equal to the one given, and an end position less than or equal to |
482 | /// the one given. |
483 | fn find(&self, haystack: &[u8], span: Span) -> Option<Span>; |
484 | |
485 | /// Returns the span of a prefix of `haystack[span.start..span.end]` if |
486 | /// the prefilter matches. |
487 | /// |
488 | /// The span returned is guaranteed to have a start position equivalent to |
489 | /// the one given, and an end position less than or equal to the one given. |
490 | fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>; |
491 | |
492 | /// Returns the heap memory, in bytes, used by the underlying prefilter. |
493 | fn memory_usage(&self) -> usize; |
494 | |
495 | /// Implementations might return true here if they believe themselves to |
496 | /// be "fast." See [`Prefilter::is_fast`] for more details. |
497 | fn is_fast(&self) -> bool; |
498 | } |
499 | |
500 | #[cfg (feature = "alloc" )] |
501 | impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> { |
502 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
503 | fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
504 | (&**self).find(haystack, span) |
505 | } |
506 | |
507 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
508 | fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
509 | (&**self).prefix(haystack, span) |
510 | } |
511 | |
512 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
513 | fn memory_usage(&self) -> usize { |
514 | (&**self).memory_usage() |
515 | } |
516 | |
517 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
518 | fn is_fast(&self) -> bool { |
519 | (&**self).is_fast() |
520 | } |
521 | } |
522 | |
523 | /// A type that encapsulates the selection of a prefilter algorithm from a |
524 | /// sequence of needles. |
525 | /// |
526 | /// The existence of this type is a little tricky, because we don't (currently) |
527 | /// use it for performing a search. Instead, we really only consume it by |
528 | /// converting the underlying prefilter into a trait object, whether that be |
529 | /// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order |
530 | /// to avoid re-copying the prefilter selection logic, we isolate it here, and |
531 | /// then force anything downstream that wants to convert it to a trait object |
532 | /// to do trivial case analysis on it. |
533 | /// |
534 | /// One wonders whether we *should* use an enum instead of a trait object. |
535 | /// At time of writing, I chose trait objects based on instinct because 1) I |
536 | /// knew I wasn't going to inline anything and 2) there would potentially be |
537 | /// many different choices. However, as of time of writing, I haven't actually |
538 | /// compared the trait object approach to the enum approach. That probably |
539 | /// should be litigated, but I ran out of steam. |
540 | /// |
541 | /// Note that if the `alloc` feature is disabled, then values of this type |
542 | /// are (and should) never be constructed. Also, in practice, for any of the |
543 | /// prefilters to be selected, you'll need at least one of the `perf-literal-*` |
544 | /// features enabled. |
545 | #[derive (Clone, Debug)] |
546 | pub(crate) enum Choice { |
547 | Memchr(Memchr), |
548 | Memchr2(Memchr2), |
549 | Memchr3(Memchr3), |
550 | Memmem(Memmem), |
551 | Teddy(Teddy), |
552 | ByteSet(ByteSet), |
553 | AhoCorasick(AhoCorasick), |
554 | } |
555 | |
556 | impl Choice { |
557 | /// Select what is believed to be the best prefilter algorithm for the |
558 | /// match semantics and sequence of needles given. |
559 | /// |
560 | /// This selection algorithm uses the needles as given without any |
561 | /// modification. For example, if `[bar]` is given, then this doesn't |
562 | /// try to select `memchr` for `b`. Instead, it would select `memmem` |
563 | /// for `bar`. If callers would want `memchr` selected for `[bar]`, then |
564 | /// callers should massages the literals themselves. That is, callers are |
565 | /// responsible for heuristics surrounding which sequence of literals is |
566 | /// best. |
567 | /// |
568 | /// What this selection algorithm does is attempt to use the fastest |
569 | /// prefilter that works for the literals given. So if `[a, b]`, is given, |
570 | /// then `memchr2` is selected. |
571 | /// |
572 | /// Of course, which prefilter is selected is also subject to what |
573 | /// is available. For example, if `alloc` isn't enabled, then |
574 | /// that limits which prefilters can be selected. Similarly, if |
575 | /// `perf-literal-substring` isn't enabled, then nothing from the `memchr` |
576 | /// crate can be returned. |
577 | pub(crate) fn new<B: AsRef<[u8]>>( |
578 | kind: MatchKind, |
579 | needles: &[B], |
580 | ) -> Option<Choice> { |
581 | // An empty set means the regex matches nothing, so no sense in |
582 | // building a prefilter. |
583 | if needles.len() == 0 { |
584 | debug!("prefilter building failed: found empty set of literals" ); |
585 | return None; |
586 | } |
587 | // If the regex can match the empty string, then the prefilter |
588 | // will by definition match at every position. This is obviously |
589 | // completely ineffective. |
590 | if needles.iter().any(|n| n.as_ref().is_empty()) { |
591 | debug!("prefilter building failed: literals match empty string" ); |
592 | return None; |
593 | } |
594 | // BREADCRUMBS: Perhaps the literal optimizer should special case |
595 | // sequences of length two or three if the leading bytes of each are |
596 | // "rare"? Or perhaps, if there are two or three total possible leading |
597 | // bytes, regardless of the number of literals, and all are rare... |
598 | // Then well, perhaps we should use memchr2 or memchr3 in those cases? |
599 | if let Some(pre) = Memchr::new(kind, needles) { |
600 | debug!("prefilter built: memchr" ); |
601 | return Some(Choice::Memchr(pre)); |
602 | } |
603 | if let Some(pre) = Memchr2::new(kind, needles) { |
604 | debug!("prefilter built: memchr2" ); |
605 | return Some(Choice::Memchr2(pre)); |
606 | } |
607 | if let Some(pre) = Memchr3::new(kind, needles) { |
608 | debug!("prefilter built: memchr3" ); |
609 | return Some(Choice::Memchr3(pre)); |
610 | } |
611 | if let Some(pre) = Memmem::new(kind, needles) { |
612 | debug!("prefilter built: memmem" ); |
613 | return Some(Choice::Memmem(pre)); |
614 | } |
615 | if let Some(pre) = Teddy::new(kind, needles) { |
616 | debug!("prefilter built: teddy" ); |
617 | return Some(Choice::Teddy(pre)); |
618 | } |
619 | if let Some(pre) = ByteSet::new(kind, needles) { |
620 | debug!("prefilter built: byteset" ); |
621 | return Some(Choice::ByteSet(pre)); |
622 | } |
623 | if let Some(pre) = AhoCorasick::new(kind, needles) { |
624 | debug!("prefilter built: aho-corasick" ); |
625 | return Some(Choice::AhoCorasick(pre)); |
626 | } |
627 | debug!("prefilter building failed: no strategy could be found" ); |
628 | None |
629 | } |
630 | } |
631 | |
632 | /// Extracts all of the prefix literals from the given HIR expressions into a |
633 | /// single `Seq`. The literals in the sequence are ordered with respect to the |
634 | /// order of the given HIR expressions and consistent with the match semantics |
635 | /// given. |
636 | /// |
637 | /// The sequence returned is "optimized." That is, they may be shrunk or even |
638 | /// truncated according to heuristics with the intent of making them more |
639 | /// useful as a prefilter. (Which translates to both using faster algorithms |
640 | /// and minimizing the false positive rate.) |
641 | /// |
642 | /// Note that this erases any connection between the literals and which pattern |
643 | /// (or patterns) they came from. |
644 | /// |
645 | /// The match kind given must correspond to the match semantics of the regex |
646 | /// that is represented by the HIRs given. The match semantics may change the |
647 | /// literal sequence returned. |
648 | #[cfg (feature = "syntax" )] |
649 | pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq |
650 | where |
651 | H: core::borrow::Borrow<Hir>, |
652 | { |
653 | let mut extractor = literal::Extractor::new(); |
654 | extractor.kind(literal::ExtractKind::Prefix); |
655 | |
656 | let mut prefixes = literal::Seq::empty(); |
657 | for hir in hirs { |
658 | prefixes.union(&mut extractor.extract(hir.borrow())); |
659 | } |
660 | debug!( |
661 | "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}" , |
662 | prefixes.len(), |
663 | prefixes.is_exact(), |
664 | prefixes |
665 | ); |
666 | match kind { |
667 | MatchKind::All => { |
668 | prefixes.sort(); |
669 | prefixes.dedup(); |
670 | } |
671 | MatchKind::LeftmostFirst => { |
672 | prefixes.optimize_for_prefix_by_preference(); |
673 | } |
674 | } |
675 | debug!( |
676 | "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}" , |
677 | prefixes.len(), |
678 | prefixes.is_exact(), |
679 | prefixes |
680 | ); |
681 | prefixes |
682 | } |
683 | |
684 | /// Like `prefixes`, but for all suffixes of all matches for the given HIRs. |
685 | #[cfg (feature = "syntax" )] |
686 | pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq |
687 | where |
688 | H: core::borrow::Borrow<Hir>, |
689 | { |
690 | let mut extractor = literal::Extractor::new(); |
691 | extractor.kind(literal::ExtractKind::Suffix); |
692 | |
693 | let mut suffixes = literal::Seq::empty(); |
694 | for hir in hirs { |
695 | suffixes.union(&mut extractor.extract(hir.borrow())); |
696 | } |
697 | debug!( |
698 | "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}" , |
699 | suffixes.len(), |
700 | suffixes.is_exact(), |
701 | suffixes |
702 | ); |
703 | match kind { |
704 | MatchKind::All => { |
705 | suffixes.sort(); |
706 | suffixes.dedup(); |
707 | } |
708 | MatchKind::LeftmostFirst => { |
709 | suffixes.optimize_for_suffix_by_preference(); |
710 | } |
711 | } |
712 | debug!( |
713 | "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}" , |
714 | suffixes.len(), |
715 | suffixes.is_exact(), |
716 | suffixes |
717 | ); |
718 | suffixes |
719 | } |
720 | |