1/*!
2Defines a prefilter for accelerating regex searches.
3
4A prefilter can be created by building a [`Prefilter`] value.
5
6A prefilter represents one of the most important optimizations available for
7accelerating regex searches. The idea of a prefilter is to very quickly find
8candidate locations in a haystack where a regex _could_ match. Once a candidate
9is found, it is then intended for the regex engine to run at that position to
10determine whether the candidate is a match or a false positive.
11
12In the aforementioned description of the prefilter optimization also lay its
13demise. Namely, if a prefilter has a high false positive rate and it produces
14lots of candidates, then a prefilter can overall make a regex search slower.
15It can run more slowly because more time is spent ping-ponging between the
16prefilter search and the regex engine attempting to confirm each candidate as
17a match. This ping-ponging has overhead that adds up, and is exacerbated by
18a high false positive rate.
19
20Nevertheless, the optimization is still generally worth performing in most
21cases. Particularly given just how much throughput can be improved. (It is not
22uncommon for prefilter optimizations to improve throughput by one or two orders
23of magnitude.)
24
25Typically a prefilter is used to find occurrences of literal prefixes from a
26regex pattern, but this isn't required. A prefilter can be used to look for
27suffixes or even inner literals.
28
29Note that as of now, prefilters throw away information about which pattern
30each literal comes from. In other words, when a prefilter finds a match,
31there's no way to know which pattern (or patterns) it came from. Therefore,
32in order to confirm a match, you'll have to check all of the patterns by
33running the full regex engine.
34*/
35
36mod aho_corasick;
37mod byteset;
38mod memchr;
39mod memmem;
40mod teddy;
41
42use core::{
43 borrow::Borrow,
44 fmt::Debug,
45 panic::{RefUnwindSafe, UnwindSafe},
46};
47
48#[cfg(feature = "alloc")]
49use alloc::sync::Arc;
50
51#[cfg(feature = "syntax")]
52use regex_syntax::hir::{literal, Hir};
53
54use crate::util::search::{MatchKind, Span};
55
56pub(crate) use crate::util::prefilter::{
57 aho_corasick::AhoCorasick,
58 byteset::ByteSet,
59 memchr::{Memchr, Memchr2, Memchr3},
60 memmem::Memmem,
61 teddy::Teddy,
62};
63
64/// A prefilter for accelerating regex searches.
65///
66/// If you already have your literals that you want to search with,
67/// then the vanilla [`Prefilter::new`] constructor is for you. But
68/// if you have an [`Hir`] value from the `regex-syntax` crate, then
69/// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses
70/// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to
71/// extract literal prefixes for you, optimize them and then select and build a
72/// prefilter matcher.
73///
74/// A prefilter must have **zero false negatives**. However, by its very
75/// nature, it may produce false positives. That is, a prefilter will never
76/// skip over a position in the haystack that corresponds to a match of the
77/// original regex pattern, but it *may* produce a match for a position
78/// in the haystack that does *not* correspond to a match of the original
79/// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or
80/// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is
81/// upheld for you automatically. This guarantee is not preserved if you use
82/// [`Prefilter::new`] though, since it is up to the caller to provide correct
83/// literal strings with respect to the original regex pattern.
84///
85/// # Cloning
86///
87/// It is an API guarantee that cloning a prefilter is cheap. That is, cloning
88/// it will not duplicate whatever heap memory is used to represent the
89/// underlying matcher.
90///
91/// # Example
92///
93/// This example shows how to attach a `Prefilter` to the
94/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate
95/// searches.
96///
97/// ```
98/// use regex_automata::{
99/// nfa::thompson::pikevm::PikeVM,
100/// util::prefilter::Prefilter,
101/// Match, MatchKind,
102/// };
103///
104/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce "])
105/// .expect("a prefilter");
106/// let re = PikeVM::builder()
107/// .configure(PikeVM::config().prefilter(Some(pre)))
108/// .build(r"Bruce \w+")?;
109/// let mut cache = re.create_cache();
110/// assert_eq!(
111/// Some(Match::must(0, 6..23)),
112/// re.find(&mut cache, "Hello Bruce Springsteen!"),
113/// );
114/// # Ok::<(), Box<dyn std::error::Error>>(())
115/// ```
116///
117/// But note that if you get your prefilter incorrect, it could lead to an
118/// incorrect result!
119///
120/// ```
121/// use regex_automata::{
122/// nfa::thompson::pikevm::PikeVM,
123/// util::prefilter::Prefilter,
124/// Match, MatchKind,
125/// };
126///
127/// // This prefilter is wrong!
128/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti "])
129/// .expect("a prefilter");
130/// let re = PikeVM::builder()
131/// .configure(PikeVM::config().prefilter(Some(pre)))
132/// .build(r"Bruce \w+")?;
133/// let mut cache = re.create_cache();
134/// // We find no match even though the regex does match.
135/// assert_eq!(
136/// None,
137/// re.find(&mut cache, "Hello Bruce Springsteen!"),
138/// );
139/// # Ok::<(), Box<dyn std::error::Error>>(())
140/// ```
141#[derive(Clone, Debug)]
142pub struct Prefilter {
143 #[cfg(not(feature = "alloc"))]
144 _unused: (),
145 #[cfg(feature = "alloc")]
146 pre: Arc<dyn PrefilterI>,
147 #[cfg(feature = "alloc")]
148 is_fast: bool,
149 #[cfg(feature = "alloc")]
150 max_needle_len: usize,
151}
152
153impl Prefilter {
154 /// Create a new prefilter from a sequence of needles and a corresponding
155 /// match semantics.
156 ///
157 /// This may return `None` for a variety of reasons, for example, if
158 /// a suitable prefilter could not be constructed. That might occur
159 /// if they are unavailable (e.g., the `perf-literal-substring` and
160 /// `perf-literal-multisubstring` features aren't enabled), or it might
161 /// occur because of heuristics or other artifacts of how the prefilter
162 /// works.
163 ///
164 /// Note that if you have an [`Hir`] expression, it may be more convenient
165 /// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the
166 /// task of extracting prefix literals for you.
167 ///
168 /// # Example
169 ///
170 /// This example shows how match semantics can impact the matching
171 /// algorithm used by the prefilter. For this reason, it is important to
172 /// ensure that the match semantics given here are consistent with the
173 /// match semantics intended for the regular expression that the literals
174 /// were extracted from.
175 ///
176 /// ```
177 /// use regex_automata::{
178 /// util::{prefilter::Prefilter, syntax},
179 /// MatchKind, Span,
180 /// };
181 ///
182 /// let hay = "Hello samwise";
183 ///
184 /// // With leftmost-first, we find 'samwise' here because it comes
185 /// // before 'sam' in the sequence we give it..
186 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise", "sam"])
187 /// .expect("a prefilter");
188 /// assert_eq!(
189 /// Some(Span::from(6..13)),
190 /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
191 /// );
192 /// // Still with leftmost-first but with the literals reverse, now 'sam'
193 /// // will match instead!
194 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam", "samwise"])
195 /// .expect("a prefilter");
196 /// assert_eq!(
197 /// Some(Span::from(6..9)),
198 /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
199 /// );
200 ///
201 /// # Ok::<(), Box<dyn std::error::Error>>(())
202 /// ```
203 pub fn new<B: AsRef<[u8]>>(
204 kind: MatchKind,
205 needles: &[B],
206 ) -> Option<Prefilter> {
207 Choice::new(kind, needles).and_then(|choice| {
208 let max_needle_len =
209 needles.iter().map(|b| b.as_ref().len()).max().unwrap_or(0);
210 Prefilter::from_choice(choice, max_needle_len)
211 })
212 }
213
214 /// This turns a prefilter selection into a `Prefilter`. That is, in turns
215 /// the enum given into a trait object.
216 fn from_choice(
217 choice: Choice,
218 max_needle_len: usize,
219 ) -> Option<Prefilter> {
220 #[cfg(not(feature = "alloc"))]
221 {
222 None
223 }
224 #[cfg(feature = "alloc")]
225 {
226 let pre: Arc<dyn PrefilterI> = match choice {
227 Choice::Memchr(p) => Arc::new(p),
228 Choice::Memchr2(p) => Arc::new(p),
229 Choice::Memchr3(p) => Arc::new(p),
230 Choice::Memmem(p) => Arc::new(p),
231 Choice::Teddy(p) => Arc::new(p),
232 Choice::ByteSet(p) => Arc::new(p),
233 Choice::AhoCorasick(p) => Arc::new(p),
234 };
235 let is_fast = pre.is_fast();
236 Some(Prefilter { pre, is_fast, max_needle_len })
237 }
238 }
239
240 /// This attempts to extract prefixes from the given `Hir` expression for
241 /// the given match semantics, and if possible, builds a prefilter for
242 /// them.
243 ///
244 /// # Example
245 ///
246 /// This example shows how to build a prefilter directly from an [`Hir`]
247 /// expression, and use to find an occurrence of a prefix from the regex
248 /// pattern.
249 ///
250 /// ```
251 /// use regex_automata::{
252 /// util::{prefilter::Prefilter, syntax},
253 /// MatchKind, Span,
254 /// };
255 ///
256 /// let hir = syntax::parse(r"(Bruce|Patti) \w+")?;
257 /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
258 /// .expect("a prefilter");
259 /// let hay = "Hello Patti Scialfa!";
260 /// assert_eq!(
261 /// Some(Span::from(6..12)),
262 /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
263 /// );
264 ///
265 /// # Ok::<(), Box<dyn std::error::Error>>(())
266 /// ```
267 #[cfg(feature = "syntax")]
268 pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> {
269 Prefilter::from_hirs_prefix(kind, &[hir])
270 }
271
272 /// This attempts to extract prefixes from the given `Hir` expressions for
273 /// the given match semantics, and if possible, builds a prefilter for
274 /// them.
275 ///
276 /// Note that as of now, prefilters throw away information about which
277 /// pattern each literal comes from. In other words, when a prefilter finds
278 /// a match, there's no way to know which pattern (or patterns) it came
279 /// from. Therefore, in order to confirm a match, you'll have to check all
280 /// of the patterns by running the full regex engine.
281 ///
282 /// # Example
283 ///
284 /// This example shows how to build a prefilter directly from multiple
285 /// `Hir` expressions expression, and use it to find an occurrence of a
286 /// prefix from the regex patterns.
287 ///
288 /// ```
289 /// use regex_automata::{
290 /// util::{prefilter::Prefilter, syntax},
291 /// MatchKind, Span,
292 /// };
293 ///
294 /// let hirs = syntax::parse_many(&[
295 /// r"(Bruce|Patti) \w+",
296 /// r"Mrs?\. Doubtfire",
297 /// ])?;
298 /// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs)
299 /// .expect("a prefilter");
300 /// let hay = "Hello Mrs. Doubtfire";
301 /// assert_eq!(
302 /// Some(Span::from(6..20)),
303 /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
304 /// );
305 ///
306 /// # Ok::<(), Box<dyn std::error::Error>>(())
307 /// ```
308 #[cfg(feature = "syntax")]
309 pub fn from_hirs_prefix<H: Borrow<Hir>>(
310 kind: MatchKind,
311 hirs: &[H],
312 ) -> Option<Prefilter> {
313 prefixes(kind, hirs)
314 .literals()
315 .and_then(|lits| Prefilter::new(kind, lits))
316 }
317
318 /// Run this prefilter on `haystack[span.start..end]` and return a matching
319 /// span if one exists.
320 ///
321 /// The span returned is guaranteed to have a start position greater than
322 /// or equal to the one given, and an end position less than or equal to
323 /// the one given.
324 ///
325 /// # Example
326 ///
327 /// This example shows how to build a prefilter directly from an [`Hir`]
328 /// expression, and use it to find an occurrence of a prefix from the regex
329 /// pattern.
330 ///
331 /// ```
332 /// use regex_automata::{
333 /// util::{prefilter::Prefilter, syntax},
334 /// MatchKind, Span,
335 /// };
336 ///
337 /// let hir = syntax::parse(r"Bruce \w+")?;
338 /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
339 /// .expect("a prefilter");
340 /// let hay = "Hello Bruce Springsteen!";
341 /// assert_eq!(
342 /// Some(Span::from(6..12)),
343 /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
344 /// );
345 ///
346 /// # Ok::<(), Box<dyn std::error::Error>>(())
347 /// ```
348 #[inline]
349 pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
350 #[cfg(not(feature = "alloc"))]
351 {
352 unreachable!()
353 }
354 #[cfg(feature = "alloc")]
355 {
356 self.pre.find(haystack, span)
357 }
358 }
359
360 /// Returns the span of a prefix of `haystack[span.start..span.end]` if
361 /// the prefilter matches.
362 ///
363 /// The span returned is guaranteed to have a start position equivalent to
364 /// the one given, and an end position less than or equal to the one given.
365 ///
366 /// # Example
367 ///
368 /// This example shows how to build a prefilter directly from an [`Hir`]
369 /// expression, and use it to find an occurrence of a prefix from the regex
370 /// pattern that begins at the start of a haystack only.
371 ///
372 /// ```
373 /// use regex_automata::{
374 /// util::{prefilter::Prefilter, syntax},
375 /// MatchKind, Span,
376 /// };
377 ///
378 /// let hir = syntax::parse(r"Bruce \w+")?;
379 /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
380 /// .expect("a prefilter");
381 /// let hay = "Hello Bruce Springsteen!";
382 /// // Nothing is found here because 'Bruce' does
383 /// // not occur at the beginning of our search.
384 /// assert_eq!(
385 /// None,
386 /// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())),
387 /// );
388 /// // But if we change where we start the search
389 /// // to begin where 'Bruce ' begins, then a
390 /// // match will be found.
391 /// assert_eq!(
392 /// Some(Span::from(6..12)),
393 /// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())),
394 /// );
395 ///
396 /// # Ok::<(), Box<dyn std::error::Error>>(())
397 /// ```
398 #[inline]
399 pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
400 #[cfg(not(feature = "alloc"))]
401 {
402 unreachable!()
403 }
404 #[cfg(feature = "alloc")]
405 {
406 self.pre.prefix(haystack, span)
407 }
408 }
409
410 /// Returns the heap memory, in bytes, used by the underlying prefilter.
411 #[inline]
412 pub fn memory_usage(&self) -> usize {
413 #[cfg(not(feature = "alloc"))]
414 {
415 unreachable!()
416 }
417 #[cfg(feature = "alloc")]
418 {
419 self.pre.memory_usage()
420 }
421 }
422
423 /// Return the length of the longest needle
424 /// in this Prefilter
425 #[inline]
426 pub fn max_needle_len(&self) -> usize {
427 #[cfg(not(feature = "alloc"))]
428 {
429 unreachable!()
430 }
431 #[cfg(feature = "alloc")]
432 {
433 self.max_needle_len
434 }
435 }
436
437 /// Implementations might return true here if they believe themselves to
438 /// be "fast." The concept of "fast" is deliberately left vague, but in
439 /// practice this usually corresponds to whether it's believed that SIMD
440 /// will be used.
441 ///
442 /// Why do we care about this? Well, some prefilter tricks tend to come
443 /// with their own bits of overhead, and so might only make sense if we
444 /// know that a scan will be *much* faster than the regex engine itself.
445 /// Otherwise, the trick may not be worth doing. Whether something is
446 /// "much" faster than the regex engine generally boils down to whether
447 /// SIMD is used. (But not always. Even a SIMD matcher with a high false
448 /// positive rate can become quite slow.)
449 ///
450 /// Even if this returns true, it is still possible for the prefilter to
451 /// be "slow." Remember, prefilters are just heuristics. We can't really
452 /// *know* a prefilter will be fast without actually trying the prefilter.
453 /// (Which of course we cannot afford to do.)
454 #[inline]
455 pub fn is_fast(&self) -> bool {
456 #[cfg(not(feature = "alloc"))]
457 {
458 unreachable!()
459 }
460 #[cfg(feature = "alloc")]
461 {
462 self.is_fast
463 }
464 }
465}
466
467/// A trait for abstracting over prefilters. Basically, a prefilter is
468/// something that do an unanchored *and* an anchored search in a haystack
469/// within a given span.
470///
471/// This exists pretty much only so that we can use prefilters as a trait
472/// object (which is what `Prefilter` is). If we ever move off of trait objects
473/// and to an enum, then it's likely this trait could be removed.
474pub(crate) trait PrefilterI:
475 Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static
476{
477 /// Run this prefilter on `haystack[span.start..end]` and return a matching
478 /// span if one exists.
479 ///
480 /// The span returned is guaranteed to have a start position greater than
481 /// or equal to the one given, and an end position less than or equal to
482 /// the one given.
483 fn find(&self, haystack: &[u8], span: Span) -> Option<Span>;
484
485 /// Returns the span of a prefix of `haystack[span.start..span.end]` if
486 /// the prefilter matches.
487 ///
488 /// The span returned is guaranteed to have a start position equivalent to
489 /// the one given, and an end position less than or equal to the one given.
490 fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>;
491
492 /// Returns the heap memory, in bytes, used by the underlying prefilter.
493 fn memory_usage(&self) -> usize;
494
495 /// Implementations might return true here if they believe themselves to
496 /// be "fast." See [`Prefilter::is_fast`] for more details.
497 fn is_fast(&self) -> bool;
498}
499
500#[cfg(feature = "alloc")]
501impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
502 #[cfg_attr(feature = "perf-inline", inline(always))]
503 fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
504 (&**self).find(haystack, span)
505 }
506
507 #[cfg_attr(feature = "perf-inline", inline(always))]
508 fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
509 (&**self).prefix(haystack, span)
510 }
511
512 #[cfg_attr(feature = "perf-inline", inline(always))]
513 fn memory_usage(&self) -> usize {
514 (&**self).memory_usage()
515 }
516
517 #[cfg_attr(feature = "perf-inline", inline(always))]
518 fn is_fast(&self) -> bool {
519 (&**self).is_fast()
520 }
521}
522
523/// A type that encapsulates the selection of a prefilter algorithm from a
524/// sequence of needles.
525///
526/// The existence of this type is a little tricky, because we don't (currently)
527/// use it for performing a search. Instead, we really only consume it by
528/// converting the underlying prefilter into a trait object, whether that be
529/// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order
530/// to avoid re-copying the prefilter selection logic, we isolate it here, and
531/// then force anything downstream that wants to convert it to a trait object
532/// to do trivial case analysis on it.
533///
534/// One wonders whether we *should* use an enum instead of a trait object.
535/// At time of writing, I chose trait objects based on instinct because 1) I
536/// knew I wasn't going to inline anything and 2) there would potentially be
537/// many different choices. However, as of time of writing, I haven't actually
538/// compared the trait object approach to the enum approach. That probably
539/// should be litigated, but I ran out of steam.
540///
541/// Note that if the `alloc` feature is disabled, then values of this type
542/// are (and should) never be constructed. Also, in practice, for any of the
543/// prefilters to be selected, you'll need at least one of the `perf-literal-*`
544/// features enabled.
545#[derive(Clone, Debug)]
546pub(crate) enum Choice {
547 Memchr(Memchr),
548 Memchr2(Memchr2),
549 Memchr3(Memchr3),
550 Memmem(Memmem),
551 Teddy(Teddy),
552 ByteSet(ByteSet),
553 AhoCorasick(AhoCorasick),
554}
555
556impl Choice {
557 /// Select what is believed to be the best prefilter algorithm for the
558 /// match semantics and sequence of needles given.
559 ///
560 /// This selection algorithm uses the needles as given without any
561 /// modification. For example, if `[bar]` is given, then this doesn't
562 /// try to select `memchr` for `b`. Instead, it would select `memmem`
563 /// for `bar`. If callers would want `memchr` selected for `[bar]`, then
564 /// callers should massages the literals themselves. That is, callers are
565 /// responsible for heuristics surrounding which sequence of literals is
566 /// best.
567 ///
568 /// What this selection algorithm does is attempt to use the fastest
569 /// prefilter that works for the literals given. So if `[a, b]`, is given,
570 /// then `memchr2` is selected.
571 ///
572 /// Of course, which prefilter is selected is also subject to what
573 /// is available. For example, if `alloc` isn't enabled, then
574 /// that limits which prefilters can be selected. Similarly, if
575 /// `perf-literal-substring` isn't enabled, then nothing from the `memchr`
576 /// crate can be returned.
577 pub(crate) fn new<B: AsRef<[u8]>>(
578 kind: MatchKind,
579 needles: &[B],
580 ) -> Option<Choice> {
581 // An empty set means the regex matches nothing, so no sense in
582 // building a prefilter.
583 if needles.len() == 0 {
584 debug!("prefilter building failed: found empty set of literals");
585 return None;
586 }
587 // If the regex can match the empty string, then the prefilter
588 // will by definition match at every position. This is obviously
589 // completely ineffective.
590 if needles.iter().any(|n| n.as_ref().is_empty()) {
591 debug!("prefilter building failed: literals match empty string");
592 return None;
593 }
594 // BREADCRUMBS: Perhaps the literal optimizer should special case
595 // sequences of length two or three if the leading bytes of each are
596 // "rare"? Or perhaps, if there are two or three total possible leading
597 // bytes, regardless of the number of literals, and all are rare...
598 // Then well, perhaps we should use memchr2 or memchr3 in those cases?
599 if let Some(pre) = Memchr::new(kind, needles) {
600 debug!("prefilter built: memchr");
601 return Some(Choice::Memchr(pre));
602 }
603 if let Some(pre) = Memchr2::new(kind, needles) {
604 debug!("prefilter built: memchr2");
605 return Some(Choice::Memchr2(pre));
606 }
607 if let Some(pre) = Memchr3::new(kind, needles) {
608 debug!("prefilter built: memchr3");
609 return Some(Choice::Memchr3(pre));
610 }
611 if let Some(pre) = Memmem::new(kind, needles) {
612 debug!("prefilter built: memmem");
613 return Some(Choice::Memmem(pre));
614 }
615 if let Some(pre) = Teddy::new(kind, needles) {
616 debug!("prefilter built: teddy");
617 return Some(Choice::Teddy(pre));
618 }
619 if let Some(pre) = ByteSet::new(kind, needles) {
620 debug!("prefilter built: byteset");
621 return Some(Choice::ByteSet(pre));
622 }
623 if let Some(pre) = AhoCorasick::new(kind, needles) {
624 debug!("prefilter built: aho-corasick");
625 return Some(Choice::AhoCorasick(pre));
626 }
627 debug!("prefilter building failed: no strategy could be found");
628 None
629 }
630}
631
632/// Extracts all of the prefix literals from the given HIR expressions into a
633/// single `Seq`. The literals in the sequence are ordered with respect to the
634/// order of the given HIR expressions and consistent with the match semantics
635/// given.
636///
637/// The sequence returned is "optimized." That is, they may be shrunk or even
638/// truncated according to heuristics with the intent of making them more
639/// useful as a prefilter. (Which translates to both using faster algorithms
640/// and minimizing the false positive rate.)
641///
642/// Note that this erases any connection between the literals and which pattern
643/// (or patterns) they came from.
644///
645/// The match kind given must correspond to the match semantics of the regex
646/// that is represented by the HIRs given. The match semantics may change the
647/// literal sequence returned.
648#[cfg(feature = "syntax")]
649pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
650where
651 H: core::borrow::Borrow<Hir>,
652{
653 let mut extractor = literal::Extractor::new();
654 extractor.kind(literal::ExtractKind::Prefix);
655
656 let mut prefixes = literal::Seq::empty();
657 for hir in hirs {
658 prefixes.union(&mut extractor.extract(hir.borrow()));
659 }
660 debug!(
661 "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
662 prefixes.len(),
663 prefixes.is_exact(),
664 prefixes
665 );
666 match kind {
667 MatchKind::All => {
668 prefixes.sort();
669 prefixes.dedup();
670 }
671 MatchKind::LeftmostFirst => {
672 prefixes.optimize_for_prefix_by_preference();
673 }
674 }
675 debug!(
676 "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
677 prefixes.len(),
678 prefixes.is_exact(),
679 prefixes
680 );
681 prefixes
682}
683
684/// Like `prefixes`, but for all suffixes of all matches for the given HIRs.
685#[cfg(feature = "syntax")]
686pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
687where
688 H: core::borrow::Borrow<Hir>,
689{
690 let mut extractor = literal::Extractor::new();
691 extractor.kind(literal::ExtractKind::Suffix);
692
693 let mut suffixes = literal::Seq::empty();
694 for hir in hirs {
695 suffixes.union(&mut extractor.extract(hir.borrow()));
696 }
697 debug!(
698 "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
699 suffixes.len(),
700 suffixes.is_exact(),
701 suffixes
702 );
703 match kind {
704 MatchKind::All => {
705 suffixes.sort();
706 suffixes.dedup();
707 }
708 MatchKind::LeftmostFirst => {
709 suffixes.optimize_for_suffix_by_preference();
710 }
711 }
712 debug!(
713 "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
714 suffixes.len(),
715 suffixes.is_exact(),
716 suffixes
717 );
718 suffixes
719}
720