mod.rs source code [crates/regex-automata-0.4.6/src/util/prefilter/mod.rs]

1	/!*
2	Defines a prefilter for accelerating regex searches.
3
4	A prefilter can be created by building a [`Prefilter`] value.
5
6	A prefilter represents one of the most important optimizations available for
7	accelerating regex searches. The idea of a prefilter is to very quickly find
8	candidate locations in a haystack where a regex _could_ match. Once a candidate
9	is found, it is then intended for the regex engine to run at that position to
10	determine whether the candidate is a match or a false positive.
11
12	In the aforementioned description of the prefilter optimization also lay its
13	demise. Namely, if a prefilter has a high false positive rate and it produces
14	lots of candidates, then a prefilter can overall make a regex search slower.
15	It can run more slowly because more time is spent ping-ponging between the
16	prefilter search and the regex engine attempting to confirm each candidate as
17	a match. This ping-ponging has overhead that adds up, and is exacerbated by
18	a high false positive rate.
19
20	Nevertheless, the optimization is still generally worth performing in most
21	cases. Particularly given just how much throughput can be improved. (It is not
22	uncommon for prefilter optimizations to improve throughput by one or two orders
23	of magnitude.)
24
25	Typically a prefilter is used to find occurrences of literal prefixes from a
26	regex pattern, but this isn't required. A prefilter can be used to look for
27	suffixes or even inner literals.
28
29	Note that as of now, prefilters throw away information about which pattern
30	each literal comes from. In other words, when a prefilter finds a match,
31	there's no way to know which pattern (or patterns) it came from. Therefore,
32	in order to confirm a match, you'll have to check all of the patterns by
33	running the full regex engine.
34	*/
35
36	mod aho_corasick;
37	mod byteset;
38	mod memchr;
39	mod memmem;
40	mod teddy;
41
42	use core::{
43	borrow::Borrow,
44	fmt::Debug,
45	panic::{RefUnwindSafe, UnwindSafe},
46	};
47
48	#[cfg(feature = "alloc")]
49	use alloc::sync::Arc;
50
51	#[cfg(feature = "syntax")]
52	use regex_syntax::hir::{literal, Hir};
53
54	use crate::util::search::{MatchKind, Span};
55
56	pub(crate) use crate::util::prefilter::{
57	aho_corasick::AhoCorasick,
58	byteset::ByteSet,
59	memchr::{Memchr, Memchr2, Memchr3},
60	memmem::Memmem,
61	teddy::Teddy,
62	};
63
64	/// A prefilter for accelerating regex searches.
65	///
66	/// If you already have your literals that you want to search with,
67	/// then the vanilla [`Prefilter::new`] constructor is for you. But
68	/// if you have an [`Hir`] value from the `regex-syntax` crate, then
69	/// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses
70	/// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to
71	/// extract literal prefixes for you, optimize them and then select and build a
72	/// prefilter matcher.
73	///
74	/// A prefilter must have zero false negatives. However, by its very
75	/// nature, it may produce false positives. That is, a prefilter will never
76	/// skip over a position in the haystack that corresponds to a match of the
77	/// original regex pattern, but it may* produce a match for a position*
78	/// in the haystack that does not* correspond to a match of the original*
79	/// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or
80	/// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is
81	/// upheld for you automatically. This guarantee is not preserved if you use
82	/// [`Prefilter::new`] though, since it is up to the caller to provide correct
83	/// literal strings with respect to the original regex pattern.
84	///
85	/// # Cloning
86	///
87	/// It is an API guarantee that cloning a prefilter is cheap. That is, cloning
88	/// it will not duplicate whatever heap memory is used to represent the
89	/// underlying matcher.
90	///
91	/// # Example
92	///
93	/// This example shows how to attach a `Prefilter` to the
94	/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate
95	/// searches.
96	///
97	/// ```
98	/// use regex_automata::{
99	/// nfa::thompson::pikevm::PikeVM,
100	/// util::prefilter::Prefilter,
101	/// Match, MatchKind,
102	/// };
103	///
104	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce "])
105	/// .expect("a prefilter");
106	/// let re = PikeVM::builder()
107	/// .configure(PikeVM::config().prefilter(Some(pre)))
108	/// .build(r"Bruce \w+")?;
109	/// let mut cache = re.create_cache();
110	/// assert_eq!(
111	/// Some(Match::must(`0`, `6`..`23`)),
112	/// re.find(&mut cache, "Hello Bruce Springsteen!"),
113	/// );
114	/// # Ok::<(), Box<dyn std::error::Error>>(())
115	/// ```
116	///
117	/// But note that if you get your prefilter incorrect, it could lead to an
118	/// incorrect result!
119	///
120	/// ```
121	/// use regex_automata::{
122	/// nfa::thompson::pikevm::PikeVM,
123	/// util::prefilter::Prefilter,
124	/// Match, MatchKind,
125	/// };
126	///
127	/// // This prefilter is wrong!
128	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti "])
129	/// .expect("a prefilter");
130	/// let re = PikeVM::builder()
131	/// .configure(PikeVM::config().prefilter(Some(pre)))
132	/// .build(r"Bruce \w+")?;
133	/// let mut cache = re.create_cache();
134	/// // We find no match even though the regex does match.
135	/// assert_eq!(
136	/// None,
137	/// re.find(&mut cache, "Hello Bruce Springsteen!"),
138	/// );
139	/// # Ok::<(), Box<dyn std::error::Error>>(())
140	/// ```
141	#[derive(Clone, Debug)]
142	pub struct Prefilter {
143	#[cfg(not(feature = "alloc"))]
144	_unused: (),
145	#[cfg(feature = "alloc")]
146	pre: Arc<dyn PrefilterI>,
147	#[cfg(feature = "alloc")]
148	is_fast: bool,
149	#[cfg(feature = "alloc")]
150	max_needle_len: usize,
151	}
152
153	impl Prefilter {
154	/// Create a new prefilter from a sequence of needles and a corresponding
155	/// match semantics.
156	///
157	/// This may return `None` for a variety of reasons, for example, if
158	/// a suitable prefilter could not be constructed. That might occur
159	/// if they are unavailable (e.g., the `perf-literal-substring` and
160	/// `perf-literal-multisubstring` features aren't enabled), or it might
161	/// occur because of heuristics or other artifacts of how the prefilter
162	/// works.
163	///
164	/// Note that if you have an [`Hir`] expression, it may be more convenient
165	/// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the
166	/// task of extracting prefix literals for you.
167	///
168	/// # Example
169	///
170	/// This example shows how match semantics can impact the matching
171	/// algorithm used by the prefilter. For this reason, it is important to
172	/// ensure that the match semantics given here are consistent with the
173	/// match semantics intended for the regular expression that the literals
174	/// were extracted from.
175	///
176	/// ```
177	/// use regex_automata::{
178	/// util::{prefilter::Prefilter, syntax},
179	/// MatchKind, Span,
180	/// };
181	///
182	/// let hay = "Hello samwise";
183	///
184	/// // With leftmost-first, we find 'samwise' here because it comes
185	/// // before 'sam' in the sequence we give it..
186	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise", "sam"])
187	/// .expect("a prefilter");
188	/// assert_eq!(
189	/// Some(Span::from(`6`..`13`)),
190	/// pre.find(hay.as_bytes(), Span::from(`0`..hay.len())),
191	/// );
192	/// // Still with leftmost-first but with the literals reverse, now 'sam'
193	/// // will match instead!
194	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam", "samwise"])
195	/// .expect("a prefilter");
196	/// assert_eq!(
197	/// Some(Span::from(`6`..`9`)),
198	/// pre.find(hay.as_bytes(), Span::from(`0`..hay.len())),
199	/// );
200	///
201	/// # Ok::<(), Box<dyn std::error::Error>>(())
202	/// ```
203	pub fn new<B: AsRef<[u8]>>(
204	kind: MatchKind,
205	needles: &[B],
206	) -> Option<Prefilter> {
207	Choice::new(kind, needles).and_then(\|choice\| {
208	let max_needle_len =
209	needles.iter().map(\|b\| b.as_ref().len()).max().unwrap_or(`0`);
210	Prefilter::from_choice(choice, max_needle_len)
211	})
212	}
213
214	/// This turns a prefilter selection into a `Prefilter`. That is, in turns
215	/// the enum given into a trait object.
216	fn from_choice(
217	choice: Choice,
218	max_needle_len: usize,
219	) -> Option<Prefilter> {
220	#[cfg(not(feature = "alloc"))]
221	{
222	None
223	}
224	#[cfg(feature = "alloc")]
225	{
226	let pre: Arc<dyn PrefilterI> = match choice {
227	Choice::Memchr(p) => Arc::new(p),
228	Choice::Memchr2(p) => Arc::new(p),
229	Choice::Memchr3(p) => Arc::new(p),
230	Choice::Memmem(p) => Arc::new(p),
231	Choice::Teddy(p) => Arc::new(p),
232	Choice::ByteSet(p) => Arc::new(p),
233	Choice::AhoCorasick(p) => Arc::new(p),
234	};
235	let is_fast = pre.is_fast();
236	Some(Prefilter { pre, is_fast, max_needle_len })
237	}
238	}
239
240	/// This attempts to extract prefixes from the given `Hir` expression for
241	/// the given match semantics, and if possible, builds a prefilter for
242	/// them.
243	///
244	/// # Example
245	///
246	/// This example shows how to build a prefilter directly from an [`Hir`]
247	/// expression, and use to find an occurrence of a prefix from the regex
248	/// pattern.
249	///
250	/// ```
251	/// use regex_automata::{
252	/// util::{prefilter::Prefilter, syntax},
253	/// MatchKind, Span,
254	/// };
255	///
256	/// let hir = syntax::parse(r"(Bruce\|Patti) \w+")?;
257	/// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
258	/// .expect("a prefilter");
259	/// let hay = "Hello Patti Scialfa!";
260	/// assert_eq!(
261	/// Some(Span::from(`6`..`12`)),
262	/// pre.find(hay.as_bytes(), Span::from(`0`..hay.len())),
263	/// );
264	///
265	/// # Ok::<(), Box<dyn std::error::Error>>(())
266	/// ```
267	#[cfg(feature = "syntax")]
268	pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> {
269	Prefilter::from_hirs_prefix(kind, &[hir])
270	}
271
272	/// This attempts to extract prefixes from the given `Hir` expressions for
273	/// the given match semantics, and if possible, builds a prefilter for
274	/// them.
275	///
276	/// Note that as of now, prefilters throw away information about which
277	/// pattern each literal comes from. In other words, when a prefilter finds
278	/// a match, there's no way to know which pattern (or patterns) it came
279	/// from. Therefore, in order to confirm a match, you'll have to check all
280	/// of the patterns by running the full regex engine.
281	///
282	/// # Example
283	///
284	/// This example shows how to build a prefilter directly from multiple
285	/// `Hir` expressions expression, and use it to find an occurrence of a
286	/// prefix from the regex patterns.
287	///
288	/// ```
289	/// use regex_automata::{
290	/// util::{prefilter::Prefilter, syntax},
291	/// MatchKind, Span,
292	/// };
293	///
294	/// let hirs = syntax::parse_many(&[
295	/// r"(Bruce\|Patti) \w+",
296	/// r"Mrs?\. Doubtfire",
297	/// ])?;
298	/// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs)
299	/// .expect("a prefilter");
300	/// let hay = "Hello Mrs. Doubtfire";
301	/// assert_eq!(
302	/// Some(Span::from(`6`..`20`)),
303	/// pre.find(hay.as_bytes(), Span::from(`0`..hay.len())),
304	/// );
305	///
306	/// # Ok::<(), Box<dyn std::error::Error>>(())
307	/// ```
308	#[cfg(feature = "syntax")]
309	pub fn from_hirs_prefix<H: Borrow<Hir>>(
310	kind: MatchKind,
311	hirs: &[H],
312	) -> Option<Prefilter> {
313	prefixes(kind, hirs)
314	.literals()
315	.and_then(\|lits\| Prefilter::new(kind, lits))
316	}
317
318	/// Run this prefilter on `haystack[span.start..end]` and return a matching
319	/// span if one exists.
320	///
321	/// The span returned is guaranteed to have a start position greater than
322	/// or equal to the one given, and an end position less than or equal to
323	/// the one given.
324	///
325	/// # Example
326	///
327	/// This example shows how to build a prefilter directly from an [`Hir`]
328	/// expression, and use it to find an occurrence of a prefix from the regex
329	/// pattern.
330	///
331	/// ```
332	/// use regex_automata::{
333	/// util::{prefilter::Prefilter, syntax},
334	/// MatchKind, Span,
335	/// };
336	///
337	/// let hir = syntax::parse(r"Bruce \w+")?;
338	/// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
339	/// .expect("a prefilter");
340	/// let hay = "Hello Bruce Springsteen!";
341	/// assert_eq!(
342	/// Some(Span::from(`6`..`12`)),
343	/// pre.find(hay.as_bytes(), Span::from(`0`..hay.len())),
344	/// );
345	///
346	/// # Ok::<(), Box<dyn std::error::Error>>(())
347	/// ```
348	#[inline]
349	pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
350	#[cfg(not(feature = "alloc"))]
351	{
352	unreachable!()
353	}
354	#[cfg(feature = "alloc")]
355	{
356	self.pre.find(haystack, span)
357	}
358	}
359
360	/// Returns the span of a prefix of `haystack[span.start..span.end]` if
361	/// the prefilter matches.
362	///
363	/// The span returned is guaranteed to have a start position equivalent to
364	/// the one given, and an end position less than or equal to the one given.
365	///
366	/// # Example
367	///
368	/// This example shows how to build a prefilter directly from an [`Hir`]
369	/// expression, and use it to find an occurrence of a prefix from the regex
370	/// pattern that begins at the start of a haystack only.
371	///
372	/// ```
373	/// use regex_automata::{
374	/// util::{prefilter::Prefilter, syntax},
375	/// MatchKind, Span,
376	/// };
377	///
378	/// let hir = syntax::parse(r"Bruce \w+")?;
379	/// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
380	/// .expect("a prefilter");
381	/// let hay = "Hello Bruce Springsteen!";
382	/// // Nothing is found here because 'Bruce' does
383	/// // not occur at the beginning of our search.
384	/// assert_eq!(
385	/// None,
386	/// pre.prefix(hay.as_bytes(), Span::from(`0`..hay.len())),
387	/// );
388	/// // But if we change where we start the search
389	/// // to begin where 'Bruce ' begins, then a
390	/// // match will be found.
391	/// assert_eq!(
392	/// Some(Span::from(`6`..`12`)),
393	/// pre.prefix(hay.as_bytes(), Span::from(`6`..hay.len())),
394	/// );
395	///
396	/// # Ok::<(), Box<dyn std::error::Error>>(())
397	/// ```
398	#[inline]
399	pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
400	#[cfg(not(feature = "alloc"))]
401	{
402	unreachable!()
403	}
404	#[cfg(feature = "alloc")]
405	{
406	self.pre.prefix(haystack, span)
407	}
408	}
409
410	/// Returns the heap memory, in bytes, used by the underlying prefilter.
411	#[inline]
412	pub fn memory_usage(&self) -> usize {
413	#[cfg(not(feature = "alloc"))]
414	{
415	unreachable!()
416	}
417	#[cfg(feature = "alloc")]
418	{
419	self.pre.memory_usage()
420	}
421	}
422
423	/// Return the length of the longest needle
424	/// in this Prefilter
425	#[inline]
426	pub fn max_needle_len(&self) -> usize {
427	#[cfg(not(feature = "alloc"))]
428	{
429	unreachable!()
430	}
431	#[cfg(feature = "alloc")]
432	{
433	self.max_needle_len
434	}
435	}
436
437	/// Implementations might return true here if they believe themselves to
438	/// be "fast." The concept of "fast" is deliberately left vague, but in
439	/// practice this usually corresponds to whether it's believed that SIMD
440	/// will be used.
441	///
442	/// Why do we care about this? Well, some prefilter tricks tend to come
443	/// with their own bits of overhead, and so might only make sense if we
444	/// know that a scan will be much* faster than the regex engine itself.*
445	/// Otherwise, the trick may not be worth doing. Whether something is
446	/// "much" faster than the regex engine generally boils down to whether
447	/// SIMD is used. (But not always. Even a SIMD matcher with a high false
448	/// positive rate can become quite slow.)
449	///
450	/// Even if this returns true, it is still possible for the prefilter to
451	/// be "slow." Remember, prefilters are just heuristics. We can't really
452	/// know* a prefilter will be fast without actually trying the prefilter.*
453	/// (Which of course we cannot afford to do.)
454	#[inline]
455	pub fn is_fast(&self) -> bool {
456	#[cfg(not(feature = "alloc"))]
457	{
458	unreachable!()
459	}
460	#[cfg(feature = "alloc")]
461	{
462	self.is_fast
463	}
464	}
465	}
466
467	/// A trait for abstracting over prefilters. Basically, a prefilter is
468	/// something that do an unanchored and* an anchored search in a haystack*
469	/// within a given span.
470	///
471	/// This exists pretty much only so that we can use prefilters as a trait
472	/// object (which is what `Prefilter` is). If we ever move off of trait objects
473	/// and to an enum, then it's likely this trait could be removed.
474	pub(crate) trait PrefilterI:
475	Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static
476	{
477	/// Run this prefilter on `haystack[span.start..end]` and return a matching
478	/// span if one exists.
479	///
480	/// The span returned is guaranteed to have a start position greater than
481	/// or equal to the one given, and an end position less than or equal to
482	/// the one given.
483	fn find(&self, haystack: &[u8], span: Span) -> Option<Span>;
484
485	/// Returns the span of a prefix of `haystack[span.start..span.end]` if
486	/// the prefilter matches.
487	///
488	/// The span returned is guaranteed to have a start position equivalent to
489	/// the one given, and an end position less than or equal to the one given.
490	fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>;
491
492	/// Returns the heap memory, in bytes, used by the underlying prefilter.
493	fn memory_usage(&self) -> usize;
494
495	/// Implementations might return true here if they believe themselves to
496	/// be "fast." See [`Prefilter::is_fast`] for more details.
497	fn is_fast(&self) -> bool;
498	}
499
500	#[cfg(feature = "alloc")]
501	impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
502	#[cfg_attr(feature = "perf-inline", inline(always))]
503	fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
504	(&**self).find(haystack, span)
505	}
506
507	#[cfg_attr(feature = "perf-inline", inline(always))]
508	fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
509	(&**self).prefix(haystack, span)
510	}
511
512	#[cfg_attr(feature = "perf-inline", inline(always))]
513	fn memory_usage(&self) -> usize {
514	(&**self).memory_usage()
515	}
516
517	#[cfg_attr(feature = "perf-inline", inline(always))]
518	fn is_fast(&self) -> bool {
519	(&**self).is_fast()
520	}
521	}
522
523	/// A type that encapsulates the selection of a prefilter algorithm from a
524	/// sequence of needles.
525	///
526	/// The existence of this type is a little tricky, because we don't (currently)
527	/// use it for performing a search. Instead, we really only consume it by
528	/// converting the underlying prefilter into a trait object, whether that be
529	/// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order
530	/// to avoid re-copying the prefilter selection logic, we isolate it here, and
531	/// then force anything downstream that wants to convert it to a trait object
532	/// to do trivial case analysis on it.
533	///
534	/// One wonders whether we should* use an enum instead of a trait object.*
535	/// At time of writing, I chose trait objects based on instinct because 1) I
536	/// knew I wasn't going to inline anything and 2) there would potentially be
537	/// many different choices. However, as of time of writing, I haven't actually
538	/// compared the trait object approach to the enum approach. That probably
539	/// should be litigated, but I ran out of steam.
540	///
541	/// Note that if the `alloc` feature is disabled, then values of this type
542	/// are (and should) never be constructed. Also, in practice, for any of the
543	/// prefilters to be selected, you'll need at least one of the `perf-literal-`*
544	/// features enabled.
545	#[derive(Clone, Debug)]
546	pub(crate) enum Choice {
547	Memchr(Memchr),
548	Memchr2(Memchr2),
549	Memchr3(Memchr3),
550	Memmem(Memmem),
551	Teddy(Teddy),
552	ByteSet(ByteSet),
553	AhoCorasick(AhoCorasick),
554	}
555
556	impl Choice {
557	/// Select what is believed to be the best prefilter algorithm for the
558	/// match semantics and sequence of needles given.
559	///
560	/// This selection algorithm uses the needles as given without any
561	/// modification. For example, if `[bar]` is given, then this doesn't
562	/// try to select `memchr` for `b`. Instead, it would select `memmem`
563	/// for `bar`. If callers would want `memchr` selected for `[bar]`, then
564	/// callers should massages the literals themselves. That is, callers are
565	/// responsible for heuristics surrounding which sequence of literals is
566	/// best.
567	///
568	/// What this selection algorithm does is attempt to use the fastest
569	/// prefilter that works for the literals given. So if `[a, b]`, is given,
570	/// then `memchr2` is selected.
571	///
572	/// Of course, which prefilter is selected is also subject to what
573	/// is available. For example, if `alloc` isn't enabled, then
574	/// that limits which prefilters can be selected. Similarly, if
575	/// `perf-literal-substring` isn't enabled, then nothing from the `memchr`
576	/// crate can be returned.
577	pub(crate) fn new<B: AsRef<[u8]>>(
578	kind: MatchKind,
579	needles: &[B],
580	) -> Option<Choice> {
581	// An empty set means the regex matches nothing, so no sense in
582	// building a prefilter.
583	if needles.len() == `0` {
584	debug!("prefilter building failed: found empty set of literals");
585	return None;
586	}
587	// If the regex can match the empty string, then the prefilter
588	// will by definition match at every position. This is obviously
589	// completely ineffective.
590	if needles.iter().any(\|n\| n.as_ref().is_empty()) {
591	debug!("prefilter building failed: literals match empty string");
592	return None;
593	}
594	// BREADCRUMBS: Perhaps the literal optimizer should special case
595	// sequences of length two or three if the leading bytes of each are
596	// "rare"? Or perhaps, if there are two or three total possible leading
597	// bytes, regardless of the number of literals, and all are rare...
598	// Then well, perhaps we should use memchr2 or memchr3 in those cases?
599	if let Some(pre) = Memchr::new(kind, needles) {
600	debug!("prefilter built: memchr");
601	return Some(Choice::Memchr(pre));
602	}
603	if let Some(pre) = Memchr2::new(kind, needles) {
604	debug!("prefilter built: memchr2");
605	return Some(Choice::Memchr2(pre));
606	}
607	if let Some(pre) = Memchr3::new(kind, needles) {
608	debug!("prefilter built: memchr3");
609	return Some(Choice::Memchr3(pre));
610	}
611	if let Some(pre) = Memmem::new(kind, needles) {
612	debug!("prefilter built: memmem");
613	return Some(Choice::Memmem(pre));
614	}
615	if let Some(pre) = Teddy::new(kind, needles) {
616	debug!("prefilter built: teddy");
617	return Some(Choice::Teddy(pre));
618	}
619	if let Some(pre) = ByteSet::new(kind, needles) {
620	debug!("prefilter built: byteset");
621	return Some(Choice::ByteSet(pre));
622	}
623	if let Some(pre) = AhoCorasick::new(kind, needles) {
624	debug!("prefilter built: aho-corasick");
625	return Some(Choice::AhoCorasick(pre));
626	}
627	debug!("prefilter building failed: no strategy could be found");
628	None
629	}
630	}
631
632	/// Extracts all of the prefix literals from the given HIR expressions into a
633	/// single `Seq`. The literals in the sequence are ordered with respect to the
634	/// order of the given HIR expressions and consistent with the match semantics
635	/// given.
636	///
637	/// The sequence returned is "optimized." That is, they may be shrunk or even
638	/// truncated according to heuristics with the intent of making them more
639	/// useful as a prefilter. (Which translates to both using faster algorithms
640	/// and minimizing the false positive rate.)
641	///
642	/// Note that this erases any connection between the literals and which pattern
643	/// (or patterns) they came from.
644	///
645	/// The match kind given must correspond to the match semantics of the regex
646	/// that is represented by the HIRs given. The match semantics may change the
647	/// literal sequence returned.
648	#[cfg(feature = "syntax")]
649	pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
650	where
651	H: core::borrow::Borrow<Hir>,
652	{
653	let mut extractor = literal::Extractor::new();
654	extractor.kind(literal::ExtractKind::Prefix);
655
656	let mut prefixes = literal::Seq::empty();
657	for hir in hirs {
658	prefixes.union(&mut extractor.extract(hir.borrow()));
659	}
660	debug!(
661	"prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
662	prefixes.len(),
663	prefixes.is_exact(),
664	prefixes
665	);
666	match kind {
667	MatchKind::All => {
668	prefixes.sort();
669	prefixes.dedup();
670	}
671	MatchKind::LeftmostFirst => {
672	prefixes.optimize_for_prefix_by_preference();
673	}
674	}
675	debug!(
676	"prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
677	prefixes.len(),
678	prefixes.is_exact(),
679	prefixes
680	);
681	prefixes
682	}
683
684	/// Like `prefixes`, but for all suffixes of all matches for the given HIRs.
685	#[cfg(feature = "syntax")]
686	pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
687	where
688	H: core::borrow::Borrow<Hir>,
689	{
690	let mut extractor = literal::Extractor::new();
691	extractor.kind(literal::ExtractKind::Suffix);
692
693	let mut suffixes = literal::Seq::empty();
694	for hir in hirs {
695	suffixes.union(&mut extractor.extract(hir.borrow()));
696	}
697	debug!(
698	"suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
699	suffixes.len(),
700	suffixes.is_exact(),
701	suffixes
702	);
703	match kind {
704	MatchKind::All => {
705	suffixes.sort();
706	suffixes.dedup();
707	}
708	MatchKind::LeftmostFirst => {
709	suffixes.optimize_for_suffix_by_preference();
710	}
711	}
712	debug!(
713	"suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
714	suffixes.len(),
715	suffixes.is_exact(),
716	suffixes
717	);
718	suffixes
719	}
720

Provided by KDAB

Definitions