aho_corasick.rs - Codebrowser

1	use crate::util::{
2	prefilter::PrefilterI,
3	search::{MatchKind, Span},
4	};
5
6	#[derive(Clone, Debug)]
7	pub(crate) struct AhoCorasick {
8	#[cfg(not(feature = "perf-literal-multisubstring"))]
9	_unused: (),
10	#[cfg(feature = "perf-literal-multisubstring")]
11	ac: aho_corasick::AhoCorasick,
12	}
13
14	impl AhoCorasick {
15	pub(crate) fn new<B: AsRef<[u8]>>(
16	kind: MatchKind,
17	needles: &[B],
18	) -> Option<AhoCorasick> {
19	#[cfg(not(feature = "perf-literal-multisubstring"))]
20	{
21	None
22	}
23	#[cfg(feature = "perf-literal-multisubstring")]
24	{
25	// We used to use `aho_corasick::MatchKind::Standard` here when
26	// `kind` was `MatchKind::All`, but this is not correct. The
27	// "standard" Aho-Corasick match semantics are to report a match
28	// immediately as soon as it is seen, but `All` isn't like that.
29	// In particular, with "standard" semantics, given the needles
30	// "abc" and "b" and the haystack "abc," it would report a match
31	// at offset 1 before a match at offset 0. This is never what we
32	// want in the context of the regex engine, regardless of whether
33	// we have leftmost-first or 'all' semantics. Namely, we always
34	// want the leftmost match.
35	let ac_match_kind = match kind {
36	MatchKind::LeftmostFirst \| MatchKind::All => {
37	aho_corasick::MatchKind::LeftmostFirst
38	}
39	};
40	// This is kind of just an arbitrary number, but basically, if we
41	// have a small enough set of literals, then we try to use the VERY
42	// memory hungry DFA. Otherwise, we whimp out and use an NFA. The
43	// upshot is that the NFA is quite lean and decently fast. Faster
44	// than a naive Aho-Corasick NFA anyway.
45	let ac_kind = if needles.len() <= `500` {
46	aho_corasick::AhoCorasickKind::DFA
47	} else {
48	aho_corasick::AhoCorasickKind::ContiguousNFA
49	};
50	let result = aho_corasick::AhoCorasick::builder()
51	.kind(Some(ac_kind))
52	.match_kind(ac_match_kind)
53	.start_kind(aho_corasick::StartKind::Both)
54	// We try to handle all of the prefilter cases in the super
55	// module, and only use Aho-Corasick for the actual automaton.
56	// The aho-corasick crate does have some extra prefilters,
57	// namely, looking for rare bytes to feed to memchr{,2,3}
58	// instead of just the first byte. If we end up wanting
59	// those---and they are somewhat tricky to implement---then
60	// we could port them to this crate.
61	//
62	// The main reason for doing things this way is so we have a
63	// complete and easy to understand picture of which prefilters
64	// are available and how they work. Otherwise it seems too
65	// easy to get into a situation where we have a prefilter
66	// layered on top of prefilter, and that might have unintended
67	// consequences.
68	.prefilter(`false`)
69	.build(needles);
70	let ac = match result {
71	Ok(ac) => ac,
72	Err(_err) => {
73	debug!("aho-corasick prefilter failed to build: {}", _err);
74	return None;
75	}
76	};
77	Some(AhoCorasick { ac })
78	}
79	}
80	}
81
82	impl PrefilterI for AhoCorasick {
83	fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
84	#[cfg(not(feature = "perf-literal-multisubstring"))]
85	{
86	unreachable!()
87	}
88	#[cfg(feature = "perf-literal-multisubstring")]
89	{
90	let input =
91	aho_corasick::Input::new(haystack).span(span.start..span.end);
92	self.ac
93	.find(input)
94	.map(\|m\| Span { start: m.start(), end: m.end() })
95	}
96	}
97
98	fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
99	#[cfg(not(feature = "perf-literal-multisubstring"))]
100	{
101	unreachable!()
102	}
103	#[cfg(feature = "perf-literal-multisubstring")]
104	{
105	let input = aho_corasick::Input::new(haystack)
106	.anchored(aho_corasick::Anchored::Yes)
107	.span(span.start..span.end);
108	self.ac
109	.find(input)
110	.map(\|m\| Span { start: m.start(), end: m.end() })
111	}
112	}
113
114	fn memory_usage(&self) -> usize {
115	#[cfg(not(feature = "perf-literal-multisubstring"))]
116	{
117	unreachable!()
118	}
119	#[cfg(feature = "perf-literal-multisubstring")]
120	{
121	self.ac.memory_usage()
122	}
123	}
124
125	fn is_fast(&self) -> bool {
126	#[cfg(not(feature = "perf-literal-multisubstring"))]
127	{
128	unreachable!()
129	}
130	#[cfg(feature = "perf-literal-multisubstring")]
131	{
132	// Aho-Corasick is never considered "fast" because it's never
133	// going to be even close to an order of magnitude faster than the
134	// regex engine itself (assuming a DFA is used). In fact, it is
135	// usually slower. The magic of Aho-Corasick is that it can search
136	// a large* number of literals with a relatively small amount of*
137	// memory. The regex engines are far more wasteful.
138	//
139	// Aho-Corasick may be "fast" when the regex engine corresponds
140	// to, say, the PikeVM. That happens when the lazy DFA couldn't be
141	// built or used for some reason. But in these cases, the regex
142	// itself is likely quite big and we're probably hosed no matter
143	// what we do. (In this case, the best bet is for the caller to
144	// increase some of the memory limits on the hybrid cache capacity
145	// and hope that's enough.)
146	`false`
147	}
148	}
149	}
150