imp.rs source code [crates/regex-1.8.4/src/literal/imp.rs]

1	use std::mem;
2
3	use aho_corasick::{self, packed, AhoCorasick};
4	use memchr::{memchr, memchr2, memchr3, memmem};
5	use regex_syntax::hir::literal::{Literal, Seq};
6
7	/// A prefix extracted from a compiled regular expression.
8	///
9	/// A regex prefix is a set of literal strings that must* be matched at the*
10	/// beginning of a regex in order for the entire regex to match. Similarly
11	/// for a regex suffix.
12	#[derive(Clone, Debug)]
13	pub struct LiteralSearcher {
14	complete: bool,
15	lcp: Memmem,
16	lcs: Memmem,
17	matcher: Matcher,
18	}
19
20	#[derive(Clone, Debug)]
21	enum Matcher {
22	/// No literals. (Never advances through the input.)
23	Empty,
24	/// A set of four or more single byte literals.
25	Bytes(SingleByteSet),
26	/// A single substring, using vector accelerated routines when available.
27	Memmem(Memmem),
28	/// An Aho-Corasick automaton.
29	AC { ac: AhoCorasick, lits: Vec<Literal> },
30	/// A packed multiple substring searcher, using SIMD.
31	///
32	/// Note that Aho-Corasick will actually use this packed searcher
33	/// internally automatically, however, there is some overhead associated
34	/// with going through the Aho-Corasick machinery. So using the packed
35	/// searcher directly results in some gains.
36	Packed { s: packed::Searcher, lits: Vec<Literal> },
37	}
38
39	impl LiteralSearcher {
40	/// Returns a matcher that never matches and never advances the input.
41	pub fn empty() -> Self {
42	Self::new(Seq::infinite(), Matcher::Empty)
43	}
44
45	/// Returns a matcher for literal prefixes from the given set.
46	pub fn prefixes(lits: Seq) -> Self {
47	let matcher = Matcher::prefixes(&lits);
48	Self::new(lits, matcher)
49	}
50
51	/// Returns a matcher for literal suffixes from the given set.
52	pub fn suffixes(lits: Seq) -> Self {
53	let matcher = Matcher::suffixes(&lits);
54	Self::new(lits, matcher)
55	}
56
57	fn new(lits: Seq, matcher: Matcher) -> Self {
58	LiteralSearcher {
59	complete: lits.is_exact(),
60	lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")),
61	lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")),
62	matcher,
63	}
64	}
65
66	/// Returns true if all matches comprise the entire regular expression.
67	///
68	/// This does not necessarily mean that a literal match implies a match
69	/// of the regular expression. For example, the regular expression `^a`
70	/// is comprised of a single complete literal `a`, but the regular
71	/// expression demands that it only match at the beginning of a string.
72	pub fn complete(&self) -> bool {
73	self.complete && !self.is_empty()
74	}
75
76	/// Find the position of a literal in `haystack` if it exists.
77	#[cfg_attr(feature = "perf-inline", inline(always))]
78	pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
79	use self::Matcher::*;
80	match self.matcher {
81	Empty => Some((`0`, `0`)),
82	Bytes(ref sset) => sset.find(haystack).map(\|i\| (i, i + `1`)),
83	Memmem(ref s) => s.find(haystack).map(\|i\| (i, i + s.len())),
84	AC { ref ac, .. } => {
85	ac.find(haystack).map(\|m\| (m.start(), m.end()))
86	}
87	Packed { ref s, .. } => {
88	s.find(haystack).map(\|m\| (m.start(), m.end()))
89	}
90	}
91	}
92
93	/// Like find, except matches must start at index `0`.
94	pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
95	for lit in self.iter() {
96	if lit.len() > haystack.len() {
97	continue;
98	}
99	if lit == &haystack[`0`..lit.len()] {
100	return Some((`0`, lit.len()));
101	}
102	}
103	None
104	}
105
106	/// Like find, except matches must end at index `haystack.len()`.
107	pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
108	for lit in self.iter() {
109	if lit.len() > haystack.len() {
110	continue;
111	}
112	if lit == &haystack[haystack.len() - lit.len()..] {
113	return Some((haystack.len() - lit.len(), haystack.len()));
114	}
115	}
116	None
117	}
118
119	/// Returns an iterator over all literals to be matched.
120	pub fn iter(&self) -> LiteralIter<'_> {
121	match self.matcher {
122	Matcher::Empty => LiteralIter::Empty,
123	Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
124	Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
125	Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
126	Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
127	}
128	}
129
130	/// Returns a matcher for the longest common prefix of this matcher.
131	pub fn lcp(&self) -> &Memmem {
132	&self.lcp
133	}
134
135	/// Returns a matcher for the longest common suffix of this matcher.
136	pub fn lcs(&self) -> &Memmem {
137	&self.lcs
138	}
139
140	/// Returns true iff this prefix is empty.
141	pub fn is_empty(&self) -> bool {
142	self.len() == `0`
143	}
144
145	/// Returns the number of prefixes in this machine.
146	pub fn len(&self) -> usize {
147	use self::Matcher::*;
148	match self.matcher {
149	Empty => `0`,
150	Bytes(ref sset) => sset.dense.len(),
151	Memmem(_) => `1`,
152	AC { ref ac, .. } => ac.patterns_len(),
153	Packed { ref lits, .. } => lits.len(),
154	}
155	}
156
157	/// Return the approximate heap usage of literals in bytes.
158	pub fn approximate_size(&self) -> usize {
159	use self::Matcher::*;
160	match self.matcher {
161	Empty => `0`,
162	Bytes(ref sset) => sset.approximate_size(),
163	Memmem(ref single) => single.approximate_size(),
164	AC { ref ac, .. } => ac.memory_usage(),
165	Packed { ref s, .. } => s.memory_usage(),
166	}
167	}
168	}
169
170	impl Matcher {
171	fn prefixes(lits: &Seq) -> Self {
172	let sset = SingleByteSet::prefixes(lits);
173	Matcher::new(lits, sset)
174	}
175
176	fn suffixes(lits: &Seq) -> Self {
177	let sset = SingleByteSet::suffixes(lits);
178	Matcher::new(lits, sset)
179	}
180
181	fn new(lits: &Seq, sset: SingleByteSet) -> Self {
182	if lits.is_empty() \|\| lits.min_literal_len() == Some(`0`) {
183	return Matcher::Empty;
184	}
185	let lits = match lits.literals() {
186	None => return Matcher::Empty,
187	Some(members) => members,
188	};
189	if sset.dense.len() >= `26` {
190	// Avoid trying to match a large number of single bytes.
191	// This is very* sensitive to a frequency analysis comparison*
192	// between the bytes in sset and the composition of the haystack.
193	// No matter the size of sset, if its members all are rare in the
194	// haystack, then it'd be worth using it. How to tune this... IDK.
195	// ---AG
196	return Matcher::Empty;
197	}
198	if sset.complete {
199	return Matcher::Bytes(sset);
200	}
201	if lits.len() == `1` {
202	return Matcher::Memmem(Memmem::new(lits[`0`].as_bytes()));
203	}
204
205	let pats: Vec<&[u8]> = lits.iter().map(\|lit\| lit.as_bytes()).collect();
206	let is_aho_corasick_fast = sset.dense.len() <= `1` && sset.all_ascii;
207	if lits.len() <= `100` && !is_aho_corasick_fast {
208	let mut builder = packed::Config::new()
209	.match_kind(packed::MatchKind::LeftmostFirst)
210	.builder();
211	if let Some(s) = builder.extend(&pats).build() {
212	return Matcher::Packed { s, lits: lits.to_owned() };
213	}
214	}
215	let ac = AhoCorasick::builder()
216	.match_kind(aho_corasick::MatchKind::LeftmostFirst)
217	.kind(Some(aho_corasick::AhoCorasickKind::DFA))
218	.build(&pats)
219	.unwrap();
220	Matcher::AC { ac, lits: lits.to_owned() }
221	}
222	}
223
224	#[derive(Debug)]
225	pub enum LiteralIter<'a> {
226	Empty,
227	Bytes(&'a [u8]),
228	Single(&'a [u8]),
229	AC(&'a [Literal]),
230	Packed(&'a [Literal]),
231	}
232
233	impl<'a> Iterator for LiteralIter<'a> {
234	type Item = &'a [u8];
235
236	fn next(&mut self) -> Option<Self::Item> {
237	match *self {
238	LiteralIter::Empty => None,
239	LiteralIter::Bytes(ref mut many) => {
240	if many.is_empty() {
241	None
242	} else {
243	let next = &many[`0`..`1`];
244	*many = &many[`1`..];
245	Some(next)
246	}
247	}
248	LiteralIter::Single(ref mut one) => {
249	if one.is_empty() {
250	None
251	} else {
252	let next = &one[..];
253	*one = &[];
254	Some(next)
255	}
256	}
257	LiteralIter::AC(ref mut lits) => {
258	if lits.is_empty() {
259	None
260	} else {
261	let next = &lits[`0`];
262	*lits = &lits[`1`..];
263	Some(next.as_bytes())
264	}
265	}
266	LiteralIter::Packed(ref mut lits) => {
267	if lits.is_empty() {
268	None
269	} else {
270	let next = &lits[`0`];
271	*lits = &lits[`1`..];
272	Some(next.as_bytes())
273	}
274	}
275	}
276	}
277	}
278
279	#[derive(Clone, Debug)]
280	struct SingleByteSet {
281	sparse: Vec<bool>,
282	dense: Vec<u8>,
283	complete: bool,
284	all_ascii: bool,
285	}
286
287	impl SingleByteSet {
288	fn new() -> SingleByteSet {
289	SingleByteSet {
290	sparse: vec![`false`; `256`],
291	dense: vec![],
292	complete: `true`,
293	all_ascii: `true`,
294	}
295	}
296
297	fn prefixes(lits: &Seq) -> SingleByteSet {
298	let mut sset = SingleByteSet::new();
299	let lits = match lits.literals() {
300	None => return sset,
301	Some(lits) => lits,
302	};
303	for lit in lits.iter() {
304	sset.complete = sset.complete && lit.len() == `1`;
305	if let Some(&b) = lit.as_bytes().get(`0`) {
306	if !sset.sparse[b as usize] {
307	if b > `0x7F` {
308	sset.all_ascii = `false`;
309	}
310	sset.dense.push(b);
311	sset.sparse[b as usize] = `true`;
312	}
313	}
314	}
315	sset
316	}
317
318	fn suffixes(lits: &Seq) -> SingleByteSet {
319	let mut sset = SingleByteSet::new();
320	let lits = match lits.literals() {
321	None => return sset,
322	Some(lits) => lits,
323	};
324	for lit in lits.iter() {
325	sset.complete = sset.complete && lit.len() == `1`;
326	if let Some(&b) = lit.as_bytes().last() {
327	if !sset.sparse[b as usize] {
328	if b > `0x7F` {
329	sset.all_ascii = `false`;
330	}
331	sset.dense.push(b);
332	sset.sparse[b as usize] = `true`;
333	}
334	}
335	}
336	sset
337	}
338
339	/// Faster find that special cases certain sizes to use memchr.
340	#[cfg_attr(feature = "perf-inline", inline(always))]
341	fn find(&self, text: &[u8]) -> Option<usize> {
342	match self.dense.len() {
343	`0` => None,
344	`1` => memchr(self.dense[`0`], text),
345	`2` => memchr2(self.dense[`0`], self.dense[`1`], text),
346	`3` => memchr3(self.dense[`0`], self.dense[`1`], self.dense[`2`], text),
347	_ => self._find(text),
348	}
349	}
350
351	/// Generic find that works on any sized set.
352	fn _find(&self, haystack: &[u8]) -> Option<usize> {
353	for (i, &b) in haystack.iter().enumerate() {
354	if self.sparse[b as usize] {
355	return Some(i);
356	}
357	}
358	None
359	}
360
361	fn approximate_size(&self) -> usize {
362	(self.dense.len() * mem::size_of::<u8>())
363	+ (self.sparse.len() * mem::size_of::<bool>())
364	}
365	}
366
367	/// A simple wrapper around the memchr crate's memmem implementation.
368	///
369	/// The API this exposes mirrors the API of previous substring searchers that
370	/// this supplanted.
371	#[derive(Clone, Debug)]
372	pub struct Memmem {
373	finder: memmem::Finder<'static>,
374	char_len: usize,
375	}
376
377	impl Memmem {
378	fn new(pat: &[u8]) -> Memmem {
379	Memmem {
380	finder: memmem::Finder::new(pat).into_owned(),
381	char_len: char_len_lossy(pat),
382	}
383	}
384
385	#[cfg_attr(feature = "perf-inline", inline(always))]
386	pub fn find(&self, haystack: &[u8]) -> Option<usize> {
387	self.finder.find(haystack)
388	}
389
390	#[cfg_attr(feature = "perf-inline", inline(always))]
391	pub fn is_suffix(&self, text: &[u8]) -> bool {
392	if text.len() < self.len() {
393	return `false`;
394	}
395	&text[text.len() - self.len()..] == self.finder.needle()
396	}
397
398	pub fn len(&self) -> usize {
399	self.finder.needle().len()
400	}
401
402	pub fn char_len(&self) -> usize {
403	self.char_len
404	}
405
406	fn approximate_size(&self) -> usize {
407	self.finder.needle().len() * mem::size_of::<u8>()
408	}
409	}
410
411	fn char_len_lossy(bytes: &[u8]) -> usize {
412	String::from_utf8_lossy(bytes).chars().count()
413	}
414