mod.rs source code [crates/memchr/src/arch/all/packedpair/mod.rs]

1	/!*
2	Provides an architecture independent implementation of the "packed pair"
3	algorithm.
4
5	The "packed pair" algorithm is based on the [generic SIMD] algorithm. The main
6	difference is that it (by default) uses a background distribution of byte
7	frequencies to heuristically select the pair of bytes to search for. Note that
8	this module provides an architecture independent version that doesn't do as
9	good of a job keeping the search for candidates inside a SIMD hot path. It
10	however can be good enough in many circumstances.
11
12	[generic SIMD]: http://0x80.pl/articles/simd-strfind.html#first-and-last
13	*/
14
15	use crate::memchr;
16
17	mod default_rank;
18
19	/// An architecture independent "packed pair" finder.
20	///
21	/// This finder picks two bytes that it believes have high predictive power for
22	/// indicating an overall match of a needle. At search time, it reports offsets
23	/// where the needle could match based on whether the pair of bytes it chose
24	/// match.
25	///
26	/// This is architecture independent because it utilizes `memchr` to find the
27	/// occurrence of one of the bytes in the pair, and then checks whether the
28	/// second byte matches. If it does, in the case of [`Finder::find_prefilter`],
29	/// the location at which the needle could match is returned.
30	///
31	/// It is generally preferred to use architecture specific routines for a
32	/// "packed pair" prefilter, but this can be a useful fallback when the
33	/// architecture independent routines are unavailable.
34	#[derive(Clone, Copy, Debug)]
35	pub struct Finder {
36	pair: Pair,
37	byte1: u8,
38	byte2: u8,
39	}
40
41	impl Finder {
42	/// Create a new prefilter that reports possible locations where the given
43	/// needle matches.
44	#[inline]
45	pub fn new(needle: &[u8]) -> Option<Finder> {
46	Finder::with_pair(needle, Pair::new(needle)?)
47	}
48
49	/// Create a new prefilter using the pair given.
50	///
51	/// If the prefilter could not be constructed, then `None` is returned.
52	///
53	/// This constructor permits callers to control precisely which pair of
54	/// bytes is used as a predicate.
55	#[inline]
56	pub fn with_pair(needle: &[u8], pair: Pair) -> Option<Finder> {
57	let byte1 = needle[usize::from(pair.index1())];
58	let byte2 = needle[usize::from(pair.index2())];
59	// Currently this can never fail so we could just return a Finder,
60	// but it's conceivable this could change.
61	Some(Finder { pair, byte1, byte2 })
62	}
63
64	/// Run this finder on the given haystack as a prefilter.
65	///
66	/// If a candidate match is found, then an offset where the needle could
67	/// begin in the haystack is returned.
68	#[inline]
69	pub fn find_prefilter(&self, haystack: &[u8]) -> Option<usize> {
70	let mut i = `0`;
71	let index1 = usize::from(self.pair.index1());
72	let index2 = usize::from(self.pair.index2());
73	loop {
74	// Use a fast vectorized implementation to skip to the next
75	// occurrence of the rarest byte (heuristically chosen) in the
76	// needle.
77	i += memchr(self.byte1, &haystack[i..])?;
78	let found = i;
79	i += `1`;
80
81	// If we can't align our first byte match with the haystack, then a
82	// match is impossible.
83	let aligned1 = match found.checked_sub(index1) {
84	None => continue,
85	Some(aligned1) => aligned1,
86	};
87
88	// Now align the second byte match with the haystack. A mismatch
89	// means that a match is impossible.
90	let aligned2 = match aligned1.checked_add(index2) {
91	None => continue,
92	Some(aligned_index2) => aligned_index2,
93	};
94	if haystack.get(aligned2).map_or(`true`, \|&b\| b != self.byte2) {
95	continue;
96	}
97
98	// We've done what we can. There might be a match here.
99	return Some(aligned1);
100	}
101	}
102
103	/// Returns the pair of offsets (into the needle) used to check as a
104	/// predicate before confirming whether a needle exists at a particular
105	/// position.
106	#[inline]
107	pub fn pair(&self) -> &Pair {
108	&self.pair
109	}
110	}
111
112	/// A pair of byte offsets into a needle to use as a predicate.
113	///
114	/// This pair is used as a predicate to quickly filter out positions in a
115	/// haystack in which a needle cannot match. In some cases, this pair can even
116	/// be used in vector algorithms such that the vector algorithm only switches
117	/// over to scalar code once this pair has been found.
118	///
119	/// A pair of offsets can be used in both substring search implementations and
120	/// in prefilters. The former will report matches of a needle in a haystack
121	/// where as the latter will only report possible matches of a needle.
122	///
123	/// The offsets are limited each to a maximum of 255 to keep memory usage low.
124	/// Moreover, it's rarely advantageous to create a predicate using offsets
125	/// greater than 255 anyway.
126	///
127	/// The only guarantee enforced on the pair of offsets is that they are not
128	/// equivalent. It is not necessarily the case that `index1 < index2` for
129	/// example. By convention, `index1` corresponds to the byte in the needle
130	/// that is believed to be most the predictive. Note also that because of the
131	/// requirement that the indices be both valid for the needle used to build
132	/// the pair and not equal, it follows that a pair can only be constructed for
133	/// needles with length at least 2.
134	#[derive(Clone, Copy, Debug)]
135	pub struct Pair {
136	index1: u8,
137	index2: u8,
138	}
139
140	impl Pair {
141	/// Create a new pair of offsets from the given needle.
142	///
143	/// If a pair could not be created (for example, if the needle is too
144	/// short), then `None` is returned.
145	///
146	/// This chooses the pair in the needle that is believed to be as
147	/// predictive of an overall match of the needle as possible.
148	#[inline]
149	pub fn new(needle: &[u8]) -> Option<Pair> {
150	Pair::with_ranker(needle, DefaultFrequencyRank)
151	}
152
153	/// Create a new pair of offsets from the given needle and ranker.
154	///
155	/// This permits the caller to choose a background frequency distribution
156	/// with which bytes are selected. The idea is to select a pair of bytes
157	/// that is believed to strongly predict a match in the haystack. This
158	/// usually means selecting bytes that occur rarely in a haystack.
159	///
160	/// If a pair could not be created (for example, if the needle is too
161	/// short), then `None` is returned.
162	#[inline]
163	pub fn with_ranker<R: HeuristicFrequencyRank>(
164	needle: &[u8],
165	ranker: R,
166	) -> Option<Pair> {
167	if needle.len() <= `1` {
168	return None;
169	}
170	// Find the rarest two bytes. We make them distinct indices by
171	// construction. (The actual byte value may be the same in degenerate
172	// cases, but that's OK.)
173	let (mut rare1, mut index1) = (needle[`0`], `0`);
174	let (mut rare2, mut index2) = (needle[`1`], `1`);
175	if ranker.rank(rare2) < ranker.rank(rare1) {
176	core::mem::swap(&mut rare1, &mut rare2);
177	core::mem::swap(&mut index1, &mut index2);
178	}
179	let max = usize::from(core::u8::MAX);
180	for (i, &b) in needle.iter().enumerate().take(max).skip(`2`) {
181	if ranker.rank(b) < ranker.rank(rare1) {
182	rare2 = rare1;
183	index2 = index1;
184	rare1 = b;
185	index1 = u8::try_from(i).unwrap();
186	} else if b != rare1 && ranker.rank(b) < ranker.rank(rare2) {
187	rare2 = b;
188	index2 = u8::try_from(i).unwrap();
189	}
190	}
191	// While not strictly required for how a Pair is normally used, we
192	// really don't want these to be equivalent. If they were, it would
193	// reduce the effectiveness of candidate searching using these rare
194	// bytes by increasing the rate of false positives.
195	assert_ne!(index1, index2);
196	Some(Pair { index1, index2 })
197	}
198
199	/// Create a new pair using the offsets given for the needle given.
200	///
201	/// This bypasses any sort of heuristic process for choosing the offsets
202	/// and permits the caller to choose the offsets themselves.
203	///
204	/// Indices are limited to valid `u8` values so that a `Pair` uses less
205	/// memory. It is not possible to create a `Pair` with offsets bigger than
206	/// `u8::MAX`. It's likely that such a thing is not needed, but if it is,
207	/// it's suggested to build your own bespoke algorithm because you're
208	/// likely working on a very niche case. (File an issue if this suggestion
209	/// does not make sense to you.)
210	///
211	/// If a pair could not be created (for example, if the needle is too
212	/// short), then `None` is returned.
213	#[inline]
214	pub fn with_indices(
215	needle: &[u8],
216	index1: u8,
217	index2: u8,
218	) -> Option<Pair> {
219	// While not strictly required for how a Pair is normally used, we
220	// really don't want these to be equivalent. If they were, it would
221	// reduce the effectiveness of candidate searching using these rare
222	// bytes by increasing the rate of false positives.
223	if index1 == index2 {
224	return None;
225	}
226	// Similarly, invalid indices means the Pair is invalid too.
227	if usize::from(index1) >= needle.len() {
228	return None;
229	}
230	if usize::from(index2) >= needle.len() {
231	return None;
232	}
233	Some(Pair { index1, index2 })
234	}
235
236	/// Returns the first offset of the pair.
237	#[inline]
238	pub fn index1(&self) -> u8 {
239	self.index1
240	}
241
242	/// Returns the second offset of the pair.
243	#[inline]
244	pub fn index2(&self) -> u8 {
245	self.index2
246	}
247	}
248
249	/// This trait allows the user to customize the heuristic used to determine the
250	/// relative frequency of a given byte in the dataset being searched.
251	///
252	/// The use of this trait can have a dramatic impact on performance depending
253	/// on the type of data being searched. The details of why are explained in the
254	/// docs of [`crate::memmem::Prefilter`]. To summarize, the core algorithm uses
255	/// a prefilter to quickly identify candidate matches that are later verified
256	/// more slowly. This prefilter is implemented in terms of trying to find
257	/// `rare` bytes at specific offsets that will occur less frequently in the
258	/// dataset. While the concept of a `rare` byte is similar for most datasets,
259	/// there are some specific datasets (like binary executables) that have
260	/// dramatically different byte distributions. For these datasets customizing
261	/// the byte frequency heuristic can have a massive impact on performance, and
262	/// might even need to be done at runtime.
263	///
264	/// The default implementation of `HeuristicFrequencyRank` reads from the
265	/// static frequency table defined in `src/memmem/byte_frequencies.rs`. This
266	/// is optimal for most inputs, so if you are unsure of the impact of using a
267	/// custom `HeuristicFrequencyRank` you should probably just use the default.
268	///
269	/// # Example
270	///
271	/// ```
272	/// use memchr::{
273	/// arch::all::packedpair::HeuristicFrequencyRank,
274	/// memmem::FinderBuilder,
275	/// };
276	///
277	/// /// A byte-frequency table that is good for scanning binary executables.
278	/// struct Binary;
279	///
280	/// impl HeuristicFrequencyRank for Binary {
281	/// fn rank(&self, byte: u8) -> u8 {
282	/// const TABLE: [u8; `256`] = [
283	/// `255`, `128`, `61`, `43`, `50`, `41`, `27`, `28`, `57`, `15`, `21`, `13`, `24`, `17`, `17`,
284	/// `89`, `58`, `16`, `11`, `7`, `14`, `23`, `7`, `6`, `24`, `9`, `6`, `5`, `9`, `4`, `7`, `16`,
285	/// `68`, `11`, `9`, `6`, `88`, `7`, `4`, `4`, `23`, `9`, `4`, `8`, `8`, `5`, `10`, `4`, `30`, `11`,
286	/// `9`, `24`, `11`, `5`, `5`, `5`, `19`, `11`, `6`, `17`, `9`, `9`, `6`, `8`,
287	/// `48`, `58`, `11`, `14`, `53`, `40`, `9`, `9`, `254`, `35`, `3`, `6`, `52`, `23`, `6`, `6`, `27`,
288	/// `4`, `7`, `11`, `14`, `13`, `10`, `11`, `11`, `5`, `2`, `10`, `16`, `12`, `6`, `19`,
289	/// `19`, `20`, `5`, `14`, `16`, `31`, `19`, `7`, `14`, `20`, `4`, `4`, `19`, `8`, `18`, `20`, `24`,
290	/// `1`, `25`, `19`, `58`, `29`, `10`, `5`, `15`, `20`, `2`, `2`, `9`, `4`, `3`, `5`,
291	/// `51`, `11`, `4`, `53`, `23`, `39`, `6`, `4`, `13`, `81`, `4`, `186`, `5`, `67`, `3`, `2`, `15`,
292	/// `0`, `0`, `1`, `3`, `2`, `0`, `0`, `5`, `0`, `0`, `0`, `2`, `0`, `0`, `0`,
293	/// `12`, `2`, `1`, `1`, `3`, `1`, `1`, `1`, `6`, `1`, `2`, `1`, `3`, `1`, `1`, `2`, `9`, `1`, `1`, `0`,
294	/// `2`, `2`, `4`, `4`, `11`, `6`, `7`, `3`, `6`, `9`, `4`, `5`,
295	/// `46`, `18`, `8`, `18`, `17`, `3`, `8`, `20`, `16`, `10`, `3`, `7`, `175`, `4`, `6`, `7`, `13`,
296	/// `3`, `7`, `3`, `3`, `1`, `3`, `3`, `10`, `3`, `1`, `5`, `2`, `0`, `1`, `2`,
297	/// `16`, `3`, `5`, `1`, `6`, `1`, `1`, `2`, `58`, `20`, `3`, `14`, `12`, `2`, `1`, `3`, `16`, `3`, `5`,
298	/// `8`, `3`, `1`, `8`, `6`, `17`, `6`, `5`, `3`, `8`, `6`, `13`, `175`,
299	/// ];
300	/// TABLE[byte as usize]
301	/// }
302	/// }
303	/// // Create a new finder with the custom heuristic.
304	/// let finder = FinderBuilder::new()
305	/// .build_forward_with_ranker(Binary, b"`\x00\x00\xdd\xdd`");
306	/// // Find needle with custom heuristic.
307	/// assert!(finder.find(b"`\x00\x00\x00\xdd\xdd`").is_some());
308	/// ```
309	pub trait HeuristicFrequencyRank {
310	/// Return the heuristic frequency rank of the given byte. A lower rank
311	/// means the byte is believed to occur less frequently in the haystack.
312	///
313	/// Some uses of this heuristic may treat arbitrary absolute rank values as
314	/// significant. For example, an implementation detail in this crate may
315	/// determine that heuristic prefilters are inappropriate if every byte in
316	/// the needle has a "high" rank.
317	fn rank(&self, byte: u8) -> u8;
318	}
319
320	/// The default byte frequency heuristic that is good for most haystacks.
321	pub(crate) struct DefaultFrequencyRank;
322
323	impl HeuristicFrequencyRank for DefaultFrequencyRank {
324	fn rank(&self, byte: u8) -> u8 {
325	self::default_rank::RANK[usize::from(byte)]
326	}
327	}
328
329	/// This permits passing any implementation of `HeuristicFrequencyRank` as a
330	/// borrowed version of itself.
331	impl<'a, R> HeuristicFrequencyRank for &'a R
332	where
333	R: HeuristicFrequencyRank,
334	{
335	fn rank(&self, byte: u8) -> u8 {
336	(**self).rank(byte)
337	}
338	}
339
340	#[cfg(test)]
341	mod tests {
342	use super::*;
343
344	#[test]
345	fn forward_packedpair() {
346	fn find(
347	haystack: &[u8],
348	needle: &[u8],
349	_index1: u8,
350	_index2: u8,
351	) -> Option<Option<usize>> {
352	// We ignore the index positions requested since it winds up making
353	// this test too slow overall.
354	let f = Finder::new(needle)?;
355	Some(f.find_prefilter(haystack))
356	}
357	crate::tests::packedpair::Runner::new().fwd(find).run()
358	}
359	}
360