genericsimd.rs source code [crates/memchr-2.5.0/src/memmem/genericsimd.rs]

1	use core::mem::size_of;
2
3	use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo};
4
5	/// The minimum length of a needle required for this algorithm. The minimum
6	/// is 2 since a length of 1 should just use memchr and a length of 0 isn't
7	/// a case handled by this searcher.
8	pub(crate) const MIN_NEEDLE_LEN: usize = `2`;
9
10	/// The maximum length of a needle required for this algorithm.
11	///
12	/// In reality, there is no hard max here. The code below can handle any
13	/// length needle. (Perhaps that suggests there are missing optimizations.)
14	/// Instead, this is a heuristic and a bound guaranteeing our linear time
15	/// complexity.
16	///
17	/// It is a heuristic because when a candidate match is found, memcmp is run.
18	/// For very large needles with lots of false positives, memcmp can make the
19	/// code run quite slow.
20	///
21	/// It is a bound because the worst case behavior with memcmp is multiplicative
22	/// in the size of the needle and haystack, and we want to keep that additive.
23	/// This bound ensures we still meet that bound theoretically, since it's just
24	/// a constant. We aren't acting in bad faith here, memcmp on tiny needles
25	/// is so fast that even in pathological cases (see pathological vector
26	/// benchmarks), this is still just as fast or faster in practice.
27	///
28	/// This specific number was chosen by tweaking a bit and running benchmarks.
29	/// The rare-medium-needle, for example, gets about 5% faster by using this
30	/// algorithm instead of a prefilter-accelerated Two-Way. There's also a
31	/// theoretical desire to keep this number reasonably low, to mitigate the
32	/// impact of pathological cases. I did try 64, and some benchmarks got a
33	/// little better, and others (particularly the pathological ones), got a lot
34	/// worse. So... 32 it is?
35	pub(crate) const MAX_NEEDLE_LEN: usize = `32`;
36
37	/// The implementation of the forward vector accelerated substring search.
38	///
39	/// This is extremely similar to the prefilter vector module by the same name.
40	/// The key difference is that this is not a prefilter. Instead, it handles
41	/// confirming its own matches. The trade off is that this only works with
42	/// smaller needles. The speed up here is that an inlined memcmp on a tiny
43	/// needle is very quick, even on pathological inputs. This is much better than
44	/// combining a prefilter with Two-Way, where using Two-Way to confirm the
45	/// match has higher latency.
46	///
47	/// So why not use this for all needles? We could, and it would probably work
48	/// really well on most inputs. But its worst case is multiplicative and we
49	/// want to guarantee worst case additive time. Some of the benchmarks try to
50	/// justify this (see the pathological ones).
51	///
52	/// The prefilter variant of this has more comments. Also note that we only
53	/// implement this for forward searches for now. If you have a compelling use
54	/// case for accelerated reverse search, please file an issue.
55	#[derive(Clone, Copy, Debug)]
56	pub(crate) struct Forward {
57	rare1i: u8,
58	rare2i: u8,
59	}
60
61	impl Forward {
62	/// Create a new "generic simd" forward searcher. If one could not be
63	/// created from the given inputs, then None is returned.
64	pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
65	let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8();
66	// If the needle is too short or too long, give up. Also, give up
67	// if the rare bytes detected are at the same position. (It likely
68	// suggests a degenerate case, although it should technically not be
69	// possible.)
70	if needle.len() < MIN_NEEDLE_LEN
71	\|\| needle.len() > MAX_NEEDLE_LEN
72	\|\| rare1i == rare2i
73	{
74	return None;
75	}
76	Some(Forward { rare1i, rare2i })
77	}
78
79	/// Returns the minimum length of haystack that is needed for this searcher
80	/// to work for a particular vector. Passing a haystack with a length
81	/// smaller than this will cause `fwd_find` to panic.
82	#[inline(always)]
83	pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize {
84	self.rare2i as usize + size_of::<V>()
85	}
86	}
87
88	/// Searches the given haystack for the given needle. The needle given should
89	/// be the same as the needle that this searcher was initialized with.
90	///
91	/// # Panics
92	///
93	/// When the given haystack has a length smaller than `min_haystack_len`.
94	///
95	/// # Safety
96	///
97	/// Since this is meant to be used with vector functions, callers need to
98	/// specialize this inside of a function with a `target_feature` attribute.
99	/// Therefore, callers must ensure that whatever target feature is being used
100	/// supports the vector functions that this function is specialized for. (For
101	/// the specific vector functions used, see the Vector trait implementations.)
102	#[inline(always)]
103	pub(crate) unsafe fn fwd_find<V: Vector>(
104	fwd: &Forward,
105	haystack: &[u8],
106	needle: &[u8],
107	) -> Option<usize> {
108	// It would be nice if we didn't have this check here, since the meta
109	// searcher should handle it for us. But without this, I don't think we
110	// guarantee that end_ptr.sub(needle.len()) won't result in UB. We could
111	// put it as part of the safety contract, but it makes it more complicated
112	// than necessary.
113	if haystack.len() < needle.len() {
114	return None;
115	}
116	let min_haystack_len = fwd.min_haystack_len::<V>();
117	assert!(haystack.len() >= min_haystack_len, "haystack too small");
118	debug_assert!(needle.len() <= haystack.len());
119	debug_assert!(
120	needle.len() >= MIN_NEEDLE_LEN,
121	"needle must be at least {} bytes",
122	MIN_NEEDLE_LEN,
123	);
124	debug_assert!(
125	needle.len() <= MAX_NEEDLE_LEN,
126	"needle must be at most {} bytes",
127	MAX_NEEDLE_LEN,
128	);
129
130	let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize);
131	let rare1chunk = V::splat(needle[rare1i]);
132	let rare2chunk = V::splat(needle[rare2i]);
133
134	let start_ptr = haystack.as_ptr();
135	let end_ptr = start_ptr.add(haystack.len());
136	let max_ptr = end_ptr.sub(min_haystack_len);
137	let mut ptr = start_ptr;
138
139	// N.B. I did experiment with unrolling the loop to deal with size(V)
140	// bytes at a time and 2size(V) bytes at a time. The double unroll was*
141	// marginally faster while the quadruple unroll was unambiguously slower.
142	// In the end, I decided the complexity from unrolling wasn't worth it. I
143	// used the memmem/krate/prebuilt/huge-en/ benchmarks to compare.
144	while ptr <= max_ptr {
145	let m = fwd_find_in_chunk(
146	fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !`0`,
147	);
148	if let Some(chunki) = m {
149	return Some(matched(start_ptr, ptr, chunki));
150	}
151	ptr = ptr.add(size_of::<V>());
152	}
153	if ptr < end_ptr {
154	let remaining = diff(end_ptr, ptr);
155	debug_assert!(
156	remaining < min_haystack_len,
157	"remaining bytes should be smaller than the minimum haystack \
158	length of {}, but there are {} bytes remaining",
159	min_haystack_len,
160	remaining,
161	);
162	if remaining < needle.len() {
163	return None;
164	}
165	debug_assert!(
166	max_ptr < ptr,
167	"after main loop, ptr should have exceeded max_ptr",
168	);
169	let overlap = diff(ptr, max_ptr);
170	debug_assert!(
171	overlap > `0`,
172	"overlap ({}) must always be non-zero",
173	overlap,
174	);
175	debug_assert!(
176	overlap < size_of::<V>(),
177	"overlap ({}) cannot possibly be >= than a vector ({})",
178	overlap,
179	size_of::<V>(),
180	);
181	// The mask has all of its bits set except for the first N least
182	// significant bits, where N=overlap. This way, any matches that
183	// occur in find_in_chunk within the overlap are automatically
184	// ignored.
185	let mask = !((`1` << overlap) - `1`);
186	ptr = max_ptr;
187	let m = fwd_find_in_chunk(
188	fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask,
189	);
190	if let Some(chunki) = m {
191	return Some(matched(start_ptr, ptr, chunki));
192	}
193	}
194	None
195	}
196
197	/// Search for an occurrence of two rare bytes from the needle in the chunk
198	/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
199	/// an occurrence is found, memcmp is run to check if a match occurs at the
200	/// corresponding position.
201	///
202	/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
203	/// bytes repeated in each 8-bit lane, respectively.
204	///
205	/// mask should have bits set corresponding the positions in the chunk in which
206	/// matches are considered. This is only used for the last vector load where
207	/// the beginning of the vector might have overlapped with the last load in
208	/// the main loop. The mask lets us avoid visiting positions that have already
209	/// been discarded as matches.
210	///
211	/// # Safety
212	///
213	/// It must be safe to do an unaligned read of size(V) bytes starting at both
214	/// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
215	/// loads on ptr up to (end_ptr - needle.len()).
216	#[inline(always)]
217	unsafe fn fwd_find_in_chunk<V: Vector>(
218	fwd: &Forward,
219	needle: &[u8],
220	ptr: *const u8,
221	end_ptr: *const u8,
222	rare1chunk: V,
223	rare2chunk: V,
224	mask: u32,
225	) -> Option<usize> {
226	let chunk0: V = V::load_unaligned(data:ptr.add(count:fwd.rare1i as usize));
227	let chunk1: V = V::load_unaligned(data:ptr.add(count:fwd.rare2i as usize));
228
229	let eq0: V = chunk0.cmpeq(vector2:rare1chunk);
230	let eq1: V = chunk1.cmpeq(vector2:rare2chunk);
231
232	let mut match_offsets: u32 = eq0.and(vector2:eq1).movemask() & mask;
233	while match_offsets != `0` {
234	let offset: usize = match_offsets.trailing_zeros() as usize;
235	let ptr: *const u8 = ptr.add(count:offset);
236	if end_ptr.sub(count:needle.len()) < ptr {
237	return None;
238	}
239	let chunk: &[u8] = core::slice::from_raw_parts(data:ptr, needle.len());
240	if memcmp(x:needle, y:chunk) {
241	return Some(offset);
242	}
243	match_offsets &= match_offsets - `1`;
244	}
245	None
246	}
247
248	/// Accepts a chunk-relative offset and returns a haystack relative offset
249	/// after updating the prefilter state.
250	///
251	/// See the same function with the same name in the prefilter variant of this
252	/// algorithm to learned why it's tagged with inline(never). Even here, where
253	/// the function is simpler, inlining it leads to poorer codegen. (Although
254	/// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.)
255	#[cold]
256	#[inline(never)]
257	fn matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize {
258	diff(a:ptr, b:start_ptr) + chunki
259	}
260
261	/// Subtract `b` from `a` and return the difference. `a` must be greater than
262	/// or equal to `b`.
263	fn diff(a: *const u8, b: *const u8) -> usize {
264	debug_assert!(a >= b);
265	(a as usize) - (b as usize)
266	}
267