util.rs source code [crates/memchr-2.5.0/src/memmem/util.rs]

1	// These routines are meant to be optimized specifically for low latency as
2	// compared to the equivalent routines offered by std. (Which may invoke the
3	// dynamic linker and call out to libc, which introduces a bit more latency
4	// than we'd like.)
5
6	/// Returns true if and only if needle is a prefix of haystack.
7	#[inline(always)]
8	pub(crate) fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
9	needle.len() <= haystack.len() && memcmp(&haystack[..needle.len()], y:needle)
10	}
11
12	/// Returns true if and only if needle is a suffix of haystack.
13	#[inline(always)]
14	pub(crate) fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool {
15	needle.len() <= haystack.len()
16	&& memcmp(&haystack[haystack.len() - needle.len()..], y:needle)
17	}
18
19	/// Return true if and only if x.len() == y.len() && x[i] == y[i] for all
20	/// 0 <= i < x.len().
21	///
22	/// Why not just use actual memcmp for this? Well, memcmp requires calling out
23	/// to libc, and this routine is called in fairly hot code paths. Other than
24	/// just calling out to libc, it also seems to result in worse codegen. By
25	/// rolling our own memcmp in pure Rust, it seems to appear more friendly to
26	/// the optimizer.
27	///
28	/// We mark this as inline always, although, some callers may not want it
29	/// inlined for better codegen (like Rabin-Karp). In that case, callers are
30	/// advised to create a non-inlineable wrapper routine that calls memcmp.
31	#[inline(always)]
32	pub(crate) fn memcmp(x: &[u8], y: &[u8]) -> bool {
33	if x.len() != y.len() {
34	return `false`;
35	}
36	// If we don't have enough bytes to do 4-byte at a time loads, then
37	// fall back to the naive slow version.
38	//
39	// TODO: We could do a copy_nonoverlapping combined with a mask instead
40	// of a loop. Benchmark it.
41	if x.len() < `4` {
42	for (&b1, &b2) in x.iter().zip(y) {
43	if b1 != b2 {
44	return `false`;
45	}
46	}
47	return `true`;
48	}
49	// When we have 4 or more bytes to compare, then proceed in chunks of 4 at
50	// a time using unaligned loads.
51	//
52	// Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
53	// that this particular version of memcmp is likely to be called with tiny
54	// needles. That means that if we do 8 byte loads, then a higher proportion
55	// of memcmp calls will use the slower variant above. With that said, this
56	// is a hypothesis and is only loosely supported by benchmarks. There's
57	// likely some improvement that could be made here. The main thing here
58	// though is to optimize for latency, not throughput.
59
60	// SAFETY: Via the conditional above, we know that both `px` and `py`
61	// have the same length, so `px < pxend` implies that `py < pyend`.
62	// Thus, derefencing both `px` and `py` in the loop below is safe.
63	//
64	// Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual
65	// end of of `px` and `py`. Thus, the final dereference outside of the
66	// loop is guaranteed to be valid. (The final comparison will overlap with
67	// the last comparison done in the loop for lengths that aren't multiples
68	// of four.)
69	//
70	// Finally, we needn't worry about alignment here, since we do unaligned
71	// loads.
72	unsafe {
73	let (mut px, mut py) = (x.as_ptr(), y.as_ptr());
74	let (pxend, pyend) = (px.add(x.len() - `4`), py.add(y.len() - `4`));
75	while px < pxend {
76	let vx = (px as *const u32).read_unaligned();
77	let vy = (py as *const u32).read_unaligned();
78	if vx != vy {
79	return `false`;
80	}
81	px = px.add(`4`);
82	py = py.add(`4`);
83	}
84	let vx = (pxend as *const u32).read_unaligned();
85	let vy = (pyend as *const u32).read_unaligned();
86	vx == vy
87	}
88	}
89

Provided by KDAB

Definitions

Learn Rust with the experts

Find out more