1 | // These routines are meant to be optimized specifically for low latency as |
2 | // compared to the equivalent routines offered by std. (Which may invoke the |
3 | // dynamic linker and call out to libc, which introduces a bit more latency |
4 | // than we'd like.) |
5 | |
6 | /// Returns true if and only if needle is a prefix of haystack. |
7 | #[inline (always)] |
8 | pub(crate) fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool { |
9 | needle.len() <= haystack.len() && memcmp(&haystack[..needle.len()], y:needle) |
10 | } |
11 | |
12 | /// Returns true if and only if needle is a suffix of haystack. |
13 | #[inline (always)] |
14 | pub(crate) fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool { |
15 | needle.len() <= haystack.len() |
16 | && memcmp(&haystack[haystack.len() - needle.len()..], y:needle) |
17 | } |
18 | |
19 | /// Return true if and only if x.len() == y.len() && x[i] == y[i] for all |
20 | /// 0 <= i < x.len(). |
21 | /// |
22 | /// Why not just use actual memcmp for this? Well, memcmp requires calling out |
23 | /// to libc, and this routine is called in fairly hot code paths. Other than |
24 | /// just calling out to libc, it also seems to result in worse codegen. By |
25 | /// rolling our own memcmp in pure Rust, it seems to appear more friendly to |
26 | /// the optimizer. |
27 | /// |
28 | /// We mark this as inline always, although, some callers may not want it |
29 | /// inlined for better codegen (like Rabin-Karp). In that case, callers are |
30 | /// advised to create a non-inlineable wrapper routine that calls memcmp. |
31 | #[inline (always)] |
32 | pub(crate) fn memcmp(x: &[u8], y: &[u8]) -> bool { |
33 | if x.len() != y.len() { |
34 | return false; |
35 | } |
36 | // If we don't have enough bytes to do 4-byte at a time loads, then |
37 | // fall back to the naive slow version. |
38 | // |
39 | // TODO: We could do a copy_nonoverlapping combined with a mask instead |
40 | // of a loop. Benchmark it. |
41 | if x.len() < 4 { |
42 | for (&b1, &b2) in x.iter().zip(y) { |
43 | if b1 != b2 { |
44 | return false; |
45 | } |
46 | } |
47 | return true; |
48 | } |
49 | // When we have 4 or more bytes to compare, then proceed in chunks of 4 at |
50 | // a time using unaligned loads. |
51 | // |
52 | // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is |
53 | // that this particular version of memcmp is likely to be called with tiny |
54 | // needles. That means that if we do 8 byte loads, then a higher proportion |
55 | // of memcmp calls will use the slower variant above. With that said, this |
56 | // is a hypothesis and is only loosely supported by benchmarks. There's |
57 | // likely some improvement that could be made here. The main thing here |
58 | // though is to optimize for latency, not throughput. |
59 | |
60 | // SAFETY: Via the conditional above, we know that both `px` and `py` |
61 | // have the same length, so `px < pxend` implies that `py < pyend`. |
62 | // Thus, derefencing both `px` and `py` in the loop below is safe. |
63 | // |
64 | // Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual |
65 | // end of of `px` and `py`. Thus, the final dereference outside of the |
66 | // loop is guaranteed to be valid. (The final comparison will overlap with |
67 | // the last comparison done in the loop for lengths that aren't multiples |
68 | // of four.) |
69 | // |
70 | // Finally, we needn't worry about alignment here, since we do unaligned |
71 | // loads. |
72 | unsafe { |
73 | let (mut px, mut py) = (x.as_ptr(), y.as_ptr()); |
74 | let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4)); |
75 | while px < pxend { |
76 | let vx = (px as *const u32).read_unaligned(); |
77 | let vy = (py as *const u32).read_unaligned(); |
78 | if vx != vy { |
79 | return false; |
80 | } |
81 | px = px.add(4); |
82 | py = py.add(4); |
83 | } |
84 | let vx = (pxend as *const u32).read_unaligned(); |
85 | let vy = (pyend as *const u32).read_unaligned(); |
86 | vx == vy |
87 | } |
88 | } |
89 | |