| 1 | /*! |
| 2 | Generic crate-internal routines for the "packed pair" SIMD algorithm. |
| 3 | |
| 4 | The "packed pair" algorithm is based on the [generic SIMD] algorithm. The main |
| 5 | difference is that it (by default) uses a background distribution of byte |
| 6 | frequencies to heuristically select the pair of bytes to search for. |
| 7 | |
| 8 | [generic SIMD]: http://0x80.pl/articles/simd-strfind.html#first-and-last |
| 9 | */ |
| 10 | |
| 11 | use crate::{ |
| 12 | arch::all::{is_equal_raw, packedpair::Pair}, |
| 13 | ext::Pointer, |
| 14 | vector::{MoveMask, Vector}, |
| 15 | }; |
| 16 | |
| 17 | /// A generic architecture dependent "packed pair" finder. |
| 18 | /// |
| 19 | /// This finder picks two bytes that it believes have high predictive power |
| 20 | /// for indicating an overall match of a needle. Depending on whether |
| 21 | /// `Finder::find` or `Finder::find_prefilter` is used, it reports offsets |
| 22 | /// where the needle matches or could match. In the prefilter case, candidates |
| 23 | /// are reported whenever the [`Pair`] of bytes given matches. |
| 24 | /// |
| 25 | /// This is architecture dependent because it uses specific vector operations |
| 26 | /// to look for occurrences of the pair of bytes. |
| 27 | /// |
| 28 | /// This type is not meant to be exported and is instead meant to be used as |
| 29 | /// the implementation for architecture specific facades. Why? Because it's a |
| 30 | /// bit of a quirky API that requires `inline(always)` annotations. And pretty |
| 31 | /// much everything has safety obligations due (at least) to the caller needing |
| 32 | /// to inline calls into routines marked with |
| 33 | /// `#[target_feature(enable = "...")]`. |
| 34 | #[derive (Clone, Copy, Debug)] |
| 35 | pub(crate) struct Finder<V> { |
| 36 | pair: Pair, |
| 37 | v1: V, |
| 38 | v2: V, |
| 39 | min_haystack_len: usize, |
| 40 | } |
| 41 | |
| 42 | impl<V: Vector> Finder<V> { |
| 43 | /// Create a new pair searcher. The searcher returned can either report |
| 44 | /// exact matches of `needle` or act as a prefilter and report candidate |
| 45 | /// positions of `needle`. |
| 46 | /// |
| 47 | /// # Safety |
| 48 | /// |
| 49 | /// Callers must ensure that whatever vector type this routine is called |
| 50 | /// with is supported by the current environment. |
| 51 | /// |
| 52 | /// Callers must also ensure that `needle.len() >= 2`. |
| 53 | #[inline (always)] |
| 54 | pub(crate) unsafe fn new(needle: &[u8], pair: Pair) -> Finder<V> { |
| 55 | let max_index = pair.index1().max(pair.index2()); |
| 56 | let min_haystack_len = |
| 57 | core::cmp::max(needle.len(), usize::from(max_index) + V::BYTES); |
| 58 | let v1 = V::splat(needle[usize::from(pair.index1())]); |
| 59 | let v2 = V::splat(needle[usize::from(pair.index2())]); |
| 60 | Finder { pair, v1, v2, min_haystack_len } |
| 61 | } |
| 62 | |
| 63 | /// Searches the given haystack for the given needle. The needle given |
| 64 | /// should be the same as the needle that this finder was initialized |
| 65 | /// with. |
| 66 | /// |
| 67 | /// # Panics |
| 68 | /// |
| 69 | /// When `haystack.len()` is less than [`Finder::min_haystack_len`]. |
| 70 | /// |
| 71 | /// # Safety |
| 72 | /// |
| 73 | /// Since this is meant to be used with vector functions, callers need to |
| 74 | /// specialize this inside of a function with a `target_feature` attribute. |
| 75 | /// Therefore, callers must ensure that whatever target feature is being |
| 76 | /// used supports the vector functions that this function is specialized |
| 77 | /// for. (For the specific vector functions used, see the Vector trait |
| 78 | /// implementations.) |
| 79 | #[inline (always)] |
| 80 | pub(crate) unsafe fn find( |
| 81 | &self, |
| 82 | haystack: &[u8], |
| 83 | needle: &[u8], |
| 84 | ) -> Option<usize> { |
| 85 | assert!( |
| 86 | haystack.len() >= self.min_haystack_len, |
| 87 | "haystack too small, should be at least {} but got {}" , |
| 88 | self.min_haystack_len, |
| 89 | haystack.len(), |
| 90 | ); |
| 91 | |
| 92 | let all = V::Mask::all_zeros_except_least_significant(0); |
| 93 | let start = haystack.as_ptr(); |
| 94 | let end = start.add(haystack.len()); |
| 95 | let max = end.sub(self.min_haystack_len); |
| 96 | let mut cur = start; |
| 97 | |
| 98 | // N.B. I did experiment with unrolling the loop to deal with size(V) |
| 99 | // bytes at a time and 2*size(V) bytes at a time. The double unroll |
| 100 | // was marginally faster while the quadruple unroll was unambiguously |
| 101 | // slower. In the end, I decided the complexity from unrolling wasn't |
| 102 | // worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to |
| 103 | // compare. |
| 104 | while cur <= max { |
| 105 | if let Some(chunki) = self.find_in_chunk(needle, cur, end, all) { |
| 106 | return Some(matched(start, cur, chunki)); |
| 107 | } |
| 108 | cur = cur.add(V::BYTES); |
| 109 | } |
| 110 | if cur < end { |
| 111 | let remaining = end.distance(cur); |
| 112 | debug_assert!( |
| 113 | remaining < self.min_haystack_len, |
| 114 | "remaining bytes should be smaller than the minimum haystack \ |
| 115 | length of {}, but there are {} bytes remaining" , |
| 116 | self.min_haystack_len, |
| 117 | remaining, |
| 118 | ); |
| 119 | if remaining < needle.len() { |
| 120 | return None; |
| 121 | } |
| 122 | debug_assert!( |
| 123 | max < cur, |
| 124 | "after main loop, cur should have exceeded max" , |
| 125 | ); |
| 126 | let overlap = cur.distance(max); |
| 127 | debug_assert!( |
| 128 | overlap > 0, |
| 129 | "overlap ( {}) must always be non-zero" , |
| 130 | overlap, |
| 131 | ); |
| 132 | debug_assert!( |
| 133 | overlap < V::BYTES, |
| 134 | "overlap ( {}) cannot possibly be >= than a vector ( {})" , |
| 135 | overlap, |
| 136 | V::BYTES, |
| 137 | ); |
| 138 | // The mask has all of its bits set except for the first N least |
| 139 | // significant bits, where N=overlap. This way, any matches that |
| 140 | // occur in find_in_chunk within the overlap are automatically |
| 141 | // ignored. |
| 142 | let mask = V::Mask::all_zeros_except_least_significant(overlap); |
| 143 | cur = max; |
| 144 | let m = self.find_in_chunk(needle, cur, end, mask); |
| 145 | if let Some(chunki) = m { |
| 146 | return Some(matched(start, cur, chunki)); |
| 147 | } |
| 148 | } |
| 149 | None |
| 150 | } |
| 151 | |
| 152 | /// Searches the given haystack for offsets that represent candidate |
| 153 | /// matches of the `needle` given to this finder's constructor. The offsets |
| 154 | /// returned, if they are a match, correspond to the starting offset of |
| 155 | /// `needle` in the given `haystack`. |
| 156 | /// |
| 157 | /// # Panics |
| 158 | /// |
| 159 | /// When `haystack.len()` is less than [`Finder::min_haystack_len`]. |
| 160 | /// |
| 161 | /// # Safety |
| 162 | /// |
| 163 | /// Since this is meant to be used with vector functions, callers need to |
| 164 | /// specialize this inside of a function with a `target_feature` attribute. |
| 165 | /// Therefore, callers must ensure that whatever target feature is being |
| 166 | /// used supports the vector functions that this function is specialized |
| 167 | /// for. (For the specific vector functions used, see the Vector trait |
| 168 | /// implementations.) |
| 169 | #[inline (always)] |
| 170 | pub(crate) unsafe fn find_prefilter( |
| 171 | &self, |
| 172 | haystack: &[u8], |
| 173 | ) -> Option<usize> { |
| 174 | assert!( |
| 175 | haystack.len() >= self.min_haystack_len, |
| 176 | "haystack too small, should be at least {} but got {}" , |
| 177 | self.min_haystack_len, |
| 178 | haystack.len(), |
| 179 | ); |
| 180 | |
| 181 | let start = haystack.as_ptr(); |
| 182 | let end = start.add(haystack.len()); |
| 183 | let max = end.sub(self.min_haystack_len); |
| 184 | let mut cur = start; |
| 185 | |
| 186 | // N.B. I did experiment with unrolling the loop to deal with size(V) |
| 187 | // bytes at a time and 2*size(V) bytes at a time. The double unroll |
| 188 | // was marginally faster while the quadruple unroll was unambiguously |
| 189 | // slower. In the end, I decided the complexity from unrolling wasn't |
| 190 | // worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to |
| 191 | // compare. |
| 192 | while cur <= max { |
| 193 | if let Some(chunki) = self.find_prefilter_in_chunk(cur) { |
| 194 | return Some(matched(start, cur, chunki)); |
| 195 | } |
| 196 | cur = cur.add(V::BYTES); |
| 197 | } |
| 198 | if cur < end { |
| 199 | // This routine immediately quits if a candidate match is found. |
| 200 | // That means that if we're here, no candidate matches have been |
| 201 | // found at or before 'ptr'. Thus, we don't need to mask anything |
| 202 | // out even though we might technically search part of the haystack |
| 203 | // that we've already searched (because we know it can't match). |
| 204 | cur = max; |
| 205 | if let Some(chunki) = self.find_prefilter_in_chunk(cur) { |
| 206 | return Some(matched(start, cur, chunki)); |
| 207 | } |
| 208 | } |
| 209 | None |
| 210 | } |
| 211 | |
| 212 | /// Search for an occurrence of our byte pair from the needle in the chunk |
| 213 | /// pointed to by cur, with the end of the haystack pointed to by end. |
| 214 | /// When an occurrence is found, memcmp is run to check if a match occurs |
| 215 | /// at the corresponding position. |
| 216 | /// |
| 217 | /// `mask` should have bits set corresponding the positions in the chunk |
| 218 | /// in which matches are considered. This is only used for the last vector |
| 219 | /// load where the beginning of the vector might have overlapped with the |
| 220 | /// last load in the main loop. The mask lets us avoid visiting positions |
| 221 | /// that have already been discarded as matches. |
| 222 | /// |
| 223 | /// # Safety |
| 224 | /// |
| 225 | /// It must be safe to do an unaligned read of size(V) bytes starting at |
| 226 | /// both (cur + self.index1) and (cur + self.index2). It must also be safe |
| 227 | /// to do unaligned loads on cur up to (end - needle.len()). |
| 228 | #[inline (always)] |
| 229 | unsafe fn find_in_chunk( |
| 230 | &self, |
| 231 | needle: &[u8], |
| 232 | cur: *const u8, |
| 233 | end: *const u8, |
| 234 | mask: V::Mask, |
| 235 | ) -> Option<usize> { |
| 236 | let index1 = usize::from(self.pair.index1()); |
| 237 | let index2 = usize::from(self.pair.index2()); |
| 238 | let chunk1 = V::load_unaligned(cur.add(index1)); |
| 239 | let chunk2 = V::load_unaligned(cur.add(index2)); |
| 240 | let eq1 = chunk1.cmpeq(self.v1); |
| 241 | let eq2 = chunk2.cmpeq(self.v2); |
| 242 | |
| 243 | let mut offsets = eq1.and(eq2).movemask().and(mask); |
| 244 | while offsets.has_non_zero() { |
| 245 | let offset = offsets.first_offset(); |
| 246 | let cur = cur.add(offset); |
| 247 | if end.sub(needle.len()) < cur { |
| 248 | return None; |
| 249 | } |
| 250 | if is_equal_raw(needle.as_ptr(), cur, needle.len()) { |
| 251 | return Some(offset); |
| 252 | } |
| 253 | offsets = offsets.clear_least_significant_bit(); |
| 254 | } |
| 255 | None |
| 256 | } |
| 257 | |
| 258 | /// Search for an occurrence of our byte pair from the needle in the chunk |
| 259 | /// pointed to by cur, with the end of the haystack pointed to by end. |
| 260 | /// When an occurrence is found, memcmp is run to check if a match occurs |
| 261 | /// at the corresponding position. |
| 262 | /// |
| 263 | /// # Safety |
| 264 | /// |
| 265 | /// It must be safe to do an unaligned read of size(V) bytes starting at |
| 266 | /// both (cur + self.index1) and (cur + self.index2). It must also be safe |
| 267 | /// to do unaligned reads on cur up to (end - needle.len()). |
| 268 | #[inline (always)] |
| 269 | unsafe fn find_prefilter_in_chunk(&self, cur: *const u8) -> Option<usize> { |
| 270 | let index1 = usize::from(self.pair.index1()); |
| 271 | let index2 = usize::from(self.pair.index2()); |
| 272 | let chunk1 = V::load_unaligned(cur.add(index1)); |
| 273 | let chunk2 = V::load_unaligned(cur.add(index2)); |
| 274 | let eq1 = chunk1.cmpeq(self.v1); |
| 275 | let eq2 = chunk2.cmpeq(self.v2); |
| 276 | |
| 277 | let offsets = eq1.and(eq2).movemask(); |
| 278 | if !offsets.has_non_zero() { |
| 279 | return None; |
| 280 | } |
| 281 | Some(offsets.first_offset()) |
| 282 | } |
| 283 | |
| 284 | /// Returns the pair of offsets (into the needle) used to check as a |
| 285 | /// predicate before confirming whether a needle exists at a particular |
| 286 | /// position. |
| 287 | #[inline ] |
| 288 | pub(crate) fn pair(&self) -> &Pair { |
| 289 | &self.pair |
| 290 | } |
| 291 | |
| 292 | /// Returns the minimum haystack length that this `Finder` can search. |
| 293 | /// |
| 294 | /// Providing a haystack to this `Finder` shorter than this length is |
| 295 | /// guaranteed to result in a panic. |
| 296 | #[inline (always)] |
| 297 | pub(crate) fn min_haystack_len(&self) -> usize { |
| 298 | self.min_haystack_len |
| 299 | } |
| 300 | } |
| 301 | |
| 302 | /// Accepts a chunk-relative offset and returns a haystack relative offset. |
| 303 | /// |
| 304 | /// This used to be marked `#[cold]` and `#[inline(never)]`, but I couldn't |
| 305 | /// observe a consistent measureable difference between that and just inlining |
| 306 | /// it. So we go with inlining it. |
| 307 | /// |
| 308 | /// # Safety |
| 309 | /// |
| 310 | /// Same at `ptr::offset_from` in addition to `cur >= start`. |
| 311 | #[inline (always)] |
| 312 | unsafe fn matched(start: *const u8, cur: *const u8, chunki: usize) -> usize { |
| 313 | cur.distance(origin:start) + chunki |
| 314 | } |
| 315 | |
| 316 | // If you're looking for tests, those are run for each instantiation of the |
| 317 | // above code. So for example, see arch::x86_64::sse2::packedpair. |
| 318 | |