1// NOTE: The descriptions for each of the vector methods on the traits below
2// are pretty inscrutable. For this reason, there are tests for every method
3// on for every trait impl below. If you're confused about what an op does,
4// consult its test. (They probably should be doc tests, but I couldn't figure
5// out how to write them in a non-annoying way.)
6
7use core::{
8 fmt::Debug,
9 panic::{RefUnwindSafe, UnwindSafe},
10};
11
12/// A trait for describing vector operations used by vectorized searchers.
13///
14/// The trait is highly constrained to low level vector operations needed for
15/// the specific algorithms used in this crate. In general, it was invented
16/// mostly to be generic over x86's __m128i and __m256i types. At time of
17/// writing, it also supports wasm and aarch64 128-bit vector types as well.
18///
19/// # Safety
20///
21/// All methods are not safe since they are intended to be implemented using
22/// vendor intrinsics, which are also not safe. Callers must ensure that
23/// the appropriate target features are enabled in the calling function,
24/// and that the current CPU supports them. All implementations should
25/// avoid marking the routines with `#[target_feature]` and instead mark
26/// them as `#[inline(always)]` to ensure they get appropriately inlined.
27/// (`inline(always)` cannot be used with target_feature.)
28pub(crate) trait Vector:
29 Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe
30{
31 /// The number of bits in the vector.
32 const BITS: usize;
33 /// The number of bytes in the vector. That is, this is the size of the
34 /// vector in memory.
35 const BYTES: usize;
36
37 /// Create a vector with 8-bit lanes with the given byte repeated into each
38 /// lane.
39 ///
40 /// # Safety
41 ///
42 /// Callers must ensure that this is okay to call in the current target for
43 /// the current CPU.
44 unsafe fn splat(byte: u8) -> Self;
45
46 /// Read a vector-size number of bytes from the given pointer. The pointer
47 /// does not need to be aligned.
48 ///
49 /// # Safety
50 ///
51 /// Callers must ensure that this is okay to call in the current target for
52 /// the current CPU.
53 ///
54 /// Callers must guarantee that at least `BYTES` bytes are readable from
55 /// `data`.
56 unsafe fn load_unaligned(data: *const u8) -> Self;
57
58 /// Returns true if and only if this vector has zero in all of its lanes.
59 ///
60 /// # Safety
61 ///
62 /// Callers must ensure that this is okay to call in the current target for
63 /// the current CPU.
64 unsafe fn is_zero(self) -> bool;
65
66 /// Do an 8-bit pairwise equality check. If lane `i` is equal in this
67 /// vector and the one given, then lane `i` in the resulting vector is set
68 /// to `0xFF`. Otherwise, it is set to `0x00`.
69 ///
70 /// # Safety
71 ///
72 /// Callers must ensure that this is okay to call in the current target for
73 /// the current CPU.
74 unsafe fn cmpeq(self, vector2: Self) -> Self;
75
76 /// Perform a bitwise 'and' of this vector and the one given and return
77 /// the result.
78 ///
79 /// # Safety
80 ///
81 /// Callers must ensure that this is okay to call in the current target for
82 /// the current CPU.
83 unsafe fn and(self, vector2: Self) -> Self;
84
85 /// Perform a bitwise 'or' of this vector and the one given and return
86 /// the result.
87 ///
88 /// # Safety
89 ///
90 /// Callers must ensure that this is okay to call in the current target for
91 /// the current CPU.
92 unsafe fn or(self, vector2: Self) -> Self;
93
94 /// Shift each 8-bit lane in this vector to the right by the number of
95 /// bits indictated by the `BITS` type parameter.
96 ///
97 /// # Safety
98 ///
99 /// Callers must ensure that this is okay to call in the current target for
100 /// the current CPU.
101 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self;
102
103 /// Shift this vector to the left by one byte and shift the most
104 /// significant byte of `vector2` into the least significant position of
105 /// this vector.
106 ///
107 /// Stated differently, this behaves as if `self` and `vector2` were
108 /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
109 /// right by `Self::BYTES - 1` bytes.
110 ///
111 /// With respect to the Teddy algorithm, `vector2` is usually a previous
112 /// `Self::BYTES` chunk from the haystack and `self` is the chunk
113 /// immediately following it. This permits combining the last two bytes
114 /// from the previous chunk (`vector2`) with the first `Self::BYTES - 1`
115 /// bytes from the current chunk. This permits aligning the result of
116 /// various shuffles so that they can be and-ed together and a possible
117 /// candidate discovered.
118 ///
119 /// # Safety
120 ///
121 /// Callers must ensure that this is okay to call in the current target for
122 /// the current CPU.
123 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self;
124
125 /// Shift this vector to the left by two bytes and shift the two most
126 /// significant bytes of `vector2` into the least significant position of
127 /// this vector.
128 ///
129 /// Stated differently, this behaves as if `self` and `vector2` were
130 /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
131 /// right by `Self::BYTES - 2` bytes.
132 ///
133 /// With respect to the Teddy algorithm, `vector2` is usually a previous
134 /// `Self::BYTES` chunk from the haystack and `self` is the chunk
135 /// immediately following it. This permits combining the last two bytes
136 /// from the previous chunk (`vector2`) with the first `Self::BYTES - 2`
137 /// bytes from the current chunk. This permits aligning the result of
138 /// various shuffles so that they can be and-ed together and a possible
139 /// candidate discovered.
140 ///
141 /// # Safety
142 ///
143 /// Callers must ensure that this is okay to call in the current target for
144 /// the current CPU.
145 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self;
146
147 /// Shift this vector to the left by three bytes and shift the three most
148 /// significant bytes of `vector2` into the least significant position of
149 /// this vector.
150 ///
151 /// Stated differently, this behaves as if `self` and `vector2` were
152 /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
153 /// right by `Self::BYTES - 3` bytes.
154 ///
155 /// With respect to the Teddy algorithm, `vector2` is usually a previous
156 /// `Self::BYTES` chunk from the haystack and `self` is the chunk
157 /// immediately following it. This permits combining the last three bytes
158 /// from the previous chunk (`vector2`) with the first `Self::BYTES - 3`
159 /// bytes from the current chunk. This permits aligning the result of
160 /// various shuffles so that they can be and-ed together and a possible
161 /// candidate discovered.
162 ///
163 /// # Safety
164 ///
165 /// Callers must ensure that this is okay to call in the current target for
166 /// the current CPU.
167 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self;
168
169 /// Shuffles the bytes in this vector according to the indices in each of
170 /// the corresponding lanes in `indices`.
171 ///
172 /// If `i` is the index of corresponding lanes, `A` is this vector, `B` is
173 /// indices and `C` is the resulting vector, then `C = A[B[i]]`.
174 ///
175 /// # Safety
176 ///
177 /// Callers must ensure that this is okay to call in the current target for
178 /// the current CPU.
179 unsafe fn shuffle_bytes(self, indices: Self) -> Self;
180
181 /// Call the provided function for each 64-bit lane in this vector. The
182 /// given function is provided the lane index and lane value as a `u64`.
183 ///
184 /// If `f` returns `Some`, then iteration over the lanes is stopped and the
185 /// value is returned. Otherwise, this returns `None`.
186 ///
187 /// # Notes
188 ///
189 /// Conceptually it would be nice if we could have a
190 /// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is
191 /// tricky given Rust's [current support for const generics][support].
192 /// And even if we could, it would be tricky to write generic code over
193 /// it. (Not impossible. We could introduce another layer that requires
194 /// `AsRef<[u64]>` or something.)
195 ///
196 /// [support]: https://github.com/rust-lang/rust/issues/60551
197 ///
198 /// # Safety
199 ///
200 /// Callers must ensure that this is okay to call in the current target for
201 /// the current CPU.
202 unsafe fn for_each_64bit_lane<T>(
203 self,
204 f: impl FnMut(usize, u64) -> Option<T>,
205 ) -> Option<T>;
206}
207
208/// This trait extends the `Vector` trait with additional operations to support
209/// Fat Teddy.
210///
211/// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as
212/// the vector size) instead of the full size of a vector per iteration. For
213/// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr
214/// but Fat Teddy reads 16 bytes at a time.
215///
216/// Fat Teddy is useful when searching for a large number of literals.
217/// The extra number of buckets spreads the literals out more and reduces
218/// verification time.
219///
220/// Currently we only implement this for AVX on x86_64. It would be nice to
221/// implement this for SSE on x86_64 and NEON on aarch64, with the latter two
222/// only reading 8 bytes at a time. It's not clear how well it would work, but
223/// there are some tricky things to figure out in terms of implementation. The
224/// `half_shift_in_{one,two,three}_bytes` methods in particular are probably
225/// the trickiest of the bunch. For AVX2, these are implemented by taking
226/// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit
227/// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8`
228/// operates on the full 128-bit vector and not on each 64-bit half.) I didn't
229/// do a careful survey of NEON to see if it could easily support these
230/// operations.
231pub(crate) trait FatVector: Vector {
232 type Half: Vector;
233
234 /// Read a half-vector-size number of bytes from the given pointer, and
235 /// broadcast it across both halfs of a full vector. The pointer does not
236 /// need to be aligned.
237 ///
238 /// # Safety
239 ///
240 /// Callers must ensure that this is okay to call in the current target for
241 /// the current CPU.
242 ///
243 /// Callers must guarantee that at least `Self::HALF::BYTES` bytes are
244 /// readable from `data`.
245 unsafe fn load_half_unaligned(data: *const u8) -> Self;
246
247 /// Like `Vector::shift_in_one_byte`, except this is done for each half
248 /// of the vector instead.
249 ///
250 /// # Safety
251 ///
252 /// Callers must ensure that this is okay to call in the current target for
253 /// the current CPU.
254 unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self;
255
256 /// Like `Vector::shift_in_two_bytes`, except this is done for each half
257 /// of the vector instead.
258 ///
259 /// # Safety
260 ///
261 /// Callers must ensure that this is okay to call in the current target for
262 /// the current CPU.
263 unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self;
264
265 /// Like `Vector::shift_in_two_bytes`, except this is done for each half
266 /// of the vector instead.
267 ///
268 /// # Safety
269 ///
270 /// Callers must ensure that this is okay to call in the current target for
271 /// the current CPU.
272 unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self;
273
274 /// Swap the 128-bit lanes in this vector.
275 ///
276 /// # Safety
277 ///
278 /// Callers must ensure that this is okay to call in the current target for
279 /// the current CPU.
280 unsafe fn swap_halves(self) -> Self;
281
282 /// Unpack and interleave the 8-bit lanes from the low 128 bits of each
283 /// vector and return the result.
284 ///
285 /// # Safety
286 ///
287 /// Callers must ensure that this is okay to call in the current target for
288 /// the current CPU.
289 unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self;
290
291 /// Unpack and interleave the 8-bit lanes from the high 128 bits of each
292 /// vector and return the result.
293 ///
294 /// # Safety
295 ///
296 /// Callers must ensure that this is okay to call in the current target for
297 /// the current CPU.
298 unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self;
299
300 /// Call the provided function for each 64-bit lane in the lower half
301 /// of this vector and then in the other vector. The given function is
302 /// provided the lane index and lane value as a `u64`. (The high 128-bits
303 /// of each vector are ignored.)
304 ///
305 /// If `f` returns `Some`, then iteration over the lanes is stopped and the
306 /// value is returned. Otherwise, this returns `None`.
307 ///
308 /// # Safety
309 ///
310 /// Callers must ensure that this is okay to call in the current target for
311 /// the current CPU.
312 unsafe fn for_each_low_64bit_lane<T>(
313 self,
314 vector2: Self,
315 f: impl FnMut(usize, u64) -> Option<T>,
316 ) -> Option<T>;
317}
318
319#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
320mod x86_64_ssse3 {
321 use core::arch::x86_64::*;
322
323 use crate::util::int::{I32, I8};
324
325 use super::Vector;
326
327 impl Vector for __m128i {
328 const BITS: usize = 128;
329 const BYTES: usize = 16;
330
331 #[inline(always)]
332 unsafe fn splat(byte: u8) -> __m128i {
333 _mm_set1_epi8(i8::from_bits(byte))
334 }
335
336 #[inline(always)]
337 unsafe fn load_unaligned(data: *const u8) -> __m128i {
338 _mm_loadu_si128(data.cast::<__m128i>())
339 }
340
341 #[inline(always)]
342 unsafe fn is_zero(self) -> bool {
343 let cmp = self.cmpeq(Self::splat(0));
344 _mm_movemask_epi8(cmp).to_bits() == 0xFFFF
345 }
346
347 #[inline(always)]
348 unsafe fn cmpeq(self, vector2: Self) -> __m128i {
349 _mm_cmpeq_epi8(self, vector2)
350 }
351
352 #[inline(always)]
353 unsafe fn and(self, vector2: Self) -> __m128i {
354 _mm_and_si128(self, vector2)
355 }
356
357 #[inline(always)]
358 unsafe fn or(self, vector2: Self) -> __m128i {
359 _mm_or_si128(self, vector2)
360 }
361
362 #[inline(always)]
363 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
364 // Apparently there is no _mm_srli_epi8, so we emulate it by
365 // shifting 16-bit integers and masking out the high nybble of each
366 // 8-bit lane (since that nybble will contain bits from the low
367 // nybble of the previous lane).
368 let lomask = Self::splat(0xF);
369 _mm_srli_epi16(self, BITS).and(lomask)
370 }
371
372 #[inline(always)]
373 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
374 _mm_alignr_epi8(self, vector2, 15)
375 }
376
377 #[inline(always)]
378 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
379 _mm_alignr_epi8(self, vector2, 14)
380 }
381
382 #[inline(always)]
383 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
384 _mm_alignr_epi8(self, vector2, 13)
385 }
386
387 #[inline(always)]
388 unsafe fn shuffle_bytes(self, indices: Self) -> Self {
389 _mm_shuffle_epi8(self, indices)
390 }
391
392 #[inline(always)]
393 unsafe fn for_each_64bit_lane<T>(
394 self,
395 mut f: impl FnMut(usize, u64) -> Option<T>,
396 ) -> Option<T> {
397 // We could just use _mm_extract_epi64 here, but that requires
398 // SSE 4.1. It isn't necessarily a problem to just require SSE 4.1,
399 // but everything else works with SSSE3 so we stick to that subset.
400 let lanes: [u64; 2] = core::mem::transmute(self);
401 if let Some(t) = f(0, lanes[0]) {
402 return Some(t);
403 }
404 if let Some(t) = f(1, lanes[1]) {
405 return Some(t);
406 }
407 None
408 }
409 }
410}
411
412#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
413mod x86_64_avx2 {
414 use core::arch::x86_64::*;
415
416 use crate::util::int::{I32, I64, I8};
417
418 use super::{FatVector, Vector};
419
420 impl Vector for __m256i {
421 const BITS: usize = 256;
422 const BYTES: usize = 32;
423
424 #[inline(always)]
425 unsafe fn splat(byte: u8) -> __m256i {
426 _mm256_set1_epi8(i8::from_bits(byte))
427 }
428
429 #[inline(always)]
430 unsafe fn load_unaligned(data: *const u8) -> __m256i {
431 _mm256_loadu_si256(data.cast::<__m256i>())
432 }
433
434 #[inline(always)]
435 unsafe fn is_zero(self) -> bool {
436 let cmp = self.cmpeq(Self::splat(0));
437 _mm256_movemask_epi8(cmp).to_bits() == 0xFFFFFFFF
438 }
439
440 #[inline(always)]
441 unsafe fn cmpeq(self, vector2: Self) -> __m256i {
442 _mm256_cmpeq_epi8(self, vector2)
443 }
444
445 #[inline(always)]
446 unsafe fn and(self, vector2: Self) -> __m256i {
447 _mm256_and_si256(self, vector2)
448 }
449
450 #[inline(always)]
451 unsafe fn or(self, vector2: Self) -> __m256i {
452 _mm256_or_si256(self, vector2)
453 }
454
455 #[inline(always)]
456 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
457 let lomask = Self::splat(0xF);
458 _mm256_srli_epi16(self, BITS).and(lomask)
459 }
460
461 #[inline(always)]
462 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
463 // Credit goes to jneem for figuring this out:
464 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
465 //
466 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
467 // PALIGNR instructions, which is not what we want, so we need to
468 // do some extra shuffling.
469 let v = _mm256_permute2x128_si256(vector2, self, 0x21);
470 _mm256_alignr_epi8(self, v, 15)
471 }
472
473 #[inline(always)]
474 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
475 // Credit goes to jneem for figuring this out:
476 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
477 //
478 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
479 // PALIGNR instructions, which is not what we want, so we need to
480 // do some extra shuffling.
481 let v = _mm256_permute2x128_si256(vector2, self, 0x21);
482 _mm256_alignr_epi8(self, v, 14)
483 }
484
485 #[inline(always)]
486 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
487 // Credit goes to jneem for figuring this out:
488 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
489 //
490 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
491 // PALIGNR instructions, which is not what we want, so we need to
492 // do some extra shuffling.
493 let v = _mm256_permute2x128_si256(vector2, self, 0x21);
494 _mm256_alignr_epi8(self, v, 13)
495 }
496
497 #[inline(always)]
498 unsafe fn shuffle_bytes(self, indices: Self) -> Self {
499 _mm256_shuffle_epi8(self, indices)
500 }
501
502 #[inline(always)]
503 unsafe fn for_each_64bit_lane<T>(
504 self,
505 mut f: impl FnMut(usize, u64) -> Option<T>,
506 ) -> Option<T> {
507 // NOTE: At one point in the past, I used transmute to this to
508 // get a [u64; 4], but it turned out to lead to worse codegen IIRC.
509 // I've tried it more recently, and it looks like that's no longer
510 // the case. But since there's no difference, we stick with the
511 // slightly more complicated but transmute-free version.
512 let lane = _mm256_extract_epi64(self, 0).to_bits();
513 if let Some(t) = f(0, lane) {
514 return Some(t);
515 }
516 let lane = _mm256_extract_epi64(self, 1).to_bits();
517 if let Some(t) = f(1, lane) {
518 return Some(t);
519 }
520 let lane = _mm256_extract_epi64(self, 2).to_bits();
521 if let Some(t) = f(2, lane) {
522 return Some(t);
523 }
524 let lane = _mm256_extract_epi64(self, 3).to_bits();
525 if let Some(t) = f(3, lane) {
526 return Some(t);
527 }
528 None
529 }
530 }
531
532 impl FatVector for __m256i {
533 type Half = __m128i;
534
535 #[inline(always)]
536 unsafe fn load_half_unaligned(data: *const u8) -> Self {
537 let half = Self::Half::load_unaligned(data);
538 _mm256_broadcastsi128_si256(half)
539 }
540
541 #[inline(always)]
542 unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self {
543 _mm256_alignr_epi8(self, vector2, 15)
544 }
545
546 #[inline(always)]
547 unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self {
548 _mm256_alignr_epi8(self, vector2, 14)
549 }
550
551 #[inline(always)]
552 unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self {
553 _mm256_alignr_epi8(self, vector2, 13)
554 }
555
556 #[inline(always)]
557 unsafe fn swap_halves(self) -> Self {
558 _mm256_permute4x64_epi64(self, 0x4E)
559 }
560
561 #[inline(always)]
562 unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self {
563 _mm256_unpacklo_epi8(self, vector2)
564 }
565
566 #[inline(always)]
567 unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self {
568 _mm256_unpackhi_epi8(self, vector2)
569 }
570
571 #[inline(always)]
572 unsafe fn for_each_low_64bit_lane<T>(
573 self,
574 vector2: Self,
575 mut f: impl FnMut(usize, u64) -> Option<T>,
576 ) -> Option<T> {
577 let lane = _mm256_extract_epi64(self, 0).to_bits();
578 if let Some(t) = f(0, lane) {
579 return Some(t);
580 }
581 let lane = _mm256_extract_epi64(self, 1).to_bits();
582 if let Some(t) = f(1, lane) {
583 return Some(t);
584 }
585 let lane = _mm256_extract_epi64(vector2, 0).to_bits();
586 if let Some(t) = f(2, lane) {
587 return Some(t);
588 }
589 let lane = _mm256_extract_epi64(vector2, 1).to_bits();
590 if let Some(t) = f(3, lane) {
591 return Some(t);
592 }
593 None
594 }
595 }
596}
597
598#[cfg(target_arch = "aarch64")]
599mod aarch64_neon {
600 use core::arch::aarch64::*;
601
602 use super::Vector;
603
604 impl Vector for uint8x16_t {
605 const BITS: usize = 128;
606 const BYTES: usize = 16;
607
608 #[inline(always)]
609 unsafe fn splat(byte: u8) -> uint8x16_t {
610 vdupq_n_u8(byte)
611 }
612
613 #[inline(always)]
614 unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
615 vld1q_u8(data)
616 }
617
618 #[inline(always)]
619 unsafe fn is_zero(self) -> bool {
620 // Could also use vmaxvq_u8.
621 // ... I tried that and couldn't observe any meaningful difference
622 // in benchmarks.
623 let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
624 vgetq_lane_u64(maxes, 0) == 0
625 }
626
627 #[inline(always)]
628 unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
629 vceqq_u8(self, vector2)
630 }
631
632 #[inline(always)]
633 unsafe fn and(self, vector2: Self) -> uint8x16_t {
634 vandq_u8(self, vector2)
635 }
636
637 #[inline(always)]
638 unsafe fn or(self, vector2: Self) -> uint8x16_t {
639 vorrq_u8(self, vector2)
640 }
641
642 #[inline(always)]
643 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
644 debug_assert!(BITS <= 7);
645 vshrq_n_u8(self, BITS)
646 }
647
648 #[inline(always)]
649 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
650 vextq_u8(vector2, self, 15)
651 }
652
653 #[inline(always)]
654 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
655 vextq_u8(vector2, self, 14)
656 }
657
658 #[inline(always)]
659 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
660 vextq_u8(vector2, self, 13)
661 }
662
663 #[inline(always)]
664 unsafe fn shuffle_bytes(self, indices: Self) -> Self {
665 vqtbl1q_u8(self, indices)
666 }
667
668 #[inline(always)]
669 unsafe fn for_each_64bit_lane<T>(
670 self,
671 mut f: impl FnMut(usize, u64) -> Option<T>,
672 ) -> Option<T> {
673 let this = vreinterpretq_u64_u8(self);
674 let lane = vgetq_lane_u64(this, 0);
675 if let Some(t) = f(0, lane) {
676 return Some(t);
677 }
678 let lane = vgetq_lane_u64(this, 1);
679 if let Some(t) = f(1, lane) {
680 return Some(t);
681 }
682 None
683 }
684 }
685}
686
687#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
688mod tests_x86_64_ssse3 {
689 use core::arch::x86_64::*;
690
691 use crate::util::int::{I32, U32};
692
693 use super::*;
694
695 fn is_runnable() -> bool {
696 std::is_x86_feature_detected!("ssse3")
697 }
698
699 #[target_feature(enable = "ssse3")]
700 unsafe fn load(lanes: [u8; 16]) -> __m128i {
701 __m128i::load_unaligned(&lanes as *const u8)
702 }
703
704 #[target_feature(enable = "ssse3")]
705 unsafe fn unload(v: __m128i) -> [u8; 16] {
706 [
707 _mm_extract_epi8(v, 0).to_bits().low_u8(),
708 _mm_extract_epi8(v, 1).to_bits().low_u8(),
709 _mm_extract_epi8(v, 2).to_bits().low_u8(),
710 _mm_extract_epi8(v, 3).to_bits().low_u8(),
711 _mm_extract_epi8(v, 4).to_bits().low_u8(),
712 _mm_extract_epi8(v, 5).to_bits().low_u8(),
713 _mm_extract_epi8(v, 6).to_bits().low_u8(),
714 _mm_extract_epi8(v, 7).to_bits().low_u8(),
715 _mm_extract_epi8(v, 8).to_bits().low_u8(),
716 _mm_extract_epi8(v, 9).to_bits().low_u8(),
717 _mm_extract_epi8(v, 10).to_bits().low_u8(),
718 _mm_extract_epi8(v, 11).to_bits().low_u8(),
719 _mm_extract_epi8(v, 12).to_bits().low_u8(),
720 _mm_extract_epi8(v, 13).to_bits().low_u8(),
721 _mm_extract_epi8(v, 14).to_bits().low_u8(),
722 _mm_extract_epi8(v, 15).to_bits().low_u8(),
723 ]
724 }
725
726 #[test]
727 fn vector_splat() {
728 #[target_feature(enable = "ssse3")]
729 unsafe fn test() {
730 let v = __m128i::splat(0xAF);
731 assert_eq!(
732 unload(v),
733 [
734 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
735 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
736 ]
737 );
738 }
739 if !is_runnable() {
740 return;
741 }
742 unsafe { test() }
743 }
744
745 #[test]
746 fn vector_is_zero() {
747 #[target_feature(enable = "ssse3")]
748 unsafe fn test() {
749 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
750 assert!(!v.is_zero());
751 let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
752 assert!(v.is_zero());
753 }
754 if !is_runnable() {
755 return;
756 }
757 unsafe { test() }
758 }
759
760 #[test]
761 fn vector_cmpeq() {
762 #[target_feature(enable = "ssse3")]
763 unsafe fn test() {
764 let v1 =
765 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
766 let v2 =
767 load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
768 assert_eq!(
769 unload(v1.cmpeq(v2)),
770 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
771 );
772 }
773 if !is_runnable() {
774 return;
775 }
776 unsafe { test() }
777 }
778
779 #[test]
780 fn vector_and() {
781 #[target_feature(enable = "ssse3")]
782 unsafe fn test() {
783 let v1 =
784 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
785 let v2 =
786 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
787 assert_eq!(
788 unload(v1.and(v2)),
789 [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
790 );
791 }
792 if !is_runnable() {
793 return;
794 }
795 unsafe { test() }
796 }
797
798 #[test]
799 fn vector_or() {
800 #[target_feature(enable = "ssse3")]
801 unsafe fn test() {
802 let v1 =
803 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
804 let v2 =
805 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
806 assert_eq!(
807 unload(v1.or(v2)),
808 [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
809 );
810 }
811 if !is_runnable() {
812 return;
813 }
814 unsafe { test() }
815 }
816
817 #[test]
818 fn vector_shift_8bit_lane_right() {
819 #[target_feature(enable = "ssse3")]
820 unsafe fn test() {
821 let v = load([
822 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
823 ]);
824 assert_eq!(
825 unload(v.shift_8bit_lane_right::<2>()),
826 [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
827 );
828 }
829 if !is_runnable() {
830 return;
831 }
832 unsafe { test() }
833 }
834
835 #[test]
836 fn vector_shift_in_one_byte() {
837 #[target_feature(enable = "ssse3")]
838 unsafe fn test() {
839 let v1 =
840 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
841 let v2 = load([
842 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
843 ]);
844 assert_eq!(
845 unload(v1.shift_in_one_byte(v2)),
846 [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
847 );
848 }
849 if !is_runnable() {
850 return;
851 }
852 unsafe { test() }
853 }
854
855 #[test]
856 fn vector_shift_in_two_bytes() {
857 #[target_feature(enable = "ssse3")]
858 unsafe fn test() {
859 let v1 =
860 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
861 let v2 = load([
862 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
863 ]);
864 assert_eq!(
865 unload(v1.shift_in_two_bytes(v2)),
866 [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
867 );
868 }
869 if !is_runnable() {
870 return;
871 }
872 unsafe { test() }
873 }
874
875 #[test]
876 fn vector_shift_in_three_bytes() {
877 #[target_feature(enable = "ssse3")]
878 unsafe fn test() {
879 let v1 =
880 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
881 let v2 = load([
882 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
883 ]);
884 assert_eq!(
885 unload(v1.shift_in_three_bytes(v2)),
886 [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
887 );
888 }
889 if !is_runnable() {
890 return;
891 }
892 unsafe { test() }
893 }
894
895 #[test]
896 fn vector_shuffle_bytes() {
897 #[target_feature(enable = "ssse3")]
898 unsafe fn test() {
899 let v1 =
900 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
901 let v2 =
902 load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
903 assert_eq!(
904 unload(v1.shuffle_bytes(v2)),
905 [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
906 );
907 }
908 if !is_runnable() {
909 return;
910 }
911 unsafe { test() }
912 }
913
914 #[test]
915 fn vector_for_each_64bit_lane() {
916 #[target_feature(enable = "ssse3")]
917 unsafe fn test() {
918 let v = load([
919 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
920 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
921 ]);
922 let mut lanes = [0u64; 2];
923 v.for_each_64bit_lane(|i, lane| {
924 lanes[i] = lane;
925 None::<()>
926 });
927 assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
928 }
929 if !is_runnable() {
930 return;
931 }
932 unsafe { test() }
933 }
934}
935
936#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
937mod tests_x86_64_avx2 {
938 use core::arch::x86_64::*;
939
940 use crate::util::int::{I32, U32};
941
942 use super::*;
943
944 fn is_runnable() -> bool {
945 std::is_x86_feature_detected!("avx2")
946 }
947
948 #[target_feature(enable = "avx2")]
949 unsafe fn load(lanes: [u8; 32]) -> __m256i {
950 __m256i::load_unaligned(&lanes as *const u8)
951 }
952
953 #[target_feature(enable = "avx2")]
954 unsafe fn load_half(lanes: [u8; 16]) -> __m256i {
955 __m256i::load_half_unaligned(&lanes as *const u8)
956 }
957
958 #[target_feature(enable = "avx2")]
959 unsafe fn unload(v: __m256i) -> [u8; 32] {
960 [
961 _mm256_extract_epi8(v, 0).to_bits().low_u8(),
962 _mm256_extract_epi8(v, 1).to_bits().low_u8(),
963 _mm256_extract_epi8(v, 2).to_bits().low_u8(),
964 _mm256_extract_epi8(v, 3).to_bits().low_u8(),
965 _mm256_extract_epi8(v, 4).to_bits().low_u8(),
966 _mm256_extract_epi8(v, 5).to_bits().low_u8(),
967 _mm256_extract_epi8(v, 6).to_bits().low_u8(),
968 _mm256_extract_epi8(v, 7).to_bits().low_u8(),
969 _mm256_extract_epi8(v, 8).to_bits().low_u8(),
970 _mm256_extract_epi8(v, 9).to_bits().low_u8(),
971 _mm256_extract_epi8(v, 10).to_bits().low_u8(),
972 _mm256_extract_epi8(v, 11).to_bits().low_u8(),
973 _mm256_extract_epi8(v, 12).to_bits().low_u8(),
974 _mm256_extract_epi8(v, 13).to_bits().low_u8(),
975 _mm256_extract_epi8(v, 14).to_bits().low_u8(),
976 _mm256_extract_epi8(v, 15).to_bits().low_u8(),
977 _mm256_extract_epi8(v, 16).to_bits().low_u8(),
978 _mm256_extract_epi8(v, 17).to_bits().low_u8(),
979 _mm256_extract_epi8(v, 18).to_bits().low_u8(),
980 _mm256_extract_epi8(v, 19).to_bits().low_u8(),
981 _mm256_extract_epi8(v, 20).to_bits().low_u8(),
982 _mm256_extract_epi8(v, 21).to_bits().low_u8(),
983 _mm256_extract_epi8(v, 22).to_bits().low_u8(),
984 _mm256_extract_epi8(v, 23).to_bits().low_u8(),
985 _mm256_extract_epi8(v, 24).to_bits().low_u8(),
986 _mm256_extract_epi8(v, 25).to_bits().low_u8(),
987 _mm256_extract_epi8(v, 26).to_bits().low_u8(),
988 _mm256_extract_epi8(v, 27).to_bits().low_u8(),
989 _mm256_extract_epi8(v, 28).to_bits().low_u8(),
990 _mm256_extract_epi8(v, 29).to_bits().low_u8(),
991 _mm256_extract_epi8(v, 30).to_bits().low_u8(),
992 _mm256_extract_epi8(v, 31).to_bits().low_u8(),
993 ]
994 }
995
996 #[test]
997 fn vector_splat() {
998 #[target_feature(enable = "avx2")]
999 unsafe fn test() {
1000 let v = __m256i::splat(0xAF);
1001 assert_eq!(
1002 unload(v),
1003 [
1004 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1005 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1006 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1007 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1008 ]
1009 );
1010 }
1011 if !is_runnable() {
1012 return;
1013 }
1014 unsafe { test() }
1015 }
1016
1017 #[test]
1018 fn vector_is_zero() {
1019 #[target_feature(enable = "avx2")]
1020 unsafe fn test() {
1021 let v = load([
1022 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1023 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1024 ]);
1025 assert!(!v.is_zero());
1026 let v = load([
1027 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1028 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1029 ]);
1030 assert!(v.is_zero());
1031 }
1032 if !is_runnable() {
1033 return;
1034 }
1035 unsafe { test() }
1036 }
1037
1038 #[test]
1039 fn vector_cmpeq() {
1040 #[target_feature(enable = "avx2")]
1041 unsafe fn test() {
1042 let v1 = load([
1043 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1044 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1,
1045 ]);
1046 let v2 = load([
1047 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
1048 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
1049 ]);
1050 assert_eq!(
1051 unload(v1.cmpeq(v2)),
1052 [
1053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF
1055 ]
1056 );
1057 }
1058 if !is_runnable() {
1059 return;
1060 }
1061 unsafe { test() }
1062 }
1063
1064 #[test]
1065 fn vector_and() {
1066 #[target_feature(enable = "avx2")]
1067 unsafe fn test() {
1068 let v1 = load([
1069 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1070 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1071 ]);
1072 let v2 = load([
1073 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075 ]);
1076 assert_eq!(
1077 unload(v1.and(v2)),
1078 [
1079 0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1081 ]
1082 );
1083 }
1084 if !is_runnable() {
1085 return;
1086 }
1087 unsafe { test() }
1088 }
1089
1090 #[test]
1091 fn vector_or() {
1092 #[target_feature(enable = "avx2")]
1093 unsafe fn test() {
1094 let v1 = load([
1095 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1097 ]);
1098 let v2 = load([
1099 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1101 ]);
1102 assert_eq!(
1103 unload(v1.or(v2)),
1104 [
1105 0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1107 ]
1108 );
1109 }
1110 if !is_runnable() {
1111 return;
1112 }
1113 unsafe { test() }
1114 }
1115
1116 #[test]
1117 fn vector_shift_8bit_lane_right() {
1118 #[target_feature(enable = "avx2")]
1119 unsafe fn test() {
1120 let v = load([
1121 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1123 ]);
1124 assert_eq!(
1125 unload(v.shift_8bit_lane_right::<2>()),
1126 [
1127 0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1129 ]
1130 );
1131 }
1132 if !is_runnable() {
1133 return;
1134 }
1135 unsafe { test() }
1136 }
1137
1138 #[test]
1139 fn vector_shift_in_one_byte() {
1140 #[target_feature(enable = "avx2")]
1141 unsafe fn test() {
1142 let v1 = load([
1143 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1144 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1145 ]);
1146 let v2 = load([
1147 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1148 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1149 63, 64,
1150 ]);
1151 assert_eq!(
1152 unload(v1.shift_in_one_byte(v2)),
1153 [
1154 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1155 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
1156 31,
1157 ],
1158 );
1159 }
1160 if !is_runnable() {
1161 return;
1162 }
1163 unsafe { test() }
1164 }
1165
1166 #[test]
1167 fn vector_shift_in_two_bytes() {
1168 #[target_feature(enable = "avx2")]
1169 unsafe fn test() {
1170 let v1 = load([
1171 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1172 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1173 ]);
1174 let v2 = load([
1175 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1176 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1177 63, 64,
1178 ]);
1179 assert_eq!(
1180 unload(v1.shift_in_two_bytes(v2)),
1181 [
1182 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1183 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
1184 30,
1185 ],
1186 );
1187 }
1188 if !is_runnable() {
1189 return;
1190 }
1191 unsafe { test() }
1192 }
1193
1194 #[test]
1195 fn vector_shift_in_three_bytes() {
1196 #[target_feature(enable = "avx2")]
1197 unsafe fn test() {
1198 let v1 = load([
1199 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1200 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1201 ]);
1202 let v2 = load([
1203 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1204 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1205 63, 64,
1206 ]);
1207 assert_eq!(
1208 unload(v1.shift_in_three_bytes(v2)),
1209 [
1210 62, 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1211 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
1212 29,
1213 ],
1214 );
1215 }
1216 if !is_runnable() {
1217 return;
1218 }
1219 unsafe { test() }
1220 }
1221
1222 #[test]
1223 fn vector_shuffle_bytes() {
1224 #[target_feature(enable = "avx2")]
1225 unsafe fn test() {
1226 let v1 = load([
1227 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1228 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1229 ]);
1230 let v2 = load([
1231 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16,
1232 16, 16, 20, 20, 20, 20, 24, 24, 24, 24, 28, 28, 28, 28,
1233 ]);
1234 assert_eq!(
1235 unload(v1.shuffle_bytes(v2)),
1236 [
1237 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13, 17,
1238 17, 17, 17, 21, 21, 21, 21, 25, 25, 25, 25, 29, 29, 29,
1239 29
1240 ],
1241 );
1242 }
1243 if !is_runnable() {
1244 return;
1245 }
1246 unsafe { test() }
1247 }
1248
1249 #[test]
1250 fn vector_for_each_64bit_lane() {
1251 #[target_feature(enable = "avx2")]
1252 unsafe fn test() {
1253 let v = load([
1254 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1255 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
1256 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
1257 0x1F, 0x20,
1258 ]);
1259 let mut lanes = [0u64; 4];
1260 v.for_each_64bit_lane(|i, lane| {
1261 lanes[i] = lane;
1262 None::<()>
1263 });
1264 assert_eq!(
1265 lanes,
1266 [
1267 0x0807060504030201,
1268 0x100F0E0D0C0B0A09,
1269 0x1817161514131211,
1270 0x201F1E1D1C1B1A19
1271 ]
1272 );
1273 }
1274 if !is_runnable() {
1275 return;
1276 }
1277 unsafe { test() }
1278 }
1279
1280 #[test]
1281 fn fat_vector_half_shift_in_one_byte() {
1282 #[target_feature(enable = "avx2")]
1283 unsafe fn test() {
1284 let v1 = load_half([
1285 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1286 ]);
1287 let v2 = load_half([
1288 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1289 ]);
1290 assert_eq!(
1291 unload(v1.half_shift_in_one_byte(v2)),
1292 [
1293 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32,
1294 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1295 ],
1296 );
1297 }
1298 if !is_runnable() {
1299 return;
1300 }
1301 unsafe { test() }
1302 }
1303
1304 #[test]
1305 fn fat_vector_half_shift_in_two_bytes() {
1306 #[target_feature(enable = "avx2")]
1307 unsafe fn test() {
1308 let v1 = load_half([
1309 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1310 ]);
1311 let v2 = load_half([
1312 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1313 ]);
1314 assert_eq!(
1315 unload(v1.half_shift_in_two_bytes(v2)),
1316 [
1317 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31,
1318 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1319 ],
1320 );
1321 }
1322 if !is_runnable() {
1323 return;
1324 }
1325 unsafe { test() }
1326 }
1327
1328 #[test]
1329 fn fat_vector_half_shift_in_three_bytes() {
1330 #[target_feature(enable = "avx2")]
1331 unsafe fn test() {
1332 let v1 = load_half([
1333 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1334 ]);
1335 let v2 = load_half([
1336 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1337 ]);
1338 assert_eq!(
1339 unload(v1.half_shift_in_three_bytes(v2)),
1340 [
1341 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 30,
1342 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1343 ],
1344 );
1345 }
1346 if !is_runnable() {
1347 return;
1348 }
1349 unsafe { test() }
1350 }
1351
1352 #[test]
1353 fn fat_vector_swap_halves() {
1354 #[target_feature(enable = "avx2")]
1355 unsafe fn test() {
1356 let v = load([
1357 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1358 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1359 ]);
1360 assert_eq!(
1361 unload(v.swap_halves()),
1362 [
1363 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
1364 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1365 16,
1366 ],
1367 );
1368 }
1369 if !is_runnable() {
1370 return;
1371 }
1372 unsafe { test() }
1373 }
1374
1375 #[test]
1376 fn fat_vector_interleave_low_8bit_lanes() {
1377 #[target_feature(enable = "avx2")]
1378 unsafe fn test() {
1379 let v1 = load([
1380 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1381 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1382 ]);
1383 let v2 = load([
1384 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1385 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1386 63, 64,
1387 ]);
1388 assert_eq!(
1389 unload(v1.interleave_low_8bit_lanes(v2)),
1390 [
1391 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40,
1392 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
1393 24, 56,
1394 ],
1395 );
1396 }
1397 if !is_runnable() {
1398 return;
1399 }
1400 unsafe { test() }
1401 }
1402
1403 #[test]
1404 fn fat_vector_interleave_high_8bit_lanes() {
1405 #[target_feature(enable = "avx2")]
1406 unsafe fn test() {
1407 let v1 = load([
1408 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1409 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1410 ]);
1411 let v2 = load([
1412 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1413 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1414 63, 64,
1415 ]);
1416 assert_eq!(
1417 unload(v1.interleave_high_8bit_lanes(v2)),
1418 [
1419 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16,
1420 48, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31,
1421 63, 32, 64,
1422 ],
1423 );
1424 }
1425 if !is_runnable() {
1426 return;
1427 }
1428 unsafe { test() }
1429 }
1430
1431 #[test]
1432 fn fat_vector_for_each_low_64bit_lane() {
1433 #[target_feature(enable = "avx2")]
1434 unsafe fn test() {
1435 let v1 = load([
1436 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1437 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
1438 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
1439 0x1F, 0x20,
1440 ]);
1441 let v2 = load([
1442 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A,
1443 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34,
1444 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E,
1445 0x3F, 0x40,
1446 ]);
1447 let mut lanes = [0u64; 4];
1448 v1.for_each_low_64bit_lane(v2, |i, lane| {
1449 lanes[i] = lane;
1450 None::<()>
1451 });
1452 assert_eq!(
1453 lanes,
1454 [
1455 0x0807060504030201,
1456 0x100F0E0D0C0B0A09,
1457 0x2827262524232221,
1458 0x302F2E2D2C2B2A29
1459 ]
1460 );
1461 }
1462 if !is_runnable() {
1463 return;
1464 }
1465 unsafe { test() }
1466 }
1467}
1468
1469#[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))]
1470mod tests_aarch64_neon {
1471 use core::arch::aarch64::*;
1472
1473 use super::*;
1474
1475 #[target_feature(enable = "neon")]
1476 unsafe fn load(lanes: [u8; 16]) -> uint8x16_t {
1477 uint8x16_t::load_unaligned(&lanes as *const u8)
1478 }
1479
1480 #[target_feature(enable = "neon")]
1481 unsafe fn unload(v: uint8x16_t) -> [u8; 16] {
1482 [
1483 vgetq_lane_u8(v, 0),
1484 vgetq_lane_u8(v, 1),
1485 vgetq_lane_u8(v, 2),
1486 vgetq_lane_u8(v, 3),
1487 vgetq_lane_u8(v, 4),
1488 vgetq_lane_u8(v, 5),
1489 vgetq_lane_u8(v, 6),
1490 vgetq_lane_u8(v, 7),
1491 vgetq_lane_u8(v, 8),
1492 vgetq_lane_u8(v, 9),
1493 vgetq_lane_u8(v, 10),
1494 vgetq_lane_u8(v, 11),
1495 vgetq_lane_u8(v, 12),
1496 vgetq_lane_u8(v, 13),
1497 vgetq_lane_u8(v, 14),
1498 vgetq_lane_u8(v, 15),
1499 ]
1500 }
1501
1502 // Example functions. These don't test the Vector traits, but rather,
1503 // specific NEON instructions. They are basically little experiments I
1504 // wrote to figure out what an instruction does since their descriptions
1505 // are so dense. I decided to keep the experiments around as example tests
1506 // in case there' useful.
1507
1508 #[test]
1509 fn example_vmaxvq_u8_non_zero() {
1510 #[target_feature(enable = "neon")]
1511 unsafe fn example() {
1512 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1513 assert_eq!(vmaxvq_u8(v), 1);
1514 }
1515 unsafe { example() }
1516 }
1517
1518 #[test]
1519 fn example_vmaxvq_u8_zero() {
1520 #[target_feature(enable = "neon")]
1521 unsafe fn example() {
1522 let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1523 assert_eq!(vmaxvq_u8(v), 0);
1524 }
1525 unsafe { example() }
1526 }
1527
1528 #[test]
1529 fn example_vpmaxq_u8_non_zero() {
1530 #[target_feature(enable = "neon")]
1531 unsafe fn example() {
1532 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1533 let r = vpmaxq_u8(v, v);
1534 assert_eq!(
1535 unload(r),
1536 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
1537 );
1538 }
1539 unsafe { example() }
1540 }
1541
1542 #[test]
1543 fn example_vpmaxq_u8_self() {
1544 #[target_feature(enable = "neon")]
1545 unsafe fn example() {
1546 let v =
1547 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1548 let r = vpmaxq_u8(v, v);
1549 assert_eq!(
1550 unload(r),
1551 [2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16]
1552 );
1553 }
1554 unsafe { example() }
1555 }
1556
1557 #[test]
1558 fn example_vpmaxq_u8_other() {
1559 #[target_feature(enable = "neon")]
1560 unsafe fn example() {
1561 let v1 =
1562 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1563 let v2 = load([
1564 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1565 ]);
1566 let r = vpmaxq_u8(v1, v2);
1567 assert_eq!(
1568 unload(r),
1569 [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32]
1570 );
1571 }
1572 unsafe { example() }
1573 }
1574
1575 // Now we test the actual methods on the Vector trait.
1576
1577 #[test]
1578 fn vector_splat() {
1579 #[target_feature(enable = "neon")]
1580 unsafe fn test() {
1581 let v = uint8x16_t::splat(0xAF);
1582 assert_eq!(
1583 unload(v),
1584 [
1585 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1586 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
1587 ]
1588 );
1589 }
1590 unsafe { test() }
1591 }
1592
1593 #[test]
1594 fn vector_is_zero() {
1595 #[target_feature(enable = "neon")]
1596 unsafe fn test() {
1597 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1598 assert!(!v.is_zero());
1599 let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1600 assert!(v.is_zero());
1601 }
1602 unsafe { test() }
1603 }
1604
1605 #[test]
1606 fn vector_cmpeq() {
1607 #[target_feature(enable = "neon")]
1608 unsafe fn test() {
1609 let v1 =
1610 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
1611 let v2 =
1612 load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
1613 assert_eq!(
1614 unload(v1.cmpeq(v2)),
1615 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
1616 );
1617 }
1618 unsafe { test() }
1619 }
1620
1621 #[test]
1622 fn vector_and() {
1623 #[target_feature(enable = "neon")]
1624 unsafe fn test() {
1625 let v1 =
1626 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1627 let v2 =
1628 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1629 assert_eq!(
1630 unload(v1.and(v2)),
1631 [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1632 );
1633 }
1634 unsafe { test() }
1635 }
1636
1637 #[test]
1638 fn vector_or() {
1639 #[target_feature(enable = "neon")]
1640 unsafe fn test() {
1641 let v1 =
1642 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1643 let v2 =
1644 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1645 assert_eq!(
1646 unload(v1.or(v2)),
1647 [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1648 );
1649 }
1650 unsafe { test() }
1651 }
1652
1653 #[test]
1654 fn vector_shift_8bit_lane_right() {
1655 #[target_feature(enable = "neon")]
1656 unsafe fn test() {
1657 let v = load([
1658 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1659 ]);
1660 assert_eq!(
1661 unload(v.shift_8bit_lane_right::<2>()),
1662 [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1663 );
1664 }
1665 unsafe { test() }
1666 }
1667
1668 #[test]
1669 fn vector_shift_in_one_byte() {
1670 #[target_feature(enable = "neon")]
1671 unsafe fn test() {
1672 let v1 =
1673 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1674 let v2 = load([
1675 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1676 ]);
1677 assert_eq!(
1678 unload(v1.shift_in_one_byte(v2)),
1679 [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
1680 );
1681 }
1682 unsafe { test() }
1683 }
1684
1685 #[test]
1686 fn vector_shift_in_two_bytes() {
1687 #[target_feature(enable = "neon")]
1688 unsafe fn test() {
1689 let v1 =
1690 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1691 let v2 = load([
1692 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1693 ]);
1694 assert_eq!(
1695 unload(v1.shift_in_two_bytes(v2)),
1696 [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
1697 );
1698 }
1699 unsafe { test() }
1700 }
1701
1702 #[test]
1703 fn vector_shift_in_three_bytes() {
1704 #[target_feature(enable = "neon")]
1705 unsafe fn test() {
1706 let v1 =
1707 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1708 let v2 = load([
1709 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1710 ]);
1711 assert_eq!(
1712 unload(v1.shift_in_three_bytes(v2)),
1713 [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
1714 );
1715 }
1716 unsafe { test() }
1717 }
1718
1719 #[test]
1720 fn vector_shuffle_bytes() {
1721 #[target_feature(enable = "neon")]
1722 unsafe fn test() {
1723 let v1 =
1724 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1725 let v2 =
1726 load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
1727 assert_eq!(
1728 unload(v1.shuffle_bytes(v2)),
1729 [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
1730 );
1731 }
1732 unsafe { test() }
1733 }
1734
1735 #[test]
1736 fn vector_for_each_64bit_lane() {
1737 #[target_feature(enable = "neon")]
1738 unsafe fn test() {
1739 let v = load([
1740 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1741 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
1742 ]);
1743 let mut lanes = [0u64; 2];
1744 v.for_each_64bit_lane(|i, lane| {
1745 lanes[i] = lane;
1746 None::<()>
1747 });
1748 assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
1749 }
1750 unsafe { test() }
1751 }
1752}
1753