1// NOTE: The descriptions for each of the vector methods on the traits below
2// are pretty inscrutable. For this reason, there are tests for every method
3// on for every trait impl below. If you're confused about what an op does,
4// consult its test. (They probably should be doc tests, but I couldn't figure
5// out how to write them in a non-annoying way.)
6
7use core::{
8 fmt::Debug,
9 panic::{RefUnwindSafe, UnwindSafe},
10};
11
12/// A trait for describing vector operations used by vectorized searchers.
13///
14/// The trait is highly constrained to low level vector operations needed for
15/// the specific algorithms used in this crate. In general, it was invented
16/// mostly to be generic over x86's __m128i and __m256i types. At time of
17/// writing, it also supports wasm and aarch64 128-bit vector types as well.
18///
19/// # Safety
20///
21/// All methods are not safe since they are intended to be implemented using
22/// vendor intrinsics, which are also not safe. Callers must ensure that
23/// the appropriate target features are enabled in the calling function,
24/// and that the current CPU supports them. All implementations should
25/// avoid marking the routines with `#[target_feature]` and instead mark
26/// them as `#[inline(always)]` to ensure they get appropriately inlined.
27/// (`inline(always)` cannot be used with target_feature.)
28pub(crate) trait Vector:
29 Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe
30{
31 /// The number of bits in the vector.
32 const BITS: usize;
33 /// The number of bytes in the vector. That is, this is the size of the
34 /// vector in memory.
35 const BYTES: usize;
36
37 /// Create a vector with 8-bit lanes with the given byte repeated into each
38 /// lane.
39 ///
40 /// # Safety
41 ///
42 /// Callers must ensure that this is okay to call in the current target for
43 /// the current CPU.
44 unsafe fn splat(byte: u8) -> Self;
45
46 /// Read a vector-size number of bytes from the given pointer. The pointer
47 /// does not need to be aligned.
48 ///
49 /// # Safety
50 ///
51 /// Callers must ensure that this is okay to call in the current target for
52 /// the current CPU.
53 ///
54 /// Callers must guarantee that at least `BYTES` bytes are readable from
55 /// `data`.
56 unsafe fn load_unaligned(data: *const u8) -> Self;
57
58 /// Returns true if and only if this vector has zero in all of its lanes.
59 ///
60 /// # Safety
61 ///
62 /// Callers must ensure that this is okay to call in the current target for
63 /// the current CPU.
64 unsafe fn is_zero(self) -> bool;
65
66 /// Do an 8-bit pairwise equality check. If lane `i` is equal in this
67 /// vector and the one given, then lane `i` in the resulting vector is set
68 /// to `0xFF`. Otherwise, it is set to `0x00`.
69 ///
70 /// # Safety
71 ///
72 /// Callers must ensure that this is okay to call in the current target for
73 /// the current CPU.
74 unsafe fn cmpeq(self, vector2: Self) -> Self;
75
76 /// Perform a bitwise 'and' of this vector and the one given and return
77 /// the result.
78 ///
79 /// # Safety
80 ///
81 /// Callers must ensure that this is okay to call in the current target for
82 /// the current CPU.
83 unsafe fn and(self, vector2: Self) -> Self;
84
85 /// Perform a bitwise 'or' of this vector and the one given and return
86 /// the result.
87 ///
88 /// # Safety
89 ///
90 /// Callers must ensure that this is okay to call in the current target for
91 /// the current CPU.
92 #[allow(dead_code)] // unused, but useful enough to keep around?
93 unsafe fn or(self, vector2: Self) -> Self;
94
95 /// Shift each 8-bit lane in this vector to the right by the number of
96 /// bits indictated by the `BITS` type parameter.
97 ///
98 /// # Safety
99 ///
100 /// Callers must ensure that this is okay to call in the current target for
101 /// the current CPU.
102 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self;
103
104 /// Shift this vector to the left by one byte and shift the most
105 /// significant byte of `vector2` into the least significant position of
106 /// this vector.
107 ///
108 /// Stated differently, this behaves as if `self` and `vector2` were
109 /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
110 /// right by `Self::BYTES - 1` bytes.
111 ///
112 /// With respect to the Teddy algorithm, `vector2` is usually a previous
113 /// `Self::BYTES` chunk from the haystack and `self` is the chunk
114 /// immediately following it. This permits combining the last two bytes
115 /// from the previous chunk (`vector2`) with the first `Self::BYTES - 1`
116 /// bytes from the current chunk. This permits aligning the result of
117 /// various shuffles so that they can be and-ed together and a possible
118 /// candidate discovered.
119 ///
120 /// # Safety
121 ///
122 /// Callers must ensure that this is okay to call in the current target for
123 /// the current CPU.
124 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self;
125
126 /// Shift this vector to the left by two bytes and shift the two most
127 /// significant bytes of `vector2` into the least significant position of
128 /// this vector.
129 ///
130 /// Stated differently, this behaves as if `self` and `vector2` were
131 /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
132 /// right by `Self::BYTES - 2` bytes.
133 ///
134 /// With respect to the Teddy algorithm, `vector2` is usually a previous
135 /// `Self::BYTES` chunk from the haystack and `self` is the chunk
136 /// immediately following it. This permits combining the last two bytes
137 /// from the previous chunk (`vector2`) with the first `Self::BYTES - 2`
138 /// bytes from the current chunk. This permits aligning the result of
139 /// various shuffles so that they can be and-ed together and a possible
140 /// candidate discovered.
141 ///
142 /// # Safety
143 ///
144 /// Callers must ensure that this is okay to call in the current target for
145 /// the current CPU.
146 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self;
147
148 /// Shift this vector to the left by three bytes and shift the three most
149 /// significant bytes of `vector2` into the least significant position of
150 /// this vector.
151 ///
152 /// Stated differently, this behaves as if `self` and `vector2` were
153 /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
154 /// right by `Self::BYTES - 3` bytes.
155 ///
156 /// With respect to the Teddy algorithm, `vector2` is usually a previous
157 /// `Self::BYTES` chunk from the haystack and `self` is the chunk
158 /// immediately following it. This permits combining the last three bytes
159 /// from the previous chunk (`vector2`) with the first `Self::BYTES - 3`
160 /// bytes from the current chunk. This permits aligning the result of
161 /// various shuffles so that they can be and-ed together and a possible
162 /// candidate discovered.
163 ///
164 /// # Safety
165 ///
166 /// Callers must ensure that this is okay to call in the current target for
167 /// the current CPU.
168 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self;
169
170 /// Shuffles the bytes in this vector according to the indices in each of
171 /// the corresponding lanes in `indices`.
172 ///
173 /// If `i` is the index of corresponding lanes, `A` is this vector, `B` is
174 /// indices and `C` is the resulting vector, then `C = A[B[i]]`.
175 ///
176 /// # Safety
177 ///
178 /// Callers must ensure that this is okay to call in the current target for
179 /// the current CPU.
180 unsafe fn shuffle_bytes(self, indices: Self) -> Self;
181
182 /// Call the provided function for each 64-bit lane in this vector. The
183 /// given function is provided the lane index and lane value as a `u64`.
184 ///
185 /// If `f` returns `Some`, then iteration over the lanes is stopped and the
186 /// value is returned. Otherwise, this returns `None`.
187 ///
188 /// # Notes
189 ///
190 /// Conceptually it would be nice if we could have a
191 /// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is
192 /// tricky given Rust's [current support for const generics][support].
193 /// And even if we could, it would be tricky to write generic code over
194 /// it. (Not impossible. We could introduce another layer that requires
195 /// `AsRef<[u64]>` or something.)
196 ///
197 /// [support]: https://github.com/rust-lang/rust/issues/60551
198 ///
199 /// # Safety
200 ///
201 /// Callers must ensure that this is okay to call in the current target for
202 /// the current CPU.
203 unsafe fn for_each_64bit_lane<T>(
204 self,
205 f: impl FnMut(usize, u64) -> Option<T>,
206 ) -> Option<T>;
207}
208
209/// This trait extends the `Vector` trait with additional operations to support
210/// Fat Teddy.
211///
212/// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as
213/// the vector size) instead of the full size of a vector per iteration. For
214/// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr
215/// but Fat Teddy reads 16 bytes at a time.
216///
217/// Fat Teddy is useful when searching for a large number of literals.
218/// The extra number of buckets spreads the literals out more and reduces
219/// verification time.
220///
221/// Currently we only implement this for AVX on x86_64. It would be nice to
222/// implement this for SSE on x86_64 and NEON on aarch64, with the latter two
223/// only reading 8 bytes at a time. It's not clear how well it would work, but
224/// there are some tricky things to figure out in terms of implementation. The
225/// `half_shift_in_{one,two,three}_bytes` methods in particular are probably
226/// the trickiest of the bunch. For AVX2, these are implemented by taking
227/// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit
228/// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8`
229/// operates on the full 128-bit vector and not on each 64-bit half.) I didn't
230/// do a careful survey of NEON to see if it could easily support these
231/// operations.
232pub(crate) trait FatVector: Vector {
233 type Half: Vector;
234
235 /// Read a half-vector-size number of bytes from the given pointer, and
236 /// broadcast it across both halfs of a full vector. The pointer does not
237 /// need to be aligned.
238 ///
239 /// # Safety
240 ///
241 /// Callers must ensure that this is okay to call in the current target for
242 /// the current CPU.
243 ///
244 /// Callers must guarantee that at least `Self::HALF::BYTES` bytes are
245 /// readable from `data`.
246 unsafe fn load_half_unaligned(data: *const u8) -> Self;
247
248 /// Like `Vector::shift_in_one_byte`, except this is done for each half
249 /// of the vector instead.
250 ///
251 /// # Safety
252 ///
253 /// Callers must ensure that this is okay to call in the current target for
254 /// the current CPU.
255 unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self;
256
257 /// Like `Vector::shift_in_two_bytes`, except this is done for each half
258 /// of the vector instead.
259 ///
260 /// # Safety
261 ///
262 /// Callers must ensure that this is okay to call in the current target for
263 /// the current CPU.
264 unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self;
265
266 /// Like `Vector::shift_in_two_bytes`, except this is done for each half
267 /// of the vector instead.
268 ///
269 /// # Safety
270 ///
271 /// Callers must ensure that this is okay to call in the current target for
272 /// the current CPU.
273 unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self;
274
275 /// Swap the 128-bit lanes in this vector.
276 ///
277 /// # Safety
278 ///
279 /// Callers must ensure that this is okay to call in the current target for
280 /// the current CPU.
281 unsafe fn swap_halves(self) -> Self;
282
283 /// Unpack and interleave the 8-bit lanes from the low 128 bits of each
284 /// vector and return the result.
285 ///
286 /// # Safety
287 ///
288 /// Callers must ensure that this is okay to call in the current target for
289 /// the current CPU.
290 unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self;
291
292 /// Unpack and interleave the 8-bit lanes from the high 128 bits of each
293 /// vector and return the result.
294 ///
295 /// # Safety
296 ///
297 /// Callers must ensure that this is okay to call in the current target for
298 /// the current CPU.
299 unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self;
300
301 /// Call the provided function for each 64-bit lane in the lower half
302 /// of this vector and then in the other vector. The given function is
303 /// provided the lane index and lane value as a `u64`. (The high 128-bits
304 /// of each vector are ignored.)
305 ///
306 /// If `f` returns `Some`, then iteration over the lanes is stopped and the
307 /// value is returned. Otherwise, this returns `None`.
308 ///
309 /// # Safety
310 ///
311 /// Callers must ensure that this is okay to call in the current target for
312 /// the current CPU.
313 unsafe fn for_each_low_64bit_lane<T>(
314 self,
315 vector2: Self,
316 f: impl FnMut(usize, u64) -> Option<T>,
317 ) -> Option<T>;
318}
319
320#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
321mod x86_64_ssse3 {
322 use core::arch::x86_64::*;
323
324 use crate::util::int::{I32, I8};
325
326 use super::Vector;
327
328 impl Vector for __m128i {
329 const BITS: usize = 128;
330 const BYTES: usize = 16;
331
332 #[inline(always)]
333 unsafe fn splat(byte: u8) -> __m128i {
334 _mm_set1_epi8(i8::from_bits(byte))
335 }
336
337 #[inline(always)]
338 unsafe fn load_unaligned(data: *const u8) -> __m128i {
339 _mm_loadu_si128(data.cast::<__m128i>())
340 }
341
342 #[inline(always)]
343 unsafe fn is_zero(self) -> bool {
344 let cmp = self.cmpeq(Self::splat(0));
345 _mm_movemask_epi8(cmp).to_bits() == 0xFFFF
346 }
347
348 #[inline(always)]
349 unsafe fn cmpeq(self, vector2: Self) -> __m128i {
350 _mm_cmpeq_epi8(self, vector2)
351 }
352
353 #[inline(always)]
354 unsafe fn and(self, vector2: Self) -> __m128i {
355 _mm_and_si128(self, vector2)
356 }
357
358 #[inline(always)]
359 unsafe fn or(self, vector2: Self) -> __m128i {
360 _mm_or_si128(self, vector2)
361 }
362
363 #[inline(always)]
364 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
365 // Apparently there is no _mm_srli_epi8, so we emulate it by
366 // shifting 16-bit integers and masking out the high nybble of each
367 // 8-bit lane (since that nybble will contain bits from the low
368 // nybble of the previous lane).
369 let lomask = Self::splat(0xF);
370 _mm_srli_epi16(self, BITS).and(lomask)
371 }
372
373 #[inline(always)]
374 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
375 _mm_alignr_epi8(self, vector2, 15)
376 }
377
378 #[inline(always)]
379 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
380 _mm_alignr_epi8(self, vector2, 14)
381 }
382
383 #[inline(always)]
384 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
385 _mm_alignr_epi8(self, vector2, 13)
386 }
387
388 #[inline(always)]
389 unsafe fn shuffle_bytes(self, indices: Self) -> Self {
390 _mm_shuffle_epi8(self, indices)
391 }
392
393 #[inline(always)]
394 unsafe fn for_each_64bit_lane<T>(
395 self,
396 mut f: impl FnMut(usize, u64) -> Option<T>,
397 ) -> Option<T> {
398 // We could just use _mm_extract_epi64 here, but that requires
399 // SSE 4.1. It isn't necessarily a problem to just require SSE 4.1,
400 // but everything else works with SSSE3 so we stick to that subset.
401 let lanes: [u64; 2] = core::mem::transmute(self);
402 if let Some(t) = f(0, lanes[0]) {
403 return Some(t);
404 }
405 if let Some(t) = f(1, lanes[1]) {
406 return Some(t);
407 }
408 None
409 }
410 }
411}
412
413#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
414mod x86_64_avx2 {
415 use core::arch::x86_64::*;
416
417 use crate::util::int::{I32, I64, I8};
418
419 use super::{FatVector, Vector};
420
421 impl Vector for __m256i {
422 const BITS: usize = 256;
423 const BYTES: usize = 32;
424
425 #[inline(always)]
426 unsafe fn splat(byte: u8) -> __m256i {
427 _mm256_set1_epi8(i8::from_bits(byte))
428 }
429
430 #[inline(always)]
431 unsafe fn load_unaligned(data: *const u8) -> __m256i {
432 _mm256_loadu_si256(data.cast::<__m256i>())
433 }
434
435 #[inline(always)]
436 unsafe fn is_zero(self) -> bool {
437 let cmp = self.cmpeq(Self::splat(0));
438 _mm256_movemask_epi8(cmp).to_bits() == 0xFFFFFFFF
439 }
440
441 #[inline(always)]
442 unsafe fn cmpeq(self, vector2: Self) -> __m256i {
443 _mm256_cmpeq_epi8(self, vector2)
444 }
445
446 #[inline(always)]
447 unsafe fn and(self, vector2: Self) -> __m256i {
448 _mm256_and_si256(self, vector2)
449 }
450
451 #[inline(always)]
452 unsafe fn or(self, vector2: Self) -> __m256i {
453 _mm256_or_si256(self, vector2)
454 }
455
456 #[inline(always)]
457 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
458 let lomask = Self::splat(0xF);
459 _mm256_srli_epi16(self, BITS).and(lomask)
460 }
461
462 #[inline(always)]
463 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
464 // Credit goes to jneem for figuring this out:
465 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
466 //
467 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
468 // PALIGNR instructions, which is not what we want, so we need to
469 // do some extra shuffling.
470 let v = _mm256_permute2x128_si256(vector2, self, 0x21);
471 _mm256_alignr_epi8(self, v, 15)
472 }
473
474 #[inline(always)]
475 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
476 // Credit goes to jneem for figuring this out:
477 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
478 //
479 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
480 // PALIGNR instructions, which is not what we want, so we need to
481 // do some extra shuffling.
482 let v = _mm256_permute2x128_si256(vector2, self, 0x21);
483 _mm256_alignr_epi8(self, v, 14)
484 }
485
486 #[inline(always)]
487 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
488 // Credit goes to jneem for figuring this out:
489 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
490 //
491 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
492 // PALIGNR instructions, which is not what we want, so we need to
493 // do some extra shuffling.
494 let v = _mm256_permute2x128_si256(vector2, self, 0x21);
495 _mm256_alignr_epi8(self, v, 13)
496 }
497
498 #[inline(always)]
499 unsafe fn shuffle_bytes(self, indices: Self) -> Self {
500 _mm256_shuffle_epi8(self, indices)
501 }
502
503 #[inline(always)]
504 unsafe fn for_each_64bit_lane<T>(
505 self,
506 mut f: impl FnMut(usize, u64) -> Option<T>,
507 ) -> Option<T> {
508 // NOTE: At one point in the past, I used transmute to this to
509 // get a [u64; 4], but it turned out to lead to worse codegen IIRC.
510 // I've tried it more recently, and it looks like that's no longer
511 // the case. But since there's no difference, we stick with the
512 // slightly more complicated but transmute-free version.
513 let lane = _mm256_extract_epi64(self, 0).to_bits();
514 if let Some(t) = f(0, lane) {
515 return Some(t);
516 }
517 let lane = _mm256_extract_epi64(self, 1).to_bits();
518 if let Some(t) = f(1, lane) {
519 return Some(t);
520 }
521 let lane = _mm256_extract_epi64(self, 2).to_bits();
522 if let Some(t) = f(2, lane) {
523 return Some(t);
524 }
525 let lane = _mm256_extract_epi64(self, 3).to_bits();
526 if let Some(t) = f(3, lane) {
527 return Some(t);
528 }
529 None
530 }
531 }
532
533 impl FatVector for __m256i {
534 type Half = __m128i;
535
536 #[inline(always)]
537 unsafe fn load_half_unaligned(data: *const u8) -> Self {
538 let half = Self::Half::load_unaligned(data);
539 _mm256_broadcastsi128_si256(half)
540 }
541
542 #[inline(always)]
543 unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self {
544 _mm256_alignr_epi8(self, vector2, 15)
545 }
546
547 #[inline(always)]
548 unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self {
549 _mm256_alignr_epi8(self, vector2, 14)
550 }
551
552 #[inline(always)]
553 unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self {
554 _mm256_alignr_epi8(self, vector2, 13)
555 }
556
557 #[inline(always)]
558 unsafe fn swap_halves(self) -> Self {
559 _mm256_permute4x64_epi64(self, 0x4E)
560 }
561
562 #[inline(always)]
563 unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self {
564 _mm256_unpacklo_epi8(self, vector2)
565 }
566
567 #[inline(always)]
568 unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self {
569 _mm256_unpackhi_epi8(self, vector2)
570 }
571
572 #[inline(always)]
573 unsafe fn for_each_low_64bit_lane<T>(
574 self,
575 vector2: Self,
576 mut f: impl FnMut(usize, u64) -> Option<T>,
577 ) -> Option<T> {
578 let lane = _mm256_extract_epi64(self, 0).to_bits();
579 if let Some(t) = f(0, lane) {
580 return Some(t);
581 }
582 let lane = _mm256_extract_epi64(self, 1).to_bits();
583 if let Some(t) = f(1, lane) {
584 return Some(t);
585 }
586 let lane = _mm256_extract_epi64(vector2, 0).to_bits();
587 if let Some(t) = f(2, lane) {
588 return Some(t);
589 }
590 let lane = _mm256_extract_epi64(vector2, 1).to_bits();
591 if let Some(t) = f(3, lane) {
592 return Some(t);
593 }
594 None
595 }
596 }
597}
598
599#[cfg(all(
600 target_arch = "aarch64",
601 target_feature = "neon",
602 target_endian = "little"
603))]
604mod aarch64_neon {
605 use core::arch::aarch64::*;
606
607 use super::Vector;
608
609 impl Vector for uint8x16_t {
610 const BITS: usize = 128;
611 const BYTES: usize = 16;
612
613 #[inline(always)]
614 unsafe fn splat(byte: u8) -> uint8x16_t {
615 vdupq_n_u8(byte)
616 }
617
618 #[inline(always)]
619 unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
620 vld1q_u8(data)
621 }
622
623 #[inline(always)]
624 unsafe fn is_zero(self) -> bool {
625 // Could also use vmaxvq_u8.
626 // ... I tried that and couldn't observe any meaningful difference
627 // in benchmarks.
628 let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
629 vgetq_lane_u64(maxes, 0) == 0
630 }
631
632 #[inline(always)]
633 unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
634 vceqq_u8(self, vector2)
635 }
636
637 #[inline(always)]
638 unsafe fn and(self, vector2: Self) -> uint8x16_t {
639 vandq_u8(self, vector2)
640 }
641
642 #[inline(always)]
643 unsafe fn or(self, vector2: Self) -> uint8x16_t {
644 vorrq_u8(self, vector2)
645 }
646
647 #[inline(always)]
648 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
649 debug_assert!(BITS <= 7);
650 vshrq_n_u8(self, BITS)
651 }
652
653 #[inline(always)]
654 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
655 vextq_u8(vector2, self, 15)
656 }
657
658 #[inline(always)]
659 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
660 vextq_u8(vector2, self, 14)
661 }
662
663 #[inline(always)]
664 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
665 vextq_u8(vector2, self, 13)
666 }
667
668 #[inline(always)]
669 unsafe fn shuffle_bytes(self, indices: Self) -> Self {
670 vqtbl1q_u8(self, indices)
671 }
672
673 #[inline(always)]
674 unsafe fn for_each_64bit_lane<T>(
675 self,
676 mut f: impl FnMut(usize, u64) -> Option<T>,
677 ) -> Option<T> {
678 let this = vreinterpretq_u64_u8(self);
679 let lane = vgetq_lane_u64(this, 0);
680 if let Some(t) = f(0, lane) {
681 return Some(t);
682 }
683 let lane = vgetq_lane_u64(this, 1);
684 if let Some(t) = f(1, lane) {
685 return Some(t);
686 }
687 None
688 }
689 }
690}
691
692#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
693mod tests_x86_64_ssse3 {
694 use core::arch::x86_64::*;
695
696 use crate::util::int::{I32, U32};
697
698 use super::*;
699
700 fn is_runnable() -> bool {
701 std::is_x86_feature_detected!("ssse3")
702 }
703
704 #[target_feature(enable = "ssse3")]
705 unsafe fn load(lanes: [u8; 16]) -> __m128i {
706 __m128i::load_unaligned(&lanes as *const u8)
707 }
708
709 #[target_feature(enable = "ssse3")]
710 unsafe fn unload(v: __m128i) -> [u8; 16] {
711 [
712 _mm_extract_epi8(v, 0).to_bits().low_u8(),
713 _mm_extract_epi8(v, 1).to_bits().low_u8(),
714 _mm_extract_epi8(v, 2).to_bits().low_u8(),
715 _mm_extract_epi8(v, 3).to_bits().low_u8(),
716 _mm_extract_epi8(v, 4).to_bits().low_u8(),
717 _mm_extract_epi8(v, 5).to_bits().low_u8(),
718 _mm_extract_epi8(v, 6).to_bits().low_u8(),
719 _mm_extract_epi8(v, 7).to_bits().low_u8(),
720 _mm_extract_epi8(v, 8).to_bits().low_u8(),
721 _mm_extract_epi8(v, 9).to_bits().low_u8(),
722 _mm_extract_epi8(v, 10).to_bits().low_u8(),
723 _mm_extract_epi8(v, 11).to_bits().low_u8(),
724 _mm_extract_epi8(v, 12).to_bits().low_u8(),
725 _mm_extract_epi8(v, 13).to_bits().low_u8(),
726 _mm_extract_epi8(v, 14).to_bits().low_u8(),
727 _mm_extract_epi8(v, 15).to_bits().low_u8(),
728 ]
729 }
730
731 #[test]
732 fn vector_splat() {
733 #[target_feature(enable = "ssse3")]
734 unsafe fn test() {
735 let v = __m128i::splat(0xAF);
736 assert_eq!(
737 unload(v),
738 [
739 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
740 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
741 ]
742 );
743 }
744 if !is_runnable() {
745 return;
746 }
747 unsafe { test() }
748 }
749
750 #[test]
751 fn vector_is_zero() {
752 #[target_feature(enable = "ssse3")]
753 unsafe fn test() {
754 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
755 assert!(!v.is_zero());
756 let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
757 assert!(v.is_zero());
758 }
759 if !is_runnable() {
760 return;
761 }
762 unsafe { test() }
763 }
764
765 #[test]
766 fn vector_cmpeq() {
767 #[target_feature(enable = "ssse3")]
768 unsafe fn test() {
769 let v1 =
770 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
771 let v2 =
772 load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
773 assert_eq!(
774 unload(v1.cmpeq(v2)),
775 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
776 );
777 }
778 if !is_runnable() {
779 return;
780 }
781 unsafe { test() }
782 }
783
784 #[test]
785 fn vector_and() {
786 #[target_feature(enable = "ssse3")]
787 unsafe fn test() {
788 let v1 =
789 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
790 let v2 =
791 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
792 assert_eq!(
793 unload(v1.and(v2)),
794 [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
795 );
796 }
797 if !is_runnable() {
798 return;
799 }
800 unsafe { test() }
801 }
802
803 #[test]
804 fn vector_or() {
805 #[target_feature(enable = "ssse3")]
806 unsafe fn test() {
807 let v1 =
808 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
809 let v2 =
810 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
811 assert_eq!(
812 unload(v1.or(v2)),
813 [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
814 );
815 }
816 if !is_runnable() {
817 return;
818 }
819 unsafe { test() }
820 }
821
822 #[test]
823 fn vector_shift_8bit_lane_right() {
824 #[target_feature(enable = "ssse3")]
825 unsafe fn test() {
826 let v = load([
827 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
828 ]);
829 assert_eq!(
830 unload(v.shift_8bit_lane_right::<2>()),
831 [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
832 );
833 }
834 if !is_runnable() {
835 return;
836 }
837 unsafe { test() }
838 }
839
840 #[test]
841 fn vector_shift_in_one_byte() {
842 #[target_feature(enable = "ssse3")]
843 unsafe fn test() {
844 let v1 =
845 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
846 let v2 = load([
847 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
848 ]);
849 assert_eq!(
850 unload(v1.shift_in_one_byte(v2)),
851 [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
852 );
853 }
854 if !is_runnable() {
855 return;
856 }
857 unsafe { test() }
858 }
859
860 #[test]
861 fn vector_shift_in_two_bytes() {
862 #[target_feature(enable = "ssse3")]
863 unsafe fn test() {
864 let v1 =
865 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
866 let v2 = load([
867 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
868 ]);
869 assert_eq!(
870 unload(v1.shift_in_two_bytes(v2)),
871 [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
872 );
873 }
874 if !is_runnable() {
875 return;
876 }
877 unsafe { test() }
878 }
879
880 #[test]
881 fn vector_shift_in_three_bytes() {
882 #[target_feature(enable = "ssse3")]
883 unsafe fn test() {
884 let v1 =
885 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
886 let v2 = load([
887 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
888 ]);
889 assert_eq!(
890 unload(v1.shift_in_three_bytes(v2)),
891 [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
892 );
893 }
894 if !is_runnable() {
895 return;
896 }
897 unsafe { test() }
898 }
899
900 #[test]
901 fn vector_shuffle_bytes() {
902 #[target_feature(enable = "ssse3")]
903 unsafe fn test() {
904 let v1 =
905 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
906 let v2 =
907 load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
908 assert_eq!(
909 unload(v1.shuffle_bytes(v2)),
910 [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
911 );
912 }
913 if !is_runnable() {
914 return;
915 }
916 unsafe { test() }
917 }
918
919 #[test]
920 fn vector_for_each_64bit_lane() {
921 #[target_feature(enable = "ssse3")]
922 unsafe fn test() {
923 let v = load([
924 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
925 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
926 ]);
927 let mut lanes = [0u64; 2];
928 v.for_each_64bit_lane(|i, lane| {
929 lanes[i] = lane;
930 None::<()>
931 });
932 assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
933 }
934 if !is_runnable() {
935 return;
936 }
937 unsafe { test() }
938 }
939}
940
941#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
942mod tests_x86_64_avx2 {
943 use core::arch::x86_64::*;
944
945 use crate::util::int::{I32, U32};
946
947 use super::*;
948
949 fn is_runnable() -> bool {
950 std::is_x86_feature_detected!("avx2")
951 }
952
953 #[target_feature(enable = "avx2")]
954 unsafe fn load(lanes: [u8; 32]) -> __m256i {
955 __m256i::load_unaligned(&lanes as *const u8)
956 }
957
958 #[target_feature(enable = "avx2")]
959 unsafe fn load_half(lanes: [u8; 16]) -> __m256i {
960 __m256i::load_half_unaligned(&lanes as *const u8)
961 }
962
963 #[target_feature(enable = "avx2")]
964 unsafe fn unload(v: __m256i) -> [u8; 32] {
965 [
966 _mm256_extract_epi8(v, 0).to_bits().low_u8(),
967 _mm256_extract_epi8(v, 1).to_bits().low_u8(),
968 _mm256_extract_epi8(v, 2).to_bits().low_u8(),
969 _mm256_extract_epi8(v, 3).to_bits().low_u8(),
970 _mm256_extract_epi8(v, 4).to_bits().low_u8(),
971 _mm256_extract_epi8(v, 5).to_bits().low_u8(),
972 _mm256_extract_epi8(v, 6).to_bits().low_u8(),
973 _mm256_extract_epi8(v, 7).to_bits().low_u8(),
974 _mm256_extract_epi8(v, 8).to_bits().low_u8(),
975 _mm256_extract_epi8(v, 9).to_bits().low_u8(),
976 _mm256_extract_epi8(v, 10).to_bits().low_u8(),
977 _mm256_extract_epi8(v, 11).to_bits().low_u8(),
978 _mm256_extract_epi8(v, 12).to_bits().low_u8(),
979 _mm256_extract_epi8(v, 13).to_bits().low_u8(),
980 _mm256_extract_epi8(v, 14).to_bits().low_u8(),
981 _mm256_extract_epi8(v, 15).to_bits().low_u8(),
982 _mm256_extract_epi8(v, 16).to_bits().low_u8(),
983 _mm256_extract_epi8(v, 17).to_bits().low_u8(),
984 _mm256_extract_epi8(v, 18).to_bits().low_u8(),
985 _mm256_extract_epi8(v, 19).to_bits().low_u8(),
986 _mm256_extract_epi8(v, 20).to_bits().low_u8(),
987 _mm256_extract_epi8(v, 21).to_bits().low_u8(),
988 _mm256_extract_epi8(v, 22).to_bits().low_u8(),
989 _mm256_extract_epi8(v, 23).to_bits().low_u8(),
990 _mm256_extract_epi8(v, 24).to_bits().low_u8(),
991 _mm256_extract_epi8(v, 25).to_bits().low_u8(),
992 _mm256_extract_epi8(v, 26).to_bits().low_u8(),
993 _mm256_extract_epi8(v, 27).to_bits().low_u8(),
994 _mm256_extract_epi8(v, 28).to_bits().low_u8(),
995 _mm256_extract_epi8(v, 29).to_bits().low_u8(),
996 _mm256_extract_epi8(v, 30).to_bits().low_u8(),
997 _mm256_extract_epi8(v, 31).to_bits().low_u8(),
998 ]
999 }
1000
1001 #[test]
1002 fn vector_splat() {
1003 #[target_feature(enable = "avx2")]
1004 unsafe fn test() {
1005 let v = __m256i::splat(0xAF);
1006 assert_eq!(
1007 unload(v),
1008 [
1009 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1010 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1011 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1012 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1013 ]
1014 );
1015 }
1016 if !is_runnable() {
1017 return;
1018 }
1019 unsafe { test() }
1020 }
1021
1022 #[test]
1023 fn vector_is_zero() {
1024 #[target_feature(enable = "avx2")]
1025 unsafe fn test() {
1026 let v = load([
1027 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1028 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1029 ]);
1030 assert!(!v.is_zero());
1031 let v = load([
1032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1034 ]);
1035 assert!(v.is_zero());
1036 }
1037 if !is_runnable() {
1038 return;
1039 }
1040 unsafe { test() }
1041 }
1042
1043 #[test]
1044 fn vector_cmpeq() {
1045 #[target_feature(enable = "avx2")]
1046 unsafe fn test() {
1047 let v1 = load([
1048 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1049 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1,
1050 ]);
1051 let v2 = load([
1052 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
1053 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
1054 ]);
1055 assert_eq!(
1056 unload(v1.cmpeq(v2)),
1057 [
1058 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1059 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF
1060 ]
1061 );
1062 }
1063 if !is_runnable() {
1064 return;
1065 }
1066 unsafe { test() }
1067 }
1068
1069 #[test]
1070 fn vector_and() {
1071 #[target_feature(enable = "avx2")]
1072 unsafe fn test() {
1073 let v1 = load([
1074 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076 ]);
1077 let v2 = load([
1078 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1080 ]);
1081 assert_eq!(
1082 unload(v1.and(v2)),
1083 [
1084 0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1086 ]
1087 );
1088 }
1089 if !is_runnable() {
1090 return;
1091 }
1092 unsafe { test() }
1093 }
1094
1095 #[test]
1096 fn vector_or() {
1097 #[target_feature(enable = "avx2")]
1098 unsafe fn test() {
1099 let v1 = load([
1100 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1102 ]);
1103 let v2 = load([
1104 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1106 ]);
1107 assert_eq!(
1108 unload(v1.or(v2)),
1109 [
1110 0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1112 ]
1113 );
1114 }
1115 if !is_runnable() {
1116 return;
1117 }
1118 unsafe { test() }
1119 }
1120
1121 #[test]
1122 fn vector_shift_8bit_lane_right() {
1123 #[target_feature(enable = "avx2")]
1124 unsafe fn test() {
1125 let v = load([
1126 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128 ]);
1129 assert_eq!(
1130 unload(v.shift_8bit_lane_right::<2>()),
1131 [
1132 0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1134 ]
1135 );
1136 }
1137 if !is_runnable() {
1138 return;
1139 }
1140 unsafe { test() }
1141 }
1142
1143 #[test]
1144 fn vector_shift_in_one_byte() {
1145 #[target_feature(enable = "avx2")]
1146 unsafe fn test() {
1147 let v1 = load([
1148 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1149 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1150 ]);
1151 let v2 = load([
1152 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1153 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1154 63, 64,
1155 ]);
1156 assert_eq!(
1157 unload(v1.shift_in_one_byte(v2)),
1158 [
1159 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1160 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
1161 31,
1162 ],
1163 );
1164 }
1165 if !is_runnable() {
1166 return;
1167 }
1168 unsafe { test() }
1169 }
1170
1171 #[test]
1172 fn vector_shift_in_two_bytes() {
1173 #[target_feature(enable = "avx2")]
1174 unsafe fn test() {
1175 let v1 = load([
1176 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1177 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1178 ]);
1179 let v2 = load([
1180 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1181 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1182 63, 64,
1183 ]);
1184 assert_eq!(
1185 unload(v1.shift_in_two_bytes(v2)),
1186 [
1187 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1188 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
1189 30,
1190 ],
1191 );
1192 }
1193 if !is_runnable() {
1194 return;
1195 }
1196 unsafe { test() }
1197 }
1198
1199 #[test]
1200 fn vector_shift_in_three_bytes() {
1201 #[target_feature(enable = "avx2")]
1202 unsafe fn test() {
1203 let v1 = load([
1204 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1205 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1206 ]);
1207 let v2 = load([
1208 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1209 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1210 63, 64,
1211 ]);
1212 assert_eq!(
1213 unload(v1.shift_in_three_bytes(v2)),
1214 [
1215 62, 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1216 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
1217 29,
1218 ],
1219 );
1220 }
1221 if !is_runnable() {
1222 return;
1223 }
1224 unsafe { test() }
1225 }
1226
1227 #[test]
1228 fn vector_shuffle_bytes() {
1229 #[target_feature(enable = "avx2")]
1230 unsafe fn test() {
1231 let v1 = load([
1232 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1233 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1234 ]);
1235 let v2 = load([
1236 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16,
1237 16, 16, 20, 20, 20, 20, 24, 24, 24, 24, 28, 28, 28, 28,
1238 ]);
1239 assert_eq!(
1240 unload(v1.shuffle_bytes(v2)),
1241 [
1242 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13, 17,
1243 17, 17, 17, 21, 21, 21, 21, 25, 25, 25, 25, 29, 29, 29,
1244 29
1245 ],
1246 );
1247 }
1248 if !is_runnable() {
1249 return;
1250 }
1251 unsafe { test() }
1252 }
1253
1254 #[test]
1255 fn vector_for_each_64bit_lane() {
1256 #[target_feature(enable = "avx2")]
1257 unsafe fn test() {
1258 let v = load([
1259 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1260 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
1261 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
1262 0x1F, 0x20,
1263 ]);
1264 let mut lanes = [0u64; 4];
1265 v.for_each_64bit_lane(|i, lane| {
1266 lanes[i] = lane;
1267 None::<()>
1268 });
1269 assert_eq!(
1270 lanes,
1271 [
1272 0x0807060504030201,
1273 0x100F0E0D0C0B0A09,
1274 0x1817161514131211,
1275 0x201F1E1D1C1B1A19
1276 ]
1277 );
1278 }
1279 if !is_runnable() {
1280 return;
1281 }
1282 unsafe { test() }
1283 }
1284
1285 #[test]
1286 fn fat_vector_half_shift_in_one_byte() {
1287 #[target_feature(enable = "avx2")]
1288 unsafe fn test() {
1289 let v1 = load_half([
1290 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1291 ]);
1292 let v2 = load_half([
1293 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1294 ]);
1295 assert_eq!(
1296 unload(v1.half_shift_in_one_byte(v2)),
1297 [
1298 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32,
1299 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1300 ],
1301 );
1302 }
1303 if !is_runnable() {
1304 return;
1305 }
1306 unsafe { test() }
1307 }
1308
1309 #[test]
1310 fn fat_vector_half_shift_in_two_bytes() {
1311 #[target_feature(enable = "avx2")]
1312 unsafe fn test() {
1313 let v1 = load_half([
1314 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1315 ]);
1316 let v2 = load_half([
1317 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1318 ]);
1319 assert_eq!(
1320 unload(v1.half_shift_in_two_bytes(v2)),
1321 [
1322 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31,
1323 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1324 ],
1325 );
1326 }
1327 if !is_runnable() {
1328 return;
1329 }
1330 unsafe { test() }
1331 }
1332
1333 #[test]
1334 fn fat_vector_half_shift_in_three_bytes() {
1335 #[target_feature(enable = "avx2")]
1336 unsafe fn test() {
1337 let v1 = load_half([
1338 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1339 ]);
1340 let v2 = load_half([
1341 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1342 ]);
1343 assert_eq!(
1344 unload(v1.half_shift_in_three_bytes(v2)),
1345 [
1346 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 30,
1347 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1348 ],
1349 );
1350 }
1351 if !is_runnable() {
1352 return;
1353 }
1354 unsafe { test() }
1355 }
1356
1357 #[test]
1358 fn fat_vector_swap_halves() {
1359 #[target_feature(enable = "avx2")]
1360 unsafe fn test() {
1361 let v = load([
1362 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1363 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1364 ]);
1365 assert_eq!(
1366 unload(v.swap_halves()),
1367 [
1368 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
1369 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1370 16,
1371 ],
1372 );
1373 }
1374 if !is_runnable() {
1375 return;
1376 }
1377 unsafe { test() }
1378 }
1379
1380 #[test]
1381 fn fat_vector_interleave_low_8bit_lanes() {
1382 #[target_feature(enable = "avx2")]
1383 unsafe fn test() {
1384 let v1 = load([
1385 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1386 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1387 ]);
1388 let v2 = load([
1389 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1390 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1391 63, 64,
1392 ]);
1393 assert_eq!(
1394 unload(v1.interleave_low_8bit_lanes(v2)),
1395 [
1396 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40,
1397 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
1398 24, 56,
1399 ],
1400 );
1401 }
1402 if !is_runnable() {
1403 return;
1404 }
1405 unsafe { test() }
1406 }
1407
1408 #[test]
1409 fn fat_vector_interleave_high_8bit_lanes() {
1410 #[target_feature(enable = "avx2")]
1411 unsafe fn test() {
1412 let v1 = load([
1413 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1414 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1415 ]);
1416 let v2 = load([
1417 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1418 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1419 63, 64,
1420 ]);
1421 assert_eq!(
1422 unload(v1.interleave_high_8bit_lanes(v2)),
1423 [
1424 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16,
1425 48, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31,
1426 63, 32, 64,
1427 ],
1428 );
1429 }
1430 if !is_runnable() {
1431 return;
1432 }
1433 unsafe { test() }
1434 }
1435
1436 #[test]
1437 fn fat_vector_for_each_low_64bit_lane() {
1438 #[target_feature(enable = "avx2")]
1439 unsafe fn test() {
1440 let v1 = load([
1441 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1442 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
1443 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
1444 0x1F, 0x20,
1445 ]);
1446 let v2 = load([
1447 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A,
1448 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34,
1449 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E,
1450 0x3F, 0x40,
1451 ]);
1452 let mut lanes = [0u64; 4];
1453 v1.for_each_low_64bit_lane(v2, |i, lane| {
1454 lanes[i] = lane;
1455 None::<()>
1456 });
1457 assert_eq!(
1458 lanes,
1459 [
1460 0x0807060504030201,
1461 0x100F0E0D0C0B0A09,
1462 0x2827262524232221,
1463 0x302F2E2D2C2B2A29
1464 ]
1465 );
1466 }
1467 if !is_runnable() {
1468 return;
1469 }
1470 unsafe { test() }
1471 }
1472}
1473
1474#[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))]
1475mod tests_aarch64_neon {
1476 use core::arch::aarch64::*;
1477
1478 use super::*;
1479
1480 #[target_feature(enable = "neon")]
1481 unsafe fn load(lanes: [u8; 16]) -> uint8x16_t {
1482 uint8x16_t::load_unaligned(&lanes as *const u8)
1483 }
1484
1485 #[target_feature(enable = "neon")]
1486 unsafe fn unload(v: uint8x16_t) -> [u8; 16] {
1487 [
1488 vgetq_lane_u8(v, 0),
1489 vgetq_lane_u8(v, 1),
1490 vgetq_lane_u8(v, 2),
1491 vgetq_lane_u8(v, 3),
1492 vgetq_lane_u8(v, 4),
1493 vgetq_lane_u8(v, 5),
1494 vgetq_lane_u8(v, 6),
1495 vgetq_lane_u8(v, 7),
1496 vgetq_lane_u8(v, 8),
1497 vgetq_lane_u8(v, 9),
1498 vgetq_lane_u8(v, 10),
1499 vgetq_lane_u8(v, 11),
1500 vgetq_lane_u8(v, 12),
1501 vgetq_lane_u8(v, 13),
1502 vgetq_lane_u8(v, 14),
1503 vgetq_lane_u8(v, 15),
1504 ]
1505 }
1506
1507 // Example functions. These don't test the Vector traits, but rather,
1508 // specific NEON instructions. They are basically little experiments I
1509 // wrote to figure out what an instruction does since their descriptions
1510 // are so dense. I decided to keep the experiments around as example tests
1511 // in case there' useful.
1512
1513 #[test]
1514 fn example_vmaxvq_u8_non_zero() {
1515 #[target_feature(enable = "neon")]
1516 unsafe fn example() {
1517 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1518 assert_eq!(vmaxvq_u8(v), 1);
1519 }
1520 unsafe { example() }
1521 }
1522
1523 #[test]
1524 fn example_vmaxvq_u8_zero() {
1525 #[target_feature(enable = "neon")]
1526 unsafe fn example() {
1527 let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1528 assert_eq!(vmaxvq_u8(v), 0);
1529 }
1530 unsafe { example() }
1531 }
1532
1533 #[test]
1534 fn example_vpmaxq_u8_non_zero() {
1535 #[target_feature(enable = "neon")]
1536 unsafe fn example() {
1537 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1538 let r = vpmaxq_u8(v, v);
1539 assert_eq!(
1540 unload(r),
1541 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
1542 );
1543 }
1544 unsafe { example() }
1545 }
1546
1547 #[test]
1548 fn example_vpmaxq_u8_self() {
1549 #[target_feature(enable = "neon")]
1550 unsafe fn example() {
1551 let v =
1552 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1553 let r = vpmaxq_u8(v, v);
1554 assert_eq!(
1555 unload(r),
1556 [2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16]
1557 );
1558 }
1559 unsafe { example() }
1560 }
1561
1562 #[test]
1563 fn example_vpmaxq_u8_other() {
1564 #[target_feature(enable = "neon")]
1565 unsafe fn example() {
1566 let v1 =
1567 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1568 let v2 = load([
1569 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1570 ]);
1571 let r = vpmaxq_u8(v1, v2);
1572 assert_eq!(
1573 unload(r),
1574 [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32]
1575 );
1576 }
1577 unsafe { example() }
1578 }
1579
1580 // Now we test the actual methods on the Vector trait.
1581
1582 #[test]
1583 fn vector_splat() {
1584 #[target_feature(enable = "neon")]
1585 unsafe fn test() {
1586 let v = uint8x16_t::splat(0xAF);
1587 assert_eq!(
1588 unload(v),
1589 [
1590 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1591 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
1592 ]
1593 );
1594 }
1595 unsafe { test() }
1596 }
1597
1598 #[test]
1599 fn vector_is_zero() {
1600 #[target_feature(enable = "neon")]
1601 unsafe fn test() {
1602 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1603 assert!(!v.is_zero());
1604 let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1605 assert!(v.is_zero());
1606 }
1607 unsafe { test() }
1608 }
1609
1610 #[test]
1611 fn vector_cmpeq() {
1612 #[target_feature(enable = "neon")]
1613 unsafe fn test() {
1614 let v1 =
1615 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
1616 let v2 =
1617 load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
1618 assert_eq!(
1619 unload(v1.cmpeq(v2)),
1620 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
1621 );
1622 }
1623 unsafe { test() }
1624 }
1625
1626 #[test]
1627 fn vector_and() {
1628 #[target_feature(enable = "neon")]
1629 unsafe fn test() {
1630 let v1 =
1631 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1632 let v2 =
1633 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1634 assert_eq!(
1635 unload(v1.and(v2)),
1636 [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1637 );
1638 }
1639 unsafe { test() }
1640 }
1641
1642 #[test]
1643 fn vector_or() {
1644 #[target_feature(enable = "neon")]
1645 unsafe fn test() {
1646 let v1 =
1647 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1648 let v2 =
1649 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1650 assert_eq!(
1651 unload(v1.or(v2)),
1652 [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1653 );
1654 }
1655 unsafe { test() }
1656 }
1657
1658 #[test]
1659 fn vector_shift_8bit_lane_right() {
1660 #[target_feature(enable = "neon")]
1661 unsafe fn test() {
1662 let v = load([
1663 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1664 ]);
1665 assert_eq!(
1666 unload(v.shift_8bit_lane_right::<2>()),
1667 [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1668 );
1669 }
1670 unsafe { test() }
1671 }
1672
1673 #[test]
1674 fn vector_shift_in_one_byte() {
1675 #[target_feature(enable = "neon")]
1676 unsafe fn test() {
1677 let v1 =
1678 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1679 let v2 = load([
1680 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1681 ]);
1682 assert_eq!(
1683 unload(v1.shift_in_one_byte(v2)),
1684 [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
1685 );
1686 }
1687 unsafe { test() }
1688 }
1689
1690 #[test]
1691 fn vector_shift_in_two_bytes() {
1692 #[target_feature(enable = "neon")]
1693 unsafe fn test() {
1694 let v1 =
1695 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1696 let v2 = load([
1697 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1698 ]);
1699 assert_eq!(
1700 unload(v1.shift_in_two_bytes(v2)),
1701 [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
1702 );
1703 }
1704 unsafe { test() }
1705 }
1706
1707 #[test]
1708 fn vector_shift_in_three_bytes() {
1709 #[target_feature(enable = "neon")]
1710 unsafe fn test() {
1711 let v1 =
1712 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1713 let v2 = load([
1714 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1715 ]);
1716 assert_eq!(
1717 unload(v1.shift_in_three_bytes(v2)),
1718 [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
1719 );
1720 }
1721 unsafe { test() }
1722 }
1723
1724 #[test]
1725 fn vector_shuffle_bytes() {
1726 #[target_feature(enable = "neon")]
1727 unsafe fn test() {
1728 let v1 =
1729 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1730 let v2 =
1731 load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
1732 assert_eq!(
1733 unload(v1.shuffle_bytes(v2)),
1734 [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
1735 );
1736 }
1737 unsafe { test() }
1738 }
1739
1740 #[test]
1741 fn vector_for_each_64bit_lane() {
1742 #[target_feature(enable = "neon")]
1743 unsafe fn test() {
1744 let v = load([
1745 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1746 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
1747 ]);
1748 let mut lanes = [0u64; 2];
1749 v.for_each_64bit_lane(|i, lane| {
1750 lanes[i] = lane;
1751 None::<()>
1752 });
1753 assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
1754 }
1755 unsafe { test() }
1756 }
1757}
1758