1 | use crate::simd::{LaneCount, Simd, SupportedLaneCount}; |
2 | use core::mem; |
3 | |
4 | impl<const N: usize> Simd<u8, N> |
5 | where |
6 | LaneCount<N>: SupportedLaneCount, |
7 | { |
8 | /// Swizzle a vector of bytes according to the index vector. |
9 | /// Indices within range select the appropriate byte. |
10 | /// Indices "out of bounds" instead select 0. |
11 | /// |
12 | /// Note that the current implementation is selected during build-time |
13 | /// of the standard library, so `cargo build -Zbuild-std` may be necessary |
14 | /// to unlock better performance, especially for larger vectors. |
15 | /// A planned compiler improvement will enable using `#[target_feature]` instead. |
16 | #[inline ] |
17 | pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self { |
18 | #![allow (unused_imports, unused_unsafe)] |
19 | #[cfg (all( |
20 | any(target_arch = "aarch64" , target_arch = "arm64ec" ), |
21 | target_endian = "little" |
22 | ))] |
23 | use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8}; |
24 | #[cfg (all( |
25 | target_arch = "arm" , |
26 | target_feature = "v7" , |
27 | target_feature = "neon" , |
28 | target_endian = "little" |
29 | ))] |
30 | use core::arch::arm::{uint8x8_t, vtbl1_u8}; |
31 | #[cfg (target_arch = "wasm32" )] |
32 | use core::arch::wasm32 as wasm; |
33 | #[cfg (target_arch = "x86" )] |
34 | use core::arch::x86; |
35 | #[cfg (target_arch = "x86_64" )] |
36 | use core::arch::x86_64 as x86; |
37 | // SAFETY: Intrinsics covered by cfg |
38 | unsafe { |
39 | match N { |
40 | #[cfg (all( |
41 | any( |
42 | target_arch = "aarch64" , |
43 | target_arch = "arm64ec" , |
44 | all(target_arch = "arm" , target_feature = "v7" ) |
45 | ), |
46 | target_feature = "neon" , |
47 | target_endian = "little" |
48 | ))] |
49 | 8 => transize(vtbl1_u8, self, idxs), |
50 | #[cfg (target_feature = "ssse3" )] |
51 | 16 => transize(x86::_mm_shuffle_epi8, self, zeroing_idxs(idxs)), |
52 | #[cfg (target_feature = "simd128" )] |
53 | 16 => transize(wasm::i8x16_swizzle, self, idxs), |
54 | #[cfg (all( |
55 | any(target_arch = "aarch64" , target_arch = "arm64ec" ), |
56 | target_feature = "neon" , |
57 | target_endian = "little" |
58 | ))] |
59 | 16 => transize(vqtbl1q_u8, self, idxs), |
60 | #[cfg (all(target_feature = "avx2" , not(target_feature = "avx512vbmi" )))] |
61 | 32 => transize(avx2_pshufb, self, idxs), |
62 | #[cfg (all(target_feature = "avx512vl" , target_feature = "avx512vbmi" ))] |
63 | 32 => transize(x86::_mm256_permutexvar_epi8, zeroing_idxs(idxs), self), |
64 | // Notable absence: avx512bw shuffle |
65 | // If avx512bw is available, odds of avx512vbmi are good |
66 | // FIXME: initial AVX512VBMI variant didn't actually pass muster |
67 | // #[cfg(target_feature = "avx512vbmi")] |
68 | // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs), |
69 | _ => { |
70 | let mut array = [0; N]; |
71 | for (i, k) in idxs.to_array().into_iter().enumerate() { |
72 | if (k as usize) < N { |
73 | array[i] = self[k as usize]; |
74 | }; |
75 | } |
76 | array.into() |
77 | } |
78 | } |
79 | } |
80 | } |
81 | } |
82 | |
83 | /// "vpshufb like it was meant to be" on AVX2 |
84 | /// |
85 | /// # Safety |
86 | /// This requires AVX2 to work |
87 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
88 | #[target_feature (enable = "avx2" )] |
89 | #[allow (unused)] |
90 | #[inline ] |
91 | #[allow (clippy::let_and_return)] |
92 | unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> { |
93 | use crate::simd::cmp::SimdPartialOrd; |
94 | #[cfg (target_arch = "x86" )] |
95 | use core::arch::x86; |
96 | #[cfg (target_arch = "x86_64" )] |
97 | use core::arch::x86_64 as x86; |
98 | use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle; |
99 | use x86::_mm256_shuffle_epi8 as avx2_half_pshufb; |
100 | let mid = Simd::splat(16u8); |
101 | let high = mid + mid; |
102 | // SAFETY: Caller promised AVX2 |
103 | unsafe { |
104 | // This is ordering sensitive, and LLVM will order these how you put them. |
105 | // Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes. |
106 | // But the "compose" step will lower to ops that can also use at least 1 other port. |
107 | // So this tries to break up permutes so composition flows through "open" ports. |
108 | // Comparative benches should be done on multiple AVX2 CPUs before reordering this |
109 | |
110 | let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into()); |
111 | let hi_shuf = Simd::from(avx2_half_pshufb( |
112 | hihi, // duplicate the vector's top half |
113 | idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31 |
114 | )); |
115 | // A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics |
116 | let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0)); |
117 | let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into()); |
118 | let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into())); |
119 | // Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step |
120 | let compose = idxs.simd_lt(mid).select(lo_shuf, compose); |
121 | compose |
122 | } |
123 | } |
124 | |
125 | /// This sets up a call to an architecture-specific function, and in doing so |
126 | /// it persuades rustc that everything is the correct size. Which it is. |
127 | /// This would not be needed if one could convince Rust that, by matching on N, |
128 | /// N is that value, and thus it would be valid to substitute e.g. 16. |
129 | /// |
130 | /// # Safety |
131 | /// The correctness of this function hinges on the sizes agreeing in actuality. |
132 | #[allow (dead_code)] |
133 | #[inline (always)] |
134 | unsafe fn transize<T, const N: usize>( |
135 | f: unsafe fn(T, T) -> T, |
136 | a: Simd<u8, N>, |
137 | b: Simd<u8, N>, |
138 | ) -> Simd<u8, N> |
139 | where |
140 | LaneCount<N>: SupportedLaneCount, |
141 | { |
142 | // SAFETY: Same obligation to use this function as to use mem::transmute_copy. |
143 | unsafe { mem::transmute_copy(&f(mem::transmute_copy(&a), mem::transmute_copy(&b))) } |
144 | } |
145 | |
146 | /// Make indices that yield 0 for x86 |
147 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
148 | #[allow (unused)] |
149 | #[inline (always)] |
150 | fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N> |
151 | where |
152 | LaneCount<N>: SupportedLaneCount, |
153 | { |
154 | use crate::simd::cmp::SimdPartialOrd; |
155 | idxs.simd_lt(Simd::splat(N as u8)) |
156 | .select(true_values:idxs, false_values:Simd::splat(u8::MAX)) |
157 | } |
158 | |