1 | use crate::simd::{LaneCount, Simd, SupportedLaneCount}; |
2 | use core::mem; |
3 | |
4 | impl<const N: usize> Simd<u8, N> |
5 | where |
6 | LaneCount<N>: SupportedLaneCount, |
7 | { |
8 | /// Swizzle a vector of bytes according to the index vector. |
9 | /// Indices within range select the appropriate byte. |
10 | /// Indices "out of bounds" instead select 0. |
11 | /// |
12 | /// Note that the current implementation is selected during build-time |
13 | /// of the standard library, so `cargo build -Zbuild-std` may be necessary |
14 | /// to unlock better performance, especially for larger vectors. |
15 | /// A planned compiler improvement will enable using `#[target_feature]` instead. |
16 | #[inline ] |
17 | pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self { |
18 | #![allow (unused_imports, unused_unsafe)] |
19 | #[cfg (all(target_arch = "aarch64" , target_endian = "little" ))] |
20 | use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8}; |
21 | #[cfg (all( |
22 | target_arch = "arm" , |
23 | target_feature = "v7" , |
24 | target_feature = "neon" , |
25 | target_endian = "little" |
26 | ))] |
27 | use core::arch::arm::{uint8x8_t, vtbl1_u8}; |
28 | #[cfg (target_arch = "wasm32" )] |
29 | use core::arch::wasm32 as wasm; |
30 | #[cfg (target_arch = "x86" )] |
31 | use core::arch::x86; |
32 | #[cfg (target_arch = "x86_64" )] |
33 | use core::arch::x86_64 as x86; |
34 | // SAFETY: Intrinsics covered by cfg |
35 | unsafe { |
36 | match N { |
37 | #[cfg (all( |
38 | any( |
39 | target_arch = "aarch64" , |
40 | all(target_arch = "arm" , target_feature = "v7" ) |
41 | ), |
42 | target_feature = "neon" , |
43 | target_endian = "little" |
44 | ))] |
45 | 8 => transize(vtbl1_u8, self, idxs), |
46 | #[cfg (target_feature = "ssse3" )] |
47 | 16 => transize(x86::_mm_shuffle_epi8, self, idxs), |
48 | #[cfg (target_feature = "simd128" )] |
49 | 16 => transize(wasm::i8x16_swizzle, self, idxs), |
50 | #[cfg (all( |
51 | target_arch = "aarch64" , |
52 | target_feature = "neon" , |
53 | target_endian = "little" |
54 | ))] |
55 | 16 => transize(vqtbl1q_u8, self, idxs), |
56 | #[cfg (all(target_feature = "avx2" , not(target_feature = "avx512vbmi" )))] |
57 | 32 => transize_raw(avx2_pshufb, self, idxs), |
58 | #[cfg (all(target_feature = "avx512vl" , target_feature = "avx512vbmi" ))] |
59 | 32 => transize(x86::_mm256_permutexvar_epi8, self, idxs), |
60 | // Notable absence: avx512bw shuffle |
61 | // If avx512bw is available, odds of avx512vbmi are good |
62 | // FIXME: initial AVX512VBMI variant didn't actually pass muster |
63 | // #[cfg(target_feature = "avx512vbmi")] |
64 | // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs), |
65 | _ => { |
66 | let mut array = [0; N]; |
67 | for (i, k) in idxs.to_array().into_iter().enumerate() { |
68 | if (k as usize) < N { |
69 | array[i] = self[k as usize]; |
70 | }; |
71 | } |
72 | array.into() |
73 | } |
74 | } |
75 | } |
76 | } |
77 | } |
78 | |
79 | /// "vpshufb like it was meant to be" on AVX2 |
80 | /// |
81 | /// # Safety |
82 | /// This requires AVX2 to work |
83 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
84 | #[target_feature (enable = "avx2" )] |
85 | #[allow (unused)] |
86 | #[inline ] |
87 | #[allow (clippy::let_and_return)] |
88 | unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> { |
89 | use crate::simd::cmp::SimdPartialOrd; |
90 | #[cfg (target_arch = "x86" )] |
91 | use core::arch::x86; |
92 | #[cfg (target_arch = "x86_64" )] |
93 | use core::arch::x86_64 as x86; |
94 | use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle; |
95 | use x86::_mm256_shuffle_epi8 as avx2_half_pshufb; |
96 | let mid = Simd::splat(16u8); |
97 | let high = mid + mid; |
98 | // SAFETY: Caller promised AVX2 |
99 | unsafe { |
100 | // This is ordering sensitive, and LLVM will order these how you put them. |
101 | // Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes. |
102 | // But the "compose" step will lower to ops that can also use at least 1 other port. |
103 | // So this tries to break up permutes so composition flows through "open" ports. |
104 | // Comparative benches should be done on multiple AVX2 CPUs before reordering this |
105 | |
106 | let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into()); |
107 | let hi_shuf = Simd::from(avx2_half_pshufb( |
108 | hihi, // duplicate the vector's top half |
109 | idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31 |
110 | )); |
111 | // A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics |
112 | let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0)); |
113 | let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into()); |
114 | let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into())); |
115 | // Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step |
116 | let compose = idxs.simd_lt(mid).select(lo_shuf, compose); |
117 | compose |
118 | } |
119 | } |
120 | |
121 | /// This sets up a call to an architecture-specific function, and in doing so |
122 | /// it persuades rustc that everything is the correct size. Which it is. |
123 | /// This would not be needed if one could convince Rust that, by matching on N, |
124 | /// N is that value, and thus it would be valid to substitute e.g. 16. |
125 | /// |
126 | /// # Safety |
127 | /// The correctness of this function hinges on the sizes agreeing in actuality. |
128 | #[allow (dead_code)] |
129 | #[inline (always)] |
130 | unsafe fn transize<T, const N: usize>( |
131 | f: unsafe fn(T, T) -> T, |
132 | bytes: Simd<u8, N>, |
133 | idxs: Simd<u8, N>, |
134 | ) -> Simd<u8, N> |
135 | where |
136 | LaneCount<N>: SupportedLaneCount, |
137 | { |
138 | let idxs: Simd = zeroing_idxs(idxs); |
139 | // SAFETY: Same obligation to use this function as to use mem::transmute_copy. |
140 | unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } |
141 | } |
142 | |
143 | /// Make indices that yield 0 for this architecture |
144 | #[inline (always)] |
145 | fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N> |
146 | where |
147 | LaneCount<N>: SupportedLaneCount, |
148 | { |
149 | // On x86, make sure the top bit is set. |
150 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
151 | let idxs: Simd = { |
152 | use crate::simd::cmp::SimdPartialOrd; |
153 | idxs.simd_lt(Simd::splat(N as u8)) |
154 | .select(true_values:idxs, false_values:Simd::splat(u8::MAX)) |
155 | }; |
156 | // Simply do nothing on most architectures. |
157 | idxs |
158 | } |
159 | |
160 | /// As transize but no implicit call to `zeroing_idxs`. |
161 | #[allow (dead_code)] |
162 | #[inline (always)] |
163 | unsafe fn transize_raw<T, const N: usize>( |
164 | f: unsafe fn(T, T) -> T, |
165 | bytes: Simd<u8, N>, |
166 | idxs: Simd<u8, N>, |
167 | ) -> Simd<u8, N> |
168 | where |
169 | LaneCount<N>: SupportedLaneCount, |
170 | { |
171 | // SAFETY: Same obligation to use this function as to use mem::transmute_copy. |
172 | unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } |
173 | } |
174 | |