1use crate::simd::{LaneCount, Simd, SupportedLaneCount};
2use core::mem;
3
4impl<const N: usize> Simd<u8, N>
5where
6 LaneCount<N>: SupportedLaneCount,
7{
8 /// Swizzle a vector of bytes according to the index vector.
9 /// Indices within range select the appropriate byte.
10 /// Indices "out of bounds" instead select 0.
11 ///
12 /// Note that the current implementation is selected during build-time
13 /// of the standard library, so `cargo build -Zbuild-std` may be necessary
14 /// to unlock better performance, especially for larger vectors.
15 /// A planned compiler improvement will enable using `#[target_feature]` instead.
16 #[inline]
17 pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
18 #![allow(unused_imports, unused_unsafe)]
19 #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
20 use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
21 #[cfg(all(
22 target_arch = "arm",
23 target_feature = "v7",
24 target_feature = "neon",
25 target_endian = "little"
26 ))]
27 use core::arch::arm::{uint8x8_t, vtbl1_u8};
28 #[cfg(target_arch = "wasm32")]
29 use core::arch::wasm32 as wasm;
30 #[cfg(target_arch = "x86")]
31 use core::arch::x86;
32 #[cfg(target_arch = "x86_64")]
33 use core::arch::x86_64 as x86;
34 // SAFETY: Intrinsics covered by cfg
35 unsafe {
36 match N {
37 #[cfg(all(
38 any(
39 target_arch = "aarch64",
40 all(target_arch = "arm", target_feature = "v7")
41 ),
42 target_feature = "neon",
43 target_endian = "little"
44 ))]
45 8 => transize(vtbl1_u8, self, idxs),
46 #[cfg(target_feature = "ssse3")]
47 16 => transize(x86::_mm_shuffle_epi8, self, idxs),
48 #[cfg(target_feature = "simd128")]
49 16 => transize(wasm::i8x16_swizzle, self, idxs),
50 #[cfg(all(
51 target_arch = "aarch64",
52 target_feature = "neon",
53 target_endian = "little"
54 ))]
55 16 => transize(vqtbl1q_u8, self, idxs),
56 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
57 32 => transize_raw(avx2_pshufb, self, idxs),
58 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
59 32 => transize(x86::_mm256_permutexvar_epi8, self, idxs),
60 // Notable absence: avx512bw shuffle
61 // If avx512bw is available, odds of avx512vbmi are good
62 // FIXME: initial AVX512VBMI variant didn't actually pass muster
63 // #[cfg(target_feature = "avx512vbmi")]
64 // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
65 _ => {
66 let mut array = [0; N];
67 for (i, k) in idxs.to_array().into_iter().enumerate() {
68 if (k as usize) < N {
69 array[i] = self[k as usize];
70 };
71 }
72 array.into()
73 }
74 }
75 }
76 }
77}
78
79/// "vpshufb like it was meant to be" on AVX2
80///
81/// # Safety
82/// This requires AVX2 to work
83#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
84#[target_feature(enable = "avx2")]
85#[allow(unused)]
86#[inline]
87#[allow(clippy::let_and_return)]
88unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
89 use crate::simd::cmp::SimdPartialOrd;
90 #[cfg(target_arch = "x86")]
91 use core::arch::x86;
92 #[cfg(target_arch = "x86_64")]
93 use core::arch::x86_64 as x86;
94 use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
95 use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
96 let mid = Simd::splat(16u8);
97 let high = mid + mid;
98 // SAFETY: Caller promised AVX2
99 unsafe {
100 // This is ordering sensitive, and LLVM will order these how you put them.
101 // Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes.
102 // But the "compose" step will lower to ops that can also use at least 1 other port.
103 // So this tries to break up permutes so composition flows through "open" ports.
104 // Comparative benches should be done on multiple AVX2 CPUs before reordering this
105
106 let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into());
107 let hi_shuf = Simd::from(avx2_half_pshufb(
108 hihi, // duplicate the vector's top half
109 idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31
110 ));
111 // A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics
112 let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0));
113 let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into());
114 let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into()));
115 // Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step
116 let compose = idxs.simd_lt(mid).select(lo_shuf, compose);
117 compose
118 }
119}
120
121/// This sets up a call to an architecture-specific function, and in doing so
122/// it persuades rustc that everything is the correct size. Which it is.
123/// This would not be needed if one could convince Rust that, by matching on N,
124/// N is that value, and thus it would be valid to substitute e.g. 16.
125///
126/// # Safety
127/// The correctness of this function hinges on the sizes agreeing in actuality.
128#[allow(dead_code)]
129#[inline(always)]
130unsafe fn transize<T, const N: usize>(
131 f: unsafe fn(T, T) -> T,
132 bytes: Simd<u8, N>,
133 idxs: Simd<u8, N>,
134) -> Simd<u8, N>
135where
136 LaneCount<N>: SupportedLaneCount,
137{
138 let idxs: Simd = zeroing_idxs(idxs);
139 // SAFETY: Same obligation to use this function as to use mem::transmute_copy.
140 unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
141}
142
143/// Make indices that yield 0 for this architecture
144#[inline(always)]
145fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
146where
147 LaneCount<N>: SupportedLaneCount,
148{
149 // On x86, make sure the top bit is set.
150 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
151 let idxs: Simd = {
152 use crate::simd::cmp::SimdPartialOrd;
153 idxs.simd_lt(Simd::splat(N as u8))
154 .select(true_values:idxs, false_values:Simd::splat(u8::MAX))
155 };
156 // Simply do nothing on most architectures.
157 idxs
158}
159
160/// As transize but no implicit call to `zeroing_idxs`.
161#[allow(dead_code)]
162#[inline(always)]
163unsafe fn transize_raw<T, const N: usize>(
164 f: unsafe fn(T, T) -> T,
165 bytes: Simd<u8, N>,
166 idxs: Simd<u8, N>,
167) -> Simd<u8, N>
168where
169 LaneCount<N>: SupportedLaneCount,
170{
171 // SAFETY: Same obligation to use this function as to use mem::transmute_copy.
172 unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
173}
174