1 | // crate minimums: sse2, x86_64 |
2 | |
3 | use crate::types::*; |
4 | use core::arch::x86_64::{__m128i, __m256i}; |
5 | use zerocopy::{AsBytes, FromBytes, FromZeroes}; |
6 | |
7 | mod sse2; |
8 | |
9 | #[derive (Copy, Clone)] |
10 | pub struct YesS3; |
11 | #[derive (Copy, Clone)] |
12 | pub struct NoS3; |
13 | |
14 | #[derive (Copy, Clone)] |
15 | pub struct YesS4; |
16 | #[derive (Copy, Clone)] |
17 | pub struct NoS4; |
18 | |
19 | #[derive (Copy, Clone)] |
20 | pub struct YesA1; |
21 | #[derive (Copy, Clone)] |
22 | pub struct NoA1; |
23 | |
24 | #[derive (Copy, Clone)] |
25 | pub struct YesA2; |
26 | #[derive (Copy, Clone)] |
27 | pub struct NoA2; |
28 | |
29 | #[derive (Copy, Clone)] |
30 | pub struct YesNI; |
31 | #[derive (Copy, Clone)] |
32 | pub struct NoNI; |
33 | |
34 | use core::marker::PhantomData; |
35 | |
36 | #[derive (Copy, Clone)] |
37 | pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>); |
38 | impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI> |
39 | where |
40 | sse2::u128x1_sse2<S3, S4, NI>: Swap64, |
41 | sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, |
42 | sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, |
43 | sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4, |
44 | sse2::u128x1_sse2<S3, S4, NI>: BSwap, |
45 | sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>, |
46 | sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>, |
47 | sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>, |
48 | sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>, |
49 | sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>, |
50 | { |
51 | type u32x4 = sse2::u32x4_sse2<S3, S4, NI>; |
52 | type u64x2 = sse2::u64x2_sse2<S3, S4, NI>; |
53 | type u128x1 = sse2::u128x1_sse2<S3, S4, NI>; |
54 | |
55 | type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>; |
56 | type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>; |
57 | type u64x4 = sse2::u64x4_sse2<S3, S4, NI>; |
58 | type u128x2 = sse2::u128x2_sse2<S3, S4, NI>; |
59 | |
60 | type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>; |
61 | type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>; |
62 | type u128x4 = sse2::u128x4_sse2<S3, S4, NI>; |
63 | |
64 | #[inline (always)] |
65 | unsafe fn instance() -> Self { |
66 | SseMachine(PhantomData) |
67 | } |
68 | } |
69 | |
70 | #[derive (Copy, Clone)] |
71 | pub struct Avx2Machine<NI>(PhantomData<NI>); |
72 | impl<NI: Copy> Machine for Avx2Machine<NI> |
73 | where |
74 | sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64, |
75 | sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, |
76 | sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, |
77 | sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4, |
78 | { |
79 | type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>; |
80 | type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; |
81 | type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; |
82 | |
83 | type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>; |
84 | type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; |
85 | type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; |
86 | type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; |
87 | |
88 | type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>; |
89 | type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>; |
90 | type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>; |
91 | |
92 | #[inline (always)] |
93 | unsafe fn instance() -> Self { |
94 | Avx2Machine(PhantomData) |
95 | } |
96 | } |
97 | |
98 | pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>; |
99 | pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>; |
100 | pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>; |
101 | /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything |
102 | /// to avoid expensive SSE/VEX conflicts. |
103 | pub type AVX = SseMachine<YesS3, YesS4, NoNI>; |
104 | pub type AVX2 = Avx2Machine<NoNI>; |
105 | |
106 | /// Generic wrapper for unparameterized storage of any of the possible impls. |
107 | /// Converting into and out of this type should be essentially free, although it may be more |
108 | /// aligned than a particular impl requires. |
109 | #[allow (non_camel_case_types)] |
110 | #[derive (Copy, Clone, FromBytes, AsBytes, FromZeroes)] |
111 | #[repr (C)] |
112 | pub union vec128_storage { |
113 | u32x4: [u32; 4], |
114 | u64x2: [u64; 2], |
115 | u128x1: [u128; 1], |
116 | sse2: __m128i, |
117 | } |
118 | impl Store<vec128_storage> for vec128_storage { |
119 | #[inline (always)] |
120 | unsafe fn unpack(p: vec128_storage) -> Self { |
121 | p |
122 | } |
123 | } |
124 | impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { |
125 | #[inline (always)] |
126 | fn from(x: &'a vec128_storage) -> Self { |
127 | unsafe { &x.u32x4 } |
128 | } |
129 | } |
130 | impl From<[u32; 4]> for vec128_storage { |
131 | #[inline (always)] |
132 | fn from(u32x4: [u32; 4]) -> Self { |
133 | vec128_storage { u32x4 } |
134 | } |
135 | } |
136 | impl Default for vec128_storage { |
137 | #[inline (always)] |
138 | fn default() -> Self { |
139 | vec128_storage { u128x1: [0] } |
140 | } |
141 | } |
142 | impl Eq for vec128_storage {} |
143 | impl PartialEq for vec128_storage { |
144 | #[inline (always)] |
145 | fn eq(&self, rhs: &Self) -> bool { |
146 | unsafe { self.u128x1 == rhs.u128x1 } |
147 | } |
148 | } |
149 | |
150 | #[allow (non_camel_case_types)] |
151 | #[derive (Copy, Clone)] |
152 | pub union vec256_storage { |
153 | u32x8: [u32; 8], |
154 | u64x4: [u64; 4], |
155 | u128x2: [u128; 2], |
156 | sse2: [vec128_storage; 2], |
157 | avx: __m256i, |
158 | } |
159 | impl From<[u64; 4]> for vec256_storage { |
160 | #[inline (always)] |
161 | fn from(u64x4: [u64; 4]) -> Self { |
162 | vec256_storage { u64x4 } |
163 | } |
164 | } |
165 | impl Default for vec256_storage { |
166 | #[inline (always)] |
167 | fn default() -> Self { |
168 | vec256_storage { u128x2: [0, 0] } |
169 | } |
170 | } |
171 | impl vec256_storage { |
172 | #[inline (always)] |
173 | pub fn new128(xs: [vec128_storage; 2]) -> Self { |
174 | Self { sse2: xs } |
175 | } |
176 | #[inline (always)] |
177 | pub fn split128(self) -> [vec128_storage; 2] { |
178 | unsafe { self.sse2 } |
179 | } |
180 | } |
181 | impl Eq for vec256_storage {} |
182 | impl PartialEq for vec256_storage { |
183 | #[inline (always)] |
184 | fn eq(&self, rhs: &Self) -> bool { |
185 | unsafe { self.sse2 == rhs.sse2 } |
186 | } |
187 | } |
188 | |
189 | #[allow (non_camel_case_types)] |
190 | #[derive (Copy, Clone)] |
191 | pub union vec512_storage { |
192 | u32x16: [u32; 16], |
193 | u64x8: [u64; 8], |
194 | u128x4: [u128; 4], |
195 | sse2: [vec128_storage; 4], |
196 | avx: [vec256_storage; 2], |
197 | } |
198 | impl Default for vec512_storage { |
199 | #[inline (always)] |
200 | fn default() -> Self { |
201 | vec512_storage { |
202 | u128x4: [0, 0, 0, 0], |
203 | } |
204 | } |
205 | } |
206 | impl vec512_storage { |
207 | #[inline (always)] |
208 | pub fn new128(xs: [vec128_storage; 4]) -> Self { |
209 | Self { sse2: xs } |
210 | } |
211 | #[inline (always)] |
212 | pub fn split128(self) -> [vec128_storage; 4] { |
213 | unsafe { self.sse2 } |
214 | } |
215 | } |
216 | impl Eq for vec512_storage {} |
217 | impl PartialEq for vec512_storage { |
218 | #[inline (always)] |
219 | fn eq(&self, rhs: &Self) -> bool { |
220 | unsafe { self.avx == rhs.avx } |
221 | } |
222 | } |
223 | |
224 | macro_rules! impl_into { |
225 | ($storage:ident, $array:ty, $name:ident) => { |
226 | impl From<$storage> for $array { |
227 | #[inline(always)] |
228 | fn from(vec: $storage) -> Self { |
229 | unsafe { vec.$name } |
230 | } |
231 | } |
232 | }; |
233 | } |
234 | impl_into!(vec128_storage, [u32; 4], u32x4); |
235 | impl_into!(vec128_storage, [u64; 2], u64x2); |
236 | impl_into!(vec128_storage, [u128; 1], u128x1); |
237 | impl_into!(vec256_storage, [u32; 8], u32x8); |
238 | impl_into!(vec256_storage, [u64; 4], u64x4); |
239 | impl_into!(vec256_storage, [u128; 2], u128x2); |
240 | impl_into!(vec512_storage, [u32; 16], u32x16); |
241 | impl_into!(vec512_storage, [u64; 8], u64x8); |
242 | impl_into!(vec512_storage, [u128; 4], u128x4); |
243 | |
244 | /// Generate the full set of optimized implementations to take advantage of the most important |
245 | /// hardware feature sets. |
246 | /// |
247 | /// This dispatcher is suitable for maximizing throughput. |
248 | #[macro_export ] |
249 | macro_rules! dispatch { |
250 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { |
251 | #[cfg(feature = "std" )] |
252 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
253 | #[inline(always)] |
254 | fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
255 | use std::arch::x86_64::*; |
256 | #[target_feature(enable = "avx2" )] |
257 | unsafe fn impl_avx2($($arg: $argty),*) -> $ret { |
258 | let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); |
259 | _mm256_zeroupper(); |
260 | ret |
261 | } |
262 | #[target_feature(enable = "avx" )] |
263 | #[target_feature(enable = "sse4.1" )] |
264 | #[target_feature(enable = "ssse3" )] |
265 | unsafe fn impl_avx($($arg: $argty),*) -> $ret { |
266 | let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); |
267 | _mm256_zeroupper(); |
268 | ret |
269 | } |
270 | #[target_feature(enable = "sse4.1" )] |
271 | #[target_feature(enable = "ssse3" )] |
272 | unsafe fn impl_sse41($($arg: $argty),*) -> $ret { |
273 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
274 | } |
275 | #[target_feature(enable = "ssse3" )] |
276 | unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { |
277 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
278 | } |
279 | #[target_feature(enable = "sse2" )] |
280 | unsafe fn impl_sse2($($arg: $argty),*) -> $ret { |
281 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
282 | } |
283 | unsafe { |
284 | if is_x86_feature_detected!("avx2" ) { |
285 | impl_avx2($($arg),*) |
286 | } else if is_x86_feature_detected!("avx" ) { |
287 | impl_avx($($arg),*) |
288 | } else if is_x86_feature_detected!("sse4.1" ) { |
289 | impl_sse41($($arg),*) |
290 | } else if is_x86_feature_detected!("ssse3" ) { |
291 | impl_ssse3($($arg),*) |
292 | } else if is_x86_feature_detected!("sse2" ) { |
293 | impl_sse2($($arg),*) |
294 | } else { |
295 | unimplemented!() |
296 | } |
297 | } |
298 | } |
299 | #[cfg(not(feature = "std" ))] |
300 | #[inline(always)] |
301 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
302 | unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
303 | unsafe { |
304 | if cfg!(target_feature = "avx2" ) { |
305 | fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) |
306 | } else if cfg!(target_feature = "avx" ) { |
307 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
308 | } else if cfg!(target_feature = "sse4.1" ) { |
309 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
310 | } else if cfg!(target_feature = "ssse3" ) { |
311 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
312 | } else { |
313 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
314 | } |
315 | } |
316 | } |
317 | }; |
318 | ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { |
319 | dispatch!($mach, $MTy, { |
320 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body |
321 | }); |
322 | } |
323 | } |
324 | |
325 | /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit |
326 | /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. |
327 | /// |
328 | /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware |
329 | /// features (e.g. because they are done infrequently), so minimizing their contribution to code |
330 | /// size is more important. |
331 | #[macro_export ] |
332 | macro_rules! dispatch_light128 { |
333 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { |
334 | #[cfg(feature = "std" )] |
335 | $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
336 | #[inline(always)] |
337 | fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
338 | use std::arch::x86_64::*; |
339 | #[target_feature(enable = "avx" )] |
340 | unsafe fn impl_avx($($arg: $argty),*) -> $ret { |
341 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
342 | } |
343 | #[target_feature(enable = "sse2" )] |
344 | unsafe fn impl_sse2($($arg: $argty),*) -> $ret { |
345 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
346 | } |
347 | unsafe { |
348 | if is_x86_feature_detected!("avx" ) { |
349 | impl_avx($($arg),*) |
350 | } else if is_x86_feature_detected!("sse2" ) { |
351 | impl_sse2($($arg),*) |
352 | } else { |
353 | unimplemented!() |
354 | } |
355 | } |
356 | } |
357 | #[cfg(not(feature = "std" ))] |
358 | #[inline(always)] |
359 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
360 | unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
361 | unsafe { |
362 | if cfg!(target_feature = "avx2" ) { |
363 | fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) |
364 | } else if cfg!(target_feature = "avx" ) { |
365 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
366 | } else if cfg!(target_feature = "sse4.1" ) { |
367 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
368 | } else if cfg!(target_feature = "ssse3" ) { |
369 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
370 | } else { |
371 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
372 | } |
373 | } |
374 | } |
375 | }; |
376 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { |
377 | dispatch_light128!($mach, $MTy, { |
378 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body |
379 | }); |
380 | } |
381 | } |
382 | |
383 | /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit |
384 | /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. |
385 | /// |
386 | /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware |
387 | /// features (e.g. because they are done infrequently), so minimizing their contribution to code |
388 | /// size is more important. |
389 | #[macro_export ] |
390 | macro_rules! dispatch_light256 { |
391 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { |
392 | #[cfg(feature = "std" )] |
393 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { |
394 | #[inline(always)] |
395 | fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
396 | use std::arch::x86_64::*; |
397 | #[target_feature(enable = "avx" )] |
398 | unsafe fn impl_avx($($arg: $argty),*) -> $ret { |
399 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
400 | } |
401 | #[target_feature(enable = "sse2" )] |
402 | unsafe fn impl_sse2($($arg: $argty),*) -> $ret { |
403 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
404 | } |
405 | unsafe { |
406 | if is_x86_feature_detected!("avx" ) { |
407 | impl_avx($($arg),*) |
408 | } else if is_x86_feature_detected!("sse2" ) { |
409 | impl_sse2($($arg),*) |
410 | } else { |
411 | unimplemented!() |
412 | } |
413 | } |
414 | } |
415 | #[cfg(not(feature = "std" ))] |
416 | #[inline(always)] |
417 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
418 | unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
419 | unsafe { |
420 | if cfg!(target_feature = "avx2" ) { |
421 | fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) |
422 | } else if cfg!(target_feature = "avx" ) { |
423 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
424 | } else if cfg!(target_feature = "sse4.1" ) { |
425 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
426 | } else if cfg!(target_feature = "ssse3" ) { |
427 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
428 | } else { |
429 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
430 | } |
431 | } |
432 | } |
433 | }; |
434 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { |
435 | dispatch_light256!($mach, $MTy, { |
436 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body |
437 | }); |
438 | } |
439 | } |
440 | |