1 | // crate minimums: sse2, x86_64 |
2 | |
3 | use crate::types::*; |
4 | use core::arch::x86_64::{__m128i, __m256i}; |
5 | |
6 | mod sse2; |
7 | |
8 | #[derive (Copy, Clone)] |
9 | pub struct YesS3; |
10 | #[derive (Copy, Clone)] |
11 | pub struct NoS3; |
12 | |
13 | #[derive (Copy, Clone)] |
14 | pub struct YesS4; |
15 | #[derive (Copy, Clone)] |
16 | pub struct NoS4; |
17 | |
18 | #[derive (Copy, Clone)] |
19 | pub struct YesA1; |
20 | #[derive (Copy, Clone)] |
21 | pub struct NoA1; |
22 | |
23 | #[derive (Copy, Clone)] |
24 | pub struct YesA2; |
25 | #[derive (Copy, Clone)] |
26 | pub struct NoA2; |
27 | |
28 | #[derive (Copy, Clone)] |
29 | pub struct YesNI; |
30 | #[derive (Copy, Clone)] |
31 | pub struct NoNI; |
32 | |
33 | use core::marker::PhantomData; |
34 | |
35 | #[derive (Copy, Clone)] |
36 | pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>); |
37 | impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI> |
38 | where |
39 | sse2::u128x1_sse2<S3, S4, NI>: Swap64, |
40 | sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, |
41 | sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, |
42 | sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4, |
43 | sse2::u128x1_sse2<S3, S4, NI>: BSwap, |
44 | sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>, |
45 | sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>, |
46 | sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>, |
47 | sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>, |
48 | sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>, |
49 | { |
50 | type u32x4 = sse2::u32x4_sse2<S3, S4, NI>; |
51 | type u64x2 = sse2::u64x2_sse2<S3, S4, NI>; |
52 | type u128x1 = sse2::u128x1_sse2<S3, S4, NI>; |
53 | |
54 | type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>; |
55 | type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>; |
56 | type u64x4 = sse2::u64x4_sse2<S3, S4, NI>; |
57 | type u128x2 = sse2::u128x2_sse2<S3, S4, NI>; |
58 | |
59 | type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>; |
60 | type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>; |
61 | type u128x4 = sse2::u128x4_sse2<S3, S4, NI>; |
62 | |
63 | #[inline (always)] |
64 | unsafe fn instance() -> Self { |
65 | SseMachine(PhantomData) |
66 | } |
67 | } |
68 | |
69 | #[derive (Copy, Clone)] |
70 | pub struct Avx2Machine<NI>(PhantomData<NI>); |
71 | impl<NI: Copy> Machine for Avx2Machine<NI> |
72 | where |
73 | sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64, |
74 | sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, |
75 | sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, |
76 | sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4, |
77 | { |
78 | type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>; |
79 | type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; |
80 | type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; |
81 | |
82 | type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>; |
83 | type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; |
84 | type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; |
85 | type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; |
86 | |
87 | type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>; |
88 | type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>; |
89 | type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>; |
90 | |
91 | #[inline (always)] |
92 | unsafe fn instance() -> Self { |
93 | Avx2Machine(PhantomData) |
94 | } |
95 | } |
96 | |
97 | pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>; |
98 | pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>; |
99 | pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>; |
100 | /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything |
101 | /// to avoid expensive SSE/VEX conflicts. |
102 | pub type AVX = SseMachine<YesS3, YesS4, NoNI>; |
103 | pub type AVX2 = Avx2Machine<NoNI>; |
104 | |
105 | zerocopy::cryptocorrosion_derive_traits! { |
106 | #[repr(C)] |
107 | /// Generic wrapper for unparameterized storage of any of the possible impls. |
108 | /// Converting into and out of this type should be essentially free, although it may be more |
109 | /// aligned than a particular impl requires. |
110 | #[allow (non_camel_case_types)] |
111 | #[derive (Copy, Clone)] |
112 | pub union vec128_storage { |
113 | u32x4: [u32; 4], |
114 | u64x2: [u64; 2], |
115 | u128x1: [u128; 1], |
116 | sse2: __m128i, |
117 | } |
118 | } |
119 | |
120 | impl Store<vec128_storage> for vec128_storage { |
121 | #[inline (always)] |
122 | unsafe fn unpack(p: vec128_storage) -> Self { |
123 | p |
124 | } |
125 | } |
126 | impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { |
127 | #[inline (always)] |
128 | fn from(x: &'a vec128_storage) -> Self { |
129 | unsafe { &x.u32x4 } |
130 | } |
131 | } |
132 | impl From<[u32; 4]> for vec128_storage { |
133 | #[inline (always)] |
134 | fn from(u32x4: [u32; 4]) -> Self { |
135 | vec128_storage { u32x4 } |
136 | } |
137 | } |
138 | impl Default for vec128_storage { |
139 | #[inline (always)] |
140 | fn default() -> Self { |
141 | vec128_storage { u128x1: [0] } |
142 | } |
143 | } |
144 | impl Eq for vec128_storage {} |
145 | impl PartialEq for vec128_storage { |
146 | #[inline (always)] |
147 | fn eq(&self, rhs: &Self) -> bool { |
148 | unsafe { self.u128x1 == rhs.u128x1 } |
149 | } |
150 | } |
151 | |
152 | #[allow (non_camel_case_types)] |
153 | #[derive (Copy, Clone)] |
154 | pub union vec256_storage { |
155 | u32x8: [u32; 8], |
156 | u64x4: [u64; 4], |
157 | u128x2: [u128; 2], |
158 | sse2: [vec128_storage; 2], |
159 | avx: __m256i, |
160 | } |
161 | impl From<[u64; 4]> for vec256_storage { |
162 | #[inline (always)] |
163 | fn from(u64x4: [u64; 4]) -> Self { |
164 | vec256_storage { u64x4 } |
165 | } |
166 | } |
167 | impl Default for vec256_storage { |
168 | #[inline (always)] |
169 | fn default() -> Self { |
170 | vec256_storage { u128x2: [0, 0] } |
171 | } |
172 | } |
173 | impl vec256_storage { |
174 | #[inline (always)] |
175 | pub fn new128(xs: [vec128_storage; 2]) -> Self { |
176 | Self { sse2: xs } |
177 | } |
178 | #[inline (always)] |
179 | pub fn split128(self) -> [vec128_storage; 2] { |
180 | unsafe { self.sse2 } |
181 | } |
182 | } |
183 | impl Eq for vec256_storage {} |
184 | impl PartialEq for vec256_storage { |
185 | #[inline (always)] |
186 | fn eq(&self, rhs: &Self) -> bool { |
187 | unsafe { self.sse2 == rhs.sse2 } |
188 | } |
189 | } |
190 | |
191 | #[allow (non_camel_case_types)] |
192 | #[derive (Copy, Clone)] |
193 | pub union vec512_storage { |
194 | u32x16: [u32; 16], |
195 | u64x8: [u64; 8], |
196 | u128x4: [u128; 4], |
197 | sse2: [vec128_storage; 4], |
198 | avx: [vec256_storage; 2], |
199 | } |
200 | impl Default for vec512_storage { |
201 | #[inline (always)] |
202 | fn default() -> Self { |
203 | vec512_storage { |
204 | u128x4: [0, 0, 0, 0], |
205 | } |
206 | } |
207 | } |
208 | impl vec512_storage { |
209 | #[inline (always)] |
210 | pub fn new128(xs: [vec128_storage; 4]) -> Self { |
211 | Self { sse2: xs } |
212 | } |
213 | #[inline (always)] |
214 | pub fn split128(self) -> [vec128_storage; 4] { |
215 | unsafe { self.sse2 } |
216 | } |
217 | } |
218 | impl Eq for vec512_storage {} |
219 | impl PartialEq for vec512_storage { |
220 | #[inline (always)] |
221 | fn eq(&self, rhs: &Self) -> bool { |
222 | unsafe { self.avx == rhs.avx } |
223 | } |
224 | } |
225 | |
226 | macro_rules! impl_into { |
227 | ($storage:ident, $array:ty, $name:ident) => { |
228 | impl From<$storage> for $array { |
229 | #[inline(always)] |
230 | fn from(vec: $storage) -> Self { |
231 | unsafe { vec.$name } |
232 | } |
233 | } |
234 | }; |
235 | } |
236 | impl_into!(vec128_storage, [u32; 4], u32x4); |
237 | impl_into!(vec128_storage, [u64; 2], u64x2); |
238 | impl_into!(vec128_storage, [u128; 1], u128x1); |
239 | impl_into!(vec256_storage, [u32; 8], u32x8); |
240 | impl_into!(vec256_storage, [u64; 4], u64x4); |
241 | impl_into!(vec256_storage, [u128; 2], u128x2); |
242 | impl_into!(vec512_storage, [u32; 16], u32x16); |
243 | impl_into!(vec512_storage, [u64; 8], u64x8); |
244 | impl_into!(vec512_storage, [u128; 4], u128x4); |
245 | |
246 | /// Generate the full set of optimized implementations to take advantage of the most important |
247 | /// hardware feature sets. |
248 | /// |
249 | /// This dispatcher is suitable for maximizing throughput. |
250 | #[macro_export ] |
251 | macro_rules! dispatch { |
252 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { |
253 | #[cfg(feature = "std" )] |
254 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
255 | #[inline(always)] |
256 | fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
257 | use std::arch::x86_64::*; |
258 | #[target_feature(enable = "avx2" )] |
259 | unsafe fn impl_avx2($($arg: $argty),*) -> $ret { |
260 | let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); |
261 | _mm256_zeroupper(); |
262 | ret |
263 | } |
264 | #[target_feature(enable = "avx" )] |
265 | #[target_feature(enable = "sse4.1" )] |
266 | #[target_feature(enable = "ssse3" )] |
267 | unsafe fn impl_avx($($arg: $argty),*) -> $ret { |
268 | let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); |
269 | _mm256_zeroupper(); |
270 | ret |
271 | } |
272 | #[target_feature(enable = "sse4.1" )] |
273 | #[target_feature(enable = "ssse3" )] |
274 | unsafe fn impl_sse41($($arg: $argty),*) -> $ret { |
275 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
276 | } |
277 | #[target_feature(enable = "ssse3" )] |
278 | unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { |
279 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
280 | } |
281 | #[target_feature(enable = "sse2" )] |
282 | unsafe fn impl_sse2($($arg: $argty),*) -> $ret { |
283 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
284 | } |
285 | unsafe { |
286 | if is_x86_feature_detected!("avx2" ) { |
287 | impl_avx2($($arg),*) |
288 | } else if is_x86_feature_detected!("avx" ) { |
289 | impl_avx($($arg),*) |
290 | } else if is_x86_feature_detected!("sse4.1" ) { |
291 | impl_sse41($($arg),*) |
292 | } else if is_x86_feature_detected!("ssse3" ) { |
293 | impl_ssse3($($arg),*) |
294 | } else if is_x86_feature_detected!("sse2" ) { |
295 | impl_sse2($($arg),*) |
296 | } else { |
297 | unimplemented!() |
298 | } |
299 | } |
300 | } |
301 | #[cfg(not(feature = "std" ))] |
302 | #[inline(always)] |
303 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
304 | unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
305 | unsafe { |
306 | if cfg!(target_feature = "avx2" ) { |
307 | fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) |
308 | } else if cfg!(target_feature = "avx" ) { |
309 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
310 | } else if cfg!(target_feature = "sse4.1" ) { |
311 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
312 | } else if cfg!(target_feature = "ssse3" ) { |
313 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
314 | } else { |
315 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
316 | } |
317 | } |
318 | } |
319 | }; |
320 | ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { |
321 | dispatch!($mach, $MTy, { |
322 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body |
323 | }); |
324 | } |
325 | } |
326 | |
327 | /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit |
328 | /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. |
329 | /// |
330 | /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware |
331 | /// features (e.g. because they are done infrequently), so minimizing their contribution to code |
332 | /// size is more important. |
333 | #[macro_export ] |
334 | macro_rules! dispatch_light128 { |
335 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { |
336 | #[cfg(feature = "std" )] |
337 | $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
338 | #[inline(always)] |
339 | fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
340 | use std::arch::x86_64::*; |
341 | #[target_feature(enable = "avx" )] |
342 | unsafe fn impl_avx($($arg: $argty),*) -> $ret { |
343 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
344 | } |
345 | #[target_feature(enable = "sse2" )] |
346 | unsafe fn impl_sse2($($arg: $argty),*) -> $ret { |
347 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
348 | } |
349 | unsafe { |
350 | if is_x86_feature_detected!("avx" ) { |
351 | impl_avx($($arg),*) |
352 | } else if is_x86_feature_detected!("sse2" ) { |
353 | impl_sse2($($arg),*) |
354 | } else { |
355 | unimplemented!() |
356 | } |
357 | } |
358 | } |
359 | #[cfg(not(feature = "std" ))] |
360 | #[inline(always)] |
361 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
362 | unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
363 | unsafe { |
364 | if cfg!(target_feature = "avx2" ) { |
365 | fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) |
366 | } else if cfg!(target_feature = "avx" ) { |
367 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
368 | } else if cfg!(target_feature = "sse4.1" ) { |
369 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
370 | } else if cfg!(target_feature = "ssse3" ) { |
371 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
372 | } else { |
373 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
374 | } |
375 | } |
376 | } |
377 | }; |
378 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { |
379 | dispatch_light128!($mach, $MTy, { |
380 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body |
381 | }); |
382 | } |
383 | } |
384 | |
385 | /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit |
386 | /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. |
387 | /// |
388 | /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware |
389 | /// features (e.g. because they are done infrequently), so minimizing their contribution to code |
390 | /// size is more important. |
391 | #[macro_export ] |
392 | macro_rules! dispatch_light256 { |
393 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { |
394 | #[cfg(feature = "std" )] |
395 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { |
396 | #[inline(always)] |
397 | fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
398 | use std::arch::x86_64::*; |
399 | #[target_feature(enable = "avx" )] |
400 | unsafe fn impl_avx($($arg: $argty),*) -> $ret { |
401 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
402 | } |
403 | #[target_feature(enable = "sse2" )] |
404 | unsafe fn impl_sse2($($arg: $argty),*) -> $ret { |
405 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
406 | } |
407 | unsafe { |
408 | if is_x86_feature_detected!("avx" ) { |
409 | impl_avx($($arg),*) |
410 | } else if is_x86_feature_detected!("sse2" ) { |
411 | impl_sse2($($arg),*) |
412 | } else { |
413 | unimplemented!() |
414 | } |
415 | } |
416 | } |
417 | #[cfg(not(feature = "std" ))] |
418 | #[inline(always)] |
419 | $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { |
420 | unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body |
421 | unsafe { |
422 | if cfg!(target_feature = "avx2" ) { |
423 | fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) |
424 | } else if cfg!(target_feature = "avx" ) { |
425 | fn_impl($crate::x86_64::AVX::instance(), $($arg),*) |
426 | } else if cfg!(target_feature = "sse4.1" ) { |
427 | fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) |
428 | } else if cfg!(target_feature = "ssse3" ) { |
429 | fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) |
430 | } else { |
431 | fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) |
432 | } |
433 | } |
434 | } |
435 | }; |
436 | ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { |
437 | dispatch_light256!($mach, $MTy, { |
438 | $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body |
439 | }); |
440 | } |
441 | } |
442 | |