1 | use crate::soft::{x2, x4}; |
2 | use crate::types::*; |
3 | use crate::vec128_storage; |
4 | use crate::x86_64::Avx2Machine; |
5 | use crate::x86_64::SseMachine as Machine86; |
6 | use crate::x86_64::{NoS3, NoS4, YesS3, YesS4}; |
7 | use core::arch::x86_64::*; |
8 | use core::marker::PhantomData; |
9 | use core::ops::{ |
10 | Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not, |
11 | }; |
12 | use zerocopy::transmute; |
13 | |
14 | macro_rules! impl_binop { |
15 | ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => { |
16 | impl<S3, S4, NI> $trait for $vec<S3, S4, NI> { |
17 | type Output = Self; |
18 | #[inline(always)] |
19 | fn $fn(self, rhs: Self) -> Self::Output { |
20 | Self::new(unsafe { $impl_fn(self.x, rhs.x) }) |
21 | } |
22 | } |
23 | }; |
24 | } |
25 | |
26 | macro_rules! impl_binop_assign { |
27 | ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => { |
28 | impl<S3, S4, NI> $trait for $vec<S3, S4, NI> |
29 | where |
30 | $vec<S3, S4, NI>: Copy, |
31 | { |
32 | #[inline(always)] |
33 | fn $fn_assign(&mut self, rhs: Self) { |
34 | *self = self.$fn(rhs); |
35 | } |
36 | } |
37 | }; |
38 | } |
39 | |
40 | macro_rules! def_vec { |
41 | ($vec:ident, $word:ident) => { |
42 | zerocopy::cryptocorrosion_derive_traits! { |
43 | #[repr(transparent)] |
44 | #[allow(non_camel_case_types)] |
45 | #[derive(Copy, Clone)] |
46 | pub struct $vec<S3, S4, NI> { |
47 | x: __m128i, |
48 | s3: PhantomData<S3>, |
49 | s4: PhantomData<S4>, |
50 | ni: PhantomData<NI>, |
51 | } |
52 | } |
53 | |
54 | impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> { |
55 | #[inline(always)] |
56 | unsafe fn unpack(x: vec128_storage) -> Self { |
57 | Self::new(x.sse2) |
58 | } |
59 | } |
60 | impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage { |
61 | #[inline(always)] |
62 | fn from(x: $vec<S3, S4, NI>) -> Self { |
63 | vec128_storage { sse2: x.x } |
64 | } |
65 | } |
66 | impl<S3, S4, NI> $vec<S3, S4, NI> { |
67 | #[inline(always)] |
68 | fn new(x: __m128i) -> Self { |
69 | $vec { |
70 | x, |
71 | s3: PhantomData, |
72 | s4: PhantomData, |
73 | ni: PhantomData, |
74 | } |
75 | } |
76 | } |
77 | |
78 | impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI> |
79 | where |
80 | Self: BSwap, |
81 | { |
82 | #[inline(always)] |
83 | unsafe fn unsafe_read_le(input: &[u8]) -> Self { |
84 | assert_eq!(input.len(), 16); |
85 | Self::new(_mm_loadu_si128(input.as_ptr() as *const _)) |
86 | } |
87 | #[inline(always)] |
88 | unsafe fn unsafe_read_be(input: &[u8]) -> Self { |
89 | assert_eq!(input.len(), 16); |
90 | Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap() |
91 | } |
92 | #[inline(always)] |
93 | fn write_le(self, out: &mut [u8]) { |
94 | assert_eq!(out.len(), 16); |
95 | unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) } |
96 | } |
97 | #[inline(always)] |
98 | fn write_be(self, out: &mut [u8]) { |
99 | assert_eq!(out.len(), 16); |
100 | let x = self.bswap().x; |
101 | unsafe { |
102 | _mm_storeu_si128(out.as_mut_ptr() as *mut _, x); |
103 | } |
104 | } |
105 | } |
106 | |
107 | impl<S3, S4, NI> Default for $vec<S3, S4, NI> { |
108 | #[inline(always)] |
109 | fn default() -> Self { |
110 | Self::new(unsafe { _mm_setzero_si128() }) |
111 | } |
112 | } |
113 | |
114 | impl<S3, S4, NI> Not for $vec<S3, S4, NI> { |
115 | type Output = Self; |
116 | #[inline(always)] |
117 | fn not(self) -> Self::Output { |
118 | unsafe { |
119 | let ff = _mm_set1_epi64x(-1i64); |
120 | self ^ Self::new(ff) |
121 | } |
122 | } |
123 | } |
124 | |
125 | impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {} |
126 | impl_binop!($vec, BitAnd, bitand, _mm_and_si128); |
127 | impl_binop!($vec, BitOr, bitor, _mm_or_si128); |
128 | impl_binop!($vec, BitXor, bitxor, _mm_xor_si128); |
129 | impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand); |
130 | impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor); |
131 | impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor); |
132 | impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> { |
133 | type Output = Self; |
134 | #[inline(always)] |
135 | fn andnot(self, rhs: Self) -> Self { |
136 | Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) }) |
137 | } |
138 | } |
139 | }; |
140 | } |
141 | |
142 | macro_rules! impl_bitops32 { |
143 | ($vec:ident) => { |
144 | impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where |
145 | $vec<S3, S4, NI>: RotateEachWord32 |
146 | { |
147 | } |
148 | }; |
149 | } |
150 | |
151 | macro_rules! impl_bitops64 { |
152 | ($vec:ident) => { |
153 | impl_bitops32!($vec); |
154 | impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where |
155 | $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 |
156 | { |
157 | } |
158 | }; |
159 | } |
160 | |
161 | macro_rules! impl_bitops128 { |
162 | ($vec:ident) => { |
163 | impl_bitops64!($vec); |
164 | impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where |
165 | $vec<S3, S4, NI>: RotateEachWord128 |
166 | { |
167 | } |
168 | }; |
169 | } |
170 | |
171 | macro_rules! rotr_32_s3 { |
172 | ($name:ident, $k0:expr, $k1:expr) => { |
173 | #[inline(always)] |
174 | fn $name(self) -> Self { |
175 | Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) |
176 | } |
177 | }; |
178 | } |
179 | macro_rules! rotr_32 { |
180 | ($name:ident, $i:expr) => { |
181 | #[inline(always)] |
182 | fn $name(self) -> Self { |
183 | Self::new(unsafe { |
184 | _mm_or_si128( |
185 | _mm_srli_epi32(self.x, $i as i32), |
186 | _mm_slli_epi32(self.x, 32 - $i as i32), |
187 | ) |
188 | }) |
189 | } |
190 | }; |
191 | } |
192 | impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> { |
193 | rotr_32!(rotate_each_word_right7, 7); |
194 | rotr_32_s3!( |
195 | rotate_each_word_right8, |
196 | 0x0c0f_0e0d_080b_0a09, |
197 | 0x0407_0605_0003_0201 |
198 | ); |
199 | rotr_32!(rotate_each_word_right11, 11); |
200 | rotr_32!(rotate_each_word_right12, 12); |
201 | rotr_32_s3!( |
202 | rotate_each_word_right16, |
203 | 0x0d0c_0f0e_0908_0b0a, |
204 | 0x0504_0706_0100_0302 |
205 | ); |
206 | rotr_32!(rotate_each_word_right20, 20); |
207 | rotr_32_s3!( |
208 | rotate_each_word_right24, |
209 | 0x0e0d_0c0f_0a09_080b, |
210 | 0x0605_0407_0201_0003 |
211 | ); |
212 | rotr_32!(rotate_each_word_right25, 25); |
213 | } |
214 | impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> { |
215 | rotr_32!(rotate_each_word_right7, 7); |
216 | rotr_32!(rotate_each_word_right8, 8); |
217 | rotr_32!(rotate_each_word_right11, 11); |
218 | rotr_32!(rotate_each_word_right12, 12); |
219 | #[inline (always)] |
220 | fn rotate_each_word_right16(self) -> Self { |
221 | Self::new(swap16_s2(self.x)) |
222 | } |
223 | rotr_32!(rotate_each_word_right20, 20); |
224 | rotr_32!(rotate_each_word_right24, 24); |
225 | rotr_32!(rotate_each_word_right25, 25); |
226 | } |
227 | |
228 | macro_rules! rotr_64_s3 { |
229 | ($name:ident, $k0:expr, $k1:expr) => { |
230 | #[inline(always)] |
231 | fn $name(self) -> Self { |
232 | Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) |
233 | } |
234 | }; |
235 | } |
236 | macro_rules! rotr_64 { |
237 | ($name:ident, $i:expr) => { |
238 | #[inline(always)] |
239 | fn $name(self) -> Self { |
240 | Self::new(unsafe { |
241 | _mm_or_si128( |
242 | _mm_srli_epi64(self.x, $i as i32), |
243 | _mm_slli_epi64(self.x, 64 - $i as i32), |
244 | ) |
245 | }) |
246 | } |
247 | }; |
248 | } |
249 | impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> { |
250 | rotr_64!(rotate_each_word_right7, 7); |
251 | rotr_64_s3!( |
252 | rotate_each_word_right8, |
253 | 0x080f_0e0d_0c0b_0a09, |
254 | 0x0007_0605_0403_0201 |
255 | ); |
256 | rotr_64!(rotate_each_word_right11, 11); |
257 | rotr_64!(rotate_each_word_right12, 12); |
258 | rotr_64_s3!( |
259 | rotate_each_word_right16, |
260 | 0x0908_0f0e_0d0c_0b0a, |
261 | 0x0100_0706_0504_0302 |
262 | ); |
263 | rotr_64!(rotate_each_word_right20, 20); |
264 | rotr_64_s3!( |
265 | rotate_each_word_right24, |
266 | 0x0a09_080f_0e0d_0c0b, |
267 | 0x0201_0007_0605_0403 |
268 | ); |
269 | rotr_64!(rotate_each_word_right25, 25); |
270 | } |
271 | impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> { |
272 | rotr_64!(rotate_each_word_right7, 7); |
273 | rotr_64!(rotate_each_word_right8, 8); |
274 | rotr_64!(rotate_each_word_right11, 11); |
275 | rotr_64!(rotate_each_word_right12, 12); |
276 | #[inline (always)] |
277 | fn rotate_each_word_right16(self) -> Self { |
278 | Self::new(swap16_s2(self.x)) |
279 | } |
280 | rotr_64!(rotate_each_word_right20, 20); |
281 | rotr_64!(rotate_each_word_right24, 24); |
282 | rotr_64!(rotate_each_word_right25, 25); |
283 | } |
284 | impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> { |
285 | #[inline (always)] |
286 | fn rotate_each_word_right32(self) -> Self { |
287 | Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) }) |
288 | } |
289 | } |
290 | |
291 | macro_rules! rotr_128 { |
292 | ($name:ident, $i:expr) => { |
293 | #[inline(always)] |
294 | fn $name(self) -> Self { |
295 | Self::new(unsafe { |
296 | _mm_or_si128( |
297 | _mm_srli_si128(self.x, $i as i32), |
298 | _mm_slli_si128(self.x, 128 - $i as i32), |
299 | ) |
300 | }) |
301 | } |
302 | }; |
303 | } |
304 | // TODO: completely unoptimized |
305 | impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> { |
306 | rotr_128!(rotate_each_word_right7, 7); |
307 | rotr_128!(rotate_each_word_right8, 8); |
308 | rotr_128!(rotate_each_word_right11, 11); |
309 | rotr_128!(rotate_each_word_right12, 12); |
310 | rotr_128!(rotate_each_word_right16, 16); |
311 | rotr_128!(rotate_each_word_right20, 20); |
312 | rotr_128!(rotate_each_word_right24, 24); |
313 | rotr_128!(rotate_each_word_right25, 25); |
314 | } |
315 | // TODO: completely unoptimized |
316 | impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> { |
317 | rotr_128!(rotate_each_word_right32, 32); |
318 | } |
319 | impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {} |
320 | |
321 | def_vec!(u32x4_sse2, u32); |
322 | def_vec!(u64x2_sse2, u64); |
323 | def_vec!(u128x1_sse2, u128); |
324 | |
325 | impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> { |
326 | #[inline (always)] |
327 | fn to_lanes(self) -> [u32; 4] { |
328 | unsafe { |
329 | let x: u64 = _mm_cvtsi128_si64(self.x) as u64; |
330 | let y: u64 = _mm_extract_epi64(self.x, 1) as u64; |
331 | [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] |
332 | } |
333 | } |
334 | #[inline (always)] |
335 | fn from_lanes(xs: [u32; 4]) -> Self { |
336 | unsafe { |
337 | let mut x: __m128i = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64); |
338 | x = _mm_insert_epi64(a:x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1); |
339 | Self::new(x) |
340 | } |
341 | } |
342 | } |
343 | impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> { |
344 | #[inline (always)] |
345 | fn to_lanes(self) -> [u32; 4] { |
346 | unsafe { |
347 | let x: u64 = _mm_cvtsi128_si64(self.x) as u64; |
348 | let y: u64 = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64; |
349 | [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] |
350 | } |
351 | } |
352 | #[inline (always)] |
353 | fn from_lanes(xs: [u32; 4]) -> Self { |
354 | unsafe { |
355 | let x: i64 = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64; |
356 | let y: i64 = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64; |
357 | let x: __m128i = _mm_cvtsi64_si128(x); |
358 | let y: __m128i = _mm_slli_si128(_mm_cvtsi64_si128(y), 8); |
359 | Self::new(_mm_or_si128(a:x, b:y)) |
360 | } |
361 | } |
362 | } |
363 | impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> { |
364 | #[inline (always)] |
365 | fn to_lanes(self) -> [u64; 2] { |
366 | unsafe { |
367 | [ |
368 | _mm_cvtsi128_si64(self.x) as u64, |
369 | _mm_extract_epi64(self.x, 1) as u64, |
370 | ] |
371 | } |
372 | } |
373 | #[inline (always)] |
374 | fn from_lanes(xs: [u64; 2]) -> Self { |
375 | unsafe { |
376 | let mut x: __m128i = _mm_cvtsi64_si128(xs[0] as i64); |
377 | x = _mm_insert_epi64(a:x, i:xs[1] as i64, 1); |
378 | Self::new(x) |
379 | } |
380 | } |
381 | } |
382 | impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> { |
383 | #[inline (always)] |
384 | fn to_lanes(self) -> [u64; 2] { |
385 | unsafe { |
386 | [ |
387 | _mm_cvtsi128_si64(self.x) as u64, |
388 | _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64, |
389 | ] |
390 | } |
391 | } |
392 | #[inline (always)] |
393 | fn from_lanes(xs: [u64; 2]) -> Self { |
394 | unsafe { |
395 | let x: __m128i = _mm_cvtsi64_si128(xs[0] as i64); |
396 | let y: __m128i = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8); |
397 | Self::new(_mm_or_si128(a:x, b:y)) |
398 | } |
399 | } |
400 | } |
401 | impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> { |
402 | #[inline (always)] |
403 | fn to_lanes(self) -> [u128; 1] { |
404 | unimplemented!() |
405 | } |
406 | #[inline (always)] |
407 | fn from_lanes(xs: [u128; 1]) -> Self { |
408 | unimplemented!(" {:?}" , xs) |
409 | } |
410 | } |
411 | |
412 | impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI> |
413 | where |
414 | u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy, |
415 | { |
416 | #[inline (always)] |
417 | fn to_lanes(self) -> [u64; 4] { |
418 | let (a: [u64; 2], b: [u64; 2]) = (self.0[0].to_lanes(), self.0[1].to_lanes()); |
419 | [a[0], a[1], b[0], b[1]] |
420 | } |
421 | #[inline (always)] |
422 | fn from_lanes(xs: [u64; 4]) -> Self { |
423 | let (a: u64x2_sse2, b: u64x2_sse2) = ( |
424 | u64x2_sse2::from_lanes([xs[0], xs[1]]), |
425 | u64x2_sse2::from_lanes([xs[2], xs[3]]), |
426 | ); |
427 | x2::new([a, b]) |
428 | } |
429 | } |
430 | |
431 | macro_rules! impl_into { |
432 | ($from:ident, $to:ident) => { |
433 | impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> { |
434 | #[inline(always)] |
435 | fn from(x: $from<S3, S4, NI>) -> Self { |
436 | $to::new(x.x) |
437 | } |
438 | } |
439 | }; |
440 | } |
441 | |
442 | impl_into!(u128x1_sse2, u32x4_sse2); |
443 | impl_into!(u128x1_sse2, u64x2_sse2); |
444 | |
445 | impl_bitops32!(u32x4_sse2); |
446 | impl_bitops64!(u64x2_sse2); |
447 | impl_bitops128!(u128x1_sse2); |
448 | |
449 | impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where |
450 | u32x4_sse2<S3, S4, NI>: BSwap |
451 | { |
452 | } |
453 | impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where |
454 | u64x2_sse2<S3, S4, NI>: BSwap |
455 | { |
456 | } |
457 | impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32); |
458 | impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64); |
459 | impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add); |
460 | impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add); |
461 | |
462 | impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI> |
463 | where |
464 | u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>, |
465 | Machine86<S3, S4, NI>: Machine, |
466 | { |
467 | } |
468 | impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI> |
469 | where |
470 | u64x2_sse2<S3, S4, NI>: |
471 | RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>, |
472 | Machine86<S3, S4, NI>: Machine, |
473 | { |
474 | } |
475 | impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI> |
476 | where |
477 | u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, |
478 | Machine86<S3, S4, NI>: Machine, |
479 | u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>, |
480 | u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>, |
481 | { |
482 | } |
483 | |
484 | impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI> |
485 | where |
486 | u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>, |
487 | Machine86<YesS3, YesS4, NI>: Machine, |
488 | { |
489 | } |
490 | impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI> |
491 | where |
492 | u64x2_sse2<YesS3, YesS4, NI>: |
493 | RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>, |
494 | Machine86<YesS3, YesS4, NI>: Machine, |
495 | { |
496 | } |
497 | impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI> |
498 | where |
499 | u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, |
500 | Machine86<YesS3, YesS4, NI>: Machine, |
501 | u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>, |
502 | u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>, |
503 | { |
504 | } |
505 | |
506 | impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> { |
507 | #[inline (always)] |
508 | unsafe fn unsafe_from(xs: [u32; 4]) -> Self { |
509 | Self::new(_mm_set_epi32( |
510 | e3:xs[3] as i32, |
511 | e2:xs[2] as i32, |
512 | e1:xs[1] as i32, |
513 | e0:xs[0] as i32, |
514 | )) |
515 | } |
516 | } |
517 | |
518 | impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI> |
519 | where |
520 | Self: MultiLane<[u32; 4]>, |
521 | { |
522 | #[inline (always)] |
523 | fn extract(self, i: u32) -> u32 { |
524 | self.to_lanes()[i as usize] |
525 | } |
526 | #[inline (always)] |
527 | fn insert(self, v: u32, i: u32) -> Self { |
528 | Self::new(unsafe { |
529 | match i { |
530 | 0 => _mm_insert_epi32(self.x, i:v as i32, 0), |
531 | 1 => _mm_insert_epi32(self.x, i:v as i32, 1), |
532 | 2 => _mm_insert_epi32(self.x, i:v as i32, 2), |
533 | 3 => _mm_insert_epi32(self.x, i:v as i32, 3), |
534 | _ => unreachable!(), |
535 | } |
536 | }) |
537 | } |
538 | } |
539 | impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI> |
540 | where |
541 | Self: MultiLane<[u32; 4]>, |
542 | { |
543 | #[inline (always)] |
544 | fn extract(self, i: u32) -> u32 { |
545 | self.to_lanes()[i as usize] |
546 | } |
547 | #[inline (always)] |
548 | fn insert(self, v: u32, i: u32) -> Self { |
549 | Self::new(unsafe { |
550 | match i { |
551 | 0 => { |
552 | let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x); |
553 | _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)) |
554 | } |
555 | 1 => { |
556 | let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000); |
557 | x = _mm_slli_si128(x, 4); |
558 | x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); |
559 | _mm_shuffle_epi32(x, 0b1110_0001) |
560 | } |
561 | 2 => { |
562 | let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100); |
563 | x = _mm_slli_si128(x, 4); |
564 | x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); |
565 | _mm_shuffle_epi32(x, 0b1100_1001) |
566 | } |
567 | 3 => { |
568 | let mut x = _mm_slli_si128(self.x, 4); |
569 | x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); |
570 | _mm_shuffle_epi32(x, 0b0011_1001) |
571 | } |
572 | _ => unreachable!(), |
573 | } |
574 | }) |
575 | } |
576 | } |
577 | |
578 | impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> { |
579 | #[inline (always)] |
580 | fn shuffle_lane_words2301(self) -> Self { |
581 | self.shuffle2301() |
582 | } |
583 | #[inline (always)] |
584 | fn shuffle_lane_words1230(self) -> Self { |
585 | self.shuffle1230() |
586 | } |
587 | #[inline (always)] |
588 | fn shuffle_lane_words3012(self) -> Self { |
589 | self.shuffle3012() |
590 | } |
591 | } |
592 | |
593 | impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> { |
594 | #[inline (always)] |
595 | fn shuffle2301(self) -> Self { |
596 | Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) |
597 | } |
598 | #[inline (always)] |
599 | fn shuffle1230(self) -> Self { |
600 | Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) }) |
601 | } |
602 | #[inline (always)] |
603 | fn shuffle3012(self) -> Self { |
604 | Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) }) |
605 | } |
606 | } |
607 | |
608 | impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> { |
609 | #[inline (always)] |
610 | fn shuffle2301(self) -> Self { |
611 | x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) |
612 | } |
613 | #[inline (always)] |
614 | fn shuffle3012(self) -> Self { |
615 | unsafe { |
616 | x2::new([ |
617 | u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), |
618 | u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), |
619 | ]) |
620 | } |
621 | } |
622 | #[inline (always)] |
623 | fn shuffle1230(self) -> Self { |
624 | unsafe { |
625 | x2::new([ |
626 | u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), |
627 | u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), |
628 | ]) |
629 | } |
630 | } |
631 | } |
632 | impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> { |
633 | #[inline (always)] |
634 | fn shuffle2301(self) -> Self { |
635 | x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) |
636 | } |
637 | #[inline (always)] |
638 | fn shuffle3012(self) -> Self { |
639 | unsafe { |
640 | let a = _mm_srli_si128(self.0[0].x, 8); |
641 | let b = _mm_slli_si128(self.0[0].x, 8); |
642 | let c = _mm_srli_si128(self.0[1].x, 8); |
643 | let d = _mm_slli_si128(self.0[1].x, 8); |
644 | let da = _mm_or_si128(d, a); |
645 | let bc = _mm_or_si128(b, c); |
646 | x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)]) |
647 | } |
648 | } |
649 | #[inline (always)] |
650 | fn shuffle1230(self) -> Self { |
651 | unsafe { |
652 | let a = _mm_srli_si128(self.0[0].x, 8); |
653 | let b = _mm_slli_si128(self.0[0].x, 8); |
654 | let c = _mm_srli_si128(self.0[1].x, 8); |
655 | let d = _mm_slli_si128(self.0[1].x, 8); |
656 | let da = _mm_or_si128(d, a); |
657 | let bc = _mm_or_si128(b, c); |
658 | x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)]) |
659 | } |
660 | } |
661 | } |
662 | |
663 | impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> { |
664 | #[inline (always)] |
665 | unsafe fn unsafe_from(xs: [u64; 2]) -> Self { |
666 | Self::new(_mm_set_epi64x(e1:xs[1] as i64, e0:xs[0] as i64)) |
667 | } |
668 | } |
669 | |
670 | impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> { |
671 | #[inline (always)] |
672 | fn extract(self, i: u32) -> u64 { |
673 | unsafe { |
674 | match i { |
675 | 0 => _mm_cvtsi128_si64(self.x) as u64, |
676 | 1 => _mm_extract_epi64(self.x, 1) as u64, |
677 | _ => unreachable!(), |
678 | } |
679 | } |
680 | } |
681 | #[inline (always)] |
682 | fn insert(self, x: u64, i: u32) -> Self { |
683 | Self::new(unsafe { |
684 | match i { |
685 | 0 => _mm_insert_epi64(self.x, i:x as i64, 0), |
686 | 1 => _mm_insert_epi64(self.x, i:x as i64, 1), |
687 | _ => unreachable!(), |
688 | } |
689 | }) |
690 | } |
691 | } |
692 | impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> { |
693 | #[inline (always)] |
694 | fn extract(self, i: u32) -> u64 { |
695 | unsafe { |
696 | match i { |
697 | 0 => _mm_cvtsi128_si64(self.x) as u64, |
698 | 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64, |
699 | _ => unreachable!(), |
700 | } |
701 | } |
702 | } |
703 | #[inline (always)] |
704 | fn insert(self, x: u64, i: u32) -> Self { |
705 | Self::new(unsafe { |
706 | match i { |
707 | 0 => _mm_or_si128( |
708 | _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x), |
709 | _mm_cvtsi64_si128(x as i64), |
710 | ), |
711 | 1 => _mm_or_si128( |
712 | _mm_move_epi64(self.x), |
713 | _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8), |
714 | ), |
715 | _ => unreachable!(), |
716 | } |
717 | }) |
718 | } |
719 | } |
720 | |
721 | impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> { |
722 | #[inline (always)] |
723 | fn bswap(self) -> Self { |
724 | Self::new(unsafe { |
725 | let k: __m128i = _mm_set_epi64x(e1:0x0c0d_0e0f_0809_0a0b, e0:0x0405_0607_0001_0203); |
726 | _mm_shuffle_epi8(self.x, b:k) |
727 | }) |
728 | } |
729 | } |
730 | #[inline (always)] |
731 | fn bswap32_s2(x: __m128i) -> __m128i { |
732 | unsafe { |
733 | let mut y: __m128i = _mm_unpacklo_epi8(a:x, b:_mm_setzero_si128()); |
734 | y = _mm_shufflehi_epi16(y, 0b0001_1011); |
735 | y = _mm_shufflelo_epi16(y, 0b0001_1011); |
736 | let mut z: __m128i = _mm_unpackhi_epi8(a:x, b:_mm_setzero_si128()); |
737 | z = _mm_shufflehi_epi16(z, 0b0001_1011); |
738 | z = _mm_shufflelo_epi16(z, 0b0001_1011); |
739 | _mm_packus_epi16(a:y, b:z) |
740 | } |
741 | } |
742 | impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> { |
743 | #[inline (always)] |
744 | fn bswap(self) -> Self { |
745 | Self::new(bswap32_s2(self.x)) |
746 | } |
747 | } |
748 | |
749 | impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> { |
750 | #[inline (always)] |
751 | fn bswap(self) -> Self { |
752 | Self::new(unsafe { |
753 | let k: __m128i = _mm_set_epi64x(e1:0x0809_0a0b_0c0d_0e0f, e0:0x0001_0203_0405_0607); |
754 | _mm_shuffle_epi8(self.x, b:k) |
755 | }) |
756 | } |
757 | } |
758 | impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> { |
759 | #[inline (always)] |
760 | fn bswap(self) -> Self { |
761 | Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) }) |
762 | } |
763 | } |
764 | |
765 | impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> { |
766 | #[inline (always)] |
767 | fn bswap(self) -> Self { |
768 | Self::new(unsafe { |
769 | let k: __m128i = _mm_set_epi64x(e1:0x0f0e_0d0c_0b0a_0908, e0:0x0706_0504_0302_0100); |
770 | _mm_shuffle_epi8(self.x, b:k) |
771 | }) |
772 | } |
773 | } |
774 | impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> { |
775 | #[inline (always)] |
776 | fn bswap(self) -> Self { |
777 | unimplemented!() |
778 | } |
779 | } |
780 | |
781 | macro_rules! swapi { |
782 | ($x:expr, $i:expr, $k:expr) => { |
783 | unsafe { |
784 | const K: u8 = $k; |
785 | let k = _mm_set1_epi8(K as i8); |
786 | u128x1_sse2::new(_mm_or_si128( |
787 | _mm_srli_epi16(_mm_and_si128($x.x, k), $i), |
788 | _mm_and_si128(_mm_slli_epi16($x.x, $i), k), |
789 | )) |
790 | } |
791 | }; |
792 | } |
793 | #[inline (always)] |
794 | fn swap16_s2(x: __m128i) -> __m128i { |
795 | unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) } |
796 | } |
797 | impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> { |
798 | #[inline (always)] |
799 | fn swap1(self) -> Self { |
800 | swapi!(self, 1, 0xaa) |
801 | } |
802 | #[inline (always)] |
803 | fn swap2(self) -> Self { |
804 | swapi!(self, 2, 0xcc) |
805 | } |
806 | #[inline (always)] |
807 | fn swap4(self) -> Self { |
808 | swapi!(self, 4, 0xf0) |
809 | } |
810 | #[inline (always)] |
811 | fn swap8(self) -> Self { |
812 | u128x1_sse2::new(unsafe { |
813 | let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001); |
814 | _mm_shuffle_epi8(self.x, k) |
815 | }) |
816 | } |
817 | #[inline (always)] |
818 | fn swap16(self) -> Self { |
819 | u128x1_sse2::new(unsafe { |
820 | let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302); |
821 | _mm_shuffle_epi8(self.x, k) |
822 | }) |
823 | } |
824 | #[inline (always)] |
825 | fn swap32(self) -> Self { |
826 | u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) |
827 | } |
828 | #[inline (always)] |
829 | fn swap64(self) -> Self { |
830 | u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) |
831 | } |
832 | } |
833 | impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> { |
834 | #[inline (always)] |
835 | fn swap1(self) -> Self { |
836 | swapi!(self, 1, 0xaa) |
837 | } |
838 | #[inline (always)] |
839 | fn swap2(self) -> Self { |
840 | swapi!(self, 2, 0xcc) |
841 | } |
842 | #[inline (always)] |
843 | fn swap4(self) -> Self { |
844 | swapi!(self, 4, 0xf0) |
845 | } |
846 | #[inline (always)] |
847 | fn swap8(self) -> Self { |
848 | u128x1_sse2::new(unsafe { |
849 | _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8)) |
850 | }) |
851 | } |
852 | #[inline (always)] |
853 | fn swap16(self) -> Self { |
854 | u128x1_sse2::new(swap16_s2(self.x)) |
855 | } |
856 | #[inline (always)] |
857 | fn swap32(self) -> Self { |
858 | u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) |
859 | } |
860 | #[inline (always)] |
861 | fn swap64(self) -> Self { |
862 | u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) |
863 | } |
864 | } |
865 | |
866 | #[derive (Copy, Clone)] |
867 | pub struct G0; |
868 | #[derive (Copy, Clone)] |
869 | pub struct G1; |
870 | |
871 | #[allow (non_camel_case_types)] |
872 | pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>; |
873 | #[allow (non_camel_case_types)] |
874 | pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>; |
875 | #[allow (non_camel_case_types)] |
876 | pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>; |
877 | #[allow (non_camel_case_types)] |
878 | pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>; |
879 | |
880 | #[allow (non_camel_case_types)] |
881 | pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>; |
882 | #[allow (non_camel_case_types)] |
883 | pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>; |
884 | #[allow (non_camel_case_types)] |
885 | pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>; |
886 | |
887 | impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> { |
888 | #[inline (always)] |
889 | fn to_scalars(self) -> [u32; 16] { |
890 | transmute!(self) |
891 | } |
892 | } |
893 | |
894 | impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI> |
895 | where |
896 | u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap, |
897 | Machine86<S3, S4, NI>: Machine, |
898 | u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>, |
899 | u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>, |
900 | { |
901 | } |
902 | impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI> |
903 | where |
904 | u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
905 | Machine86<S3, S4, NI>: Machine, |
906 | u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>, |
907 | u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>, |
908 | { |
909 | } |
910 | impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI> |
911 | where |
912 | u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
913 | Machine86<S3, S4, NI>: Machine, |
914 | u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4, |
915 | { |
916 | } |
917 | impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI> |
918 | where |
919 | u128x1_sse2<S3, S4, NI>: Swap64 + BSwap, |
920 | Machine86<S3, S4, NI>: Machine, |
921 | u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>, |
922 | u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>, |
923 | u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>, |
924 | u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>, |
925 | u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>, |
926 | { |
927 | } |
928 | |
929 | impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI> |
930 | where |
931 | u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap, |
932 | Avx2Machine<NI>: Machine, |
933 | u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>, |
934 | u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>, |
935 | { |
936 | } |
937 | impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI> |
938 | where |
939 | u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
940 | Avx2Machine<NI>: Machine, |
941 | u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>, |
942 | u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>, |
943 | { |
944 | } |
945 | impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI> |
946 | where |
947 | u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
948 | Avx2Machine<NI>: Machine, |
949 | u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4, |
950 | { |
951 | } |
952 | impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI> |
953 | where |
954 | u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap, |
955 | Avx2Machine<NI>: Machine, |
956 | u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>, |
957 | u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>, |
958 | u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>, |
959 | u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>, |
960 | u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>, |
961 | { |
962 | } |
963 | |
964 | impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI> |
965 | where |
966 | u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>, |
967 | { |
968 | #[inline (always)] |
969 | fn extract(self, i: u32) -> u64 { |
970 | match i { |
971 | 0 => self.0[0].extract(0), |
972 | 1 => self.0[0].extract(1), |
973 | 2 => self.0[1].extract(0), |
974 | 3 => self.0[1].extract(1), |
975 | _ => panic!(), |
976 | } |
977 | } |
978 | #[inline (always)] |
979 | fn insert(mut self, w: u64, i: u32) -> Self { |
980 | match i { |
981 | 0 => self.0[0] = self.0[0].insert(w, i:0), |
982 | 1 => self.0[0] = self.0[0].insert(w, i:1), |
983 | 2 => self.0[1] = self.0[1].insert(w, i:0), |
984 | 3 => self.0[1] = self.0[1].insert(w, i:1), |
985 | _ => panic!(), |
986 | }; |
987 | self |
988 | } |
989 | } |
990 | |
991 | impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI> |
992 | where |
993 | u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap, |
994 | Machine86<S3, S4, NI>: Machine, |
995 | u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>, |
996 | u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>, |
997 | u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>, |
998 | u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>, |
999 | { |
1000 | } |
1001 | impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI> |
1002 | where |
1003 | u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
1004 | Machine86<S3, S4, NI>: Machine, |
1005 | u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>, |
1006 | u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>, |
1007 | { |
1008 | } |
1009 | impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI> |
1010 | where |
1011 | u128x1_sse2<S3, S4, NI>: Swap64 + BSwap, |
1012 | Machine86<S3, S4, NI>: Machine, |
1013 | u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>, |
1014 | u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>, |
1015 | u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>, |
1016 | u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>, |
1017 | { |
1018 | } |
1019 | |
1020 | impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI> |
1021 | where |
1022 | u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
1023 | Avx2Machine<NI>: Machine, |
1024 | u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>, |
1025 | u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>, |
1026 | { |
1027 | } |
1028 | impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI> |
1029 | where |
1030 | u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap, |
1031 | Avx2Machine<NI>: Machine, |
1032 | u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>, |
1033 | u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>, |
1034 | u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>, |
1035 | u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>, |
1036 | { |
1037 | } |
1038 | |
1039 | macro_rules! impl_into_x { |
1040 | ($from:ident, $to:ident) => { |
1041 | impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>> |
1042 | for x2<$to<S3, S4, NI>, Gt> |
1043 | { |
1044 | #[inline(always)] |
1045 | fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self { |
1046 | x2::new([$to::from(x.0[0]), $to::from(x.0[1])]) |
1047 | } |
1048 | } |
1049 | impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> { |
1050 | #[inline(always)] |
1051 | fn from(x: x4<$from<S3, S4, NI>>) -> Self { |
1052 | x4::new([ |
1053 | $to::from(x.0[0]), |
1054 | $to::from(x.0[1]), |
1055 | $to::from(x.0[2]), |
1056 | $to::from(x.0[3]), |
1057 | ]) |
1058 | } |
1059 | } |
1060 | }; |
1061 | } |
1062 | impl_into_x!(u128x1_sse2, u64x2_sse2); |
1063 | impl_into_x!(u128x1_sse2, u32x4_sse2); |
1064 | |
1065 | ///// Debugging |
1066 | |
1067 | use core::fmt::{Debug, Formatter, Result}; |
1068 | |
1069 | impl<W: PartialEq, G> PartialEq for x2<W, G> { |
1070 | #[inline (always)] |
1071 | fn eq(&self, rhs: &Self) -> bool { |
1072 | self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1] |
1073 | } |
1074 | } |
1075 | |
1076 | #[allow (unused)] |
1077 | #[inline (always)] |
1078 | unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool { |
1079 | let q: __m128i = _mm_shuffle_epi32(_mm_cmpeq_epi64(a:x, b:y), 0b1100_0110); |
1080 | _mm_cvtsi128_si64(q) == -1 |
1081 | } |
1082 | |
1083 | #[inline (always)] |
1084 | unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool { |
1085 | let q: __m128i = _mm_cmpeq_epi32(a:x, b:y); |
1086 | let p: i64 = _mm_cvtsi128_si64(_mm_srli_si128(q, 8)); |
1087 | let q: i64 = _mm_cvtsi128_si64(q); |
1088 | (p & q) == -1 |
1089 | } |
1090 | |
1091 | impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> { |
1092 | #[inline (always)] |
1093 | fn eq(&self, rhs: &Self) -> bool { |
1094 | unsafe { eq128_s2(self.x, y:rhs.x) } |
1095 | } |
1096 | } |
1097 | impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI> |
1098 | where |
1099 | Self: Copy + MultiLane<[u32; 4]>, |
1100 | { |
1101 | #[cold ] |
1102 | fn fmt(&self, fmt: &mut Formatter) -> Result { |
1103 | fmt.write_fmt(format_args!(" {:08x?}" , &self.to_lanes())) |
1104 | } |
1105 | } |
1106 | |
1107 | impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> { |
1108 | #[inline (always)] |
1109 | fn eq(&self, rhs: &Self) -> bool { |
1110 | unsafe { eq128_s2(self.x, y:rhs.x) } |
1111 | } |
1112 | } |
1113 | impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI> |
1114 | where |
1115 | Self: Copy + MultiLane<[u64; 2]>, |
1116 | { |
1117 | #[cold ] |
1118 | fn fmt(&self, fmt: &mut Formatter) -> Result { |
1119 | fmt.write_fmt(format_args!(" {:016x?}" , &self.to_lanes())) |
1120 | } |
1121 | } |
1122 | |
1123 | impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI> |
1124 | where |
1125 | u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>, |
1126 | { |
1127 | #[cold ] |
1128 | fn fmt(&self, fmt: &mut Formatter) -> Result { |
1129 | let (a: [u64; 2], b: [u64; 2]) = (self.0[0].to_lanes(), self.0[1].to_lanes()); |
1130 | fmt.write_fmt(format_args!(" {:016x?}" , &[a[0], a[1], b[0], b[1]])) |
1131 | } |
1132 | } |
1133 | |
1134 | #[cfg (test)] |
1135 | #[cfg (target_arch = "x86_64" )] |
1136 | mod test { |
1137 | use super::*; |
1138 | use crate::x86_64::{SSE2, SSE41, SSSE3}; |
1139 | use crate::Machine; |
1140 | |
1141 | #[test ] |
1142 | #[cfg_attr (not(target_feature = "ssse3" ), ignore)] |
1143 | fn test_bswap32_s2_vs_s3() { |
1144 | let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100]; |
1145 | let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203]; |
1146 | |
1147 | let s2 = unsafe { SSE2::instance() }; |
1148 | let s3 = unsafe { SSSE3::instance() }; |
1149 | |
1150 | let x_s2 = { |
1151 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1152 | x_s2.bswap() |
1153 | }; |
1154 | |
1155 | let x_s3 = { |
1156 | let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); |
1157 | x_s3.bswap() |
1158 | }; |
1159 | |
1160 | assert_eq!(x_s2, transmute!(x_s3)); |
1161 | assert_eq!(x_s2, s2.vec(ys)); |
1162 | } |
1163 | |
1164 | #[test ] |
1165 | #[cfg_attr (not(target_feature = "ssse3" ), ignore)] |
1166 | fn test_bswap64_s2_vs_s3() { |
1167 | let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100]; |
1168 | let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607]; |
1169 | |
1170 | let s2 = unsafe { SSE2::instance() }; |
1171 | let s3 = unsafe { SSSE3::instance() }; |
1172 | |
1173 | let x_s2 = { |
1174 | let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs); |
1175 | x_s2.bswap() |
1176 | }; |
1177 | |
1178 | let x_s3 = { |
1179 | let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs); |
1180 | x_s3.bswap() |
1181 | }; |
1182 | |
1183 | assert_eq!(x_s2, s2.vec(ys)); |
1184 | assert_eq!(x_s3, transmute!(x_s3)); |
1185 | } |
1186 | |
1187 | #[test ] |
1188 | #[cfg_attr (not(target_feature = "ssse3" ), ignore)] |
1189 | fn test_shuffle32_s2_vs_s3() { |
1190 | let xs = [0x0, 0x1, 0x2, 0x3]; |
1191 | let ys = [0x2, 0x3, 0x0, 0x1]; |
1192 | let zs = [0x1, 0x2, 0x3, 0x0]; |
1193 | |
1194 | let s2 = unsafe { SSE2::instance() }; |
1195 | let s3 = unsafe { SSSE3::instance() }; |
1196 | |
1197 | let x_s2 = { |
1198 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1199 | x_s2.shuffle2301() |
1200 | }; |
1201 | let x_s3 = { |
1202 | let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); |
1203 | x_s3.shuffle2301() |
1204 | }; |
1205 | assert_eq!(x_s2, s2.vec(ys)); |
1206 | assert_eq!(x_s3, transmute!(x_s3)); |
1207 | |
1208 | let x_s2 = { |
1209 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1210 | x_s2.shuffle3012() |
1211 | }; |
1212 | let x_s3 = { |
1213 | let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); |
1214 | x_s3.shuffle3012() |
1215 | }; |
1216 | assert_eq!(x_s2, s2.vec(zs)); |
1217 | assert_eq!(x_s3, transmute!(x_s3)); |
1218 | |
1219 | let x_s2 = x_s2.shuffle1230(); |
1220 | let x_s3 = x_s3.shuffle1230(); |
1221 | assert_eq!(x_s2, s2.vec(xs)); |
1222 | assert_eq!(x_s3, transmute!(x_s3)); |
1223 | } |
1224 | |
1225 | #[test ] |
1226 | #[cfg_attr (not(target_feature = "ssse3" ), ignore)] |
1227 | fn test_shuffle64_s2_vs_s3() { |
1228 | let xs = [0x0, 0x1, 0x2, 0x3]; |
1229 | let ys = [0x2, 0x3, 0x0, 0x1]; |
1230 | let zs = [0x1, 0x2, 0x3, 0x0]; |
1231 | |
1232 | let s2 = unsafe { SSE2::instance() }; |
1233 | let s3 = unsafe { SSSE3::instance() }; |
1234 | |
1235 | let x_s2 = { |
1236 | let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs); |
1237 | x_s2.shuffle2301() |
1238 | }; |
1239 | let x_s3 = { |
1240 | let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs); |
1241 | x_s3.shuffle2301() |
1242 | }; |
1243 | assert_eq!(x_s2, s2.vec(ys)); |
1244 | assert_eq!(x_s3, transmute!(x_s3)); |
1245 | |
1246 | let x_s2 = { |
1247 | let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs); |
1248 | x_s2.shuffle3012() |
1249 | }; |
1250 | let x_s3 = { |
1251 | let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs); |
1252 | x_s3.shuffle3012() |
1253 | }; |
1254 | assert_eq!(x_s2, s2.vec(zs)); |
1255 | assert_eq!(x_s3, transmute!(x_s3)); |
1256 | |
1257 | let x_s2 = x_s2.shuffle1230(); |
1258 | let x_s3 = x_s3.shuffle1230(); |
1259 | assert_eq!(x_s2, s2.vec(xs)); |
1260 | assert_eq!(x_s3, transmute!(x_s3)); |
1261 | } |
1262 | |
1263 | #[cfg_attr (not(all(target_feature = "ssse3" , target_feature = "sse4.1" )), ignore)] |
1264 | #[test ] |
1265 | fn test_lanes_u32x4() { |
1266 | let xs = [0x1, 0x2, 0x3, 0x4]; |
1267 | |
1268 | let s2 = unsafe { SSE2::instance() }; |
1269 | let s3 = unsafe { SSSE3::instance() }; |
1270 | let s4 = unsafe { SSE41::instance() }; |
1271 | |
1272 | { |
1273 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1274 | let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs); |
1275 | assert_eq!(x_s2, y_s2); |
1276 | assert_eq!(xs, y_s2.to_lanes()); |
1277 | } |
1278 | |
1279 | { |
1280 | let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); |
1281 | let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs); |
1282 | assert_eq!(x_s3, y_s3); |
1283 | assert_eq!(xs, y_s3.to_lanes()); |
1284 | } |
1285 | |
1286 | { |
1287 | let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs); |
1288 | let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs); |
1289 | assert_eq!(x_s4, y_s4); |
1290 | assert_eq!(xs, y_s4.to_lanes()); |
1291 | } |
1292 | } |
1293 | |
1294 | #[test ] |
1295 | #[cfg_attr (not(all(target_feature = "ssse3" , target_feature = "sse4.1" )), ignore)] |
1296 | fn test_lanes_u64x2() { |
1297 | let xs = [0x1, 0x2]; |
1298 | |
1299 | let s2 = unsafe { SSE2::instance() }; |
1300 | let s3 = unsafe { SSSE3::instance() }; |
1301 | let s4 = unsafe { SSE41::instance() }; |
1302 | |
1303 | { |
1304 | let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs); |
1305 | let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs); |
1306 | assert_eq!(x_s2, y_s2); |
1307 | assert_eq!(xs, y_s2.to_lanes()); |
1308 | } |
1309 | |
1310 | { |
1311 | let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs); |
1312 | let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs); |
1313 | assert_eq!(x_s3, y_s3); |
1314 | assert_eq!(xs, y_s3.to_lanes()); |
1315 | } |
1316 | |
1317 | { |
1318 | let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs); |
1319 | let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs); |
1320 | assert_eq!(x_s4, y_s4); |
1321 | assert_eq!(xs, y_s4.to_lanes()); |
1322 | } |
1323 | } |
1324 | |
1325 | #[test ] |
1326 | fn test_vec4_u32x4_s2() { |
1327 | let xs = [1, 2, 3, 4]; |
1328 | let s2 = unsafe { SSE2::instance() }; |
1329 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1330 | assert_eq!(x_s2.extract(0), 1); |
1331 | assert_eq!(x_s2.extract(1), 2); |
1332 | assert_eq!(x_s2.extract(2), 3); |
1333 | assert_eq!(x_s2.extract(3), 4); |
1334 | assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4])); |
1335 | assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4])); |
1336 | assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4])); |
1337 | assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf])); |
1338 | } |
1339 | |
1340 | #[test ] |
1341 | #[cfg_attr (not(all(target_feature = "ssse3" , target_feature = "sse4.1" )), ignore)] |
1342 | fn test_vec4_u32x4_s4() { |
1343 | let xs = [1, 2, 3, 4]; |
1344 | let s4 = unsafe { SSE41::instance() }; |
1345 | let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs); |
1346 | assert_eq!(x_s4.extract(0), 1); |
1347 | assert_eq!(x_s4.extract(1), 2); |
1348 | assert_eq!(x_s4.extract(2), 3); |
1349 | assert_eq!(x_s4.extract(3), 4); |
1350 | assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4])); |
1351 | assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4])); |
1352 | assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4])); |
1353 | assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf])); |
1354 | } |
1355 | |
1356 | #[test ] |
1357 | fn test_vec2_u64x2_s2() { |
1358 | let xs = [0x1, 0x2]; |
1359 | let s2 = unsafe { SSE2::instance() }; |
1360 | let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs); |
1361 | assert_eq!(x_s2.extract(0), 1); |
1362 | assert_eq!(x_s2.extract(1), 2); |
1363 | assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2])); |
1364 | assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf])); |
1365 | } |
1366 | |
1367 | #[test ] |
1368 | #[cfg_attr (not(all(target_feature = "ssse3" , target_feature = "sse4.1" )), ignore)] |
1369 | fn test_vec4_u64x2_s4() { |
1370 | let xs = [0x1, 0x2]; |
1371 | let s4 = unsafe { SSE41::instance() }; |
1372 | let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs); |
1373 | assert_eq!(x_s4.extract(0), 1); |
1374 | assert_eq!(x_s4.extract(1), 2); |
1375 | assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2])); |
1376 | assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf])); |
1377 | } |
1378 | } |
1379 | |
1380 | pub mod avx2 { |
1381 | #![allow (non_camel_case_types)] |
1382 | use crate::soft::{x2, x4}; |
1383 | use crate::types::*; |
1384 | use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0}; |
1385 | use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}; |
1386 | use core::arch::x86_64::*; |
1387 | use core::marker::PhantomData; |
1388 | use core::ops::*; |
1389 | use zerocopy::transmute; |
1390 | |
1391 | zerocopy::cryptocorrosion_derive_traits! { |
1392 | #[repr(transparent)] |
1393 | #[derive (Copy, Clone)] |
1394 | pub struct u32x4x2_avx2<NI> { |
1395 | x: __m256i, |
1396 | ni: PhantomData<NI>, |
1397 | } |
1398 | } |
1399 | |
1400 | impl<NI> u32x4x2_avx2<NI> { |
1401 | #[inline (always)] |
1402 | fn new(x: __m256i) -> Self { |
1403 | Self { x, ni: PhantomData } |
1404 | } |
1405 | } |
1406 | |
1407 | impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {} |
1408 | impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> { |
1409 | #[inline (always)] |
1410 | unsafe fn unpack(p: vec256_storage) -> Self { |
1411 | Self::new(p.avx) |
1412 | } |
1413 | } |
1414 | impl<NI> StoreBytes for u32x4x2_avx2<NI> { |
1415 | #[inline (always)] |
1416 | unsafe fn unsafe_read_le(input: &[u8]) -> Self { |
1417 | assert_eq!(input.len(), 32); |
1418 | Self::new(_mm256_loadu_si256(input.as_ptr() as *const _)) |
1419 | } |
1420 | #[inline (always)] |
1421 | unsafe fn unsafe_read_be(input: &[u8]) -> Self { |
1422 | Self::unsafe_read_le(input).bswap() |
1423 | } |
1424 | #[inline (always)] |
1425 | fn write_le(self, out: &mut [u8]) { |
1426 | unsafe { |
1427 | assert_eq!(out.len(), 32); |
1428 | _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x) |
1429 | } |
1430 | } |
1431 | #[inline (always)] |
1432 | fn write_be(self, out: &mut [u8]) { |
1433 | self.bswap().write_le(out) |
1434 | } |
1435 | } |
1436 | impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> { |
1437 | #[inline (always)] |
1438 | fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] { |
1439 | unsafe { |
1440 | [ |
1441 | u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), |
1442 | u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), |
1443 | ] |
1444 | } |
1445 | } |
1446 | #[inline (always)] |
1447 | fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self { |
1448 | Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) }) |
1449 | } |
1450 | } |
1451 | impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> { |
1452 | #[inline (always)] |
1453 | fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { |
1454 | unsafe { |
1455 | match i { |
1456 | 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), |
1457 | 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), |
1458 | _ => panic!(), |
1459 | } |
1460 | } |
1461 | } |
1462 | #[inline (always)] |
1463 | fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { |
1464 | Self::new(unsafe { |
1465 | match i { |
1466 | 0 => _mm256_inserti128_si256(self.x, w.x, 0), |
1467 | 1 => _mm256_inserti128_si256(self.x, w.x, 1), |
1468 | _ => panic!(), |
1469 | } |
1470 | }) |
1471 | } |
1472 | } |
1473 | impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {} |
1474 | impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {} |
1475 | macro_rules! shuf_lane_bytes { |
1476 | ($name:ident, $k0:expr, $k1:expr) => { |
1477 | #[inline(always)] |
1478 | fn $name(self) -> Self { |
1479 | Self::new(unsafe { |
1480 | _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1)) |
1481 | }) |
1482 | } |
1483 | }; |
1484 | } |
1485 | macro_rules! rotr_32 { |
1486 | ($name:ident, $i:expr) => { |
1487 | #[inline(always)] |
1488 | fn $name(self) -> Self { |
1489 | Self::new(unsafe { |
1490 | _mm256_or_si256( |
1491 | _mm256_srli_epi32(self.x, $i as i32), |
1492 | _mm256_slli_epi32(self.x, 32 - $i as i32), |
1493 | ) |
1494 | }) |
1495 | } |
1496 | }; |
1497 | } |
1498 | impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> { |
1499 | rotr_32!(rotate_each_word_right7, 7); |
1500 | shuf_lane_bytes!( |
1501 | rotate_each_word_right8, |
1502 | 0x0c0f_0e0d_080b_0a09, |
1503 | 0x0407_0605_0003_0201 |
1504 | ); |
1505 | rotr_32!(rotate_each_word_right11, 11); |
1506 | rotr_32!(rotate_each_word_right12, 12); |
1507 | shuf_lane_bytes!( |
1508 | rotate_each_word_right16, |
1509 | 0x0d0c_0f0e_0908_0b0a, |
1510 | 0x0504_0706_0100_0302 |
1511 | ); |
1512 | rotr_32!(rotate_each_word_right20, 20); |
1513 | shuf_lane_bytes!( |
1514 | rotate_each_word_right24, |
1515 | 0x0e0d_0c0f_0a09_080b, |
1516 | 0x0605_0407_0201_0003 |
1517 | ); |
1518 | rotr_32!(rotate_each_word_right25, 25); |
1519 | } |
1520 | impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {} |
1521 | impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage { |
1522 | #[inline (always)] |
1523 | fn from(x: u32x4x2_avx2<NI>) -> Self { |
1524 | Self { avx: x.x } |
1525 | } |
1526 | } |
1527 | |
1528 | macro_rules! impl_assign { |
1529 | ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => { |
1530 | impl<NI> $Assign for $vec<NI> |
1531 | where |
1532 | NI: Copy, |
1533 | { |
1534 | #[inline(always)] |
1535 | fn $assign_fn(&mut self, rhs: Self) { |
1536 | *self = self.$bin_fn(rhs); |
1537 | } |
1538 | } |
1539 | }; |
1540 | } |
1541 | impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor); |
1542 | impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor); |
1543 | impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand); |
1544 | impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add); |
1545 | |
1546 | macro_rules! impl_bitop { |
1547 | ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => { |
1548 | impl<NI> $Op for $vec<NI> { |
1549 | type Output = Self; |
1550 | #[inline(always)] |
1551 | fn $op_fn(self, rhs: Self) -> Self::Output { |
1552 | Self::new(unsafe { $impl_fn(self.x, rhs.x) }) |
1553 | } |
1554 | } |
1555 | }; |
1556 | } |
1557 | impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256); |
1558 | impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256); |
1559 | impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256); |
1560 | impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256); |
1561 | impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32); |
1562 | |
1563 | impl<NI> Not for u32x4x2_avx2<NI> { |
1564 | type Output = Self; |
1565 | #[inline (always)] |
1566 | fn not(self) -> Self::Output { |
1567 | unsafe { |
1568 | let f = _mm256_set1_epi8(-0x7f); |
1569 | Self::new(f) ^ self |
1570 | } |
1571 | } |
1572 | } |
1573 | |
1574 | impl<NI> BSwap for u32x4x2_avx2<NI> { |
1575 | shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); |
1576 | } |
1577 | |
1578 | impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI> |
1579 | where |
1580 | NI: Copy, |
1581 | { |
1582 | #[inline (always)] |
1583 | fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self { |
1584 | Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) }) |
1585 | } |
1586 | } |
1587 | |
1588 | impl<NI> LaneWords4 for u32x4x2_avx2<NI> { |
1589 | #[inline (always)] |
1590 | fn shuffle_lane_words1230(self) -> Self { |
1591 | Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) }) |
1592 | } |
1593 | #[inline (always)] |
1594 | fn shuffle_lane_words2301(self) -> Self { |
1595 | Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) }) |
1596 | } |
1597 | #[inline (always)] |
1598 | fn shuffle_lane_words3012(self) -> Self { |
1599 | Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) }) |
1600 | } |
1601 | } |
1602 | |
1603 | /////////////////////////////////////////////////////////////////////////////////////////// |
1604 | |
1605 | pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>; |
1606 | impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {} |
1607 | |
1608 | impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> { |
1609 | #[inline (always)] |
1610 | unsafe fn unpack(p: vec512_storage) -> Self { |
1611 | Self::new([ |
1612 | u32x4x2_avx2::unpack(p.avx[0]), |
1613 | u32x4x2_avx2::unpack(p.avx[1]), |
1614 | ]) |
1615 | } |
1616 | } |
1617 | impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> { |
1618 | #[inline (always)] |
1619 | fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] { |
1620 | let [a, b] = self.0[0].to_lanes(); |
1621 | let [c, d] = self.0[1].to_lanes(); |
1622 | [a, b, c, d] |
1623 | } |
1624 | #[inline (always)] |
1625 | fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self { |
1626 | let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]); |
1627 | let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]); |
1628 | Self::new([ab, cd]) |
1629 | } |
1630 | } |
1631 | impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { |
1632 | #[inline (always)] |
1633 | fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { |
1634 | match i { |
1635 | 0 => self.0[0].extract(0), |
1636 | 1 => self.0[0].extract(1), |
1637 | 2 => self.0[1].extract(0), |
1638 | 3 => self.0[1].extract(1), |
1639 | _ => panic!(), |
1640 | } |
1641 | } |
1642 | #[inline (always)] |
1643 | fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { |
1644 | Self::new(match i { |
1645 | 0 | 1 => [self.0[0].insert(w, i), self.0[1]], |
1646 | 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)], |
1647 | _ => panic!(), |
1648 | }) |
1649 | } |
1650 | } |
1651 | impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { |
1652 | #[inline (always)] |
1653 | fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { |
1654 | /* |
1655 | * a00:a01 a10:a11 |
1656 | * b00:b01 b10:b11 |
1657 | * c00:c01 c10:c11 |
1658 | * d00:d01 d10:d11 |
1659 | * => |
1660 | * a00:b00 c00:d00 |
1661 | * a01:b01 c01:d01 |
1662 | * a10:b10 c10:d10 |
1663 | * a11:b11 c11:d11 |
1664 | */ |
1665 | unsafe { |
1666 | let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20)); |
1667 | let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31)); |
1668 | let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20)); |
1669 | let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31)); |
1670 | let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20)); |
1671 | let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31)); |
1672 | let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20)); |
1673 | let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31)); |
1674 | ( |
1675 | Self::new([ab00, cd00]), |
1676 | Self::new([ab01, cd01]), |
1677 | Self::new([ab10, cd10]), |
1678 | Self::new([ab11, cd11]), |
1679 | ) |
1680 | } |
1681 | } |
1682 | } |
1683 | impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> { |
1684 | #[inline (always)] |
1685 | fn to_scalars(self) -> [u32; 16] { |
1686 | transmute!(self) |
1687 | } |
1688 | } |
1689 | impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage { |
1690 | #[inline (always)] |
1691 | fn from(x: u32x4x4_avx2<NI>) -> Self { |
1692 | Self { |
1693 | avx: [ |
1694 | vec256_storage { avx: x.0[0].x }, |
1695 | vec256_storage { avx: x.0[1].x }, |
1696 | ], |
1697 | } |
1698 | } |
1699 | } |
1700 | impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> { |
1701 | #[inline (always)] |
1702 | fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self { |
1703 | Self::new(unsafe { |
1704 | [ |
1705 | u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)), |
1706 | u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)), |
1707 | ] |
1708 | }) |
1709 | } |
1710 | } |
1711 | } |
1712 | |