1 | use crate::soft::{x2, x4}; |
2 | use crate::types::*; |
3 | use crate::vec128_storage; |
4 | use crate::x86_64::Avx2Machine; |
5 | use crate::x86_64::SseMachine as Machine86; |
6 | use crate::x86_64::{NoS3, NoS4, YesS3, YesS4}; |
7 | use core::arch::x86_64::*; |
8 | use core::marker::PhantomData; |
9 | use core::ops::{ |
10 | Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not, |
11 | }; |
12 | use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes}; |
13 | |
14 | macro_rules! impl_binop { |
15 | ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => { |
16 | impl<S3, S4, NI> $trait for $vec<S3, S4, NI> { |
17 | type Output = Self; |
18 | #[inline(always)] |
19 | fn $fn(self, rhs: Self) -> Self::Output { |
20 | Self::new(unsafe { $impl_fn(self.x, rhs.x) }) |
21 | } |
22 | } |
23 | }; |
24 | } |
25 | |
26 | macro_rules! impl_binop_assign { |
27 | ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => { |
28 | impl<S3, S4, NI> $trait for $vec<S3, S4, NI> |
29 | where |
30 | $vec<S3, S4, NI>: Copy, |
31 | { |
32 | #[inline(always)] |
33 | fn $fn_assign(&mut self, rhs: Self) { |
34 | *self = self.$fn(rhs); |
35 | } |
36 | } |
37 | }; |
38 | } |
39 | |
40 | macro_rules! def_vec { |
41 | ($vec:ident, $word:ident) => { |
42 | #[allow(non_camel_case_types)] |
43 | #[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)] |
44 | #[repr(transparent)] |
45 | pub struct $vec<S3, S4, NI> { |
46 | x: __m128i, |
47 | s3: PhantomData<S3>, |
48 | s4: PhantomData<S4>, |
49 | ni: PhantomData<NI>, |
50 | } |
51 | |
52 | impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> { |
53 | #[inline(always)] |
54 | unsafe fn unpack(x: vec128_storage) -> Self { |
55 | Self::new(x.sse2) |
56 | } |
57 | } |
58 | impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage { |
59 | #[inline(always)] |
60 | fn from(x: $vec<S3, S4, NI>) -> Self { |
61 | vec128_storage { sse2: x.x } |
62 | } |
63 | } |
64 | impl<S3, S4, NI> $vec<S3, S4, NI> { |
65 | #[inline(always)] |
66 | fn new(x: __m128i) -> Self { |
67 | $vec { |
68 | x, |
69 | s3: PhantomData, |
70 | s4: PhantomData, |
71 | ni: PhantomData, |
72 | } |
73 | } |
74 | } |
75 | |
76 | impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI> |
77 | where |
78 | Self: BSwap, |
79 | { |
80 | #[inline(always)] |
81 | unsafe fn unsafe_read_le(input: &[u8]) -> Self { |
82 | assert_eq!(input.len(), 16); |
83 | Self::new(_mm_loadu_si128(input.as_ptr() as *const _)) |
84 | } |
85 | #[inline(always)] |
86 | unsafe fn unsafe_read_be(input: &[u8]) -> Self { |
87 | assert_eq!(input.len(), 16); |
88 | Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap() |
89 | } |
90 | #[inline(always)] |
91 | fn write_le(self, out: &mut [u8]) { |
92 | assert_eq!(out.len(), 16); |
93 | unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) } |
94 | } |
95 | #[inline(always)] |
96 | fn write_be(self, out: &mut [u8]) { |
97 | assert_eq!(out.len(), 16); |
98 | let x = self.bswap().x; |
99 | unsafe { |
100 | _mm_storeu_si128(out.as_mut_ptr() as *mut _, x); |
101 | } |
102 | } |
103 | } |
104 | |
105 | impl<S3, S4, NI> Default for $vec<S3, S4, NI> { |
106 | #[inline(always)] |
107 | fn default() -> Self { |
108 | Self::new(unsafe { _mm_setzero_si128() }) |
109 | } |
110 | } |
111 | |
112 | impl<S3, S4, NI> Not for $vec<S3, S4, NI> { |
113 | type Output = Self; |
114 | #[inline(always)] |
115 | fn not(self) -> Self::Output { |
116 | unsafe { |
117 | let ff = _mm_set1_epi64x(-1i64); |
118 | self ^ Self::new(ff) |
119 | } |
120 | } |
121 | } |
122 | |
123 | impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {} |
124 | impl_binop!($vec, BitAnd, bitand, _mm_and_si128); |
125 | impl_binop!($vec, BitOr, bitor, _mm_or_si128); |
126 | impl_binop!($vec, BitXor, bitxor, _mm_xor_si128); |
127 | impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand); |
128 | impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor); |
129 | impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor); |
130 | impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> { |
131 | type Output = Self; |
132 | #[inline(always)] |
133 | fn andnot(self, rhs: Self) -> Self { |
134 | Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) }) |
135 | } |
136 | } |
137 | }; |
138 | } |
139 | |
140 | macro_rules! impl_bitops32 { |
141 | ($vec:ident) => { |
142 | impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where |
143 | $vec<S3, S4, NI>: RotateEachWord32 |
144 | { |
145 | } |
146 | }; |
147 | } |
148 | |
149 | macro_rules! impl_bitops64 { |
150 | ($vec:ident) => { |
151 | impl_bitops32!($vec); |
152 | impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where |
153 | $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 |
154 | { |
155 | } |
156 | }; |
157 | } |
158 | |
159 | macro_rules! impl_bitops128 { |
160 | ($vec:ident) => { |
161 | impl_bitops64!($vec); |
162 | impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where |
163 | $vec<S3, S4, NI>: RotateEachWord128 |
164 | { |
165 | } |
166 | }; |
167 | } |
168 | |
169 | macro_rules! rotr_32_s3 { |
170 | ($name:ident, $k0:expr, $k1:expr) => { |
171 | #[inline(always)] |
172 | fn $name(self) -> Self { |
173 | Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) |
174 | } |
175 | }; |
176 | } |
177 | macro_rules! rotr_32 { |
178 | ($name:ident, $i:expr) => { |
179 | #[inline(always)] |
180 | fn $name(self) -> Self { |
181 | Self::new(unsafe { |
182 | _mm_or_si128( |
183 | _mm_srli_epi32(self.x, $i as i32), |
184 | _mm_slli_epi32(self.x, 32 - $i as i32), |
185 | ) |
186 | }) |
187 | } |
188 | }; |
189 | } |
190 | impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> { |
191 | rotr_32!(rotate_each_word_right7, 7); |
192 | rotr_32_s3!( |
193 | rotate_each_word_right8, |
194 | 0x0c0f_0e0d_080b_0a09, |
195 | 0x0407_0605_0003_0201 |
196 | ); |
197 | rotr_32!(rotate_each_word_right11, 11); |
198 | rotr_32!(rotate_each_word_right12, 12); |
199 | rotr_32_s3!( |
200 | rotate_each_word_right16, |
201 | 0x0d0c_0f0e_0908_0b0a, |
202 | 0x0504_0706_0100_0302 |
203 | ); |
204 | rotr_32!(rotate_each_word_right20, 20); |
205 | rotr_32_s3!( |
206 | rotate_each_word_right24, |
207 | 0x0e0d_0c0f_0a09_080b, |
208 | 0x0605_0407_0201_0003 |
209 | ); |
210 | rotr_32!(rotate_each_word_right25, 25); |
211 | } |
212 | impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> { |
213 | rotr_32!(rotate_each_word_right7, 7); |
214 | rotr_32!(rotate_each_word_right8, 8); |
215 | rotr_32!(rotate_each_word_right11, 11); |
216 | rotr_32!(rotate_each_word_right12, 12); |
217 | #[inline (always)] |
218 | fn rotate_each_word_right16(self) -> Self { |
219 | Self::new(swap16_s2(self.x)) |
220 | } |
221 | rotr_32!(rotate_each_word_right20, 20); |
222 | rotr_32!(rotate_each_word_right24, 24); |
223 | rotr_32!(rotate_each_word_right25, 25); |
224 | } |
225 | |
226 | macro_rules! rotr_64_s3 { |
227 | ($name:ident, $k0:expr, $k1:expr) => { |
228 | #[inline(always)] |
229 | fn $name(self) -> Self { |
230 | Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) |
231 | } |
232 | }; |
233 | } |
234 | macro_rules! rotr_64 { |
235 | ($name:ident, $i:expr) => { |
236 | #[inline(always)] |
237 | fn $name(self) -> Self { |
238 | Self::new(unsafe { |
239 | _mm_or_si128( |
240 | _mm_srli_epi64(self.x, $i as i32), |
241 | _mm_slli_epi64(self.x, 64 - $i as i32), |
242 | ) |
243 | }) |
244 | } |
245 | }; |
246 | } |
247 | impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> { |
248 | rotr_64!(rotate_each_word_right7, 7); |
249 | rotr_64_s3!( |
250 | rotate_each_word_right8, |
251 | 0x080f_0e0d_0c0b_0a09, |
252 | 0x0007_0605_0403_0201 |
253 | ); |
254 | rotr_64!(rotate_each_word_right11, 11); |
255 | rotr_64!(rotate_each_word_right12, 12); |
256 | rotr_64_s3!( |
257 | rotate_each_word_right16, |
258 | 0x0908_0f0e_0d0c_0b0a, |
259 | 0x0100_0706_0504_0302 |
260 | ); |
261 | rotr_64!(rotate_each_word_right20, 20); |
262 | rotr_64_s3!( |
263 | rotate_each_word_right24, |
264 | 0x0a09_080f_0e0d_0c0b, |
265 | 0x0201_0007_0605_0403 |
266 | ); |
267 | rotr_64!(rotate_each_word_right25, 25); |
268 | } |
269 | impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> { |
270 | rotr_64!(rotate_each_word_right7, 7); |
271 | rotr_64!(rotate_each_word_right8, 8); |
272 | rotr_64!(rotate_each_word_right11, 11); |
273 | rotr_64!(rotate_each_word_right12, 12); |
274 | #[inline (always)] |
275 | fn rotate_each_word_right16(self) -> Self { |
276 | Self::new(swap16_s2(self.x)) |
277 | } |
278 | rotr_64!(rotate_each_word_right20, 20); |
279 | rotr_64!(rotate_each_word_right24, 24); |
280 | rotr_64!(rotate_each_word_right25, 25); |
281 | } |
282 | impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> { |
283 | #[inline (always)] |
284 | fn rotate_each_word_right32(self) -> Self { |
285 | Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) }) |
286 | } |
287 | } |
288 | |
289 | macro_rules! rotr_128 { |
290 | ($name:ident, $i:expr) => { |
291 | #[inline(always)] |
292 | fn $name(self) -> Self { |
293 | Self::new(unsafe { |
294 | _mm_or_si128( |
295 | _mm_srli_si128(self.x, $i as i32), |
296 | _mm_slli_si128(self.x, 128 - $i as i32), |
297 | ) |
298 | }) |
299 | } |
300 | }; |
301 | } |
302 | // TODO: completely unoptimized |
303 | impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> { |
304 | rotr_128!(rotate_each_word_right7, 7); |
305 | rotr_128!(rotate_each_word_right8, 8); |
306 | rotr_128!(rotate_each_word_right11, 11); |
307 | rotr_128!(rotate_each_word_right12, 12); |
308 | rotr_128!(rotate_each_word_right16, 16); |
309 | rotr_128!(rotate_each_word_right20, 20); |
310 | rotr_128!(rotate_each_word_right24, 24); |
311 | rotr_128!(rotate_each_word_right25, 25); |
312 | } |
313 | // TODO: completely unoptimized |
314 | impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> { |
315 | rotr_128!(rotate_each_word_right32, 32); |
316 | } |
317 | impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {} |
318 | |
319 | def_vec!(u32x4_sse2, u32); |
320 | def_vec!(u64x2_sse2, u64); |
321 | def_vec!(u128x1_sse2, u128); |
322 | |
323 | impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> { |
324 | #[inline (always)] |
325 | fn to_lanes(self) -> [u32; 4] { |
326 | unsafe { |
327 | let x: u64 = _mm_cvtsi128_si64(self.x) as u64; |
328 | let y: u64 = _mm_extract_epi64(self.x, 1) as u64; |
329 | [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] |
330 | } |
331 | } |
332 | #[inline (always)] |
333 | fn from_lanes(xs: [u32; 4]) -> Self { |
334 | unsafe { |
335 | let mut x: __m128i = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64); |
336 | x = _mm_insert_epi64(a:x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1); |
337 | Self::new(x) |
338 | } |
339 | } |
340 | } |
341 | impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> { |
342 | #[inline (always)] |
343 | fn to_lanes(self) -> [u32; 4] { |
344 | unsafe { |
345 | let x: u64 = _mm_cvtsi128_si64(self.x) as u64; |
346 | let y: u64 = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64; |
347 | [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] |
348 | } |
349 | } |
350 | #[inline (always)] |
351 | fn from_lanes(xs: [u32; 4]) -> Self { |
352 | unsafe { |
353 | let x: i64 = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64; |
354 | let y: i64 = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64; |
355 | let x: __m128i = _mm_cvtsi64_si128(x); |
356 | let y: __m128i = _mm_slli_si128(_mm_cvtsi64_si128(y), 8); |
357 | Self::new(_mm_or_si128(a:x, b:y)) |
358 | } |
359 | } |
360 | } |
361 | impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> { |
362 | #[inline (always)] |
363 | fn to_lanes(self) -> [u64; 2] { |
364 | unsafe { |
365 | [ |
366 | _mm_cvtsi128_si64(self.x) as u64, |
367 | _mm_extract_epi64(self.x, 1) as u64, |
368 | ] |
369 | } |
370 | } |
371 | #[inline (always)] |
372 | fn from_lanes(xs: [u64; 2]) -> Self { |
373 | unsafe { |
374 | let mut x: __m128i = _mm_cvtsi64_si128(xs[0] as i64); |
375 | x = _mm_insert_epi64(a:x, i:xs[1] as i64, 1); |
376 | Self::new(x) |
377 | } |
378 | } |
379 | } |
380 | impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> { |
381 | #[inline (always)] |
382 | fn to_lanes(self) -> [u64; 2] { |
383 | unsafe { |
384 | [ |
385 | _mm_cvtsi128_si64(self.x) as u64, |
386 | _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64, |
387 | ] |
388 | } |
389 | } |
390 | #[inline (always)] |
391 | fn from_lanes(xs: [u64; 2]) -> Self { |
392 | unsafe { |
393 | let x: __m128i = _mm_cvtsi64_si128(xs[0] as i64); |
394 | let y: __m128i = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8); |
395 | Self::new(_mm_or_si128(a:x, b:y)) |
396 | } |
397 | } |
398 | } |
399 | impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> { |
400 | #[inline (always)] |
401 | fn to_lanes(self) -> [u128; 1] { |
402 | unimplemented!() |
403 | } |
404 | #[inline (always)] |
405 | fn from_lanes(xs: [u128; 1]) -> Self { |
406 | unimplemented!(" {:?}" , xs) |
407 | } |
408 | } |
409 | |
410 | impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI> |
411 | where |
412 | u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy, |
413 | { |
414 | #[inline (always)] |
415 | fn to_lanes(self) -> [u64; 4] { |
416 | let (a: [u64; 2], b: [u64; 2]) = (self.0[0].to_lanes(), self.0[1].to_lanes()); |
417 | [a[0], a[1], b[0], b[1]] |
418 | } |
419 | #[inline (always)] |
420 | fn from_lanes(xs: [u64; 4]) -> Self { |
421 | let (a: u64x2_sse2, b: u64x2_sse2) = ( |
422 | u64x2_sse2::from_lanes([xs[0], xs[1]]), |
423 | u64x2_sse2::from_lanes([xs[2], xs[3]]), |
424 | ); |
425 | x2::new([a, b]) |
426 | } |
427 | } |
428 | |
429 | macro_rules! impl_into { |
430 | ($from:ident, $to:ident) => { |
431 | impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> { |
432 | #[inline(always)] |
433 | fn from(x: $from<S3, S4, NI>) -> Self { |
434 | $to::new(x.x) |
435 | } |
436 | } |
437 | }; |
438 | } |
439 | |
440 | impl_into!(u128x1_sse2, u32x4_sse2); |
441 | impl_into!(u128x1_sse2, u64x2_sse2); |
442 | |
443 | impl_bitops32!(u32x4_sse2); |
444 | impl_bitops64!(u64x2_sse2); |
445 | impl_bitops128!(u128x1_sse2); |
446 | |
447 | impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where |
448 | u32x4_sse2<S3, S4, NI>: BSwap |
449 | { |
450 | } |
451 | impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where |
452 | u64x2_sse2<S3, S4, NI>: BSwap |
453 | { |
454 | } |
455 | impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32); |
456 | impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64); |
457 | impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add); |
458 | impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add); |
459 | |
460 | impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI> |
461 | where |
462 | u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>, |
463 | Machine86<S3, S4, NI>: Machine, |
464 | { |
465 | } |
466 | impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI> |
467 | where |
468 | u64x2_sse2<S3, S4, NI>: |
469 | RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>, |
470 | Machine86<S3, S4, NI>: Machine, |
471 | { |
472 | } |
473 | impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI> |
474 | where |
475 | u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, |
476 | Machine86<S3, S4, NI>: Machine, |
477 | u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>, |
478 | u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>, |
479 | { |
480 | } |
481 | |
482 | impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI> |
483 | where |
484 | u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>, |
485 | Machine86<YesS3, YesS4, NI>: Machine, |
486 | { |
487 | } |
488 | impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI> |
489 | where |
490 | u64x2_sse2<YesS3, YesS4, NI>: |
491 | RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>, |
492 | Machine86<YesS3, YesS4, NI>: Machine, |
493 | { |
494 | } |
495 | impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI> |
496 | where |
497 | u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, |
498 | Machine86<YesS3, YesS4, NI>: Machine, |
499 | u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>, |
500 | u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>, |
501 | { |
502 | } |
503 | |
504 | impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> { |
505 | #[inline (always)] |
506 | unsafe fn unsafe_from(xs: [u32; 4]) -> Self { |
507 | Self::new(_mm_set_epi32( |
508 | e3:xs[3] as i32, |
509 | e2:xs[2] as i32, |
510 | e1:xs[1] as i32, |
511 | e0:xs[0] as i32, |
512 | )) |
513 | } |
514 | } |
515 | |
516 | impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI> |
517 | where |
518 | Self: MultiLane<[u32; 4]>, |
519 | { |
520 | #[inline (always)] |
521 | fn extract(self, i: u32) -> u32 { |
522 | self.to_lanes()[i as usize] |
523 | } |
524 | #[inline (always)] |
525 | fn insert(self, v: u32, i: u32) -> Self { |
526 | Self::new(unsafe { |
527 | match i { |
528 | 0 => _mm_insert_epi32(self.x, i:v as i32, 0), |
529 | 1 => _mm_insert_epi32(self.x, i:v as i32, 1), |
530 | 2 => _mm_insert_epi32(self.x, i:v as i32, 2), |
531 | 3 => _mm_insert_epi32(self.x, i:v as i32, 3), |
532 | _ => unreachable!(), |
533 | } |
534 | }) |
535 | } |
536 | } |
537 | impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI> |
538 | where |
539 | Self: MultiLane<[u32; 4]>, |
540 | { |
541 | #[inline (always)] |
542 | fn extract(self, i: u32) -> u32 { |
543 | self.to_lanes()[i as usize] |
544 | } |
545 | #[inline (always)] |
546 | fn insert(self, v: u32, i: u32) -> Self { |
547 | Self::new(unsafe { |
548 | match i { |
549 | 0 => { |
550 | let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x); |
551 | _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)) |
552 | } |
553 | 1 => { |
554 | let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000); |
555 | x = _mm_slli_si128(x, 4); |
556 | x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); |
557 | _mm_shuffle_epi32(x, 0b1110_0001) |
558 | } |
559 | 2 => { |
560 | let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100); |
561 | x = _mm_slli_si128(x, 4); |
562 | x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); |
563 | _mm_shuffle_epi32(x, 0b1100_1001) |
564 | } |
565 | 3 => { |
566 | let mut x = _mm_slli_si128(self.x, 4); |
567 | x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); |
568 | _mm_shuffle_epi32(x, 0b0011_1001) |
569 | } |
570 | _ => unreachable!(), |
571 | } |
572 | }) |
573 | } |
574 | } |
575 | |
576 | impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> { |
577 | #[inline (always)] |
578 | fn shuffle_lane_words2301(self) -> Self { |
579 | self.shuffle2301() |
580 | } |
581 | #[inline (always)] |
582 | fn shuffle_lane_words1230(self) -> Self { |
583 | self.shuffle1230() |
584 | } |
585 | #[inline (always)] |
586 | fn shuffle_lane_words3012(self) -> Self { |
587 | self.shuffle3012() |
588 | } |
589 | } |
590 | |
591 | impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> { |
592 | #[inline (always)] |
593 | fn shuffle2301(self) -> Self { |
594 | Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) |
595 | } |
596 | #[inline (always)] |
597 | fn shuffle1230(self) -> Self { |
598 | Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) }) |
599 | } |
600 | #[inline (always)] |
601 | fn shuffle3012(self) -> Self { |
602 | Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) }) |
603 | } |
604 | } |
605 | |
606 | impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> { |
607 | #[inline (always)] |
608 | fn shuffle2301(self) -> Self { |
609 | x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) |
610 | } |
611 | #[inline (always)] |
612 | fn shuffle3012(self) -> Self { |
613 | unsafe { |
614 | x2::new([ |
615 | u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), |
616 | u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), |
617 | ]) |
618 | } |
619 | } |
620 | #[inline (always)] |
621 | fn shuffle1230(self) -> Self { |
622 | unsafe { |
623 | x2::new([ |
624 | u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), |
625 | u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), |
626 | ]) |
627 | } |
628 | } |
629 | } |
630 | impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> { |
631 | #[inline (always)] |
632 | fn shuffle2301(self) -> Self { |
633 | x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) |
634 | } |
635 | #[inline (always)] |
636 | fn shuffle3012(self) -> Self { |
637 | unsafe { |
638 | let a = _mm_srli_si128(self.0[0].x, 8); |
639 | let b = _mm_slli_si128(self.0[0].x, 8); |
640 | let c = _mm_srli_si128(self.0[1].x, 8); |
641 | let d = _mm_slli_si128(self.0[1].x, 8); |
642 | let da = _mm_or_si128(d, a); |
643 | let bc = _mm_or_si128(b, c); |
644 | x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)]) |
645 | } |
646 | } |
647 | #[inline (always)] |
648 | fn shuffle1230(self) -> Self { |
649 | unsafe { |
650 | let a = _mm_srli_si128(self.0[0].x, 8); |
651 | let b = _mm_slli_si128(self.0[0].x, 8); |
652 | let c = _mm_srli_si128(self.0[1].x, 8); |
653 | let d = _mm_slli_si128(self.0[1].x, 8); |
654 | let da = _mm_or_si128(d, a); |
655 | let bc = _mm_or_si128(b, c); |
656 | x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)]) |
657 | } |
658 | } |
659 | } |
660 | |
661 | impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> { |
662 | #[inline (always)] |
663 | unsafe fn unsafe_from(xs: [u64; 2]) -> Self { |
664 | Self::new(_mm_set_epi64x(e1:xs[1] as i64, e0:xs[0] as i64)) |
665 | } |
666 | } |
667 | |
668 | impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> { |
669 | #[inline (always)] |
670 | fn extract(self, i: u32) -> u64 { |
671 | unsafe { |
672 | match i { |
673 | 0 => _mm_cvtsi128_si64(self.x) as u64, |
674 | 1 => _mm_extract_epi64(self.x, 1) as u64, |
675 | _ => unreachable!(), |
676 | } |
677 | } |
678 | } |
679 | #[inline (always)] |
680 | fn insert(self, x: u64, i: u32) -> Self { |
681 | Self::new(unsafe { |
682 | match i { |
683 | 0 => _mm_insert_epi64(self.x, i:x as i64, 0), |
684 | 1 => _mm_insert_epi64(self.x, i:x as i64, 1), |
685 | _ => unreachable!(), |
686 | } |
687 | }) |
688 | } |
689 | } |
690 | impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> { |
691 | #[inline (always)] |
692 | fn extract(self, i: u32) -> u64 { |
693 | unsafe { |
694 | match i { |
695 | 0 => _mm_cvtsi128_si64(self.x) as u64, |
696 | 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64, |
697 | _ => unreachable!(), |
698 | } |
699 | } |
700 | } |
701 | #[inline (always)] |
702 | fn insert(self, x: u64, i: u32) -> Self { |
703 | Self::new(unsafe { |
704 | match i { |
705 | 0 => _mm_or_si128( |
706 | _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x), |
707 | _mm_cvtsi64_si128(x as i64), |
708 | ), |
709 | 1 => _mm_or_si128( |
710 | _mm_move_epi64(self.x), |
711 | _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8), |
712 | ), |
713 | _ => unreachable!(), |
714 | } |
715 | }) |
716 | } |
717 | } |
718 | |
719 | impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> { |
720 | #[inline (always)] |
721 | fn bswap(self) -> Self { |
722 | Self::new(unsafe { |
723 | let k: __m128i = _mm_set_epi64x(e1:0x0c0d_0e0f_0809_0a0b, e0:0x0405_0607_0001_0203); |
724 | _mm_shuffle_epi8(self.x, b:k) |
725 | }) |
726 | } |
727 | } |
728 | #[inline (always)] |
729 | fn bswap32_s2(x: __m128i) -> __m128i { |
730 | unsafe { |
731 | let mut y: __m128i = _mm_unpacklo_epi8(a:x, b:_mm_setzero_si128()); |
732 | y = _mm_shufflehi_epi16(y, 0b0001_1011); |
733 | y = _mm_shufflelo_epi16(y, 0b0001_1011); |
734 | let mut z: __m128i = _mm_unpackhi_epi8(a:x, b:_mm_setzero_si128()); |
735 | z = _mm_shufflehi_epi16(z, 0b0001_1011); |
736 | z = _mm_shufflelo_epi16(z, 0b0001_1011); |
737 | _mm_packus_epi16(a:y, b:z) |
738 | } |
739 | } |
740 | impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> { |
741 | #[inline (always)] |
742 | fn bswap(self) -> Self { |
743 | Self::new(bswap32_s2(self.x)) |
744 | } |
745 | } |
746 | |
747 | impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> { |
748 | #[inline (always)] |
749 | fn bswap(self) -> Self { |
750 | Self::new(unsafe { |
751 | let k: __m128i = _mm_set_epi64x(e1:0x0809_0a0b_0c0d_0e0f, e0:0x0001_0203_0405_0607); |
752 | _mm_shuffle_epi8(self.x, b:k) |
753 | }) |
754 | } |
755 | } |
756 | impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> { |
757 | #[inline (always)] |
758 | fn bswap(self) -> Self { |
759 | Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) }) |
760 | } |
761 | } |
762 | |
763 | impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> { |
764 | #[inline (always)] |
765 | fn bswap(self) -> Self { |
766 | Self::new(unsafe { |
767 | let k: __m128i = _mm_set_epi64x(e1:0x0f0e_0d0c_0b0a_0908, e0:0x0706_0504_0302_0100); |
768 | _mm_shuffle_epi8(self.x, b:k) |
769 | }) |
770 | } |
771 | } |
772 | impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> { |
773 | #[inline (always)] |
774 | fn bswap(self) -> Self { |
775 | unimplemented!() |
776 | } |
777 | } |
778 | |
779 | macro_rules! swapi { |
780 | ($x:expr, $i:expr, $k:expr) => { |
781 | unsafe { |
782 | const K: u8 = $k; |
783 | let k = _mm_set1_epi8(K as i8); |
784 | u128x1_sse2::new(_mm_or_si128( |
785 | _mm_srli_epi16(_mm_and_si128($x.x, k), $i), |
786 | _mm_and_si128(_mm_slli_epi16($x.x, $i), k), |
787 | )) |
788 | } |
789 | }; |
790 | } |
791 | #[inline (always)] |
792 | fn swap16_s2(x: __m128i) -> __m128i { |
793 | unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) } |
794 | } |
795 | impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> { |
796 | #[inline (always)] |
797 | fn swap1(self) -> Self { |
798 | swapi!(self, 1, 0xaa) |
799 | } |
800 | #[inline (always)] |
801 | fn swap2(self) -> Self { |
802 | swapi!(self, 2, 0xcc) |
803 | } |
804 | #[inline (always)] |
805 | fn swap4(self) -> Self { |
806 | swapi!(self, 4, 0xf0) |
807 | } |
808 | #[inline (always)] |
809 | fn swap8(self) -> Self { |
810 | u128x1_sse2::new(unsafe { |
811 | let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001); |
812 | _mm_shuffle_epi8(self.x, k) |
813 | }) |
814 | } |
815 | #[inline (always)] |
816 | fn swap16(self) -> Self { |
817 | u128x1_sse2::new(unsafe { |
818 | let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302); |
819 | _mm_shuffle_epi8(self.x, k) |
820 | }) |
821 | } |
822 | #[inline (always)] |
823 | fn swap32(self) -> Self { |
824 | u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) |
825 | } |
826 | #[inline (always)] |
827 | fn swap64(self) -> Self { |
828 | u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) |
829 | } |
830 | } |
831 | impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> { |
832 | #[inline (always)] |
833 | fn swap1(self) -> Self { |
834 | swapi!(self, 1, 0xaa) |
835 | } |
836 | #[inline (always)] |
837 | fn swap2(self) -> Self { |
838 | swapi!(self, 2, 0xcc) |
839 | } |
840 | #[inline (always)] |
841 | fn swap4(self) -> Self { |
842 | swapi!(self, 4, 0xf0) |
843 | } |
844 | #[inline (always)] |
845 | fn swap8(self) -> Self { |
846 | u128x1_sse2::new(unsafe { |
847 | _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8)) |
848 | }) |
849 | } |
850 | #[inline (always)] |
851 | fn swap16(self) -> Self { |
852 | u128x1_sse2::new(swap16_s2(self.x)) |
853 | } |
854 | #[inline (always)] |
855 | fn swap32(self) -> Self { |
856 | u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) |
857 | } |
858 | #[inline (always)] |
859 | fn swap64(self) -> Self { |
860 | u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) |
861 | } |
862 | } |
863 | |
864 | #[derive (Copy, Clone)] |
865 | pub struct G0; |
866 | #[derive (Copy, Clone)] |
867 | pub struct G1; |
868 | |
869 | #[allow (non_camel_case_types)] |
870 | pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>; |
871 | #[allow (non_camel_case_types)] |
872 | pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>; |
873 | #[allow (non_camel_case_types)] |
874 | pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>; |
875 | #[allow (non_camel_case_types)] |
876 | pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>; |
877 | |
878 | #[allow (non_camel_case_types)] |
879 | pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>; |
880 | #[allow (non_camel_case_types)] |
881 | pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>; |
882 | #[allow (non_camel_case_types)] |
883 | pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>; |
884 | |
885 | impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> { |
886 | #[inline (always)] |
887 | fn to_scalars(self) -> [u32; 16] { |
888 | transmute!(self) |
889 | } |
890 | } |
891 | |
892 | impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI> |
893 | where |
894 | u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap, |
895 | Machine86<S3, S4, NI>: Machine, |
896 | u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>, |
897 | u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>, |
898 | { |
899 | } |
900 | impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI> |
901 | where |
902 | u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
903 | Machine86<S3, S4, NI>: Machine, |
904 | u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>, |
905 | u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>, |
906 | { |
907 | } |
908 | impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI> |
909 | where |
910 | u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
911 | Machine86<S3, S4, NI>: Machine, |
912 | u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4, |
913 | { |
914 | } |
915 | impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI> |
916 | where |
917 | u128x1_sse2<S3, S4, NI>: Swap64 + BSwap, |
918 | Machine86<S3, S4, NI>: Machine, |
919 | u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>, |
920 | u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>, |
921 | u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>, |
922 | u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>, |
923 | u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>, |
924 | { |
925 | } |
926 | |
927 | impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI> |
928 | where |
929 | u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap, |
930 | Avx2Machine<NI>: Machine, |
931 | u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>, |
932 | u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>, |
933 | { |
934 | } |
935 | impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI> |
936 | where |
937 | u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
938 | Avx2Machine<NI>: Machine, |
939 | u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>, |
940 | u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>, |
941 | { |
942 | } |
943 | impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI> |
944 | where |
945 | u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
946 | Avx2Machine<NI>: Machine, |
947 | u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4, |
948 | { |
949 | } |
950 | impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI> |
951 | where |
952 | u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap, |
953 | Avx2Machine<NI>: Machine, |
954 | u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>, |
955 | u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>, |
956 | u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>, |
957 | u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>, |
958 | u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>, |
959 | { |
960 | } |
961 | |
962 | impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI> |
963 | where |
964 | u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>, |
965 | { |
966 | #[inline (always)] |
967 | fn extract(self, i: u32) -> u64 { |
968 | match i { |
969 | 0 => self.0[0].extract(0), |
970 | 1 => self.0[0].extract(1), |
971 | 2 => self.0[1].extract(0), |
972 | 3 => self.0[1].extract(1), |
973 | _ => panic!(), |
974 | } |
975 | } |
976 | #[inline (always)] |
977 | fn insert(mut self, w: u64, i: u32) -> Self { |
978 | match i { |
979 | 0 => self.0[0] = self.0[0].insert(w, i:0), |
980 | 1 => self.0[0] = self.0[0].insert(w, i:1), |
981 | 2 => self.0[1] = self.0[1].insert(w, i:0), |
982 | 3 => self.0[1] = self.0[1].insert(w, i:1), |
983 | _ => panic!(), |
984 | }; |
985 | self |
986 | } |
987 | } |
988 | |
989 | impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI> |
990 | where |
991 | u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap, |
992 | Machine86<S3, S4, NI>: Machine, |
993 | u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>, |
994 | u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>, |
995 | u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>, |
996 | u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>, |
997 | { |
998 | } |
999 | impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI> |
1000 | where |
1001 | u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
1002 | Machine86<S3, S4, NI>: Machine, |
1003 | u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>, |
1004 | u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>, |
1005 | { |
1006 | } |
1007 | impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI> |
1008 | where |
1009 | u128x1_sse2<S3, S4, NI>: Swap64 + BSwap, |
1010 | Machine86<S3, S4, NI>: Machine, |
1011 | u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>, |
1012 | u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>, |
1013 | u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>, |
1014 | u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>, |
1015 | { |
1016 | } |
1017 | |
1018 | impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI> |
1019 | where |
1020 | u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, |
1021 | Avx2Machine<NI>: Machine, |
1022 | u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>, |
1023 | u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>, |
1024 | { |
1025 | } |
1026 | impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI> |
1027 | where |
1028 | u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap, |
1029 | Avx2Machine<NI>: Machine, |
1030 | u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>, |
1031 | u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>, |
1032 | u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>, |
1033 | u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>, |
1034 | { |
1035 | } |
1036 | |
1037 | macro_rules! impl_into_x { |
1038 | ($from:ident, $to:ident) => { |
1039 | impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>> |
1040 | for x2<$to<S3, S4, NI>, Gt> |
1041 | { |
1042 | #[inline(always)] |
1043 | fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self { |
1044 | x2::new([$to::from(x.0[0]), $to::from(x.0[1])]) |
1045 | } |
1046 | } |
1047 | impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> { |
1048 | #[inline(always)] |
1049 | fn from(x: x4<$from<S3, S4, NI>>) -> Self { |
1050 | x4::new([ |
1051 | $to::from(x.0[0]), |
1052 | $to::from(x.0[1]), |
1053 | $to::from(x.0[2]), |
1054 | $to::from(x.0[3]), |
1055 | ]) |
1056 | } |
1057 | } |
1058 | }; |
1059 | } |
1060 | impl_into_x!(u128x1_sse2, u64x2_sse2); |
1061 | impl_into_x!(u128x1_sse2, u32x4_sse2); |
1062 | |
1063 | ///// Debugging |
1064 | |
1065 | use core::fmt::{Debug, Formatter, Result}; |
1066 | |
1067 | impl<W: PartialEq, G> PartialEq for x2<W, G> { |
1068 | #[inline (always)] |
1069 | fn eq(&self, rhs: &Self) -> bool { |
1070 | self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1] |
1071 | } |
1072 | } |
1073 | |
1074 | #[allow (unused)] |
1075 | #[inline (always)] |
1076 | unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool { |
1077 | let q: __m128i = _mm_shuffle_epi32(_mm_cmpeq_epi64(a:x, b:y), 0b1100_0110); |
1078 | _mm_cvtsi128_si64(q) == -1 |
1079 | } |
1080 | |
1081 | #[inline (always)] |
1082 | unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool { |
1083 | let q: __m128i = _mm_cmpeq_epi32(a:x, b:y); |
1084 | let p: i64 = _mm_cvtsi128_si64(_mm_srli_si128(q, 8)); |
1085 | let q: i64 = _mm_cvtsi128_si64(q); |
1086 | (p & q) == -1 |
1087 | } |
1088 | |
1089 | impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> { |
1090 | #[inline (always)] |
1091 | fn eq(&self, rhs: &Self) -> bool { |
1092 | unsafe { eq128_s2(self.x, y:rhs.x) } |
1093 | } |
1094 | } |
1095 | impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI> |
1096 | where |
1097 | Self: Copy + MultiLane<[u32; 4]>, |
1098 | { |
1099 | #[cold ] |
1100 | fn fmt(&self, fmt: &mut Formatter) -> Result { |
1101 | fmt.write_fmt(format_args!(" {:08x?}" , &self.to_lanes())) |
1102 | } |
1103 | } |
1104 | |
1105 | impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> { |
1106 | #[inline (always)] |
1107 | fn eq(&self, rhs: &Self) -> bool { |
1108 | unsafe { eq128_s2(self.x, y:rhs.x) } |
1109 | } |
1110 | } |
1111 | impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI> |
1112 | where |
1113 | Self: Copy + MultiLane<[u64; 2]>, |
1114 | { |
1115 | #[cold ] |
1116 | fn fmt(&self, fmt: &mut Formatter) -> Result { |
1117 | fmt.write_fmt(format_args!(" {:016x?}" , &self.to_lanes())) |
1118 | } |
1119 | } |
1120 | |
1121 | impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI> |
1122 | where |
1123 | u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>, |
1124 | { |
1125 | #[cold ] |
1126 | fn fmt(&self, fmt: &mut Formatter) -> Result { |
1127 | let (a: [u64; 2], b: [u64; 2]) = (self.0[0].to_lanes(), self.0[1].to_lanes()); |
1128 | fmt.write_fmt(format_args!(" {:016x?}" , &[a[0], a[1], b[0], b[1]])) |
1129 | } |
1130 | } |
1131 | |
1132 | #[cfg (test)] |
1133 | #[cfg (target_arch = "x86_64" )] |
1134 | mod test { |
1135 | use super::*; |
1136 | use crate::x86_64::{SSE2, SSE41, SSSE3}; |
1137 | use crate::Machine; |
1138 | |
1139 | #[test ] |
1140 | #[cfg_attr (not(target_feature = "ssse3" ), ignore)] |
1141 | fn test_bswap32_s2_vs_s3() { |
1142 | let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100]; |
1143 | let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203]; |
1144 | |
1145 | let s2 = unsafe { SSE2::instance() }; |
1146 | let s3 = unsafe { SSSE3::instance() }; |
1147 | |
1148 | let x_s2 = { |
1149 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1150 | x_s2.bswap() |
1151 | }; |
1152 | |
1153 | let x_s3 = { |
1154 | let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); |
1155 | x_s3.bswap() |
1156 | }; |
1157 | |
1158 | assert_eq!(x_s2, transmute!(x_s3)); |
1159 | assert_eq!(x_s2, s2.vec(ys)); |
1160 | } |
1161 | |
1162 | #[test ] |
1163 | #[cfg_attr (not(target_feature = "ssse3" ), ignore)] |
1164 | fn test_bswap64_s2_vs_s3() { |
1165 | let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100]; |
1166 | let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607]; |
1167 | |
1168 | let s2 = unsafe { SSE2::instance() }; |
1169 | let s3 = unsafe { SSSE3::instance() }; |
1170 | |
1171 | let x_s2 = { |
1172 | let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs); |
1173 | x_s2.bswap() |
1174 | }; |
1175 | |
1176 | let x_s3 = { |
1177 | let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs); |
1178 | x_s3.bswap() |
1179 | }; |
1180 | |
1181 | assert_eq!(x_s2, s2.vec(ys)); |
1182 | assert_eq!(x_s3, transmute!(x_s3)); |
1183 | } |
1184 | |
1185 | #[test ] |
1186 | #[cfg_attr (not(target_feature = "ssse3" ), ignore)] |
1187 | fn test_shuffle32_s2_vs_s3() { |
1188 | let xs = [0x0, 0x1, 0x2, 0x3]; |
1189 | let ys = [0x2, 0x3, 0x0, 0x1]; |
1190 | let zs = [0x1, 0x2, 0x3, 0x0]; |
1191 | |
1192 | let s2 = unsafe { SSE2::instance() }; |
1193 | let s3 = unsafe { SSSE3::instance() }; |
1194 | |
1195 | let x_s2 = { |
1196 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1197 | x_s2.shuffle2301() |
1198 | }; |
1199 | let x_s3 = { |
1200 | let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); |
1201 | x_s3.shuffle2301() |
1202 | }; |
1203 | assert_eq!(x_s2, s2.vec(ys)); |
1204 | assert_eq!(x_s3, transmute!(x_s3)); |
1205 | |
1206 | let x_s2 = { |
1207 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1208 | x_s2.shuffle3012() |
1209 | }; |
1210 | let x_s3 = { |
1211 | let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); |
1212 | x_s3.shuffle3012() |
1213 | }; |
1214 | assert_eq!(x_s2, s2.vec(zs)); |
1215 | assert_eq!(x_s3, transmute!(x_s3)); |
1216 | |
1217 | let x_s2 = x_s2.shuffle1230(); |
1218 | let x_s3 = x_s3.shuffle1230(); |
1219 | assert_eq!(x_s2, s2.vec(xs)); |
1220 | assert_eq!(x_s3, transmute!(x_s3)); |
1221 | } |
1222 | |
1223 | #[test ] |
1224 | #[cfg_attr (not(target_feature = "ssse3" ), ignore)] |
1225 | fn test_shuffle64_s2_vs_s3() { |
1226 | let xs = [0x0, 0x1, 0x2, 0x3]; |
1227 | let ys = [0x2, 0x3, 0x0, 0x1]; |
1228 | let zs = [0x1, 0x2, 0x3, 0x0]; |
1229 | |
1230 | let s2 = unsafe { SSE2::instance() }; |
1231 | let s3 = unsafe { SSSE3::instance() }; |
1232 | |
1233 | let x_s2 = { |
1234 | let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs); |
1235 | x_s2.shuffle2301() |
1236 | }; |
1237 | let x_s3 = { |
1238 | let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs); |
1239 | x_s3.shuffle2301() |
1240 | }; |
1241 | assert_eq!(x_s2, s2.vec(ys)); |
1242 | assert_eq!(x_s3, transmute!(x_s3)); |
1243 | |
1244 | let x_s2 = { |
1245 | let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs); |
1246 | x_s2.shuffle3012() |
1247 | }; |
1248 | let x_s3 = { |
1249 | let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs); |
1250 | x_s3.shuffle3012() |
1251 | }; |
1252 | assert_eq!(x_s2, s2.vec(zs)); |
1253 | assert_eq!(x_s3, transmute!(x_s3)); |
1254 | |
1255 | let x_s2 = x_s2.shuffle1230(); |
1256 | let x_s3 = x_s3.shuffle1230(); |
1257 | assert_eq!(x_s2, s2.vec(xs)); |
1258 | assert_eq!(x_s3, transmute!(x_s3)); |
1259 | } |
1260 | |
1261 | #[cfg_attr (not(all(target_feature = "ssse3" , target_feature = "sse4.1" )), ignore)] |
1262 | #[test ] |
1263 | fn test_lanes_u32x4() { |
1264 | let xs = [0x1, 0x2, 0x3, 0x4]; |
1265 | |
1266 | let s2 = unsafe { SSE2::instance() }; |
1267 | let s3 = unsafe { SSSE3::instance() }; |
1268 | let s4 = unsafe { SSE41::instance() }; |
1269 | |
1270 | { |
1271 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1272 | let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs); |
1273 | assert_eq!(x_s2, y_s2); |
1274 | assert_eq!(xs, y_s2.to_lanes()); |
1275 | } |
1276 | |
1277 | { |
1278 | let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); |
1279 | let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs); |
1280 | assert_eq!(x_s3, y_s3); |
1281 | assert_eq!(xs, y_s3.to_lanes()); |
1282 | } |
1283 | |
1284 | { |
1285 | let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs); |
1286 | let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs); |
1287 | assert_eq!(x_s4, y_s4); |
1288 | assert_eq!(xs, y_s4.to_lanes()); |
1289 | } |
1290 | } |
1291 | |
1292 | #[test ] |
1293 | #[cfg_attr (not(all(target_feature = "ssse3" , target_feature = "sse4.1" )), ignore)] |
1294 | fn test_lanes_u64x2() { |
1295 | let xs = [0x1, 0x2]; |
1296 | |
1297 | let s2 = unsafe { SSE2::instance() }; |
1298 | let s3 = unsafe { SSSE3::instance() }; |
1299 | let s4 = unsafe { SSE41::instance() }; |
1300 | |
1301 | { |
1302 | let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs); |
1303 | let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs); |
1304 | assert_eq!(x_s2, y_s2); |
1305 | assert_eq!(xs, y_s2.to_lanes()); |
1306 | } |
1307 | |
1308 | { |
1309 | let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs); |
1310 | let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs); |
1311 | assert_eq!(x_s3, y_s3); |
1312 | assert_eq!(xs, y_s3.to_lanes()); |
1313 | } |
1314 | |
1315 | { |
1316 | let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs); |
1317 | let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs); |
1318 | assert_eq!(x_s4, y_s4); |
1319 | assert_eq!(xs, y_s4.to_lanes()); |
1320 | } |
1321 | } |
1322 | |
1323 | #[test ] |
1324 | fn test_vec4_u32x4_s2() { |
1325 | let xs = [1, 2, 3, 4]; |
1326 | let s2 = unsafe { SSE2::instance() }; |
1327 | let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); |
1328 | assert_eq!(x_s2.extract(0), 1); |
1329 | assert_eq!(x_s2.extract(1), 2); |
1330 | assert_eq!(x_s2.extract(2), 3); |
1331 | assert_eq!(x_s2.extract(3), 4); |
1332 | assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4])); |
1333 | assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4])); |
1334 | assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4])); |
1335 | assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf])); |
1336 | } |
1337 | |
1338 | #[test ] |
1339 | #[cfg_attr (not(all(target_feature = "ssse3" , target_feature = "sse4.1" )), ignore)] |
1340 | fn test_vec4_u32x4_s4() { |
1341 | let xs = [1, 2, 3, 4]; |
1342 | let s4 = unsafe { SSE41::instance() }; |
1343 | let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs); |
1344 | assert_eq!(x_s4.extract(0), 1); |
1345 | assert_eq!(x_s4.extract(1), 2); |
1346 | assert_eq!(x_s4.extract(2), 3); |
1347 | assert_eq!(x_s4.extract(3), 4); |
1348 | assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4])); |
1349 | assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4])); |
1350 | assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4])); |
1351 | assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf])); |
1352 | } |
1353 | |
1354 | #[test ] |
1355 | fn test_vec2_u64x2_s2() { |
1356 | let xs = [0x1, 0x2]; |
1357 | let s2 = unsafe { SSE2::instance() }; |
1358 | let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs); |
1359 | assert_eq!(x_s2.extract(0), 1); |
1360 | assert_eq!(x_s2.extract(1), 2); |
1361 | assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2])); |
1362 | assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf])); |
1363 | } |
1364 | |
1365 | #[test ] |
1366 | #[cfg_attr (not(all(target_feature = "ssse3" , target_feature = "sse4.1" )), ignore)] |
1367 | fn test_vec4_u64x2_s4() { |
1368 | let xs = [0x1, 0x2]; |
1369 | let s4 = unsafe { SSE41::instance() }; |
1370 | let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs); |
1371 | assert_eq!(x_s4.extract(0), 1); |
1372 | assert_eq!(x_s4.extract(1), 2); |
1373 | assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2])); |
1374 | assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf])); |
1375 | } |
1376 | } |
1377 | |
1378 | pub mod avx2 { |
1379 | #![allow (non_camel_case_types)] |
1380 | use crate::soft::{x2, x4}; |
1381 | use crate::types::*; |
1382 | use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0}; |
1383 | use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}; |
1384 | use core::arch::x86_64::*; |
1385 | use core::marker::PhantomData; |
1386 | use core::ops::*; |
1387 | use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes}; |
1388 | |
1389 | #[derive (Copy, Clone, FromBytes, AsBytes, FromZeroes)] |
1390 | #[repr (transparent)] |
1391 | pub struct u32x4x2_avx2<NI> { |
1392 | x: __m256i, |
1393 | ni: PhantomData<NI>, |
1394 | } |
1395 | |
1396 | impl<NI> u32x4x2_avx2<NI> { |
1397 | #[inline (always)] |
1398 | fn new(x: __m256i) -> Self { |
1399 | Self { x, ni: PhantomData } |
1400 | } |
1401 | } |
1402 | |
1403 | impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {} |
1404 | impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> { |
1405 | #[inline (always)] |
1406 | unsafe fn unpack(p: vec256_storage) -> Self { |
1407 | Self::new(p.avx) |
1408 | } |
1409 | } |
1410 | impl<NI> StoreBytes for u32x4x2_avx2<NI> { |
1411 | #[inline (always)] |
1412 | unsafe fn unsafe_read_le(input: &[u8]) -> Self { |
1413 | assert_eq!(input.len(), 32); |
1414 | Self::new(_mm256_loadu_si256(input.as_ptr() as *const _)) |
1415 | } |
1416 | #[inline (always)] |
1417 | unsafe fn unsafe_read_be(input: &[u8]) -> Self { |
1418 | Self::unsafe_read_le(input).bswap() |
1419 | } |
1420 | #[inline (always)] |
1421 | fn write_le(self, out: &mut [u8]) { |
1422 | unsafe { |
1423 | assert_eq!(out.len(), 32); |
1424 | _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x) |
1425 | } |
1426 | } |
1427 | #[inline (always)] |
1428 | fn write_be(self, out: &mut [u8]) { |
1429 | self.bswap().write_le(out) |
1430 | } |
1431 | } |
1432 | impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> { |
1433 | #[inline (always)] |
1434 | fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] { |
1435 | unsafe { |
1436 | [ |
1437 | u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), |
1438 | u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), |
1439 | ] |
1440 | } |
1441 | } |
1442 | #[inline (always)] |
1443 | fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self { |
1444 | Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) }) |
1445 | } |
1446 | } |
1447 | impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> { |
1448 | #[inline (always)] |
1449 | fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { |
1450 | unsafe { |
1451 | match i { |
1452 | 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), |
1453 | 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), |
1454 | _ => panic!(), |
1455 | } |
1456 | } |
1457 | } |
1458 | #[inline (always)] |
1459 | fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { |
1460 | Self::new(unsafe { |
1461 | match i { |
1462 | 0 => _mm256_inserti128_si256(self.x, w.x, 0), |
1463 | 1 => _mm256_inserti128_si256(self.x, w.x, 1), |
1464 | _ => panic!(), |
1465 | } |
1466 | }) |
1467 | } |
1468 | } |
1469 | impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {} |
1470 | impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {} |
1471 | macro_rules! shuf_lane_bytes { |
1472 | ($name:ident, $k0:expr, $k1:expr) => { |
1473 | #[inline(always)] |
1474 | fn $name(self) -> Self { |
1475 | Self::new(unsafe { |
1476 | _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1)) |
1477 | }) |
1478 | } |
1479 | }; |
1480 | } |
1481 | macro_rules! rotr_32 { |
1482 | ($name:ident, $i:expr) => { |
1483 | #[inline(always)] |
1484 | fn $name(self) -> Self { |
1485 | Self::new(unsafe { |
1486 | _mm256_or_si256( |
1487 | _mm256_srli_epi32(self.x, $i as i32), |
1488 | _mm256_slli_epi32(self.x, 32 - $i as i32), |
1489 | ) |
1490 | }) |
1491 | } |
1492 | }; |
1493 | } |
1494 | impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> { |
1495 | rotr_32!(rotate_each_word_right7, 7); |
1496 | shuf_lane_bytes!( |
1497 | rotate_each_word_right8, |
1498 | 0x0c0f_0e0d_080b_0a09, |
1499 | 0x0407_0605_0003_0201 |
1500 | ); |
1501 | rotr_32!(rotate_each_word_right11, 11); |
1502 | rotr_32!(rotate_each_word_right12, 12); |
1503 | shuf_lane_bytes!( |
1504 | rotate_each_word_right16, |
1505 | 0x0d0c_0f0e_0908_0b0a, |
1506 | 0x0504_0706_0100_0302 |
1507 | ); |
1508 | rotr_32!(rotate_each_word_right20, 20); |
1509 | shuf_lane_bytes!( |
1510 | rotate_each_word_right24, |
1511 | 0x0e0d_0c0f_0a09_080b, |
1512 | 0x0605_0407_0201_0003 |
1513 | ); |
1514 | rotr_32!(rotate_each_word_right25, 25); |
1515 | } |
1516 | impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {} |
1517 | impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage { |
1518 | #[inline (always)] |
1519 | fn from(x: u32x4x2_avx2<NI>) -> Self { |
1520 | Self { avx: x.x } |
1521 | } |
1522 | } |
1523 | |
1524 | macro_rules! impl_assign { |
1525 | ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => { |
1526 | impl<NI> $Assign for $vec<NI> |
1527 | where |
1528 | NI: Copy, |
1529 | { |
1530 | #[inline(always)] |
1531 | fn $assign_fn(&mut self, rhs: Self) { |
1532 | *self = self.$bin_fn(rhs); |
1533 | } |
1534 | } |
1535 | }; |
1536 | } |
1537 | impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor); |
1538 | impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor); |
1539 | impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand); |
1540 | impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add); |
1541 | |
1542 | macro_rules! impl_bitop { |
1543 | ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => { |
1544 | impl<NI> $Op for $vec<NI> { |
1545 | type Output = Self; |
1546 | #[inline(always)] |
1547 | fn $op_fn(self, rhs: Self) -> Self::Output { |
1548 | Self::new(unsafe { $impl_fn(self.x, rhs.x) }) |
1549 | } |
1550 | } |
1551 | }; |
1552 | } |
1553 | impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256); |
1554 | impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256); |
1555 | impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256); |
1556 | impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256); |
1557 | impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32); |
1558 | |
1559 | impl<NI> Not for u32x4x2_avx2<NI> { |
1560 | type Output = Self; |
1561 | #[inline (always)] |
1562 | fn not(self) -> Self::Output { |
1563 | unsafe { |
1564 | let f = _mm256_set1_epi8(-0x7f); |
1565 | Self::new(f) ^ self |
1566 | } |
1567 | } |
1568 | } |
1569 | |
1570 | impl<NI> BSwap for u32x4x2_avx2<NI> { |
1571 | shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); |
1572 | } |
1573 | |
1574 | impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI> |
1575 | where |
1576 | NI: Copy, |
1577 | { |
1578 | #[inline (always)] |
1579 | fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self { |
1580 | Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) }) |
1581 | } |
1582 | } |
1583 | |
1584 | impl<NI> LaneWords4 for u32x4x2_avx2<NI> { |
1585 | #[inline (always)] |
1586 | fn shuffle_lane_words1230(self) -> Self { |
1587 | Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) }) |
1588 | } |
1589 | #[inline (always)] |
1590 | fn shuffle_lane_words2301(self) -> Self { |
1591 | Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) }) |
1592 | } |
1593 | #[inline (always)] |
1594 | fn shuffle_lane_words3012(self) -> Self { |
1595 | Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) }) |
1596 | } |
1597 | } |
1598 | |
1599 | /////////////////////////////////////////////////////////////////////////////////////////// |
1600 | |
1601 | pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>; |
1602 | impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {} |
1603 | |
1604 | impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> { |
1605 | #[inline (always)] |
1606 | unsafe fn unpack(p: vec512_storage) -> Self { |
1607 | Self::new([ |
1608 | u32x4x2_avx2::unpack(p.avx[0]), |
1609 | u32x4x2_avx2::unpack(p.avx[1]), |
1610 | ]) |
1611 | } |
1612 | } |
1613 | impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> { |
1614 | #[inline (always)] |
1615 | fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] { |
1616 | let [a, b] = self.0[0].to_lanes(); |
1617 | let [c, d] = self.0[1].to_lanes(); |
1618 | [a, b, c, d] |
1619 | } |
1620 | #[inline (always)] |
1621 | fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self { |
1622 | let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]); |
1623 | let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]); |
1624 | Self::new([ab, cd]) |
1625 | } |
1626 | } |
1627 | impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { |
1628 | #[inline (always)] |
1629 | fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { |
1630 | match i { |
1631 | 0 => self.0[0].extract(0), |
1632 | 1 => self.0[0].extract(1), |
1633 | 2 => self.0[1].extract(0), |
1634 | 3 => self.0[1].extract(1), |
1635 | _ => panic!(), |
1636 | } |
1637 | } |
1638 | #[inline (always)] |
1639 | fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { |
1640 | Self::new(match i { |
1641 | 0 | 1 => [self.0[0].insert(w, i), self.0[1]], |
1642 | 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)], |
1643 | _ => panic!(), |
1644 | }) |
1645 | } |
1646 | } |
1647 | impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { |
1648 | #[inline (always)] |
1649 | fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { |
1650 | /* |
1651 | * a00:a01 a10:a11 |
1652 | * b00:b01 b10:b11 |
1653 | * c00:c01 c10:c11 |
1654 | * d00:d01 d10:d11 |
1655 | * => |
1656 | * a00:b00 c00:d00 |
1657 | * a01:b01 c01:d01 |
1658 | * a10:b10 c10:d10 |
1659 | * a11:b11 c11:d11 |
1660 | */ |
1661 | unsafe { |
1662 | let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20)); |
1663 | let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31)); |
1664 | let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20)); |
1665 | let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31)); |
1666 | let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20)); |
1667 | let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31)); |
1668 | let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20)); |
1669 | let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31)); |
1670 | ( |
1671 | Self::new([ab00, cd00]), |
1672 | Self::new([ab01, cd01]), |
1673 | Self::new([ab10, cd10]), |
1674 | Self::new([ab11, cd11]), |
1675 | ) |
1676 | } |
1677 | } |
1678 | } |
1679 | impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> { |
1680 | #[inline (always)] |
1681 | fn to_scalars(self) -> [u32; 16] { |
1682 | transmute!(self) |
1683 | } |
1684 | } |
1685 | impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage { |
1686 | #[inline (always)] |
1687 | fn from(x: u32x4x4_avx2<NI>) -> Self { |
1688 | Self { |
1689 | avx: [ |
1690 | vec256_storage { avx: x.0[0].x }, |
1691 | vec256_storage { avx: x.0[1].x }, |
1692 | ], |
1693 | } |
1694 | } |
1695 | } |
1696 | impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> { |
1697 | #[inline (always)] |
1698 | fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self { |
1699 | Self::new(unsafe { |
1700 | [ |
1701 | u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)), |
1702 | u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)), |
1703 | ] |
1704 | }) |
1705 | } |
1706 | } |
1707 | } |
1708 | |