sse2.rs source code [crates/ppv_lite86/src/x86_64/sse2.rs]

1	use crate::soft::{x2, x4};
2	use crate::types::*;
3	use crate::vec128_storage;
4	use crate::x86_64::Avx2Machine;
5	use crate::x86_64::SseMachine as Machine86;
6	use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7	use core::arch::x86_64::*;
8	use core::marker::PhantomData;
9	use core::ops::{
10	Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11	};
12	use zerocopy::transmute;
13
14	macro_rules! impl_binop {
15	($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
16	impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
17	type Output = Self;
18	#[inline(always)]
19	fn $fn(self, rhs: Self) -> Self::Output {
20	Self::new(unsafe { $impl_fn(self.x, rhs.x) })
21	}
22	}
23	};
24	}
25
26	macro_rules! impl_binop_assign {
27	($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
28	impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
29	where
30	$vec<S3, S4, NI>: Copy,
31	{
32	#[inline(always)]
33	fn $fn_assign(&mut self, rhs: Self) {
34	*self = self.$fn(rhs);
35	}
36	}
37	};
38	}
39
40	macro_rules! def_vec {
41	($vec:ident, $word:ident) => {
42	zerocopy::cryptocorrosion_derive_traits! {
43	#[repr(transparent)]
44	#[allow(non_camel_case_types)]
45	#[derive(Copy, Clone)]
46	pub struct $vec<S3, S4, NI> {
47	x: __m128i,
48	s3: PhantomData<S3>,
49	s4: PhantomData<S4>,
50	ni: PhantomData<NI>,
51	}
52	}
53
54	impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
55	#[inline(always)]
56	unsafe fn unpack(x: vec128_storage) -> Self {
57	Self::new(x.sse2)
58	}
59	}
60	impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
61	#[inline(always)]
62	fn from(x: $vec<S3, S4, NI>) -> Self {
63	vec128_storage { sse2: x.x }
64	}
65	}
66	impl<S3, S4, NI> $vec<S3, S4, NI> {
67	#[inline(always)]
68	fn new(x: __m128i) -> Self {
69	$vec {
70	x,
71	s3: PhantomData,
72	s4: PhantomData,
73	ni: PhantomData,
74	}
75	}
76	}
77
78	impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
79	where
80	Self: BSwap,
81	{
82	#[inline(always)]
83	unsafe fn unsafe_read_le(input: &[u8]) -> Self {
84	assert_eq!(input.len(), `16`);
85	Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
86	}
87	#[inline(always)]
88	unsafe fn unsafe_read_be(input: &[u8]) -> Self {
89	assert_eq!(input.len(), `16`);
90	Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
91	}
92	#[inline(always)]
93	fn write_le(self, out: &mut [u8]) {
94	assert_eq!(out.len(), `16`);
95	unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
96	}
97	#[inline(always)]
98	fn write_be(self, out: &mut [u8]) {
99	assert_eq!(out.len(), `16`);
100	let x = self.bswap().x;
101	unsafe {
102	_mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
103	}
104	}
105	}
106
107	impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
108	#[inline(always)]
109	fn default() -> Self {
110	Self::new(unsafe { _mm_setzero_si128() })
111	}
112	}
113
114	impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
115	type Output = Self;
116	#[inline(always)]
117	fn not(self) -> Self::Output {
118	unsafe {
119	let ff = _mm_set1_epi64x(-`1i64`);
120	self ^ Self::new(ff)
121	}
122	}
123	}
124
125	impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
126	impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
127	impl_binop!($vec, BitOr, bitor, _mm_or_si128);
128	impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
129	impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
130	impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
131	impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
132	impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
133	type Output = Self;
134	#[inline(always)]
135	fn andnot(self, rhs: Self) -> Self {
136	Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
137	}
138	}
139	};
140	}
141
142	macro_rules! impl_bitops32 {
143	($vec:ident) => {
144	impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
145	$vec<S3, S4, NI>: RotateEachWord32
146	{
147	}
148	};
149	}
150
151	macro_rules! impl_bitops64 {
152	($vec:ident) => {
153	impl_bitops32!($vec);
154	impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
155	$vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
156	{
157	}
158	};
159	}
160
161	macro_rules! impl_bitops128 {
162	($vec:ident) => {
163	impl_bitops64!($vec);
164	impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
165	$vec<S3, S4, NI>: RotateEachWord128
166	{
167	}
168	};
169	}
170
171	macro_rules! rotr_32_s3 {
172	($name:ident, $k0:expr, $k1:expr) => {
173	#[inline(always)]
174	fn $name(self) -> Self {
175	Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
176	}
177	};
178	}
179	macro_rules! rotr_32 {
180	($name:ident, $i:expr) => {
181	#[inline(always)]
182	fn $name(self) -> Self {
183	Self::new(unsafe {
184	_mm_or_si128(
185	_mm_srli_epi32(self.x, $i as i32),
186	_mm_slli_epi32(self.x, `32` - $i as i32),
187	)
188	})
189	}
190	};
191	}
192	impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
193	rotr_32!(rotate_each_word_right7, `7`);
194	rotr_32_s3!(
195	rotate_each_word_right8,
196	`0x0c0f_0e0d_080b_0a09`,
197	`0x0407_0605_0003_0201`
198	);
199	rotr_32!(rotate_each_word_right11, `11`);
200	rotr_32!(rotate_each_word_right12, `12`);
201	rotr_32_s3!(
202	rotate_each_word_right16,
203	`0x0d0c_0f0e_0908_0b0a`,
204	`0x0504_0706_0100_0302`
205	);
206	rotr_32!(rotate_each_word_right20, `20`);
207	rotr_32_s3!(
208	rotate_each_word_right24,
209	`0x0e0d_0c0f_0a09_080b`,
210	`0x0605_0407_0201_0003`
211	);
212	rotr_32!(rotate_each_word_right25, `25`);
213	}
214	impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
215	rotr_32!(rotate_each_word_right7, `7`);
216	rotr_32!(rotate_each_word_right8, `8`);
217	rotr_32!(rotate_each_word_right11, `11`);
218	rotr_32!(rotate_each_word_right12, `12`);
219	#[inline(always)]
220	fn rotate_each_word_right16(self) -> Self {
221	Self::new(swap16_s2(self.x))
222	}
223	rotr_32!(rotate_each_word_right20, `20`);
224	rotr_32!(rotate_each_word_right24, `24`);
225	rotr_32!(rotate_each_word_right25, `25`);
226	}
227
228	macro_rules! rotr_64_s3 {
229	($name:ident, $k0:expr, $k1:expr) => {
230	#[inline(always)]
231	fn $name(self) -> Self {
232	Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
233	}
234	};
235	}
236	macro_rules! rotr_64 {
237	($name:ident, $i:expr) => {
238	#[inline(always)]
239	fn $name(self) -> Self {
240	Self::new(unsafe {
241	_mm_or_si128(
242	_mm_srli_epi64(self.x, $i as i32),
243	_mm_slli_epi64(self.x, `64` - $i as i32),
244	)
245	})
246	}
247	};
248	}
249	impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
250	rotr_64!(rotate_each_word_right7, `7`);
251	rotr_64_s3!(
252	rotate_each_word_right8,
253	`0x080f_0e0d_0c0b_0a09`,
254	`0x0007_0605_0403_0201`
255	);
256	rotr_64!(rotate_each_word_right11, `11`);
257	rotr_64!(rotate_each_word_right12, `12`);
258	rotr_64_s3!(
259	rotate_each_word_right16,
260	`0x0908_0f0e_0d0c_0b0a`,
261	`0x0100_0706_0504_0302`
262	);
263	rotr_64!(rotate_each_word_right20, `20`);
264	rotr_64_s3!(
265	rotate_each_word_right24,
266	`0x0a09_080f_0e0d_0c0b`,
267	`0x0201_0007_0605_0403`
268	);
269	rotr_64!(rotate_each_word_right25, `25`);
270	}
271	impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
272	rotr_64!(rotate_each_word_right7, `7`);
273	rotr_64!(rotate_each_word_right8, `8`);
274	rotr_64!(rotate_each_word_right11, `11`);
275	rotr_64!(rotate_each_word_right12, `12`);
276	#[inline(always)]
277	fn rotate_each_word_right16(self) -> Self {
278	Self::new(swap16_s2(self.x))
279	}
280	rotr_64!(rotate_each_word_right20, `20`);
281	rotr_64!(rotate_each_word_right24, `24`);
282	rotr_64!(rotate_each_word_right25, `25`);
283	}
284	impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
285	#[inline(always)]
286	fn rotate_each_word_right32(self) -> Self {
287	Self::new(unsafe { _mm_shuffle_epi32(self.x, `0b10110001`) })
288	}
289	}
290
291	macro_rules! rotr_128 {
292	($name:ident, $i:expr) => {
293	#[inline(always)]
294	fn $name(self) -> Self {
295	Self::new(unsafe {
296	_mm_or_si128(
297	_mm_srli_si128(self.x, $i as i32),
298	_mm_slli_si128(self.x, `128` - $i as i32),
299	)
300	})
301	}
302	};
303	}
304	// TODO: completely unoptimized
305	impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
306	rotr_128!(rotate_each_word_right7, `7`);
307	rotr_128!(rotate_each_word_right8, `8`);
308	rotr_128!(rotate_each_word_right11, `11`);
309	rotr_128!(rotate_each_word_right12, `12`);
310	rotr_128!(rotate_each_word_right16, `16`);
311	rotr_128!(rotate_each_word_right20, `20`);
312	rotr_128!(rotate_each_word_right24, `24`);
313	rotr_128!(rotate_each_word_right25, `25`);
314	}
315	// TODO: completely unoptimized
316	impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
317	rotr_128!(rotate_each_word_right32, `32`);
318	}
319	impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
320
321	def_vec!(u32x4_sse2, u32);
322	def_vec!(u64x2_sse2, u64);
323	def_vec!(u128x1_sse2, u128);
324
325	impl<S3, NI> MultiLane<[u32; `4`]> for u32x4_sse2<S3, YesS4, NI> {
326	#[inline(always)]
327	fn to_lanes(self) -> [u32; `4`] {
328	unsafe {
329	let x: u64 = _mm_cvtsi128_si64(self.x) as u64;
330	let y: u64 = _mm_extract_epi64(self.x, `1`) as u64;
331	[x as u32, (x >> `32`) as u32, y as u32, (y >> `32`) as u32]
332	}
333	}
334	#[inline(always)]
335	fn from_lanes(xs: [u32; `4`]) -> Self {
336	unsafe {
337	let mut x: __m128i = _mm_cvtsi64_si128((xs[`0`] as u64 \| ((xs[`1`] as u64) << `32`)) as i64);
338	x = _mm_insert_epi64(a:x, (xs[`2`] as u64 \| ((xs[`3`] as u64) << `32`)) as i64, `1`);
339	Self::new(x)
340	}
341	}
342	}
343	impl<S3, NI> MultiLane<[u32; `4`]> for u32x4_sse2<S3, NoS4, NI> {
344	#[inline(always)]
345	fn to_lanes(self) -> [u32; `4`] {
346	unsafe {
347	let x: u64 = _mm_cvtsi128_si64(self.x) as u64;
348	let y: u64 = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, `0b11101110`)) as u64;
349	[x as u32, (x >> `32`) as u32, y as u32, (y >> `32`) as u32]
350	}
351	}
352	#[inline(always)]
353	fn from_lanes(xs: [u32; `4`]) -> Self {
354	unsafe {
355	let x: i64 = (xs[`0`] as u64 \| ((xs[`1`] as u64) << `32`)) as i64;
356	let y: i64 = (xs[`2`] as u64 \| ((xs[`3`] as u64) << `32`)) as i64;
357	let x: __m128i = _mm_cvtsi64_si128(x);
358	let y: __m128i = _mm_slli_si128(_mm_cvtsi64_si128(y), `8`);
359	Self::new(_mm_or_si128(a:x, b:y))
360	}
361	}
362	}
363	impl<S3, NI> MultiLane<[u64; `2`]> for u64x2_sse2<S3, YesS4, NI> {
364	#[inline(always)]
365	fn to_lanes(self) -> [u64; `2`] {
366	unsafe {
367	[
368	_mm_cvtsi128_si64(self.x) as u64,
369	_mm_extract_epi64(self.x, `1`) as u64,
370	]
371	}
372	}
373	#[inline(always)]
374	fn from_lanes(xs: [u64; `2`]) -> Self {
375	unsafe {
376	let mut x: __m128i = _mm_cvtsi64_si128(xs[`0`] as i64);
377	x = _mm_insert_epi64(a:x, i:xs[`1`] as i64, `1`);
378	Self::new(x)
379	}
380	}
381	}
382	impl<S3, NI> MultiLane<[u64; `2`]> for u64x2_sse2<S3, NoS4, NI> {
383	#[inline(always)]
384	fn to_lanes(self) -> [u64; `2`] {
385	unsafe {
386	[
387	_mm_cvtsi128_si64(self.x) as u64,
388	_mm_cvtsi128_si64(_mm_srli_si128(self.x, `8`)) as u64,
389	]
390	}
391	}
392	#[inline(always)]
393	fn from_lanes(xs: [u64; `2`]) -> Self {
394	unsafe {
395	let x: __m128i = _mm_cvtsi64_si128(xs[`0`] as i64);
396	let y: __m128i = _mm_slli_si128(_mm_cvtsi64_si128(xs[`1`] as i64), `8`);
397	Self::new(_mm_or_si128(a:x, b:y))
398	}
399	}
400	}
401	impl<S3, S4, NI> MultiLane<[u128; `1`]> for u128x1_sse2<S3, S4, NI> {
402	#[inline(always)]
403	fn to_lanes(self) -> [u128; `1`] {
404	unimplemented!()
405	}
406	#[inline(always)]
407	fn from_lanes(xs: [u128; `1`]) -> Self {
408	unimplemented!("{:?}", xs)
409	}
410	}
411
412	impl<S3, S4, NI> MultiLane<[u64; `4`]> for u64x4_sse2<S3, S4, NI>
413	where
414	u64x2_sse2<S3, S4, NI>: MultiLane<[u64; `2`]> + Copy,
415	{
416	#[inline(always)]
417	fn to_lanes(self) -> [u64; `4`] {
418	let (a: [u64; 2], b: [u64; 2]) = (self.0[`0`].to_lanes(), self.0[`1`].to_lanes());
419	[a[`0`], a[`1`], b[`0`], b[`1`]]
420	}
421	#[inline(always)]
422	fn from_lanes(xs: [u64; `4`]) -> Self {
423	let (a: u64x2_sse2, b: u64x2_sse2) = (
424	u64x2_sse2::from_lanes([xs[`0`], xs[`1`]]),
425	u64x2_sse2::from_lanes([xs[`2`], xs[`3`]]),
426	);
427	x2::new([a, b])
428	}
429	}
430
431	macro_rules! impl_into {
432	($from:ident, $to:ident) => {
433	impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
434	#[inline(always)]
435	fn from(x: $from<S3, S4, NI>) -> Self {
436	$to::new(x.x)
437	}
438	}
439	};
440	}
441
442	impl_into!(u128x1_sse2, u32x4_sse2);
443	impl_into!(u128x1_sse2, u64x2_sse2);
444
445	impl_bitops32!(u32x4_sse2);
446	impl_bitops64!(u64x2_sse2);
447	impl_bitops128!(u128x1_sse2);
448
449	impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
450	u32x4_sse2<S3, S4, NI>: BSwap
451	{
452	}
453	impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
454	u64x2_sse2<S3, S4, NI>: BSwap
455	{
456	}
457	impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
458	impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
459	impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
460	impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
461
462	impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
463	where
464	u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; `4`]> + Vec4<u32>,
465	Machine86<S3, S4, NI>: Machine,
466	{
467	}
468	impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
469	where
470	u64x2_sse2<S3, S4, NI>:
471	RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; `2`]> + Vec2<u64>,
472	Machine86<S3, S4, NI>: Machine,
473	{
474	}
475	impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
476	where
477	u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
478	Machine86<S3, S4, NI>: Machine,
479	u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
480	u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
481	{
482	}
483
484	impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
485	where
486	u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; `4`]> + Vec4<u32>,
487	Machine86<YesS3, YesS4, NI>: Machine,
488	{
489	}
490	impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
491	where
492	u64x2_sse2<YesS3, YesS4, NI>:
493	RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; `2`]> + Vec2<u64>,
494	Machine86<YesS3, YesS4, NI>: Machine,
495	{
496	}
497	impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
498	where
499	u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
500	Machine86<YesS3, YesS4, NI>: Machine,
501	u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
502	u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
503	{
504	}
505
506	impl<S3, S4, NI> UnsafeFrom<[u32; `4`]> for u32x4_sse2<S3, S4, NI> {
507	#[inline(always)]
508	unsafe fn unsafe_from(xs: [u32; `4`]) -> Self {
509	Self::new(_mm_set_epi32(
510	e3:xs[`3`] as i32,
511	e2:xs[`2`] as i32,
512	e1:xs[`1`] as i32,
513	e0:xs[`0`] as i32,
514	))
515	}
516	}
517
518	impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
519	where
520	Self: MultiLane<[u32; `4`]>,
521	{
522	#[inline(always)]
523	fn extract(self, i: u32) -> u32 {
524	self.to_lanes()[i as usize]
525	}
526	#[inline(always)]
527	fn insert(self, v: u32, i: u32) -> Self {
528	Self::new(unsafe {
529	match i {
530	`0` => _mm_insert_epi32(self.x, i:v as i32, `0`),
531	`1` => _mm_insert_epi32(self.x, i:v as i32, `1`),
532	`2` => _mm_insert_epi32(self.x, i:v as i32, `2`),
533	`3` => _mm_insert_epi32(self.x, i:v as i32, `3`),
534	_ => unreachable!(),
535	}
536	})
537	}
538	}
539	impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
540	where
541	Self: MultiLane<[u32; `4`]>,
542	{
543	#[inline(always)]
544	fn extract(self, i: u32) -> u32 {
545	self.to_lanes()[i as usize]
546	}
547	#[inline(always)]
548	fn insert(self, v: u32, i: u32) -> Self {
549	Self::new(unsafe {
550	match i {
551	`0` => {
552	let x = _mm_andnot_si128(_mm_cvtsi32_si128(`-1`), self.x);
553	_mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
554	}
555	`1` => {
556	let mut x = _mm_shuffle_epi32(self.x, `0b0111_1000`);
557	x = _mm_slli_si128(x, `4`);
558	x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
559	_mm_shuffle_epi32(x, `0b1110_0001`)
560	}
561	`2` => {
562	let mut x = _mm_shuffle_epi32(self.x, `0b1011_0100`);
563	x = _mm_slli_si128(x, `4`);
564	x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
565	_mm_shuffle_epi32(x, `0b1100_1001`)
566	}
567	`3` => {
568	let mut x = _mm_slli_si128(self.x, `4`);
569	x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
570	_mm_shuffle_epi32(x, `0b0011_1001`)
571	}
572	_ => unreachable!(),
573	}
574	})
575	}
576	}
577
578	impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
579	#[inline(always)]
580	fn shuffle_lane_words2301(self) -> Self {
581	self.shuffle2301()
582	}
583	#[inline(always)]
584	fn shuffle_lane_words1230(self) -> Self {
585	self.shuffle1230()
586	}
587	#[inline(always)]
588	fn shuffle_lane_words3012(self) -> Self {
589	self.shuffle3012()
590	}
591	}
592
593	impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
594	#[inline(always)]
595	fn shuffle2301(self) -> Self {
596	Self::new(unsafe { _mm_shuffle_epi32(self.x, `0b0100_1110`) })
597	}
598	#[inline(always)]
599	fn shuffle1230(self) -> Self {
600	Self::new(unsafe { _mm_shuffle_epi32(self.x, `0b1001_0011`) })
601	}
602	#[inline(always)]
603	fn shuffle3012(self) -> Self {
604	Self::new(unsafe { _mm_shuffle_epi32(self.x, `0b0011_1001`) })
605	}
606	}
607
608	impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
609	#[inline(always)]
610	fn shuffle2301(self) -> Self {
611	x2::new([u64x2_sse2::new(self.0[`1`].x), u64x2_sse2::new(self.0[`0`].x)])
612	}
613	#[inline(always)]
614	fn shuffle3012(self) -> Self {
615	unsafe {
616	x2::new([
617	u64x2_sse2::new(_mm_alignr_epi8(self.0[`1`].x, self.0[`0`].x, `8`)),
618	u64x2_sse2::new(_mm_alignr_epi8(self.0[`0`].x, self.0[`1`].x, `8`)),
619	])
620	}
621	}
622	#[inline(always)]
623	fn shuffle1230(self) -> Self {
624	unsafe {
625	x2::new([
626	u64x2_sse2::new(_mm_alignr_epi8(self.0[`0`].x, self.0[`1`].x, `8`)),
627	u64x2_sse2::new(_mm_alignr_epi8(self.0[`1`].x, self.0[`0`].x, `8`)),
628	])
629	}
630	}
631	}
632	impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
633	#[inline(always)]
634	fn shuffle2301(self) -> Self {
635	x2::new([u64x2_sse2::new(self.0[`1`].x), u64x2_sse2::new(self.0[`0`].x)])
636	}
637	#[inline(always)]
638	fn shuffle3012(self) -> Self {
639	unsafe {
640	let a = _mm_srli_si128(self.0[`0`].x, `8`);
641	let b = _mm_slli_si128(self.0[`0`].x, `8`);
642	let c = _mm_srli_si128(self.0[`1`].x, `8`);
643	let d = _mm_slli_si128(self.0[`1`].x, `8`);
644	let da = _mm_or_si128(d, a);
645	let bc = _mm_or_si128(b, c);
646	x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
647	}
648	}
649	#[inline(always)]
650	fn shuffle1230(self) -> Self {
651	unsafe {
652	let a = _mm_srli_si128(self.0[`0`].x, `8`);
653	let b = _mm_slli_si128(self.0[`0`].x, `8`);
654	let c = _mm_srli_si128(self.0[`1`].x, `8`);
655	let d = _mm_slli_si128(self.0[`1`].x, `8`);
656	let da = _mm_or_si128(d, a);
657	let bc = _mm_or_si128(b, c);
658	x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
659	}
660	}
661	}
662
663	impl<S3, S4, NI> UnsafeFrom<[u64; `2`]> for u64x2_sse2<S3, S4, NI> {
664	#[inline(always)]
665	unsafe fn unsafe_from(xs: [u64; `2`]) -> Self {
666	Self::new(_mm_set_epi64x(e1:xs[`1`] as i64, e0:xs[`0`] as i64))
667	}
668	}
669
670	impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
671	#[inline(always)]
672	fn extract(self, i: u32) -> u64 {
673	unsafe {
674	match i {
675	`0` => _mm_cvtsi128_si64(self.x) as u64,
676	`1` => _mm_extract_epi64(self.x, `1`) as u64,
677	_ => unreachable!(),
678	}
679	}
680	}
681	#[inline(always)]
682	fn insert(self, x: u64, i: u32) -> Self {
683	Self::new(unsafe {
684	match i {
685	`0` => _mm_insert_epi64(self.x, i:x as i64, `0`),
686	`1` => _mm_insert_epi64(self.x, i:x as i64, `1`),
687	_ => unreachable!(),
688	}
689	})
690	}
691	}
692	impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
693	#[inline(always)]
694	fn extract(self, i: u32) -> u64 {
695	unsafe {
696	match i {
697	`0` => _mm_cvtsi128_si64(self.x) as u64,
698	`1` => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, `0b11101110`)) as u64,
699	_ => unreachable!(),
700	}
701	}
702	}
703	#[inline(always)]
704	fn insert(self, x: u64, i: u32) -> Self {
705	Self::new(unsafe {
706	match i {
707	`0` => _mm_or_si128(
708	_mm_andnot_si128(_mm_cvtsi64_si128(`-1`), self.x),
709	_mm_cvtsi64_si128(x as i64),
710	),
711	`1` => _mm_or_si128(
712	_mm_move_epi64(self.x),
713	_mm_slli_si128(_mm_cvtsi64_si128(x as i64), `8`),
714	),
715	_ => unreachable!(),
716	}
717	})
718	}
719	}
720
721	impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
722	#[inline(always)]
723	fn bswap(self) -> Self {
724	Self::new(unsafe {
725	let k: __m128i = _mm_set_epi64x(e1:`0x0c0d_0e0f_0809_0a0b`, e0:`0x0405_0607_0001_0203`);
726	_mm_shuffle_epi8(self.x, b:k)
727	})
728	}
729	}
730	#[inline(always)]
731	fn bswap32_s2(x: __m128i) -> __m128i {
732	unsafe {
733	let mut y: __m128i = _mm_unpacklo_epi8(a:x, b:_mm_setzero_si128());
734	y = _mm_shufflehi_epi16(y, `0b0001_1011`);
735	y = _mm_shufflelo_epi16(y, `0b0001_1011`);
736	let mut z: __m128i = _mm_unpackhi_epi8(a:x, b:_mm_setzero_si128());
737	z = _mm_shufflehi_epi16(z, `0b0001_1011`);
738	z = _mm_shufflelo_epi16(z, `0b0001_1011`);
739	_mm_packus_epi16(a:y, b:z)
740	}
741	}
742	impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
743	#[inline(always)]
744	fn bswap(self) -> Self {
745	Self::new(bswap32_s2(self.x))
746	}
747	}
748
749	impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
750	#[inline(always)]
751	fn bswap(self) -> Self {
752	Self::new(unsafe {
753	let k: __m128i = _mm_set_epi64x(e1:`0x0809_0a0b_0c0d_0e0f`, e0:`0x0001_0203_0405_0607`);
754	_mm_shuffle_epi8(self.x, b:k)
755	})
756	}
757	}
758	impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
759	#[inline(always)]
760	fn bswap(self) -> Self {
761	Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, `0b1011_0001`)) })
762	}
763	}
764
765	impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
766	#[inline(always)]
767	fn bswap(self) -> Self {
768	Self::new(unsafe {
769	let k: __m128i = _mm_set_epi64x(e1:`0x0f0e_0d0c_0b0a_0908`, e0:`0x0706_0504_0302_0100`);
770	_mm_shuffle_epi8(self.x, b:k)
771	})
772	}
773	}
774	impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
775	#[inline(always)]
776	fn bswap(self) -> Self {
777	unimplemented!()
778	}
779	}
780
781	macro_rules! swapi {
782	($x:expr, $i:expr, $k:expr) => {
783	unsafe {
784	const K: u8 = $k;
785	let k = _mm_set1_epi8(K as i8);
786	u128x1_sse2::new(_mm_or_si128(
787	_mm_srli_epi16(_mm_and_si128($x.x, k), $i),
788	_mm_and_si128(_mm_slli_epi16($x.x, $i), k),
789	))
790	}
791	};
792	}
793	#[inline(always)]
794	fn swap16_s2(x: __m128i) -> __m128i {
795	unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, `0b1011_0001`), `0b1011_0001`) }
796	}
797	impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
798	#[inline(always)]
799	fn swap1(self) -> Self {
800	swapi!(self, `1`, `0xaa`)
801	}
802	#[inline(always)]
803	fn swap2(self) -> Self {
804	swapi!(self, `2`, `0xcc`)
805	}
806	#[inline(always)]
807	fn swap4(self) -> Self {
808	swapi!(self, `4`, `0xf0`)
809	}
810	#[inline(always)]
811	fn swap8(self) -> Self {
812	u128x1_sse2::new(unsafe {
813	let k = _mm_set_epi64x(`0x0e0f_0c0d_0a0b_0809`, `0x0607_0405_0203_0001`);
814	_mm_shuffle_epi8(self.x, k)
815	})
816	}
817	#[inline(always)]
818	fn swap16(self) -> Self {
819	u128x1_sse2::new(unsafe {
820	let k = _mm_set_epi64x(`0x0d0c_0f0e_0908_0b0a`, `0x0504_0706_0100_0302`);
821	_mm_shuffle_epi8(self.x, k)
822	})
823	}
824	#[inline(always)]
825	fn swap32(self) -> Self {
826	u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, `0b1011_0001`) })
827	}
828	#[inline(always)]
829	fn swap64(self) -> Self {
830	u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, `0b0100_1110`) })
831	}
832	}
833	impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
834	#[inline(always)]
835	fn swap1(self) -> Self {
836	swapi!(self, `1`, `0xaa`)
837	}
838	#[inline(always)]
839	fn swap2(self) -> Self {
840	swapi!(self, `2`, `0xcc`)
841	}
842	#[inline(always)]
843	fn swap4(self) -> Self {
844	swapi!(self, `4`, `0xf0`)
845	}
846	#[inline(always)]
847	fn swap8(self) -> Self {
848	u128x1_sse2::new(unsafe {
849	_mm_or_si128(_mm_slli_epi16(self.x, `8`), _mm_srli_epi16(self.x, `8`))
850	})
851	}
852	#[inline(always)]
853	fn swap16(self) -> Self {
854	u128x1_sse2::new(swap16_s2(self.x))
855	}
856	#[inline(always)]
857	fn swap32(self) -> Self {
858	u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, `0b1011_0001`) })
859	}
860	#[inline(always)]
861	fn swap64(self) -> Self {
862	u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, `0b0100_1110`) })
863	}
864	}
865
866	#[derive(Copy, Clone)]
867	pub struct G0;
868	#[derive(Copy, Clone)]
869	pub struct G1;
870
871	#[allow(non_camel_case_types)]
872	pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
873	#[allow(non_camel_case_types)]
874	pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
875	#[allow(non_camel_case_types)]
876	pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
877	#[allow(non_camel_case_types)]
878	pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
879
880	#[allow(non_camel_case_types)]
881	pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
882	#[allow(non_camel_case_types)]
883	pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
884	#[allow(non_camel_case_types)]
885	pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
886
887	impl<S3, S4, NI> Vector<[u32; `16`]> for u32x4x4_sse2<S3, S4, NI> {
888	#[inline(always)]
889	fn to_scalars(self) -> [u32; `16`] {
890	transmute!(self)
891	}
892	}
893
894	impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
895	where
896	u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
897	Machine86<S3, S4, NI>: Machine,
898	u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; `2`]>,
899	u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
900	{
901	}
902	impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
903	where
904	u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
905	Machine86<S3, S4, NI>: Machine,
906	u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; `2`]>,
907	u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
908	{
909	}
910	impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
911	where
912	u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
913	Machine86<S3, S4, NI>: Machine,
914	u64x4_sse2<S3, S4, NI>: MultiLane<[u64; `4`]> + Vec4<u64> + Words4,
915	{
916	}
917	impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
918	where
919	u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
920	Machine86<S3, S4, NI>: Machine,
921	u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; `2`]>,
922	u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
923	u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
924	u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
925	u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
926	{
927	}
928
929	impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
930	where
931	u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
932	Avx2Machine<NI>: Machine,
933	u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; `2`]>,
934	u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
935	{
936	}
937	impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
938	where
939	u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
940	Avx2Machine<NI>: Machine,
941	u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; `2`]>,
942	u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
943	{
944	}
945	impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
946	where
947	u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
948	Avx2Machine<NI>: Machine,
949	u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; `4`]> + Vec4<u64> + Words4,
950	{
951	}
952	impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
953	where
954	u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
955	Avx2Machine<NI>: Machine,
956	u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; `2`]>,
957	u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
958	u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
959	u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
960	u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
961	{
962	}
963
964	impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
965	where
966	u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
967	{
968	#[inline(always)]
969	fn extract(self, i: u32) -> u64 {
970	match i {
971	`0` => self.0[`0`].extract(`0`),
972	`1` => self.0[`0`].extract(`1`),
973	`2` => self.0[`1`].extract(`0`),
974	`3` => self.0[`1`].extract(`1`),
975	_ => panic!(),
976	}
977	}
978	#[inline(always)]
979	fn insert(mut self, w: u64, i: u32) -> Self {
980	match i {
981	`0` => self.0[`0`] = self.0[`0`].insert(w, i:`0`),
982	`1` => self.0[`0`] = self.0[`0`].insert(w, i:`1`),
983	`2` => self.0[`1`] = self.0[`1`].insert(w, i:`0`),
984	`3` => self.0[`1`] = self.0[`1`].insert(w, i:`1`),
985	_ => panic!(),
986	};
987	self
988	}
989	}
990
991	impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
992	where
993	u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
994	Machine86<S3, S4, NI>: Machine,
995	u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; `4`]>,
996	u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
997	u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
998	u32x4x4_sse2<S3, S4, NI>: Vector<[u32; `16`]>,
999	{
1000	}
1001	impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
1002	where
1003	u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1004	Machine86<S3, S4, NI>: Machine,
1005	u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; `4`]>,
1006	u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1007	{
1008	}
1009	impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1010	where
1011	u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1012	Machine86<S3, S4, NI>: Machine,
1013	u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; `4`]>,
1014	u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1015	u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1016	u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1017	{
1018	}
1019
1020	impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1021	where
1022	u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1023	Avx2Machine<NI>: Machine,
1024	u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; `4`]>,
1025	u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1026	{
1027	}
1028	impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1029	where
1030	u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1031	Avx2Machine<NI>: Machine,
1032	u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; `4`]>,
1033	u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1034	u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1035	u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1036	{
1037	}
1038
1039	macro_rules! impl_into_x {
1040	($from:ident, $to:ident) => {
1041	impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1042	for x2<$to<S3, S4, NI>, Gt>
1043	{
1044	#[inline(always)]
1045	fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1046	x2::new([$to::from(x.`0`[`0`]), $to::from(x.`0`[`1`])])
1047	}
1048	}
1049	impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1050	#[inline(always)]
1051	fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1052	x4::new([
1053	$to::from(x.`0`[`0`]),
1054	$to::from(x.`0`[`1`]),
1055	$to::from(x.`0`[`2`]),
1056	$to::from(x.`0`[`3`]),
1057	])
1058	}
1059	}
1060	};
1061	}
1062	impl_into_x!(u128x1_sse2, u64x2_sse2);
1063	impl_into_x!(u128x1_sse2, u32x4_sse2);
1064
1065	///// Debugging
1066
1067	use core::fmt::{Debug, Formatter, Result};
1068
1069	impl<W: PartialEq, G> PartialEq for x2<W, G> {
1070	#[inline(always)]
1071	fn eq(&self, rhs: &Self) -> bool {
1072	self.0[`0`] == rhs.0[`0`] && self.0[`1`] == rhs.0[`1`]
1073	}
1074	}
1075
1076	#[allow(unused)]
1077	#[inline(always)]
1078	unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1079	let q: __m128i = _mm_shuffle_epi32(_mm_cmpeq_epi64(a:x, b:y), `0b1100_0110`);
1080	_mm_cvtsi128_si64(q) == `-1`
1081	}
1082
1083	#[inline(always)]
1084	unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1085	let q: __m128i = _mm_cmpeq_epi32(a:x, b:y);
1086	let p: i64 = _mm_cvtsi128_si64(_mm_srli_si128(q, `8`));
1087	let q: i64 = _mm_cvtsi128_si64(q);
1088	(p & q) == `-1`
1089	}
1090
1091	impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1092	#[inline(always)]
1093	fn eq(&self, rhs: &Self) -> bool {
1094	unsafe { eq128_s2(self.x, y:rhs.x) }
1095	}
1096	}
1097	impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1098	where
1099	Self: Copy + MultiLane<[u32; `4`]>,
1100	{
1101	#[cold]
1102	fn fmt(&self, fmt: &mut Formatter) -> Result {
1103	fmt.write_fmt(format_args!("{:`08`x?}", &self.to_lanes()))
1104	}
1105	}
1106
1107	impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1108	#[inline(always)]
1109	fn eq(&self, rhs: &Self) -> bool {
1110	unsafe { eq128_s2(self.x, y:rhs.x) }
1111	}
1112	}
1113	impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1114	where
1115	Self: Copy + MultiLane<[u64; `2`]>,
1116	{
1117	#[cold]
1118	fn fmt(&self, fmt: &mut Formatter) -> Result {
1119	fmt.write_fmt(format_args!("{:`016`x?}", &self.to_lanes()))
1120	}
1121	}
1122
1123	impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1124	where
1125	u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; `2`]>,
1126	{
1127	#[cold]
1128	fn fmt(&self, fmt: &mut Formatter) -> Result {
1129	let (a: [u64; 2], b: [u64; 2]) = (self.0[`0`].to_lanes(), self.0[`1`].to_lanes());
1130	fmt.write_fmt(format_args!("{:`016`x?}", &[a[`0`], a[`1`], b[`0`], b[`1`]]))
1131	}
1132	}
1133
1134	#[cfg(test)]
1135	#[cfg(target_arch = "x86_64")]
1136	mod test {
1137	use super::*;
1138	use crate::x86_64::{SSE2, SSE41, SSSE3};
1139	use crate::Machine;
1140
1141	#[test]
1142	#[cfg_attr(not(target_feature = "ssse3"), ignore)]
1143	fn test_bswap32_s2_vs_s3() {
1144	let xs = [`0x0f0e_0d0c`, `0x0b0a_0908`, `0x0706_0504`, `0x0302_0100`];
1145	let ys = [`0x0c0d_0e0f`, `0x0809_0a0b`, `0x0405_0607`, `0x0001_0203`];
1146
1147	let s2 = unsafe { SSE2::instance() };
1148	let s3 = unsafe { SSSE3::instance() };
1149
1150	let x_s2 = {
1151	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1152	x_s2.bswap()
1153	};
1154
1155	let x_s3 = {
1156	let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1157	x_s3.bswap()
1158	};
1159
1160	assert_eq!(x_s2, transmute!(x_s3));
1161	assert_eq!(x_s2, s2.vec(ys));
1162	}
1163
1164	#[test]
1165	#[cfg_attr(not(target_feature = "ssse3"), ignore)]
1166	fn test_bswap64_s2_vs_s3() {
1167	let xs = [`0x0f0e_0d0c_0b0a_0908`, `0x0706_0504_0302_0100`];
1168	let ys = [`0x0809_0a0b_0c0d_0e0f`, `0x0001_0203_0405_0607`];
1169
1170	let s2 = unsafe { SSE2::instance() };
1171	let s3 = unsafe { SSSE3::instance() };
1172
1173	let x_s2 = {
1174	let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1175	x_s2.bswap()
1176	};
1177
1178	let x_s3 = {
1179	let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1180	x_s3.bswap()
1181	};
1182
1183	assert_eq!(x_s2, s2.vec(ys));
1184	assert_eq!(x_s3, transmute!(x_s3));
1185	}
1186
1187	#[test]
1188	#[cfg_attr(not(target_feature = "ssse3"), ignore)]
1189	fn test_shuffle32_s2_vs_s3() {
1190	let xs = [`0x0`, `0x1`, `0x2`, `0x3`];
1191	let ys = [`0x2`, `0x3`, `0x0`, `0x1`];
1192	let zs = [`0x1`, `0x2`, `0x3`, `0x0`];
1193
1194	let s2 = unsafe { SSE2::instance() };
1195	let s3 = unsafe { SSSE3::instance() };
1196
1197	let x_s2 = {
1198	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1199	x_s2.shuffle2301()
1200	};
1201	let x_s3 = {
1202	let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1203	x_s3.shuffle2301()
1204	};
1205	assert_eq!(x_s2, s2.vec(ys));
1206	assert_eq!(x_s3, transmute!(x_s3));
1207
1208	let x_s2 = {
1209	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1210	x_s2.shuffle3012()
1211	};
1212	let x_s3 = {
1213	let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1214	x_s3.shuffle3012()
1215	};
1216	assert_eq!(x_s2, s2.vec(zs));
1217	assert_eq!(x_s3, transmute!(x_s3));
1218
1219	let x_s2 = x_s2.shuffle1230();
1220	let x_s3 = x_s3.shuffle1230();
1221	assert_eq!(x_s2, s2.vec(xs));
1222	assert_eq!(x_s3, transmute!(x_s3));
1223	}
1224
1225	#[test]
1226	#[cfg_attr(not(target_feature = "ssse3"), ignore)]
1227	fn test_shuffle64_s2_vs_s3() {
1228	let xs = [`0x0`, `0x1`, `0x2`, `0x3`];
1229	let ys = [`0x2`, `0x3`, `0x0`, `0x1`];
1230	let zs = [`0x1`, `0x2`, `0x3`, `0x0`];
1231
1232	let s2 = unsafe { SSE2::instance() };
1233	let s3 = unsafe { SSSE3::instance() };
1234
1235	let x_s2 = {
1236	let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1237	x_s2.shuffle2301()
1238	};
1239	let x_s3 = {
1240	let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1241	x_s3.shuffle2301()
1242	};
1243	assert_eq!(x_s2, s2.vec(ys));
1244	assert_eq!(x_s3, transmute!(x_s3));
1245
1246	let x_s2 = {
1247	let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1248	x_s2.shuffle3012()
1249	};
1250	let x_s3 = {
1251	let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1252	x_s3.shuffle3012()
1253	};
1254	assert_eq!(x_s2, s2.vec(zs));
1255	assert_eq!(x_s3, transmute!(x_s3));
1256
1257	let x_s2 = x_s2.shuffle1230();
1258	let x_s3 = x_s3.shuffle1230();
1259	assert_eq!(x_s2, s2.vec(xs));
1260	assert_eq!(x_s3, transmute!(x_s3));
1261	}
1262
1263	#[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1264	#[test]
1265	fn test_lanes_u32x4() {
1266	let xs = [`0x1`, `0x2`, `0x3`, `0x4`];
1267
1268	let s2 = unsafe { SSE2::instance() };
1269	let s3 = unsafe { SSSE3::instance() };
1270	let s4 = unsafe { SSE41::instance() };
1271
1272	{
1273	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1274	let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1275	assert_eq!(x_s2, y_s2);
1276	assert_eq!(xs, y_s2.to_lanes());
1277	}
1278
1279	{
1280	let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1281	let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1282	assert_eq!(x_s3, y_s3);
1283	assert_eq!(xs, y_s3.to_lanes());
1284	}
1285
1286	{
1287	let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1288	let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1289	assert_eq!(x_s4, y_s4);
1290	assert_eq!(xs, y_s4.to_lanes());
1291	}
1292	}
1293
1294	#[test]
1295	#[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1296	fn test_lanes_u64x2() {
1297	let xs = [`0x1`, `0x2`];
1298
1299	let s2 = unsafe { SSE2::instance() };
1300	let s3 = unsafe { SSSE3::instance() };
1301	let s4 = unsafe { SSE41::instance() };
1302
1303	{
1304	let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1305	let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1306	assert_eq!(x_s2, y_s2);
1307	assert_eq!(xs, y_s2.to_lanes());
1308	}
1309
1310	{
1311	let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1312	let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1313	assert_eq!(x_s3, y_s3);
1314	assert_eq!(xs, y_s3.to_lanes());
1315	}
1316
1317	{
1318	let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1319	let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1320	assert_eq!(x_s4, y_s4);
1321	assert_eq!(xs, y_s4.to_lanes());
1322	}
1323	}
1324
1325	#[test]
1326	fn test_vec4_u32x4_s2() {
1327	let xs = [`1`, `2`, `3`, `4`];
1328	let s2 = unsafe { SSE2::instance() };
1329	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1330	assert_eq!(x_s2.extract(`0`), `1`);
1331	assert_eq!(x_s2.extract(`1`), `2`);
1332	assert_eq!(x_s2.extract(`2`), `3`);
1333	assert_eq!(x_s2.extract(`3`), `4`);
1334	assert_eq!(x_s2.insert(`0xf`, `0`), s2.vec([`0xf`, `2`, `3`, `4`]));
1335	assert_eq!(x_s2.insert(`0xf`, `1`), s2.vec([`1`, `0xf`, `3`, `4`]));
1336	assert_eq!(x_s2.insert(`0xf`, `2`), s2.vec([`1`, `2`, `0xf`, `4`]));
1337	assert_eq!(x_s2.insert(`0xf`, `3`), s2.vec([`1`, `2`, `3`, `0xf`]));
1338	}
1339
1340	#[test]
1341	#[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1342	fn test_vec4_u32x4_s4() {
1343	let xs = [`1`, `2`, `3`, `4`];
1344	let s4 = unsafe { SSE41::instance() };
1345	let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1346	assert_eq!(x_s4.extract(`0`), `1`);
1347	assert_eq!(x_s4.extract(`1`), `2`);
1348	assert_eq!(x_s4.extract(`2`), `3`);
1349	assert_eq!(x_s4.extract(`3`), `4`);
1350	assert_eq!(x_s4.insert(`0xf`, `0`), s4.vec([`0xf`, `2`, `3`, `4`]));
1351	assert_eq!(x_s4.insert(`0xf`, `1`), s4.vec([`1`, `0xf`, `3`, `4`]));
1352	assert_eq!(x_s4.insert(`0xf`, `2`), s4.vec([`1`, `2`, `0xf`, `4`]));
1353	assert_eq!(x_s4.insert(`0xf`, `3`), s4.vec([`1`, `2`, `3`, `0xf`]));
1354	}
1355
1356	#[test]
1357	fn test_vec2_u64x2_s2() {
1358	let xs = [`0x1`, `0x2`];
1359	let s2 = unsafe { SSE2::instance() };
1360	let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1361	assert_eq!(x_s2.extract(`0`), `1`);
1362	assert_eq!(x_s2.extract(`1`), `2`);
1363	assert_eq!(x_s2.insert(`0xf`, `0`), s2.vec([`0xf`, `2`]));
1364	assert_eq!(x_s2.insert(`0xf`, `1`), s2.vec([`1`, `0xf`]));
1365	}
1366
1367	#[test]
1368	#[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1369	fn test_vec4_u64x2_s4() {
1370	let xs = [`0x1`, `0x2`];
1371	let s4 = unsafe { SSE41::instance() };
1372	let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1373	assert_eq!(x_s4.extract(`0`), `1`);
1374	assert_eq!(x_s4.extract(`1`), `2`);
1375	assert_eq!(x_s4.insert(`0xf`, `0`), s4.vec([`0xf`, `2`]));
1376	assert_eq!(x_s4.insert(`0xf`, `1`), s4.vec([`1`, `0xf`]));
1377	}
1378	}
1379
1380	pub mod avx2 {
1381	#![allow(non_camel_case_types)]
1382	use crate::soft::{x2, x4};
1383	use crate::types::*;
1384	use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
1385	use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1386	use core::arch::x86_64::*;
1387	use core::marker::PhantomData;
1388	use core::ops::*;
1389	use zerocopy::transmute;
1390
1391	zerocopy::cryptocorrosion_derive_traits! {
1392	#[repr(transparent)]
1393	#[derive(Copy, Clone)]
1394	pub struct u32x4x2_avx2<NI> {
1395	x: __m256i,
1396	ni: PhantomData<NI>,
1397	}
1398	}
1399
1400	impl<NI> u32x4x2_avx2<NI> {
1401	#[inline(always)]
1402	fn new(x: __m256i) -> Self {
1403	Self { x, ni: PhantomData }
1404	}
1405	}
1406
1407	impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
1408	impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
1409	#[inline(always)]
1410	unsafe fn unpack(p: vec256_storage) -> Self {
1411	Self::new(p.avx)
1412	}
1413	}
1414	impl<NI> StoreBytes for u32x4x2_avx2<NI> {
1415	#[inline(always)]
1416	unsafe fn unsafe_read_le(input: &[u8]) -> Self {
1417	assert_eq!(input.len(), `32`);
1418	Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
1419	}
1420	#[inline(always)]
1421	unsafe fn unsafe_read_be(input: &[u8]) -> Self {
1422	Self::unsafe_read_le(input).bswap()
1423	}
1424	#[inline(always)]
1425	fn write_le(self, out: &mut [u8]) {
1426	unsafe {
1427	assert_eq!(out.len(), `32`);
1428	_mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
1429	}
1430	}
1431	#[inline(always)]
1432	fn write_be(self, out: &mut [u8]) {
1433	self.bswap().write_le(out)
1434	}
1435	}
1436	impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; `2`]> for u32x4x2_avx2<NI> {
1437	#[inline(always)]
1438	fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; `2`] {
1439	unsafe {
1440	[
1441	u32x4_sse2::new(_mm256_extracti128_si256(self.x, `0`)),
1442	u32x4_sse2::new(_mm256_extracti128_si256(self.x, `1`)),
1443	]
1444	}
1445	}
1446	#[inline(always)]
1447	fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; `2`]) -> Self {
1448	Self::new(unsafe { _mm256_setr_m128i(x[`0`].x, x[`1`].x) })
1449	}
1450	}
1451	impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
1452	#[inline(always)]
1453	fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1454	unsafe {
1455	match i {
1456	`0` => u32x4_sse2::new(_mm256_extracti128_si256(self.x, `0`)),
1457	`1` => u32x4_sse2::new(_mm256_extracti128_si256(self.x, `1`)),
1458	_ => panic!(),
1459	}
1460	}
1461	}
1462	#[inline(always)]
1463	fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1464	Self::new(unsafe {
1465	match i {
1466	`0` => _mm256_inserti128_si256(self.x, w.x, `0`),
1467	`1` => _mm256_inserti128_si256(self.x, w.x, `1`),
1468	_ => panic!(),
1469	}
1470	})
1471	}
1472	}
1473	impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
1474	impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
1475	macro_rules! shuf_lane_bytes {
1476	($name:ident, $k0:expr, $k1:expr) => {
1477	#[inline(always)]
1478	fn $name(self) -> Self {
1479	Self::new(unsafe {
1480	_mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
1481	})
1482	}
1483	};
1484	}
1485	macro_rules! rotr_32 {
1486	($name:ident, $i:expr) => {
1487	#[inline(always)]
1488	fn $name(self) -> Self {
1489	Self::new(unsafe {
1490	_mm256_or_si256(
1491	_mm256_srli_epi32(self.x, $i as i32),
1492	_mm256_slli_epi32(self.x, `32` - $i as i32),
1493	)
1494	})
1495	}
1496	};
1497	}
1498	impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
1499	rotr_32!(rotate_each_word_right7, `7`);
1500	shuf_lane_bytes!(
1501	rotate_each_word_right8,
1502	`0x0c0f_0e0d_080b_0a09`,
1503	`0x0407_0605_0003_0201`
1504	);
1505	rotr_32!(rotate_each_word_right11, `11`);
1506	rotr_32!(rotate_each_word_right12, `12`);
1507	shuf_lane_bytes!(
1508	rotate_each_word_right16,
1509	`0x0d0c_0f0e_0908_0b0a`,
1510	`0x0504_0706_0100_0302`
1511	);
1512	rotr_32!(rotate_each_word_right20, `20`);
1513	shuf_lane_bytes!(
1514	rotate_each_word_right24,
1515	`0x0e0d_0c0f_0a09_080b`,
1516	`0x0605_0407_0201_0003`
1517	);
1518	rotr_32!(rotate_each_word_right25, `25`);
1519	}
1520	impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
1521	impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
1522	#[inline(always)]
1523	fn from(x: u32x4x2_avx2<NI>) -> Self {
1524	Self { avx: x.x }
1525	}
1526	}
1527
1528	macro_rules! impl_assign {
1529	($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1530	impl<NI> $Assign for $vec<NI>
1531	where
1532	NI: Copy,
1533	{
1534	#[inline(always)]
1535	fn $assign_fn(&mut self, rhs: Self) {
1536	*self = self.$bin_fn(rhs);
1537	}
1538	}
1539	};
1540	}
1541	impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
1542	impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
1543	impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
1544	impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
1545
1546	macro_rules! impl_bitop {
1547	($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1548	impl<NI> $Op for $vec<NI> {
1549	type Output = Self;
1550	#[inline(always)]
1551	fn $op_fn(self, rhs: Self) -> Self::Output {
1552	Self::new(unsafe { $impl_fn(self.x, rhs.x) })
1553	}
1554	}
1555	};
1556	}
1557	impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
1558	impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
1559	impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
1560	impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
1561	impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
1562
1563	impl<NI> Not for u32x4x2_avx2<NI> {
1564	type Output = Self;
1565	#[inline(always)]
1566	fn not(self) -> Self::Output {
1567	unsafe {
1568	let f = _mm256_set1_epi8(`-0x7f`);
1569	Self::new(f) ^ self
1570	}
1571	}
1572	}
1573
1574	impl<NI> BSwap for u32x4x2_avx2<NI> {
1575	shuf_lane_bytes!(bswap, `0x0c0d_0e0f_0809_0a0b`, `0x0405_0607_0001_0203`);
1576	}
1577
1578	impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
1579	where
1580	NI: Copy,
1581	{
1582	#[inline(always)]
1583	fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
1584	Self::new(unsafe { _mm256_setr_m128i(x.0[`0`].x, x.0[`1`].x) })
1585	}
1586	}
1587
1588	impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
1589	#[inline(always)]
1590	fn shuffle_lane_words1230(self) -> Self {
1591	Self::new(unsafe { _mm256_shuffle_epi32(self.x, `0b1001_0011`) })
1592	}
1593	#[inline(always)]
1594	fn shuffle_lane_words2301(self) -> Self {
1595	Self::new(unsafe { _mm256_shuffle_epi32(self.x, `0b0100_1110`) })
1596	}
1597	#[inline(always)]
1598	fn shuffle_lane_words3012(self) -> Self {
1599	Self::new(unsafe { _mm256_shuffle_epi32(self.x, `0b0011_1001`) })
1600	}
1601	}
1602
1603	///////////////////////////////////////////////////////////////////////////////////////////
1604
1605	pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
1606	impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
1607
1608	impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
1609	#[inline(always)]
1610	unsafe fn unpack(p: vec512_storage) -> Self {
1611	Self::new([
1612	u32x4x2_avx2::unpack(p.avx[`0`]),
1613	u32x4x2_avx2::unpack(p.avx[`1`]),
1614	])
1615	}
1616	}
1617	impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; `4`]> for u32x4x4_avx2<NI> {
1618	#[inline(always)]
1619	fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; `4`] {
1620	let [a, b] = self.0[`0`].to_lanes();
1621	let [c, d] = self.0[`1`].to_lanes();
1622	[a, b, c, d]
1623	}
1624	#[inline(always)]
1625	fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; `4`]) -> Self {
1626	let ab = u32x4x2_avx2::from_lanes([x[`0`], x[`1`]]);
1627	let cd = u32x4x2_avx2::from_lanes([x[`2`], x[`3`]]);
1628	Self::new([ab, cd])
1629	}
1630	}
1631	impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1632	#[inline(always)]
1633	fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1634	match i {
1635	`0` => self.0[`0`].extract(`0`),
1636	`1` => self.0[`0`].extract(`1`),
1637	`2` => self.0[`1`].extract(`0`),
1638	`3` => self.0[`1`].extract(`1`),
1639	_ => panic!(),
1640	}
1641	}
1642	#[inline(always)]
1643	fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1644	Self::new(match i {
1645	`0` \| `1` => [self.0[`0`].insert(w, i), self.0[`1`]],
1646	`2` \| `3` => [self.0[`0`], self.0[`1`].insert(w, i - `2`)],
1647	_ => panic!(),
1648	})
1649	}
1650	}
1651	impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1652	#[inline(always)]
1653	fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1654	/*
1655	* a00:a01 a10:a11
1656	* b00:b01 b10:b11
1657	* c00:c01 c10:c11
1658	* d00:d01 d10:d11
1659	* =>
1660	* a00:b00 c00:d00
1661	* a01:b01 c01:d01
1662	* a10:b10 c10:d10
1663	* a11:b11 c11:d11
1664	*/
1665	unsafe {
1666	let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[`0`].x, b.0[`0`].x, `0x20`));
1667	let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[`0`].x, b.0[`0`].x, `0x31`));
1668	let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[`1`].x, b.0[`1`].x, `0x20`));
1669	let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[`1`].x, b.0[`1`].x, `0x31`));
1670	let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[`0`].x, d.0[`0`].x, `0x20`));
1671	let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[`0`].x, d.0[`0`].x, `0x31`));
1672	let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[`1`].x, d.0[`1`].x, `0x20`));
1673	let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[`1`].x, d.0[`1`].x, `0x31`));
1674	(
1675	Self::new([ab00, cd00]),
1676	Self::new([ab01, cd01]),
1677	Self::new([ab10, cd10]),
1678	Self::new([ab11, cd11]),
1679	)
1680	}
1681	}
1682	}
1683	impl<NI: Copy> Vector<[u32; `16`]> for u32x4x4_avx2<NI> {
1684	#[inline(always)]
1685	fn to_scalars(self) -> [u32; `16`] {
1686	transmute!(self)
1687	}
1688	}
1689	impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
1690	#[inline(always)]
1691	fn from(x: u32x4x4_avx2<NI>) -> Self {
1692	Self {
1693	avx: [
1694	vec256_storage { avx: x.0[`0`].x },
1695	vec256_storage { avx: x.0[`1`].x },
1696	],
1697	}
1698	}
1699	}
1700	impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
1701	#[inline(always)]
1702	fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1703	Self::new(unsafe {
1704	[
1705	u32x4x2_avx2::new(_mm256_setr_m128i(x.0[`0`].x, x.0[`1`].x)),
1706	u32x4x2_avx2::new(_mm256_setr_m128i(x.0[`2`].x, x.0[`3`].x)),
1707	]
1708	})
1709	}
1710	}
1711	}
1712