sse2.rs source code [crates/ppv_lite86/src/x86_64/sse2.rs]

1	use crate::soft::{x2, x4};
2	use crate::types::*;
3	use crate::vec128_storage;
4	use crate::x86_64::Avx2Machine;
5	use crate::x86_64::SseMachine as Machine86;
6	use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7	use core::arch::x86_64::*;
8	use core::marker::PhantomData;
9	use core::ops::{
10	Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11	};
12	use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes};
13
14	macro_rules! impl_binop {
15	($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
16	impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
17	type Output = Self;
18	#[inline(always)]
19	fn $fn(self, rhs: Self) -> Self::Output {
20	Self::new(unsafe { $impl_fn(self.x, rhs.x) })
21	}
22	}
23	};
24	}
25
26	macro_rules! impl_binop_assign {
27	($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
28	impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
29	where
30	$vec<S3, S4, NI>: Copy,
31	{
32	#[inline(always)]
33	fn $fn_assign(&mut self, rhs: Self) {
34	*self = self.$fn(rhs);
35	}
36	}
37	};
38	}
39
40	macro_rules! def_vec {
41	($vec:ident, $word:ident) => {
42	#[allow(non_camel_case_types)]
43	#[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
44	#[repr(transparent)]
45	pub struct $vec<S3, S4, NI> {
46	x: __m128i,
47	s3: PhantomData<S3>,
48	s4: PhantomData<S4>,
49	ni: PhantomData<NI>,
50	}
51
52	impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
53	#[inline(always)]
54	unsafe fn unpack(x: vec128_storage) -> Self {
55	Self::new(x.sse2)
56	}
57	}
58	impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
59	#[inline(always)]
60	fn from(x: $vec<S3, S4, NI>) -> Self {
61	vec128_storage { sse2: x.x }
62	}
63	}
64	impl<S3, S4, NI> $vec<S3, S4, NI> {
65	#[inline(always)]
66	fn new(x: __m128i) -> Self {
67	$vec {
68	x,
69	s3: PhantomData,
70	s4: PhantomData,
71	ni: PhantomData,
72	}
73	}
74	}
75
76	impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
77	where
78	Self: BSwap,
79	{
80	#[inline(always)]
81	unsafe fn unsafe_read_le(input: &[u8]) -> Self {
82	assert_eq!(input.len(), `16`);
83	Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
84	}
85	#[inline(always)]
86	unsafe fn unsafe_read_be(input: &[u8]) -> Self {
87	assert_eq!(input.len(), `16`);
88	Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
89	}
90	#[inline(always)]
91	fn write_le(self, out: &mut [u8]) {
92	assert_eq!(out.len(), `16`);
93	unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
94	}
95	#[inline(always)]
96	fn write_be(self, out: &mut [u8]) {
97	assert_eq!(out.len(), `16`);
98	let x = self.bswap().x;
99	unsafe {
100	_mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
101	}
102	}
103	}
104
105	impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
106	#[inline(always)]
107	fn default() -> Self {
108	Self::new(unsafe { _mm_setzero_si128() })
109	}
110	}
111
112	impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
113	type Output = Self;
114	#[inline(always)]
115	fn not(self) -> Self::Output {
116	unsafe {
117	let ff = _mm_set1_epi64x(-`1i64`);
118	self ^ Self::new(ff)
119	}
120	}
121	}
122
123	impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
124	impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
125	impl_binop!($vec, BitOr, bitor, _mm_or_si128);
126	impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
127	impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
128	impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
129	impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
130	impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
131	type Output = Self;
132	#[inline(always)]
133	fn andnot(self, rhs: Self) -> Self {
134	Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
135	}
136	}
137	};
138	}
139
140	macro_rules! impl_bitops32 {
141	($vec:ident) => {
142	impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
143	$vec<S3, S4, NI>: RotateEachWord32
144	{
145	}
146	};
147	}
148
149	macro_rules! impl_bitops64 {
150	($vec:ident) => {
151	impl_bitops32!($vec);
152	impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
153	$vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
154	{
155	}
156	};
157	}
158
159	macro_rules! impl_bitops128 {
160	($vec:ident) => {
161	impl_bitops64!($vec);
162	impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
163	$vec<S3, S4, NI>: RotateEachWord128
164	{
165	}
166	};
167	}
168
169	macro_rules! rotr_32_s3 {
170	($name:ident, $k0:expr, $k1:expr) => {
171	#[inline(always)]
172	fn $name(self) -> Self {
173	Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
174	}
175	};
176	}
177	macro_rules! rotr_32 {
178	($name:ident, $i:expr) => {
179	#[inline(always)]
180	fn $name(self) -> Self {
181	Self::new(unsafe {
182	_mm_or_si128(
183	_mm_srli_epi32(self.x, $i as i32),
184	_mm_slli_epi32(self.x, `32` - $i as i32),
185	)
186	})
187	}
188	};
189	}
190	impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
191	rotr_32!(rotate_each_word_right7, `7`);
192	rotr_32_s3!(
193	rotate_each_word_right8,
194	`0x0c0f_0e0d_080b_0a09`,
195	`0x0407_0605_0003_0201`
196	);
197	rotr_32!(rotate_each_word_right11, `11`);
198	rotr_32!(rotate_each_word_right12, `12`);
199	rotr_32_s3!(
200	rotate_each_word_right16,
201	`0x0d0c_0f0e_0908_0b0a`,
202	`0x0504_0706_0100_0302`
203	);
204	rotr_32!(rotate_each_word_right20, `20`);
205	rotr_32_s3!(
206	rotate_each_word_right24,
207	`0x0e0d_0c0f_0a09_080b`,
208	`0x0605_0407_0201_0003`
209	);
210	rotr_32!(rotate_each_word_right25, `25`);
211	}
212	impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
213	rotr_32!(rotate_each_word_right7, `7`);
214	rotr_32!(rotate_each_word_right8, `8`);
215	rotr_32!(rotate_each_word_right11, `11`);
216	rotr_32!(rotate_each_word_right12, `12`);
217	#[inline(always)]
218	fn rotate_each_word_right16(self) -> Self {
219	Self::new(swap16_s2(self.x))
220	}
221	rotr_32!(rotate_each_word_right20, `20`);
222	rotr_32!(rotate_each_word_right24, `24`);
223	rotr_32!(rotate_each_word_right25, `25`);
224	}
225
226	macro_rules! rotr_64_s3 {
227	($name:ident, $k0:expr, $k1:expr) => {
228	#[inline(always)]
229	fn $name(self) -> Self {
230	Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
231	}
232	};
233	}
234	macro_rules! rotr_64 {
235	($name:ident, $i:expr) => {
236	#[inline(always)]
237	fn $name(self) -> Self {
238	Self::new(unsafe {
239	_mm_or_si128(
240	_mm_srli_epi64(self.x, $i as i32),
241	_mm_slli_epi64(self.x, `64` - $i as i32),
242	)
243	})
244	}
245	};
246	}
247	impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
248	rotr_64!(rotate_each_word_right7, `7`);
249	rotr_64_s3!(
250	rotate_each_word_right8,
251	`0x080f_0e0d_0c0b_0a09`,
252	`0x0007_0605_0403_0201`
253	);
254	rotr_64!(rotate_each_word_right11, `11`);
255	rotr_64!(rotate_each_word_right12, `12`);
256	rotr_64_s3!(
257	rotate_each_word_right16,
258	`0x0908_0f0e_0d0c_0b0a`,
259	`0x0100_0706_0504_0302`
260	);
261	rotr_64!(rotate_each_word_right20, `20`);
262	rotr_64_s3!(
263	rotate_each_word_right24,
264	`0x0a09_080f_0e0d_0c0b`,
265	`0x0201_0007_0605_0403`
266	);
267	rotr_64!(rotate_each_word_right25, `25`);
268	}
269	impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
270	rotr_64!(rotate_each_word_right7, `7`);
271	rotr_64!(rotate_each_word_right8, `8`);
272	rotr_64!(rotate_each_word_right11, `11`);
273	rotr_64!(rotate_each_word_right12, `12`);
274	#[inline(always)]
275	fn rotate_each_word_right16(self) -> Self {
276	Self::new(swap16_s2(self.x))
277	}
278	rotr_64!(rotate_each_word_right20, `20`);
279	rotr_64!(rotate_each_word_right24, `24`);
280	rotr_64!(rotate_each_word_right25, `25`);
281	}
282	impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
283	#[inline(always)]
284	fn rotate_each_word_right32(self) -> Self {
285	Self::new(unsafe { _mm_shuffle_epi32(self.x, `0b10110001`) })
286	}
287	}
288
289	macro_rules! rotr_128 {
290	($name:ident, $i:expr) => {
291	#[inline(always)]
292	fn $name(self) -> Self {
293	Self::new(unsafe {
294	_mm_or_si128(
295	_mm_srli_si128(self.x, $i as i32),
296	_mm_slli_si128(self.x, `128` - $i as i32),
297	)
298	})
299	}
300	};
301	}
302	// TODO: completely unoptimized
303	impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
304	rotr_128!(rotate_each_word_right7, `7`);
305	rotr_128!(rotate_each_word_right8, `8`);
306	rotr_128!(rotate_each_word_right11, `11`);
307	rotr_128!(rotate_each_word_right12, `12`);
308	rotr_128!(rotate_each_word_right16, `16`);
309	rotr_128!(rotate_each_word_right20, `20`);
310	rotr_128!(rotate_each_word_right24, `24`);
311	rotr_128!(rotate_each_word_right25, `25`);
312	}
313	// TODO: completely unoptimized
314	impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
315	rotr_128!(rotate_each_word_right32, `32`);
316	}
317	impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
318
319	def_vec!(u32x4_sse2, u32);
320	def_vec!(u64x2_sse2, u64);
321	def_vec!(u128x1_sse2, u128);
322
323	impl<S3, NI> MultiLane<[u32; `4`]> for u32x4_sse2<S3, YesS4, NI> {
324	#[inline(always)]
325	fn to_lanes(self) -> [u32; `4`] {
326	unsafe {
327	let x: u64 = _mm_cvtsi128_si64(self.x) as u64;
328	let y: u64 = _mm_extract_epi64(self.x, `1`) as u64;
329	[x as u32, (x >> `32`) as u32, y as u32, (y >> `32`) as u32]
330	}
331	}
332	#[inline(always)]
333	fn from_lanes(xs: [u32; `4`]) -> Self {
334	unsafe {
335	let mut x: __m128i = _mm_cvtsi64_si128((xs[`0`] as u64 \| ((xs[`1`] as u64) << `32`)) as i64);
336	x = _mm_insert_epi64(a:x, (xs[`2`] as u64 \| ((xs[`3`] as u64) << `32`)) as i64, `1`);
337	Self::new(x)
338	}
339	}
340	}
341	impl<S3, NI> MultiLane<[u32; `4`]> for u32x4_sse2<S3, NoS4, NI> {
342	#[inline(always)]
343	fn to_lanes(self) -> [u32; `4`] {
344	unsafe {
345	let x: u64 = _mm_cvtsi128_si64(self.x) as u64;
346	let y: u64 = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, `0b11101110`)) as u64;
347	[x as u32, (x >> `32`) as u32, y as u32, (y >> `32`) as u32]
348	}
349	}
350	#[inline(always)]
351	fn from_lanes(xs: [u32; `4`]) -> Self {
352	unsafe {
353	let x: i64 = (xs[`0`] as u64 \| ((xs[`1`] as u64) << `32`)) as i64;
354	let y: i64 = (xs[`2`] as u64 \| ((xs[`3`] as u64) << `32`)) as i64;
355	let x: __m128i = _mm_cvtsi64_si128(x);
356	let y: __m128i = _mm_slli_si128(_mm_cvtsi64_si128(y), `8`);
357	Self::new(_mm_or_si128(a:x, b:y))
358	}
359	}
360	}
361	impl<S3, NI> MultiLane<[u64; `2`]> for u64x2_sse2<S3, YesS4, NI> {
362	#[inline(always)]
363	fn to_lanes(self) -> [u64; `2`] {
364	unsafe {
365	[
366	_mm_cvtsi128_si64(self.x) as u64,
367	_mm_extract_epi64(self.x, `1`) as u64,
368	]
369	}
370	}
371	#[inline(always)]
372	fn from_lanes(xs: [u64; `2`]) -> Self {
373	unsafe {
374	let mut x: __m128i = _mm_cvtsi64_si128(xs[`0`] as i64);
375	x = _mm_insert_epi64(a:x, i:xs[`1`] as i64, `1`);
376	Self::new(x)
377	}
378	}
379	}
380	impl<S3, NI> MultiLane<[u64; `2`]> for u64x2_sse2<S3, NoS4, NI> {
381	#[inline(always)]
382	fn to_lanes(self) -> [u64; `2`] {
383	unsafe {
384	[
385	_mm_cvtsi128_si64(self.x) as u64,
386	_mm_cvtsi128_si64(_mm_srli_si128(self.x, `8`)) as u64,
387	]
388	}
389	}
390	#[inline(always)]
391	fn from_lanes(xs: [u64; `2`]) -> Self {
392	unsafe {
393	let x: __m128i = _mm_cvtsi64_si128(xs[`0`] as i64);
394	let y: __m128i = _mm_slli_si128(_mm_cvtsi64_si128(xs[`1`] as i64), `8`);
395	Self::new(_mm_or_si128(a:x, b:y))
396	}
397	}
398	}
399	impl<S3, S4, NI> MultiLane<[u128; `1`]> for u128x1_sse2<S3, S4, NI> {
400	#[inline(always)]
401	fn to_lanes(self) -> [u128; `1`] {
402	unimplemented!()
403	}
404	#[inline(always)]
405	fn from_lanes(xs: [u128; `1`]) -> Self {
406	unimplemented!("{:?}", xs)
407	}
408	}
409
410	impl<S3, S4, NI> MultiLane<[u64; `4`]> for u64x4_sse2<S3, S4, NI>
411	where
412	u64x2_sse2<S3, S4, NI>: MultiLane<[u64; `2`]> + Copy,
413	{
414	#[inline(always)]
415	fn to_lanes(self) -> [u64; `4`] {
416	let (a: [u64; 2], b: [u64; 2]) = (self.0[`0`].to_lanes(), self.0[`1`].to_lanes());
417	[a[`0`], a[`1`], b[`0`], b[`1`]]
418	}
419	#[inline(always)]
420	fn from_lanes(xs: [u64; `4`]) -> Self {
421	let (a: u64x2_sse2, b: u64x2_sse2) = (
422	u64x2_sse2::from_lanes([xs[`0`], xs[`1`]]),
423	u64x2_sse2::from_lanes([xs[`2`], xs[`3`]]),
424	);
425	x2::new([a, b])
426	}
427	}
428
429	macro_rules! impl_into {
430	($from:ident, $to:ident) => {
431	impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
432	#[inline(always)]
433	fn from(x: $from<S3, S4, NI>) -> Self {
434	$to::new(x.x)
435	}
436	}
437	};
438	}
439
440	impl_into!(u128x1_sse2, u32x4_sse2);
441	impl_into!(u128x1_sse2, u64x2_sse2);
442
443	impl_bitops32!(u32x4_sse2);
444	impl_bitops64!(u64x2_sse2);
445	impl_bitops128!(u128x1_sse2);
446
447	impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
448	u32x4_sse2<S3, S4, NI>: BSwap
449	{
450	}
451	impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
452	u64x2_sse2<S3, S4, NI>: BSwap
453	{
454	}
455	impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
456	impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
457	impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
458	impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
459
460	impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
461	where
462	u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; `4`]> + Vec4<u32>,
463	Machine86<S3, S4, NI>: Machine,
464	{
465	}
466	impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
467	where
468	u64x2_sse2<S3, S4, NI>:
469	RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; `2`]> + Vec2<u64>,
470	Machine86<S3, S4, NI>: Machine,
471	{
472	}
473	impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
474	where
475	u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
476	Machine86<S3, S4, NI>: Machine,
477	u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
478	u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
479	{
480	}
481
482	impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
483	where
484	u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; `4`]> + Vec4<u32>,
485	Machine86<YesS3, YesS4, NI>: Machine,
486	{
487	}
488	impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
489	where
490	u64x2_sse2<YesS3, YesS4, NI>:
491	RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; `2`]> + Vec2<u64>,
492	Machine86<YesS3, YesS4, NI>: Machine,
493	{
494	}
495	impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
496	where
497	u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
498	Machine86<YesS3, YesS4, NI>: Machine,
499	u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
500	u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
501	{
502	}
503
504	impl<S3, S4, NI> UnsafeFrom<[u32; `4`]> for u32x4_sse2<S3, S4, NI> {
505	#[inline(always)]
506	unsafe fn unsafe_from(xs: [u32; `4`]) -> Self {
507	Self::new(_mm_set_epi32(
508	e3:xs[`3`] as i32,
509	e2:xs[`2`] as i32,
510	e1:xs[`1`] as i32,
511	e0:xs[`0`] as i32,
512	))
513	}
514	}
515
516	impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
517	where
518	Self: MultiLane<[u32; `4`]>,
519	{
520	#[inline(always)]
521	fn extract(self, i: u32) -> u32 {
522	self.to_lanes()[i as usize]
523	}
524	#[inline(always)]
525	fn insert(self, v: u32, i: u32) -> Self {
526	Self::new(unsafe {
527	match i {
528	`0` => _mm_insert_epi32(self.x, i:v as i32, `0`),
529	`1` => _mm_insert_epi32(self.x, i:v as i32, `1`),
530	`2` => _mm_insert_epi32(self.x, i:v as i32, `2`),
531	`3` => _mm_insert_epi32(self.x, i:v as i32, `3`),
532	_ => unreachable!(),
533	}
534	})
535	}
536	}
537	impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
538	where
539	Self: MultiLane<[u32; `4`]>,
540	{
541	#[inline(always)]
542	fn extract(self, i: u32) -> u32 {
543	self.to_lanes()[i as usize]
544	}
545	#[inline(always)]
546	fn insert(self, v: u32, i: u32) -> Self {
547	Self::new(unsafe {
548	match i {
549	`0` => {
550	let x = _mm_andnot_si128(_mm_cvtsi32_si128(`-1`), self.x);
551	_mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
552	}
553	`1` => {
554	let mut x = _mm_shuffle_epi32(self.x, `0b0111_1000`);
555	x = _mm_slli_si128(x, `4`);
556	x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
557	_mm_shuffle_epi32(x, `0b1110_0001`)
558	}
559	`2` => {
560	let mut x = _mm_shuffle_epi32(self.x, `0b1011_0100`);
561	x = _mm_slli_si128(x, `4`);
562	x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
563	_mm_shuffle_epi32(x, `0b1100_1001`)
564	}
565	`3` => {
566	let mut x = _mm_slli_si128(self.x, `4`);
567	x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
568	_mm_shuffle_epi32(x, `0b0011_1001`)
569	}
570	_ => unreachable!(),
571	}
572	})
573	}
574	}
575
576	impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
577	#[inline(always)]
578	fn shuffle_lane_words2301(self) -> Self {
579	self.shuffle2301()
580	}
581	#[inline(always)]
582	fn shuffle_lane_words1230(self) -> Self {
583	self.shuffle1230()
584	}
585	#[inline(always)]
586	fn shuffle_lane_words3012(self) -> Self {
587	self.shuffle3012()
588	}
589	}
590
591	impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
592	#[inline(always)]
593	fn shuffle2301(self) -> Self {
594	Self::new(unsafe { _mm_shuffle_epi32(self.x, `0b0100_1110`) })
595	}
596	#[inline(always)]
597	fn shuffle1230(self) -> Self {
598	Self::new(unsafe { _mm_shuffle_epi32(self.x, `0b1001_0011`) })
599	}
600	#[inline(always)]
601	fn shuffle3012(self) -> Self {
602	Self::new(unsafe { _mm_shuffle_epi32(self.x, `0b0011_1001`) })
603	}
604	}
605
606	impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
607	#[inline(always)]
608	fn shuffle2301(self) -> Self {
609	x2::new([u64x2_sse2::new(self.0[`1`].x), u64x2_sse2::new(self.0[`0`].x)])
610	}
611	#[inline(always)]
612	fn shuffle3012(self) -> Self {
613	unsafe {
614	x2::new([
615	u64x2_sse2::new(_mm_alignr_epi8(self.0[`1`].x, self.0[`0`].x, `8`)),
616	u64x2_sse2::new(_mm_alignr_epi8(self.0[`0`].x, self.0[`1`].x, `8`)),
617	])
618	}
619	}
620	#[inline(always)]
621	fn shuffle1230(self) -> Self {
622	unsafe {
623	x2::new([
624	u64x2_sse2::new(_mm_alignr_epi8(self.0[`0`].x, self.0[`1`].x, `8`)),
625	u64x2_sse2::new(_mm_alignr_epi8(self.0[`1`].x, self.0[`0`].x, `8`)),
626	])
627	}
628	}
629	}
630	impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
631	#[inline(always)]
632	fn shuffle2301(self) -> Self {
633	x2::new([u64x2_sse2::new(self.0[`1`].x), u64x2_sse2::new(self.0[`0`].x)])
634	}
635	#[inline(always)]
636	fn shuffle3012(self) -> Self {
637	unsafe {
638	let a = _mm_srli_si128(self.0[`0`].x, `8`);
639	let b = _mm_slli_si128(self.0[`0`].x, `8`);
640	let c = _mm_srli_si128(self.0[`1`].x, `8`);
641	let d = _mm_slli_si128(self.0[`1`].x, `8`);
642	let da = _mm_or_si128(d, a);
643	let bc = _mm_or_si128(b, c);
644	x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
645	}
646	}
647	#[inline(always)]
648	fn shuffle1230(self) -> Self {
649	unsafe {
650	let a = _mm_srli_si128(self.0[`0`].x, `8`);
651	let b = _mm_slli_si128(self.0[`0`].x, `8`);
652	let c = _mm_srli_si128(self.0[`1`].x, `8`);
653	let d = _mm_slli_si128(self.0[`1`].x, `8`);
654	let da = _mm_or_si128(d, a);
655	let bc = _mm_or_si128(b, c);
656	x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
657	}
658	}
659	}
660
661	impl<S3, S4, NI> UnsafeFrom<[u64; `2`]> for u64x2_sse2<S3, S4, NI> {
662	#[inline(always)]
663	unsafe fn unsafe_from(xs: [u64; `2`]) -> Self {
664	Self::new(_mm_set_epi64x(e1:xs[`1`] as i64, e0:xs[`0`] as i64))
665	}
666	}
667
668	impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
669	#[inline(always)]
670	fn extract(self, i: u32) -> u64 {
671	unsafe {
672	match i {
673	`0` => _mm_cvtsi128_si64(self.x) as u64,
674	`1` => _mm_extract_epi64(self.x, `1`) as u64,
675	_ => unreachable!(),
676	}
677	}
678	}
679	#[inline(always)]
680	fn insert(self, x: u64, i: u32) -> Self {
681	Self::new(unsafe {
682	match i {
683	`0` => _mm_insert_epi64(self.x, i:x as i64, `0`),
684	`1` => _mm_insert_epi64(self.x, i:x as i64, `1`),
685	_ => unreachable!(),
686	}
687	})
688	}
689	}
690	impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
691	#[inline(always)]
692	fn extract(self, i: u32) -> u64 {
693	unsafe {
694	match i {
695	`0` => _mm_cvtsi128_si64(self.x) as u64,
696	`1` => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, `0b11101110`)) as u64,
697	_ => unreachable!(),
698	}
699	}
700	}
701	#[inline(always)]
702	fn insert(self, x: u64, i: u32) -> Self {
703	Self::new(unsafe {
704	match i {
705	`0` => _mm_or_si128(
706	_mm_andnot_si128(_mm_cvtsi64_si128(`-1`), self.x),
707	_mm_cvtsi64_si128(x as i64),
708	),
709	`1` => _mm_or_si128(
710	_mm_move_epi64(self.x),
711	_mm_slli_si128(_mm_cvtsi64_si128(x as i64), `8`),
712	),
713	_ => unreachable!(),
714	}
715	})
716	}
717	}
718
719	impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
720	#[inline(always)]
721	fn bswap(self) -> Self {
722	Self::new(unsafe {
723	let k: __m128i = _mm_set_epi64x(e1:`0x0c0d_0e0f_0809_0a0b`, e0:`0x0405_0607_0001_0203`);
724	_mm_shuffle_epi8(self.x, b:k)
725	})
726	}
727	}
728	#[inline(always)]
729	fn bswap32_s2(x: __m128i) -> __m128i {
730	unsafe {
731	let mut y: __m128i = _mm_unpacklo_epi8(a:x, b:_mm_setzero_si128());
732	y = _mm_shufflehi_epi16(y, `0b0001_1011`);
733	y = _mm_shufflelo_epi16(y, `0b0001_1011`);
734	let mut z: __m128i = _mm_unpackhi_epi8(a:x, b:_mm_setzero_si128());
735	z = _mm_shufflehi_epi16(z, `0b0001_1011`);
736	z = _mm_shufflelo_epi16(z, `0b0001_1011`);
737	_mm_packus_epi16(a:y, b:z)
738	}
739	}
740	impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
741	#[inline(always)]
742	fn bswap(self) -> Self {
743	Self::new(bswap32_s2(self.x))
744	}
745	}
746
747	impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
748	#[inline(always)]
749	fn bswap(self) -> Self {
750	Self::new(unsafe {
751	let k: __m128i = _mm_set_epi64x(e1:`0x0809_0a0b_0c0d_0e0f`, e0:`0x0001_0203_0405_0607`);
752	_mm_shuffle_epi8(self.x, b:k)
753	})
754	}
755	}
756	impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
757	#[inline(always)]
758	fn bswap(self) -> Self {
759	Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, `0b1011_0001`)) })
760	}
761	}
762
763	impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
764	#[inline(always)]
765	fn bswap(self) -> Self {
766	Self::new(unsafe {
767	let k: __m128i = _mm_set_epi64x(e1:`0x0f0e_0d0c_0b0a_0908`, e0:`0x0706_0504_0302_0100`);
768	_mm_shuffle_epi8(self.x, b:k)
769	})
770	}
771	}
772	impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
773	#[inline(always)]
774	fn bswap(self) -> Self {
775	unimplemented!()
776	}
777	}
778
779	macro_rules! swapi {
780	($x:expr, $i:expr, $k:expr) => {
781	unsafe {
782	const K: u8 = $k;
783	let k = _mm_set1_epi8(K as i8);
784	u128x1_sse2::new(_mm_or_si128(
785	_mm_srli_epi16(_mm_and_si128($x.x, k), $i),
786	_mm_and_si128(_mm_slli_epi16($x.x, $i), k),
787	))
788	}
789	};
790	}
791	#[inline(always)]
792	fn swap16_s2(x: __m128i) -> __m128i {
793	unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, `0b1011_0001`), `0b1011_0001`) }
794	}
795	impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
796	#[inline(always)]
797	fn swap1(self) -> Self {
798	swapi!(self, `1`, `0xaa`)
799	}
800	#[inline(always)]
801	fn swap2(self) -> Self {
802	swapi!(self, `2`, `0xcc`)
803	}
804	#[inline(always)]
805	fn swap4(self) -> Self {
806	swapi!(self, `4`, `0xf0`)
807	}
808	#[inline(always)]
809	fn swap8(self) -> Self {
810	u128x1_sse2::new(unsafe {
811	let k = _mm_set_epi64x(`0x0e0f_0c0d_0a0b_0809`, `0x0607_0405_0203_0001`);
812	_mm_shuffle_epi8(self.x, k)
813	})
814	}
815	#[inline(always)]
816	fn swap16(self) -> Self {
817	u128x1_sse2::new(unsafe {
818	let k = _mm_set_epi64x(`0x0d0c_0f0e_0908_0b0a`, `0x0504_0706_0100_0302`);
819	_mm_shuffle_epi8(self.x, k)
820	})
821	}
822	#[inline(always)]
823	fn swap32(self) -> Self {
824	u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, `0b1011_0001`) })
825	}
826	#[inline(always)]
827	fn swap64(self) -> Self {
828	u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, `0b0100_1110`) })
829	}
830	}
831	impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
832	#[inline(always)]
833	fn swap1(self) -> Self {
834	swapi!(self, `1`, `0xaa`)
835	}
836	#[inline(always)]
837	fn swap2(self) -> Self {
838	swapi!(self, `2`, `0xcc`)
839	}
840	#[inline(always)]
841	fn swap4(self) -> Self {
842	swapi!(self, `4`, `0xf0`)
843	}
844	#[inline(always)]
845	fn swap8(self) -> Self {
846	u128x1_sse2::new(unsafe {
847	_mm_or_si128(_mm_slli_epi16(self.x, `8`), _mm_srli_epi16(self.x, `8`))
848	})
849	}
850	#[inline(always)]
851	fn swap16(self) -> Self {
852	u128x1_sse2::new(swap16_s2(self.x))
853	}
854	#[inline(always)]
855	fn swap32(self) -> Self {
856	u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, `0b1011_0001`) })
857	}
858	#[inline(always)]
859	fn swap64(self) -> Self {
860	u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, `0b0100_1110`) })
861	}
862	}
863
864	#[derive(Copy, Clone)]
865	pub struct G0;
866	#[derive(Copy, Clone)]
867	pub struct G1;
868
869	#[allow(non_camel_case_types)]
870	pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
871	#[allow(non_camel_case_types)]
872	pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
873	#[allow(non_camel_case_types)]
874	pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
875	#[allow(non_camel_case_types)]
876	pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
877
878	#[allow(non_camel_case_types)]
879	pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
880	#[allow(non_camel_case_types)]
881	pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
882	#[allow(non_camel_case_types)]
883	pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
884
885	impl<S3, S4, NI> Vector<[u32; `16`]> for u32x4x4_sse2<S3, S4, NI> {
886	#[inline(always)]
887	fn to_scalars(self) -> [u32; `16`] {
888	transmute!(self)
889	}
890	}
891
892	impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
893	where
894	u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
895	Machine86<S3, S4, NI>: Machine,
896	u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; `2`]>,
897	u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
898	{
899	}
900	impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
901	where
902	u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
903	Machine86<S3, S4, NI>: Machine,
904	u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; `2`]>,
905	u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
906	{
907	}
908	impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
909	where
910	u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
911	Machine86<S3, S4, NI>: Machine,
912	u64x4_sse2<S3, S4, NI>: MultiLane<[u64; `4`]> + Vec4<u64> + Words4,
913	{
914	}
915	impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
916	where
917	u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
918	Machine86<S3, S4, NI>: Machine,
919	u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; `2`]>,
920	u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
921	u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
922	u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
923	u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
924	{
925	}
926
927	impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
928	where
929	u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
930	Avx2Machine<NI>: Machine,
931	u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; `2`]>,
932	u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
933	{
934	}
935	impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
936	where
937	u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
938	Avx2Machine<NI>: Machine,
939	u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; `2`]>,
940	u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
941	{
942	}
943	impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
944	where
945	u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
946	Avx2Machine<NI>: Machine,
947	u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; `4`]> + Vec4<u64> + Words4,
948	{
949	}
950	impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
951	where
952	u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
953	Avx2Machine<NI>: Machine,
954	u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; `2`]>,
955	u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
956	u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
957	u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
958	u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
959	{
960	}
961
962	impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
963	where
964	u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
965	{
966	#[inline(always)]
967	fn extract(self, i: u32) -> u64 {
968	match i {
969	`0` => self.0[`0`].extract(`0`),
970	`1` => self.0[`0`].extract(`1`),
971	`2` => self.0[`1`].extract(`0`),
972	`3` => self.0[`1`].extract(`1`),
973	_ => panic!(),
974	}
975	}
976	#[inline(always)]
977	fn insert(mut self, w: u64, i: u32) -> Self {
978	match i {
979	`0` => self.0[`0`] = self.0[`0`].insert(w, i:`0`),
980	`1` => self.0[`0`] = self.0[`0`].insert(w, i:`1`),
981	`2` => self.0[`1`] = self.0[`1`].insert(w, i:`0`),
982	`3` => self.0[`1`] = self.0[`1`].insert(w, i:`1`),
983	_ => panic!(),
984	};
985	self
986	}
987	}
988
989	impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
990	where
991	u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
992	Machine86<S3, S4, NI>: Machine,
993	u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; `4`]>,
994	u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
995	u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
996	u32x4x4_sse2<S3, S4, NI>: Vector<[u32; `16`]>,
997	{
998	}
999	impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
1000	where
1001	u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1002	Machine86<S3, S4, NI>: Machine,
1003	u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; `4`]>,
1004	u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1005	{
1006	}
1007	impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1008	where
1009	u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1010	Machine86<S3, S4, NI>: Machine,
1011	u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; `4`]>,
1012	u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1013	u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1014	u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1015	{
1016	}
1017
1018	impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1019	where
1020	u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1021	Avx2Machine<NI>: Machine,
1022	u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; `4`]>,
1023	u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1024	{
1025	}
1026	impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1027	where
1028	u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1029	Avx2Machine<NI>: Machine,
1030	u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; `4`]>,
1031	u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1032	u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1033	u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1034	{
1035	}
1036
1037	macro_rules! impl_into_x {
1038	($from:ident, $to:ident) => {
1039	impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1040	for x2<$to<S3, S4, NI>, Gt>
1041	{
1042	#[inline(always)]
1043	fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1044	x2::new([$to::from(x.`0`[`0`]), $to::from(x.`0`[`1`])])
1045	}
1046	}
1047	impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1048	#[inline(always)]
1049	fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1050	x4::new([
1051	$to::from(x.`0`[`0`]),
1052	$to::from(x.`0`[`1`]),
1053	$to::from(x.`0`[`2`]),
1054	$to::from(x.`0`[`3`]),
1055	])
1056	}
1057	}
1058	};
1059	}
1060	impl_into_x!(u128x1_sse2, u64x2_sse2);
1061	impl_into_x!(u128x1_sse2, u32x4_sse2);
1062
1063	///// Debugging
1064
1065	use core::fmt::{Debug, Formatter, Result};
1066
1067	impl<W: PartialEq, G> PartialEq for x2<W, G> {
1068	#[inline(always)]
1069	fn eq(&self, rhs: &Self) -> bool {
1070	self.0[`0`] == rhs.0[`0`] && self.0[`1`] == rhs.0[`1`]
1071	}
1072	}
1073
1074	#[allow(unused)]
1075	#[inline(always)]
1076	unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1077	let q: __m128i = _mm_shuffle_epi32(_mm_cmpeq_epi64(a:x, b:y), `0b1100_0110`);
1078	_mm_cvtsi128_si64(q) == `-1`
1079	}
1080
1081	#[inline(always)]
1082	unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1083	let q: __m128i = _mm_cmpeq_epi32(a:x, b:y);
1084	let p: i64 = _mm_cvtsi128_si64(_mm_srli_si128(q, `8`));
1085	let q: i64 = _mm_cvtsi128_si64(q);
1086	(p & q) == `-1`
1087	}
1088
1089	impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1090	#[inline(always)]
1091	fn eq(&self, rhs: &Self) -> bool {
1092	unsafe { eq128_s2(self.x, y:rhs.x) }
1093	}
1094	}
1095	impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1096	where
1097	Self: Copy + MultiLane<[u32; `4`]>,
1098	{
1099	#[cold]
1100	fn fmt(&self, fmt: &mut Formatter) -> Result {
1101	fmt.write_fmt(format_args!("{:`08`x?}", &self.to_lanes()))
1102	}
1103	}
1104
1105	impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1106	#[inline(always)]
1107	fn eq(&self, rhs: &Self) -> bool {
1108	unsafe { eq128_s2(self.x, y:rhs.x) }
1109	}
1110	}
1111	impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1112	where
1113	Self: Copy + MultiLane<[u64; `2`]>,
1114	{
1115	#[cold]
1116	fn fmt(&self, fmt: &mut Formatter) -> Result {
1117	fmt.write_fmt(format_args!("{:`016`x?}", &self.to_lanes()))
1118	}
1119	}
1120
1121	impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1122	where
1123	u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; `2`]>,
1124	{
1125	#[cold]
1126	fn fmt(&self, fmt: &mut Formatter) -> Result {
1127	let (a: [u64; 2], b: [u64; 2]) = (self.0[`0`].to_lanes(), self.0[`1`].to_lanes());
1128	fmt.write_fmt(format_args!("{:`016`x?}", &[a[`0`], a[`1`], b[`0`], b[`1`]]))
1129	}
1130	}
1131
1132	#[cfg(test)]
1133	#[cfg(target_arch = "x86_64")]
1134	mod test {
1135	use super::*;
1136	use crate::x86_64::{SSE2, SSE41, SSSE3};
1137	use crate::Machine;
1138
1139	#[test]
1140	#[cfg_attr(not(target_feature = "ssse3"), ignore)]
1141	fn test_bswap32_s2_vs_s3() {
1142	let xs = [`0x0f0e_0d0c`, `0x0b0a_0908`, `0x0706_0504`, `0x0302_0100`];
1143	let ys = [`0x0c0d_0e0f`, `0x0809_0a0b`, `0x0405_0607`, `0x0001_0203`];
1144
1145	let s2 = unsafe { SSE2::instance() };
1146	let s3 = unsafe { SSSE3::instance() };
1147
1148	let x_s2 = {
1149	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1150	x_s2.bswap()
1151	};
1152
1153	let x_s3 = {
1154	let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1155	x_s3.bswap()
1156	};
1157
1158	assert_eq!(x_s2, transmute!(x_s3));
1159	assert_eq!(x_s2, s2.vec(ys));
1160	}
1161
1162	#[test]
1163	#[cfg_attr(not(target_feature = "ssse3"), ignore)]
1164	fn test_bswap64_s2_vs_s3() {
1165	let xs = [`0x0f0e_0d0c_0b0a_0908`, `0x0706_0504_0302_0100`];
1166	let ys = [`0x0809_0a0b_0c0d_0e0f`, `0x0001_0203_0405_0607`];
1167
1168	let s2 = unsafe { SSE2::instance() };
1169	let s3 = unsafe { SSSE3::instance() };
1170
1171	let x_s2 = {
1172	let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1173	x_s2.bswap()
1174	};
1175
1176	let x_s3 = {
1177	let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1178	x_s3.bswap()
1179	};
1180
1181	assert_eq!(x_s2, s2.vec(ys));
1182	assert_eq!(x_s3, transmute!(x_s3));
1183	}
1184
1185	#[test]
1186	#[cfg_attr(not(target_feature = "ssse3"), ignore)]
1187	fn test_shuffle32_s2_vs_s3() {
1188	let xs = [`0x0`, `0x1`, `0x2`, `0x3`];
1189	let ys = [`0x2`, `0x3`, `0x0`, `0x1`];
1190	let zs = [`0x1`, `0x2`, `0x3`, `0x0`];
1191
1192	let s2 = unsafe { SSE2::instance() };
1193	let s3 = unsafe { SSSE3::instance() };
1194
1195	let x_s2 = {
1196	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1197	x_s2.shuffle2301()
1198	};
1199	let x_s3 = {
1200	let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1201	x_s3.shuffle2301()
1202	};
1203	assert_eq!(x_s2, s2.vec(ys));
1204	assert_eq!(x_s3, transmute!(x_s3));
1205
1206	let x_s2 = {
1207	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1208	x_s2.shuffle3012()
1209	};
1210	let x_s3 = {
1211	let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1212	x_s3.shuffle3012()
1213	};
1214	assert_eq!(x_s2, s2.vec(zs));
1215	assert_eq!(x_s3, transmute!(x_s3));
1216
1217	let x_s2 = x_s2.shuffle1230();
1218	let x_s3 = x_s3.shuffle1230();
1219	assert_eq!(x_s2, s2.vec(xs));
1220	assert_eq!(x_s3, transmute!(x_s3));
1221	}
1222
1223	#[test]
1224	#[cfg_attr(not(target_feature = "ssse3"), ignore)]
1225	fn test_shuffle64_s2_vs_s3() {
1226	let xs = [`0x0`, `0x1`, `0x2`, `0x3`];
1227	let ys = [`0x2`, `0x3`, `0x0`, `0x1`];
1228	let zs = [`0x1`, `0x2`, `0x3`, `0x0`];
1229
1230	let s2 = unsafe { SSE2::instance() };
1231	let s3 = unsafe { SSSE3::instance() };
1232
1233	let x_s2 = {
1234	let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1235	x_s2.shuffle2301()
1236	};
1237	let x_s3 = {
1238	let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1239	x_s3.shuffle2301()
1240	};
1241	assert_eq!(x_s2, s2.vec(ys));
1242	assert_eq!(x_s3, transmute!(x_s3));
1243
1244	let x_s2 = {
1245	let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1246	x_s2.shuffle3012()
1247	};
1248	let x_s3 = {
1249	let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1250	x_s3.shuffle3012()
1251	};
1252	assert_eq!(x_s2, s2.vec(zs));
1253	assert_eq!(x_s3, transmute!(x_s3));
1254
1255	let x_s2 = x_s2.shuffle1230();
1256	let x_s3 = x_s3.shuffle1230();
1257	assert_eq!(x_s2, s2.vec(xs));
1258	assert_eq!(x_s3, transmute!(x_s3));
1259	}
1260
1261	#[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1262	#[test]
1263	fn test_lanes_u32x4() {
1264	let xs = [`0x1`, `0x2`, `0x3`, `0x4`];
1265
1266	let s2 = unsafe { SSE2::instance() };
1267	let s3 = unsafe { SSSE3::instance() };
1268	let s4 = unsafe { SSE41::instance() };
1269
1270	{
1271	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1272	let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1273	assert_eq!(x_s2, y_s2);
1274	assert_eq!(xs, y_s2.to_lanes());
1275	}
1276
1277	{
1278	let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1279	let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1280	assert_eq!(x_s3, y_s3);
1281	assert_eq!(xs, y_s3.to_lanes());
1282	}
1283
1284	{
1285	let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1286	let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1287	assert_eq!(x_s4, y_s4);
1288	assert_eq!(xs, y_s4.to_lanes());
1289	}
1290	}
1291
1292	#[test]
1293	#[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1294	fn test_lanes_u64x2() {
1295	let xs = [`0x1`, `0x2`];
1296
1297	let s2 = unsafe { SSE2::instance() };
1298	let s3 = unsafe { SSSE3::instance() };
1299	let s4 = unsafe { SSE41::instance() };
1300
1301	{
1302	let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1303	let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1304	assert_eq!(x_s2, y_s2);
1305	assert_eq!(xs, y_s2.to_lanes());
1306	}
1307
1308	{
1309	let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1310	let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1311	assert_eq!(x_s3, y_s3);
1312	assert_eq!(xs, y_s3.to_lanes());
1313	}
1314
1315	{
1316	let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1317	let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1318	assert_eq!(x_s4, y_s4);
1319	assert_eq!(xs, y_s4.to_lanes());
1320	}
1321	}
1322
1323	#[test]
1324	fn test_vec4_u32x4_s2() {
1325	let xs = [`1`, `2`, `3`, `4`];
1326	let s2 = unsafe { SSE2::instance() };
1327	let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1328	assert_eq!(x_s2.extract(`0`), `1`);
1329	assert_eq!(x_s2.extract(`1`), `2`);
1330	assert_eq!(x_s2.extract(`2`), `3`);
1331	assert_eq!(x_s2.extract(`3`), `4`);
1332	assert_eq!(x_s2.insert(`0xf`, `0`), s2.vec([`0xf`, `2`, `3`, `4`]));
1333	assert_eq!(x_s2.insert(`0xf`, `1`), s2.vec([`1`, `0xf`, `3`, `4`]));
1334	assert_eq!(x_s2.insert(`0xf`, `2`), s2.vec([`1`, `2`, `0xf`, `4`]));
1335	assert_eq!(x_s2.insert(`0xf`, `3`), s2.vec([`1`, `2`, `3`, `0xf`]));
1336	}
1337
1338	#[test]
1339	#[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1340	fn test_vec4_u32x4_s4() {
1341	let xs = [`1`, `2`, `3`, `4`];
1342	let s4 = unsafe { SSE41::instance() };
1343	let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1344	assert_eq!(x_s4.extract(`0`), `1`);
1345	assert_eq!(x_s4.extract(`1`), `2`);
1346	assert_eq!(x_s4.extract(`2`), `3`);
1347	assert_eq!(x_s4.extract(`3`), `4`);
1348	assert_eq!(x_s4.insert(`0xf`, `0`), s4.vec([`0xf`, `2`, `3`, `4`]));
1349	assert_eq!(x_s4.insert(`0xf`, `1`), s4.vec([`1`, `0xf`, `3`, `4`]));
1350	assert_eq!(x_s4.insert(`0xf`, `2`), s4.vec([`1`, `2`, `0xf`, `4`]));
1351	assert_eq!(x_s4.insert(`0xf`, `3`), s4.vec([`1`, `2`, `3`, `0xf`]));
1352	}
1353
1354	#[test]
1355	fn test_vec2_u64x2_s2() {
1356	let xs = [`0x1`, `0x2`];
1357	let s2 = unsafe { SSE2::instance() };
1358	let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1359	assert_eq!(x_s2.extract(`0`), `1`);
1360	assert_eq!(x_s2.extract(`1`), `2`);
1361	assert_eq!(x_s2.insert(`0xf`, `0`), s2.vec([`0xf`, `2`]));
1362	assert_eq!(x_s2.insert(`0xf`, `1`), s2.vec([`1`, `0xf`]));
1363	}
1364
1365	#[test]
1366	#[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1367	fn test_vec4_u64x2_s4() {
1368	let xs = [`0x1`, `0x2`];
1369	let s4 = unsafe { SSE41::instance() };
1370	let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1371	assert_eq!(x_s4.extract(`0`), `1`);
1372	assert_eq!(x_s4.extract(`1`), `2`);
1373	assert_eq!(x_s4.insert(`0xf`, `0`), s4.vec([`0xf`, `2`]));
1374	assert_eq!(x_s4.insert(`0xf`, `1`), s4.vec([`1`, `0xf`]));
1375	}
1376	}
1377
1378	pub mod avx2 {
1379	#![allow(non_camel_case_types)]
1380	use crate::soft::{x2, x4};
1381	use crate::types::*;
1382	use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
1383	use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1384	use core::arch::x86_64::*;
1385	use core::marker::PhantomData;
1386	use core::ops::*;
1387	use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes};
1388
1389	#[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
1390	#[repr(transparent)]
1391	pub struct u32x4x2_avx2<NI> {
1392	x: __m256i,
1393	ni: PhantomData<NI>,
1394	}
1395
1396	impl<NI> u32x4x2_avx2<NI> {
1397	#[inline(always)]
1398	fn new(x: __m256i) -> Self {
1399	Self { x, ni: PhantomData }
1400	}
1401	}
1402
1403	impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
1404	impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
1405	#[inline(always)]
1406	unsafe fn unpack(p: vec256_storage) -> Self {
1407	Self::new(p.avx)
1408	}
1409	}
1410	impl<NI> StoreBytes for u32x4x2_avx2<NI> {
1411	#[inline(always)]
1412	unsafe fn unsafe_read_le(input: &[u8]) -> Self {
1413	assert_eq!(input.len(), `32`);
1414	Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
1415	}
1416	#[inline(always)]
1417	unsafe fn unsafe_read_be(input: &[u8]) -> Self {
1418	Self::unsafe_read_le(input).bswap()
1419	}
1420	#[inline(always)]
1421	fn write_le(self, out: &mut [u8]) {
1422	unsafe {
1423	assert_eq!(out.len(), `32`);
1424	_mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
1425	}
1426	}
1427	#[inline(always)]
1428	fn write_be(self, out: &mut [u8]) {
1429	self.bswap().write_le(out)
1430	}
1431	}
1432	impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; `2`]> for u32x4x2_avx2<NI> {
1433	#[inline(always)]
1434	fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; `2`] {
1435	unsafe {
1436	[
1437	u32x4_sse2::new(_mm256_extracti128_si256(self.x, `0`)),
1438	u32x4_sse2::new(_mm256_extracti128_si256(self.x, `1`)),
1439	]
1440	}
1441	}
1442	#[inline(always)]
1443	fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; `2`]) -> Self {
1444	Self::new(unsafe { _mm256_setr_m128i(x[`0`].x, x[`1`].x) })
1445	}
1446	}
1447	impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
1448	#[inline(always)]
1449	fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1450	unsafe {
1451	match i {
1452	`0` => u32x4_sse2::new(_mm256_extracti128_si256(self.x, `0`)),
1453	`1` => u32x4_sse2::new(_mm256_extracti128_si256(self.x, `1`)),
1454	_ => panic!(),
1455	}
1456	}
1457	}
1458	#[inline(always)]
1459	fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1460	Self::new(unsafe {
1461	match i {
1462	`0` => _mm256_inserti128_si256(self.x, w.x, `0`),
1463	`1` => _mm256_inserti128_si256(self.x, w.x, `1`),
1464	_ => panic!(),
1465	}
1466	})
1467	}
1468	}
1469	impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
1470	impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
1471	macro_rules! shuf_lane_bytes {
1472	($name:ident, $k0:expr, $k1:expr) => {
1473	#[inline(always)]
1474	fn $name(self) -> Self {
1475	Self::new(unsafe {
1476	_mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
1477	})
1478	}
1479	};
1480	}
1481	macro_rules! rotr_32 {
1482	($name:ident, $i:expr) => {
1483	#[inline(always)]
1484	fn $name(self) -> Self {
1485	Self::new(unsafe {
1486	_mm256_or_si256(
1487	_mm256_srli_epi32(self.x, $i as i32),
1488	_mm256_slli_epi32(self.x, `32` - $i as i32),
1489	)
1490	})
1491	}
1492	};
1493	}
1494	impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
1495	rotr_32!(rotate_each_word_right7, `7`);
1496	shuf_lane_bytes!(
1497	rotate_each_word_right8,
1498	`0x0c0f_0e0d_080b_0a09`,
1499	`0x0407_0605_0003_0201`
1500	);
1501	rotr_32!(rotate_each_word_right11, `11`);
1502	rotr_32!(rotate_each_word_right12, `12`);
1503	shuf_lane_bytes!(
1504	rotate_each_word_right16,
1505	`0x0d0c_0f0e_0908_0b0a`,
1506	`0x0504_0706_0100_0302`
1507	);
1508	rotr_32!(rotate_each_word_right20, `20`);
1509	shuf_lane_bytes!(
1510	rotate_each_word_right24,
1511	`0x0e0d_0c0f_0a09_080b`,
1512	`0x0605_0407_0201_0003`
1513	);
1514	rotr_32!(rotate_each_word_right25, `25`);
1515	}
1516	impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
1517	impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
1518	#[inline(always)]
1519	fn from(x: u32x4x2_avx2<NI>) -> Self {
1520	Self { avx: x.x }
1521	}
1522	}
1523
1524	macro_rules! impl_assign {
1525	($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1526	impl<NI> $Assign for $vec<NI>
1527	where
1528	NI: Copy,
1529	{
1530	#[inline(always)]
1531	fn $assign_fn(&mut self, rhs: Self) {
1532	*self = self.$bin_fn(rhs);
1533	}
1534	}
1535	};
1536	}
1537	impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
1538	impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
1539	impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
1540	impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
1541
1542	macro_rules! impl_bitop {
1543	($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1544	impl<NI> $Op for $vec<NI> {
1545	type Output = Self;
1546	#[inline(always)]
1547	fn $op_fn(self, rhs: Self) -> Self::Output {
1548	Self::new(unsafe { $impl_fn(self.x, rhs.x) })
1549	}
1550	}
1551	};
1552	}
1553	impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
1554	impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
1555	impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
1556	impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
1557	impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
1558
1559	impl<NI> Not for u32x4x2_avx2<NI> {
1560	type Output = Self;
1561	#[inline(always)]
1562	fn not(self) -> Self::Output {
1563	unsafe {
1564	let f = _mm256_set1_epi8(`-0x7f`);
1565	Self::new(f) ^ self
1566	}
1567	}
1568	}
1569
1570	impl<NI> BSwap for u32x4x2_avx2<NI> {
1571	shuf_lane_bytes!(bswap, `0x0c0d_0e0f_0809_0a0b`, `0x0405_0607_0001_0203`);
1572	}
1573
1574	impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
1575	where
1576	NI: Copy,
1577	{
1578	#[inline(always)]
1579	fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
1580	Self::new(unsafe { _mm256_setr_m128i(x.0[`0`].x, x.0[`1`].x) })
1581	}
1582	}
1583
1584	impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
1585	#[inline(always)]
1586	fn shuffle_lane_words1230(self) -> Self {
1587	Self::new(unsafe { _mm256_shuffle_epi32(self.x, `0b1001_0011`) })
1588	}
1589	#[inline(always)]
1590	fn shuffle_lane_words2301(self) -> Self {
1591	Self::new(unsafe { _mm256_shuffle_epi32(self.x, `0b0100_1110`) })
1592	}
1593	#[inline(always)]
1594	fn shuffle_lane_words3012(self) -> Self {
1595	Self::new(unsafe { _mm256_shuffle_epi32(self.x, `0b0011_1001`) })
1596	}
1597	}
1598
1599	///////////////////////////////////////////////////////////////////////////////////////////
1600
1601	pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
1602	impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
1603
1604	impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
1605	#[inline(always)]
1606	unsafe fn unpack(p: vec512_storage) -> Self {
1607	Self::new([
1608	u32x4x2_avx2::unpack(p.avx[`0`]),
1609	u32x4x2_avx2::unpack(p.avx[`1`]),
1610	])
1611	}
1612	}
1613	impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; `4`]> for u32x4x4_avx2<NI> {
1614	#[inline(always)]
1615	fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; `4`] {
1616	let [a, b] = self.0[`0`].to_lanes();
1617	let [c, d] = self.0[`1`].to_lanes();
1618	[a, b, c, d]
1619	}
1620	#[inline(always)]
1621	fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; `4`]) -> Self {
1622	let ab = u32x4x2_avx2::from_lanes([x[`0`], x[`1`]]);
1623	let cd = u32x4x2_avx2::from_lanes([x[`2`], x[`3`]]);
1624	Self::new([ab, cd])
1625	}
1626	}
1627	impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1628	#[inline(always)]
1629	fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1630	match i {
1631	`0` => self.0[`0`].extract(`0`),
1632	`1` => self.0[`0`].extract(`1`),
1633	`2` => self.0[`1`].extract(`0`),
1634	`3` => self.0[`1`].extract(`1`),
1635	_ => panic!(),
1636	}
1637	}
1638	#[inline(always)]
1639	fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1640	Self::new(match i {
1641	`0` \| `1` => [self.0[`0`].insert(w, i), self.0[`1`]],
1642	`2` \| `3` => [self.0[`0`], self.0[`1`].insert(w, i - `2`)],
1643	_ => panic!(),
1644	})
1645	}
1646	}
1647	impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1648	#[inline(always)]
1649	fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1650	/*
1651	* a00:a01 a10:a11
1652	* b00:b01 b10:b11
1653	* c00:c01 c10:c11
1654	* d00:d01 d10:d11
1655	* =>
1656	* a00:b00 c00:d00
1657	* a01:b01 c01:d01
1658	* a10:b10 c10:d10
1659	* a11:b11 c11:d11
1660	*/
1661	unsafe {
1662	let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[`0`].x, b.0[`0`].x, `0x20`));
1663	let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[`0`].x, b.0[`0`].x, `0x31`));
1664	let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[`1`].x, b.0[`1`].x, `0x20`));
1665	let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[`1`].x, b.0[`1`].x, `0x31`));
1666	let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[`0`].x, d.0[`0`].x, `0x20`));
1667	let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[`0`].x, d.0[`0`].x, `0x31`));
1668	let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[`1`].x, d.0[`1`].x, `0x20`));
1669	let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[`1`].x, d.0[`1`].x, `0x31`));
1670	(
1671	Self::new([ab00, cd00]),
1672	Self::new([ab01, cd01]),
1673	Self::new([ab10, cd10]),
1674	Self::new([ab11, cd11]),
1675	)
1676	}
1677	}
1678	}
1679	impl<NI: Copy> Vector<[u32; `16`]> for u32x4x4_avx2<NI> {
1680	#[inline(always)]
1681	fn to_scalars(self) -> [u32; `16`] {
1682	transmute!(self)
1683	}
1684	}
1685	impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
1686	#[inline(always)]
1687	fn from(x: u32x4x4_avx2<NI>) -> Self {
1688	Self {
1689	avx: [
1690	vec256_storage { avx: x.0[`0`].x },
1691	vec256_storage { avx: x.0[`1`].x },
1692	],
1693	}
1694	}
1695	}
1696	impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
1697	#[inline(always)]
1698	fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1699	Self::new(unsafe {
1700	[
1701	u32x4x2_avx2::new(_mm256_setr_m128i(x.0[`0`].x, x.0[`1`].x)),
1702	u32x4x2_avx2::new(_mm256_setr_m128i(x.0[`2`].x, x.0[`3`].x)),
1703	]
1704	})
1705	}
1706	}
1707	}
1708