sse41.rs source code [crates/core_arch/src/x86/sse41.rs]

1	//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3	use crate::core_arch::{simd::, x86::};
4	use crate::intrinsics::simd::*;
5
6	#[cfg(test)]
7	use stdarch_test::assert_instr;
8
9	// SSE4 rounding constants
10	/// round to nearest
11	#[stable(feature = "simd_x86", since = "1.27.0")]
12	pub const _MM_FROUND_TO_NEAREST_INT: i32 = `0x00`;
13	/// round down
14	#[stable(feature = "simd_x86", since = "1.27.0")]
15	pub const _MM_FROUND_TO_NEG_INF: i32 = `0x01`;
16	/// round up
17	#[stable(feature = "simd_x86", since = "1.27.0")]
18	pub const _MM_FROUND_TO_POS_INF: i32 = `0x02`;
19	/// truncate
20	#[stable(feature = "simd_x86", since = "1.27.0")]
21	pub const _MM_FROUND_TO_ZERO: i32 = `0x03`;
22	/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
23	#[stable(feature = "simd_x86", since = "1.27.0")]
24	pub const _MM_FROUND_CUR_DIRECTION: i32 = `0x04`;
25	/// do not suppress exceptions
26	#[stable(feature = "simd_x86", since = "1.27.0")]
27	pub const _MM_FROUND_RAISE_EXC: i32 = `0x00`;
28	/// suppress exceptions
29	#[stable(feature = "simd_x86", since = "1.27.0")]
30	pub const _MM_FROUND_NO_EXC: i32 = `0x08`;
31	/// round to nearest and do not suppress exceptions
32	#[stable(feature = "simd_x86", since = "1.27.0")]
33	pub const _MM_FROUND_NINT: i32 = `0x00`;
34	/// round down and do not suppress exceptions
35	#[stable(feature = "simd_x86", since = "1.27.0")]
36	pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEG_INF;
37	/// round up and do not suppress exceptions
38	#[stable(feature = "simd_x86", since = "1.27.0")]
39	pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_POS_INF;
40	/// truncate and do not suppress exceptions
41	#[stable(feature = "simd_x86", since = "1.27.0")]
42	pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_ZERO;
43	/// use MXCSR.RC and do not suppress exceptions; see
44	/// `vendor::_MM_SET_ROUNDING_MODE`
45	#[stable(feature = "simd_x86", since = "1.27.0")]
46	pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC \| _MM_FROUND_CUR_DIRECTION;
47	/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
48	#[stable(feature = "simd_x86", since = "1.27.0")]
49	pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC \| _MM_FROUND_CUR_DIRECTION;
50
51	/// Blend packed 8-bit integers from `a` and `b` using `mask`
52	///
53	/// The high bit of each corresponding mask byte determines the selection.
54	/// If the high bit is set the element of `a` is selected. The element
55	/// of `b` is selected otherwise.
56	///
57	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
58	#[inline]
59	#[target_feature(enable = "sse4.1")]
60	#[cfg_attr(test, assert_instr(pblendvb))]
61	#[stable(feature = "simd_x86", since = "1.27.0")]
62	pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
63	let mask: i8x16 = simd_lt(x:mask.as_i8x16(), y:i8x16::splat(`0`));
64	transmute(src:simd_select(mask, if_true:b.as_i8x16(), if_false:a.as_i8x16()))
65	}
66
67	/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
68	///
69	/// The mask bits determine the selection. A clear bit selects the
70	/// corresponding element of `a`, and a set bit the corresponding
71	/// element of `b`.
72	///
73	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
74	#[inline]
75	#[target_feature(enable = "sse4.1")]
76	#[cfg_attr(test, assert_instr(pblendw, IMM8 = `0xB1`))]
77	#[rustc_legacy_const_generics(`2`)]
78	#[stable(feature = "simd_x86", since = "1.27.0")]
79	pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
80	static_assert_uimm_bits!(IMM8, `8`);
81	transmute::<i16x8, _>(src:simd_shuffle!(
82	a.as_i16x8(),
83	b.as_i16x8(),
84	[
85	[`0`, `8`][IMM8 as usize & `1`],
86	[`1`, `9`][(IMM8 >> `1`) as usize & `1`],
87	[`2`, `10`][(IMM8 >> `2`) as usize & `1`],
88	[`3`, `11`][(IMM8 >> `3`) as usize & `1`],
89	[`4`, `12`][(IMM8 >> `4`) as usize & `1`],
90	[`5`, `13`][(IMM8 >> `5`) as usize & `1`],
91	[`6`, `14`][(IMM8 >> `6`) as usize & `1`],
92	[`7`, `15`][(IMM8 >> `7`) as usize & `1`],
93	]
94	))
95	}
96
97	/// Blend packed double-precision (64-bit) floating-point elements from `a`
98	/// and `b` using `mask`
99	///
100	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
101	#[inline]
102	#[target_feature(enable = "sse4.1")]
103	#[cfg_attr(test, assert_instr(blendvpd))]
104	#[stable(feature = "simd_x86", since = "1.27.0")]
105	pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
106	let mask: i64x2 = simd_lt(x:transmute::<_, i64x2>(mask), y:i64x2::splat(`0`));
107	transmute(src:simd_select(mask, if_true:b.as_f64x2(), if_false:a.as_f64x2()))
108	}
109
110	/// Blend packed single-precision (32-bit) floating-point elements from `a`
111	/// and `b` using `mask`
112	///
113	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
114	#[inline]
115	#[target_feature(enable = "sse4.1")]
116	#[cfg_attr(test, assert_instr(blendvps))]
117	#[stable(feature = "simd_x86", since = "1.27.0")]
118	pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
119	let mask: i32x4 = simd_lt(x:transmute::<_, i32x4>(mask), y:i32x4::splat(`0`));
120	transmute(src:simd_select(mask, if_true:b.as_f32x4(), if_false:a.as_f32x4()))
121	}
122
123	/// Blend packed double-precision (64-bit) floating-point elements from `a`
124	/// and `b` using control mask `IMM2`
125	///
126	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
127	#[inline]
128	#[target_feature(enable = "sse4.1")]
129	// Note: LLVM7 prefers the single-precision floating-point domain when possible
130	// see https://bugs.llvm.org/show_bug.cgi?id=38195
131	// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
132	#[cfg_attr(test, assert_instr(blendps, IMM2 = `0b10`))]
133	#[rustc_legacy_const_generics(`2`)]
134	#[stable(feature = "simd_x86", since = "1.27.0")]
135	pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
136	static_assert_uimm_bits!(IMM2, `2`);
137	transmute::<f64x2, _>(src:simd_shuffle!(
138	a.as_f64x2(),
139	b.as_f64x2(),
140	[[`0`, `2`][IMM2 as usize & `1`], [`1`, `3`][(IMM2 >> `1`) as usize & `1`]]
141	))
142	}
143
144	/// Blend packed single-precision (32-bit) floating-point elements from `a`
145	/// and `b` using mask `IMM4`
146	///
147	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
148	#[inline]
149	#[target_feature(enable = "sse4.1")]
150	#[cfg_attr(test, assert_instr(blendps, IMM4 = `0b0101`))]
151	#[rustc_legacy_const_generics(`2`)]
152	#[stable(feature = "simd_x86", since = "1.27.0")]
153	pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
154	static_assert_uimm_bits!(IMM4, `4`);
155	transmute::<f32x4, _>(src:simd_shuffle!(
156	a.as_f32x4(),
157	b.as_f32x4(),
158	[
159	[`0`, `4`][IMM4 as usize & `1`],
160	[`1`, `5`][(IMM4 >> `1`) as usize & `1`],
161	[`2`, `6`][(IMM4 >> `2`) as usize & `1`],
162	[`3`, `7`][(IMM4 >> `3`) as usize & `1`],
163	]
164	))
165	}
166
167	/// Extracts a single-precision (32-bit) floating-point element from `a`,
168	/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
169	/// and may be converted back to a floating point number via casting.
170	///
171	/// # Example
172	/// ```rust
173	/// # #[cfg(target_arch = "x86")]
174	/// # use std::arch::x86::*;
175	/// # #[cfg(target_arch = "x86_64")]
176	/// # use std::arch::x86_64::*;
177	/// # fn main() {
178	/// # if is_x86_feature_detected!("sse4.1") {
179	/// # #[target_feature(enable = "sse4.1")]
180	/// # unsafe fn worker() {
181	/// let mut float_store = vec![`1.0`, `1.0`, `2.0`, `3.0`];
182	/// let simd_floats = _mm_set_ps(`2.5`, `5.0`, `7.5`, `10.0`);
183	/// let x: i32 = _mm_extract_ps::<`2`>(simd_floats);
184	/// float_store.push(f32::from_bits(x as u32));
185	/// assert_eq!(float_store, vec![`1.0`, `1.0`, `2.0`, `3.0`, `5.0`]);
186	/// # }
187	/// # unsafe { worker() }
188	/// # }
189	/// # }
190	/// ```
191	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
192	#[inline]
193	#[target_feature(enable = "sse4.1")]
194	#[cfg_attr(
195	all(test, not(target_os = "windows")),
196	assert_instr(extractps, IMM8 = `0`)
197	)]
198	#[rustc_legacy_const_generics(`1`)]
199	#[stable(feature = "simd_x86", since = "1.27.0")]
200	pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
201	static_assert_uimm_bits!(IMM8, `2`);
202	simd_extract!(a, IMM8 as u32, f32).to_bits() as i32
203	}
204
205	/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
206	/// integer containing the zero-extended integer data.
207	///
208	/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
209	///
210	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
211	#[inline]
212	#[target_feature(enable = "sse4.1")]
213	#[cfg_attr(test, assert_instr(pextrb, IMM8 = `0`))]
214	#[rustc_legacy_const_generics(`1`)]
215	#[stable(feature = "simd_x86", since = "1.27.0")]
216	pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
217	static_assert_uimm_bits!(IMM8, `4`);
218	simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32
219	}
220
221	/// Extracts an 32-bit integer from `a` selected with `IMM8`
222	///
223	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
224	#[inline]
225	#[target_feature(enable = "sse4.1")]
226	#[cfg_attr(
227	all(test, not(target_os = "windows")),
228	assert_instr(extractps, IMM8 = `1`)
229	)]
230	#[rustc_legacy_const_generics(`1`)]
231	#[stable(feature = "simd_x86", since = "1.27.0")]
232	pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
233	static_assert_uimm_bits!(IMM8, `2`);
234	simd_extract!(a.as_i32x4(), IMM8 as u32, i32)
235	}
236
237	/// Select a single value in `b` to store at some position in `a`,
238	/// Then zero elements according to `IMM8`.
239	///
240	/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
241	/// the result they will be copied to, and which bits in the result will be
242	/// cleared. The following assignments are made:
243	///
244	/// Bits `[7:6]` specify the bits to copy from operand `b`:*
245	/// - `00`: Selects bits `[31:0]` from operand `b`.
246	/// - `01`: Selects bits `[63:32]` from operand `b`.
247	/// - `10`: Selects bits `[95:64]` from operand `b`.
248	/// - `11`: Selects bits `[127:96]` from operand `b`.
249	///
250	/// Bits `[5:4]` specify the bits in the result to which the selected bits*
251	/// from operand `b` are copied:
252	/// - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
253	/// - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
254	/// - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
255	/// - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
256	///
257	/// Bits `[3:0]`: If any of these bits are set, the corresponding result*
258	/// element is cleared.
259	///
260	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
261	#[inline]
262	#[target_feature(enable = "sse4.1")]
263	#[cfg_attr(test, assert_instr(insertps, IMM8 = `0b1010`))]
264	#[rustc_legacy_const_generics(`2`)]
265	#[stable(feature = "simd_x86", since = "1.27.0")]
266	pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
267	static_assert_uimm_bits!(IMM8, `8`);
268	insertps(a, b, IMM8 as u8)
269	}
270
271	/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
272	/// location specified by `IMM8`.
273	///
274	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
275	#[inline]
276	#[target_feature(enable = "sse4.1")]
277	#[cfg_attr(test, assert_instr(pinsrb, IMM8 = `0`))]
278	#[rustc_legacy_const_generics(`2`)]
279	#[stable(feature = "simd_x86", since = "1.27.0")]
280	pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
281	static_assert_uimm_bits!(IMM8, `4`);
282	transmute(src:simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8))
283	}
284
285	/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
286	/// location specified by `IMM8`.
287	///
288	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
289	#[inline]
290	#[target_feature(enable = "sse4.1")]
291	#[cfg_attr(test, assert_instr(pinsrd, IMM8 = `0`))]
292	#[rustc_legacy_const_generics(`2`)]
293	#[stable(feature = "simd_x86", since = "1.27.0")]
294	pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
295	static_assert_uimm_bits!(IMM8, `2`);
296	transmute(src:simd_insert!(a.as_i32x4(), IMM8 as u32, i))
297	}
298
299	/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
300	/// values in dst.
301	///
302	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
303	#[inline]
304	#[target_feature(enable = "sse4.1")]
305	#[cfg_attr(test, assert_instr(pmaxsb))]
306	#[stable(feature = "simd_x86", since = "1.27.0")]
307	pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
308	let a: i8x16 = a.as_i8x16();
309	let b: i8x16 = b.as_i8x16();
310	transmute(src:simd_select::<i8x16, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
311	}
312
313	/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
314	/// maximum.
315	///
316	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
317	#[inline]
318	#[target_feature(enable = "sse4.1")]
319	#[cfg_attr(test, assert_instr(pmaxuw))]
320	#[stable(feature = "simd_x86", since = "1.27.0")]
321	pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
322	let a: u16x8 = a.as_u16x8();
323	let b: u16x8 = b.as_u16x8();
324	transmute(src:simd_select::<i16x8, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
325	}
326
327	/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
328	/// values.
329	///
330	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
331	#[inline]
332	#[target_feature(enable = "sse4.1")]
333	#[cfg_attr(test, assert_instr(pmaxsd))]
334	#[stable(feature = "simd_x86", since = "1.27.0")]
335	pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
336	let a: i32x4 = a.as_i32x4();
337	let b: i32x4 = b.as_i32x4();
338	transmute(src:simd_select::<i32x4, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
339	}
340
341	/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
342	/// maximum values.
343	///
344	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
345	#[inline]
346	#[target_feature(enable = "sse4.1")]
347	#[cfg_attr(test, assert_instr(pmaxud))]
348	#[stable(feature = "simd_x86", since = "1.27.0")]
349	pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
350	let a: u32x4 = a.as_u32x4();
351	let b: u32x4 = b.as_u32x4();
352	transmute(src:simd_select::<i32x4, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
353	}
354
355	/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
356	/// values in dst.
357	///
358	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
359	#[inline]
360	#[target_feature(enable = "sse4.1")]
361	#[cfg_attr(test, assert_instr(pminsb))]
362	#[stable(feature = "simd_x86", since = "1.27.0")]
363	pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
364	let a: i8x16 = a.as_i8x16();
365	let b: i8x16 = b.as_i8x16();
366	transmute(src:simd_select::<i8x16, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
367	}
368
369	/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
370	/// minimum.
371	///
372	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
373	#[inline]
374	#[target_feature(enable = "sse4.1")]
375	#[cfg_attr(test, assert_instr(pminuw))]
376	#[stable(feature = "simd_x86", since = "1.27.0")]
377	pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
378	let a: u16x8 = a.as_u16x8();
379	let b: u16x8 = b.as_u16x8();
380	transmute(src:simd_select::<i16x8, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
381	}
382
383	/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
384	/// values.
385	///
386	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
387	#[inline]
388	#[target_feature(enable = "sse4.1")]
389	#[cfg_attr(test, assert_instr(pminsd))]
390	#[stable(feature = "simd_x86", since = "1.27.0")]
391	pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
392	let a: i32x4 = a.as_i32x4();
393	let b: i32x4 = b.as_i32x4();
394	transmute(src:simd_select::<i32x4, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
395	}
396
397	/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
398	/// minimum values.
399	///
400	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
401	#[inline]
402	#[target_feature(enable = "sse4.1")]
403	#[cfg_attr(test, assert_instr(pminud))]
404	#[stable(feature = "simd_x86", since = "1.27.0")]
405	pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
406	let a: u32x4 = a.as_u32x4();
407	let b: u32x4 = b.as_u32x4();
408	transmute(src:simd_select::<i32x4, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
409	}
410
411	/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
412	/// using unsigned saturation
413	///
414	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
415	#[inline]
416	#[target_feature(enable = "sse4.1")]
417	#[cfg_attr(test, assert_instr(packusdw))]
418	#[stable(feature = "simd_x86", since = "1.27.0")]
419	pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
420	transmute(src:packusdw(a:a.as_i32x4(), b:b.as_i32x4()))
421	}
422
423	/// Compares packed 64-bit integers in `a` and `b` for equality
424	///
425	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
426	#[inline]
427	#[target_feature(enable = "sse4.1")]
428	#[cfg_attr(test, assert_instr(pcmpeqq))]
429	#[stable(feature = "simd_x86", since = "1.27.0")]
430	pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
431	transmute(src:simd_eq::<_, i64x2>(x:a.as_i64x2(), y:b.as_i64x2()))
432	}
433
434	/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
435	///
436	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
437	#[inline]
438	#[target_feature(enable = "sse4.1")]
439	#[cfg_attr(test, assert_instr(pmovsxbw))]
440	#[stable(feature = "simd_x86", since = "1.27.0")]
441	pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
442	let a: i8x16 = a.as_i8x16();
443	let a: i8x8 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`]);
444	transmute(src:simd_cast::<_, i16x8>(a))
445	}
446
447	/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
448	///
449	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
450	#[inline]
451	#[target_feature(enable = "sse4.1")]
452	#[cfg_attr(test, assert_instr(pmovsxbd))]
453	#[stable(feature = "simd_x86", since = "1.27.0")]
454	pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
455	let a: i8x16 = a.as_i8x16();
456	let a: i8x4 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`]);
457	transmute(src:simd_cast::<_, i32x4>(a))
458	}
459
460	/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
461	/// 64-bit integers
462	///
463	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
464	#[inline]
465	#[target_feature(enable = "sse4.1")]
466	#[cfg_attr(test, assert_instr(pmovsxbq))]
467	#[stable(feature = "simd_x86", since = "1.27.0")]
468	pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
469	let a: i8x16 = a.as_i8x16();
470	let a: i8x2 = simd_shuffle!(a, a, [`0`, `1`]);
471	transmute(src:simd_cast::<_, i64x2>(a))
472	}
473
474	/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
475	///
476	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
477	#[inline]
478	#[target_feature(enable = "sse4.1")]
479	#[cfg_attr(test, assert_instr(pmovsxwd))]
480	#[stable(feature = "simd_x86", since = "1.27.0")]
481	pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
482	let a: i16x8 = a.as_i16x8();
483	let a: i16x4 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`]);
484	transmute(src:simd_cast::<_, i32x4>(a))
485	}
486
487	/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
488	///
489	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
490	#[inline]
491	#[target_feature(enable = "sse4.1")]
492	#[cfg_attr(test, assert_instr(pmovsxwq))]
493	#[stable(feature = "simd_x86", since = "1.27.0")]
494	pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
495	let a: i16x8 = a.as_i16x8();
496	let a: i16x2 = simd_shuffle!(a, a, [`0`, `1`]);
497	transmute(src:simd_cast::<_, i64x2>(a))
498	}
499
500	/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
501	///
502	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
503	#[inline]
504	#[target_feature(enable = "sse4.1")]
505	#[cfg_attr(test, assert_instr(pmovsxdq))]
506	#[stable(feature = "simd_x86", since = "1.27.0")]
507	pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
508	let a: i32x4 = a.as_i32x4();
509	let a: i32x2 = simd_shuffle!(a, a, [`0`, `1`]);
510	transmute(src:simd_cast::<_, i64x2>(a))
511	}
512
513	/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
514	///
515	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
516	#[inline]
517	#[target_feature(enable = "sse4.1")]
518	#[cfg_attr(test, assert_instr(pmovzxbw))]
519	#[stable(feature = "simd_x86", since = "1.27.0")]
520	pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
521	let a: u8x16 = a.as_u8x16();
522	let a: u8x8 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`]);
523	transmute(src:simd_cast::<_, i16x8>(a))
524	}
525
526	/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
527	///
528	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
529	#[inline]
530	#[target_feature(enable = "sse4.1")]
531	#[cfg_attr(test, assert_instr(pmovzxbd))]
532	#[stable(feature = "simd_x86", since = "1.27.0")]
533	pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
534	let a: u8x16 = a.as_u8x16();
535	let a: u8x4 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`]);
536	transmute(src:simd_cast::<_, i32x4>(a))
537	}
538
539	/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
540	///
541	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
542	#[inline]
543	#[target_feature(enable = "sse4.1")]
544	#[cfg_attr(test, assert_instr(pmovzxbq))]
545	#[stable(feature = "simd_x86", since = "1.27.0")]
546	pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
547	let a: u8x16 = a.as_u8x16();
548	let a: u8x2 = simd_shuffle!(a, a, [`0`, `1`]);
549	transmute(src:simd_cast::<_, i64x2>(a))
550	}
551
552	/// Zeroes extend packed unsigned 16-bit integers in `a`
553	/// to packed 32-bit integers
554	///
555	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
556	#[inline]
557	#[target_feature(enable = "sse4.1")]
558	#[cfg_attr(test, assert_instr(pmovzxwd))]
559	#[stable(feature = "simd_x86", since = "1.27.0")]
560	pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
561	let a: u16x8 = a.as_u16x8();
562	let a: u16x4 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`]);
563	transmute(src:simd_cast::<_, i32x4>(a))
564	}
565
566	/// Zeroes extend packed unsigned 16-bit integers in `a`
567	/// to packed 64-bit integers
568	///
569	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
570	#[inline]
571	#[target_feature(enable = "sse4.1")]
572	#[cfg_attr(test, assert_instr(pmovzxwq))]
573	#[stable(feature = "simd_x86", since = "1.27.0")]
574	pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
575	let a: u16x8 = a.as_u16x8();
576	let a: u16x2 = simd_shuffle!(a, a, [`0`, `1`]);
577	transmute(src:simd_cast::<_, i64x2>(a))
578	}
579
580	/// Zeroes extend packed unsigned 32-bit integers in `a`
581	/// to packed 64-bit integers
582	///
583	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
584	#[inline]
585	#[target_feature(enable = "sse4.1")]
586	#[cfg_attr(test, assert_instr(pmovzxdq))]
587	#[stable(feature = "simd_x86", since = "1.27.0")]
588	pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
589	let a: u32x4 = a.as_u32x4();
590	let a: u32x2 = simd_shuffle!(a, a, [`0`, `1`]);
591	transmute(src:simd_cast::<_, i64x2>(a))
592	}
593
594	/// Returns the dot product of two __m128d vectors.
595	///
596	/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
597	/// If a condition mask bit is zero, the corresponding multiplication is
598	/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
599	/// the dot product will be stored in the return value component. Otherwise if
600	/// the broadcast mask bit is zero then the return component will be zero.
601	///
602	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
603	#[inline]
604	#[target_feature(enable = "sse4.1")]
605	#[cfg_attr(test, assert_instr(dppd, IMM8 = `0`))]
606	#[rustc_legacy_const_generics(`2`)]
607	#[stable(feature = "simd_x86", since = "1.27.0")]
608	pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
609	static_assert_uimm_bits!(IMM8, `8`);
610	dppd(a, b, IMM8 as u8)
611	}
612
613	/// Returns the dot product of two __m128 vectors.
614	///
615	/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
616	/// If a condition mask bit is zero, the corresponding multiplication is
617	/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
618	/// the dot product will be stored in the return value component. Otherwise if
619	/// the broadcast mask bit is zero then the return component will be zero.
620	///
621	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
622	#[inline]
623	#[target_feature(enable = "sse4.1")]
624	#[cfg_attr(test, assert_instr(dpps, IMM8 = `0`))]
625	#[rustc_legacy_const_generics(`2`)]
626	#[stable(feature = "simd_x86", since = "1.27.0")]
627	pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
628	static_assert_uimm_bits!(IMM8, `8`);
629	dpps(a, b, IMM8 as u8)
630	}
631
632	/// Round the packed double-precision (64-bit) floating-point elements in `a`
633	/// down to an integer value, and stores the results as packed double-precision
634	/// floating-point elements.
635	///
636	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
637	#[inline]
638	#[target_feature(enable = "sse4.1")]
639	#[cfg_attr(test, assert_instr(roundpd))]
640	#[stable(feature = "simd_x86", since = "1.27.0")]
641	pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
642	simd_floor(a)
643	}
644
645	/// Round the packed single-precision (32-bit) floating-point elements in `a`
646	/// down to an integer value, and stores the results as packed single-precision
647	/// floating-point elements.
648	///
649	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
650	#[inline]
651	#[target_feature(enable = "sse4.1")]
652	#[cfg_attr(test, assert_instr(roundps))]
653	#[stable(feature = "simd_x86", since = "1.27.0")]
654	pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
655	simd_floor(a)
656	}
657
658	/// Round the lower double-precision (64-bit) floating-point element in `b`
659	/// down to an integer value, store the result as a double-precision
660	/// floating-point element in the lower element of the intrinsic result,
661	/// and copies the upper element from `a` to the upper element of the intrinsic
662	/// result.
663	///
664	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
665	#[inline]
666	#[target_feature(enable = "sse4.1")]
667	#[cfg_attr(test, assert_instr(roundsd))]
668	#[stable(feature = "simd_x86", since = "1.27.0")]
669	pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
670	roundsd(a, b, _MM_FROUND_FLOOR)
671	}
672
673	/// Round the lower single-precision (32-bit) floating-point element in `b`
674	/// down to an integer value, store the result as a single-precision
675	/// floating-point element in the lower element of the intrinsic result,
676	/// and copies the upper 3 packed elements from `a` to the upper elements
677	/// of the intrinsic result.
678	///
679	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
680	#[inline]
681	#[target_feature(enable = "sse4.1")]
682	#[cfg_attr(test, assert_instr(roundss))]
683	#[stable(feature = "simd_x86", since = "1.27.0")]
684	pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
685	roundss(a, b, _MM_FROUND_FLOOR)
686	}
687
688	/// Round the packed double-precision (64-bit) floating-point elements in `a`
689	/// up to an integer value, and stores the results as packed double-precision
690	/// floating-point elements.
691	///
692	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
693	#[inline]
694	#[target_feature(enable = "sse4.1")]
695	#[cfg_attr(test, assert_instr(roundpd))]
696	#[stable(feature = "simd_x86", since = "1.27.0")]
697	pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
698	simd_ceil(a)
699	}
700
701	/// Round the packed single-precision (32-bit) floating-point elements in `a`
702	/// up to an integer value, and stores the results as packed single-precision
703	/// floating-point elements.
704	///
705	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
706	#[inline]
707	#[target_feature(enable = "sse4.1")]
708	#[cfg_attr(test, assert_instr(roundps))]
709	#[stable(feature = "simd_x86", since = "1.27.0")]
710	pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
711	simd_ceil(a)
712	}
713
714	/// Round the lower double-precision (64-bit) floating-point element in `b`
715	/// up to an integer value, store the result as a double-precision
716	/// floating-point element in the lower element of the intrinsic result,
717	/// and copies the upper element from `a` to the upper element
718	/// of the intrinsic result.
719	///
720	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
721	#[inline]
722	#[target_feature(enable = "sse4.1")]
723	#[cfg_attr(test, assert_instr(roundsd))]
724	#[stable(feature = "simd_x86", since = "1.27.0")]
725	pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
726	roundsd(a, b, _MM_FROUND_CEIL)
727	}
728
729	/// Round the lower single-precision (32-bit) floating-point element in `b`
730	/// up to an integer value, store the result as a single-precision
731	/// floating-point element in the lower element of the intrinsic result,
732	/// and copies the upper 3 packed elements from `a` to the upper elements
733	/// of the intrinsic result.
734	///
735	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
736	#[inline]
737	#[target_feature(enable = "sse4.1")]
738	#[cfg_attr(test, assert_instr(roundss))]
739	#[stable(feature = "simd_x86", since = "1.27.0")]
740	pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
741	roundss(a, b, _MM_FROUND_CEIL)
742	}
743
744	/// Round the packed double-precision (64-bit) floating-point elements in `a`
745	/// using the `ROUNDING` parameter, and stores the results as packed
746	/// double-precision floating-point elements.
747	/// Rounding is done according to the rounding parameter, which can be one of:
748	///
749	/// ```
750	/// #[cfg(target_arch = "x86")]
751	/// use std::arch::x86::*;
752	/// #[cfg(target_arch = "x86_64")]
753	/// use std::arch::x86_64::*;
754	///
755	/// # fn main() {
756	/// // round to nearest, and suppress exceptions:
757	/// # let _x =
758	/// _MM_FROUND_TO_NEAREST_INT \| _MM_FROUND_NO_EXC;
759	/// // round down, and suppress exceptions:
760	/// # let _x =
761	/// _MM_FROUND_TO_NEG_INF \| _MM_FROUND_NO_EXC;
762	/// // round up, and suppress exceptions:
763	/// # let _x =
764	/// _MM_FROUND_TO_POS_INF \| _MM_FROUND_NO_EXC;
765	/// // truncate, and suppress exceptions:
766	/// # let _x =
767	/// _MM_FROUND_TO_ZERO \| _MM_FROUND_NO_EXC;
768	/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
769	/// # let _x =
770	/// _MM_FROUND_CUR_DIRECTION;
771	/// # }
772	/// ```
773	///
774	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
775	#[inline]
776	#[target_feature(enable = "sse4.1")]
777	#[cfg_attr(test, assert_instr(roundpd, ROUNDING = `0`))]
778	#[rustc_legacy_const_generics(`1`)]
779	#[stable(feature = "simd_x86", since = "1.27.0")]
780	pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
781	static_assert_uimm_bits!(ROUNDING, `4`);
782	roundpd(a, ROUNDING)
783	}
784
785	/// Round the packed single-precision (32-bit) floating-point elements in `a`
786	/// using the `ROUNDING` parameter, and stores the results as packed
787	/// single-precision floating-point elements.
788	/// Rounding is done according to the rounding parameter, which can be one of:
789	///
790	/// ```
791	/// #[cfg(target_arch = "x86")]
792	/// use std::arch::x86::*;
793	/// #[cfg(target_arch = "x86_64")]
794	/// use std::arch::x86_64::*;
795	///
796	/// # fn main() {
797	/// // round to nearest, and suppress exceptions:
798	/// # let _x =
799	/// _MM_FROUND_TO_NEAREST_INT \| _MM_FROUND_NO_EXC;
800	/// // round down, and suppress exceptions:
801	/// # let _x =
802	/// _MM_FROUND_TO_NEG_INF \| _MM_FROUND_NO_EXC;
803	/// // round up, and suppress exceptions:
804	/// # let _x =
805	/// _MM_FROUND_TO_POS_INF \| _MM_FROUND_NO_EXC;
806	/// // truncate, and suppress exceptions:
807	/// # let _x =
808	/// _MM_FROUND_TO_ZERO \| _MM_FROUND_NO_EXC;
809	/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
810	/// # let _x =
811	/// _MM_FROUND_CUR_DIRECTION;
812	/// # }
813	/// ```
814	///
815	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
816	#[inline]
817	#[target_feature(enable = "sse4.1")]
818	#[cfg_attr(test, assert_instr(roundps, ROUNDING = `0`))]
819	#[rustc_legacy_const_generics(`1`)]
820	#[stable(feature = "simd_x86", since = "1.27.0")]
821	pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
822	static_assert_uimm_bits!(ROUNDING, `4`);
823	roundps(a, ROUNDING)
824	}
825
826	/// Round the lower double-precision (64-bit) floating-point element in `b`
827	/// using the `ROUNDING` parameter, store the result as a double-precision
828	/// floating-point element in the lower element of the intrinsic result,
829	/// and copies the upper element from `a` to the upper element of the intrinsic
830	/// result.
831	/// Rounding is done according to the rounding parameter, which can be one of:
832	///
833	/// ```
834	/// #[cfg(target_arch = "x86")]
835	/// use std::arch::x86::*;
836	/// #[cfg(target_arch = "x86_64")]
837	/// use std::arch::x86_64::*;
838	///
839	/// # fn main() {
840	/// // round to nearest, and suppress exceptions:
841	/// # let _x =
842	/// _MM_FROUND_TO_NEAREST_INT \| _MM_FROUND_NO_EXC;
843	/// // round down, and suppress exceptions:
844	/// # let _x =
845	/// _MM_FROUND_TO_NEG_INF \| _MM_FROUND_NO_EXC;
846	/// // round up, and suppress exceptions:
847	/// # let _x =
848	/// _MM_FROUND_TO_POS_INF \| _MM_FROUND_NO_EXC;
849	/// // truncate, and suppress exceptions:
850	/// # let _x =
851	/// _MM_FROUND_TO_ZERO \| _MM_FROUND_NO_EXC;
852	/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
853	/// # let _x =
854	/// _MM_FROUND_CUR_DIRECTION;
855	/// # }
856	/// ```
857	///
858	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
859	#[inline]
860	#[target_feature(enable = "sse4.1")]
861	#[cfg_attr(test, assert_instr(roundsd, ROUNDING = `0`))]
862	#[rustc_legacy_const_generics(`2`)]
863	#[stable(feature = "simd_x86", since = "1.27.0")]
864	pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
865	static_assert_uimm_bits!(ROUNDING, `4`);
866	roundsd(a, b, ROUNDING)
867	}
868
869	/// Round the lower single-precision (32-bit) floating-point element in `b`
870	/// using the `ROUNDING` parameter, store the result as a single-precision
871	/// floating-point element in the lower element of the intrinsic result,
872	/// and copies the upper 3 packed elements from `a` to the upper elements
873	/// of the intrinsic result.
874	/// Rounding is done according to the rounding parameter, which can be one of:
875	///
876	/// ```
877	/// #[cfg(target_arch = "x86")]
878	/// use std::arch::x86::*;
879	/// #[cfg(target_arch = "x86_64")]
880	/// use std::arch::x86_64::*;
881	///
882	/// # fn main() {
883	/// // round to nearest, and suppress exceptions:
884	/// # let _x =
885	/// _MM_FROUND_TO_NEAREST_INT \| _MM_FROUND_NO_EXC;
886	/// // round down, and suppress exceptions:
887	/// # let _x =
888	/// _MM_FROUND_TO_NEG_INF \| _MM_FROUND_NO_EXC;
889	/// // round up, and suppress exceptions:
890	/// # let _x =
891	/// _MM_FROUND_TO_POS_INF \| _MM_FROUND_NO_EXC;
892	/// // truncate, and suppress exceptions:
893	/// # let _x =
894	/// _MM_FROUND_TO_ZERO \| _MM_FROUND_NO_EXC;
895	/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
896	/// # let _x =
897	/// _MM_FROUND_CUR_DIRECTION;
898	/// # }
899	/// ```
900	///
901	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
902	#[inline]
903	#[target_feature(enable = "sse4.1")]
904	#[cfg_attr(test, assert_instr(roundss, ROUNDING = `0`))]
905	#[rustc_legacy_const_generics(`2`)]
906	#[stable(feature = "simd_x86", since = "1.27.0")]
907	pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
908	static_assert_uimm_bits!(ROUNDING, `4`);
909	roundss(a, b, ROUNDING)
910	}
911
912	/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
913	/// returning a vector containing its value in its first position, and its
914	/// index
915	/// in its second position; all other elements are set to zero.
916	///
917	/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
918	/// instruction.
919	///
920	/// Arguments:
921	///
922	/// `a` - A 128-bit vector of type `__m128i`.*
923	///
924	/// Returns:
925	///
926	/// A 128-bit value where:
927	///
928	/// bits `[15:0]` - contain the minimum value found in parameter `a`,*
929	/// bits `[18:16]` - contain the index of the minimum value*
930	/// remaining bits are set to `0`.*
931	///
932	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
933	#[inline]
934	#[target_feature(enable = "sse4.1")]
935	#[cfg_attr(test, assert_instr(phminposuw))]
936	#[stable(feature = "simd_x86", since = "1.27.0")]
937	pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
938	transmute(src:phminposuw(a.as_u16x8()))
939	}
940
941	/// Multiplies the low 32-bit integers from each packed 64-bit
942	/// element in `a` and `b`, and returns the signed 64-bit result.
943	///
944	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
945	#[inline]
946	#[target_feature(enable = "sse4.1")]
947	#[cfg_attr(test, assert_instr(pmuldq))]
948	#[stable(feature = "simd_x86", since = "1.27.0")]
949	pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
950	let a: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
951	let b: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
952	transmute(src:simd_mul(x:a, y:b))
953	}
954
955	/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
956	/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
957	/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
958	/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
959	/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
960	/// return a negative number.
961	///
962	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
963	#[inline]
964	#[target_feature(enable = "sse4.1")]
965	#[cfg_attr(test, assert_instr(pmulld))]
966	#[stable(feature = "simd_x86", since = "1.27.0")]
967	pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
968	transmute(src:simd_mul(x:a.as_i32x4(), y:b.as_i32x4()))
969	}
970
971	/// Subtracts 8-bit unsigned integer values and computes the absolute
972	/// values of the differences to the corresponding bits in the destination.
973	/// Then sums of the absolute differences are returned according to the bit
974	/// fields in the immediate operand.
975	///
976	/// The following algorithm is performed:
977	///
978	/// ```ignore
979	/// i = IMM8[`2`] * `4`
980	/// j = IMM8[`1`:`0`] * `4`
981	/// for k := `0` to `7`
982	/// d0 = abs(a[i + k + `0`] - b[j + `0`])
983	/// d1 = abs(a[i + k + `1`] - b[j + `1`])
984	/// d2 = abs(a[i + k + `2`] - b[j + `2`])
985	/// d3 = abs(a[i + k + `3`] - b[j + `3`])
986	/// r[k] = d0 + d1 + d2 + d3
987	/// ```
988	///
989	/// Arguments:
990	///
991	/// `a` - A 128-bit vector of type `__m128i`.*
992	/// `b` - A 128-bit vector of type `__m128i`.*
993	/// `IMM8` - An 8-bit immediate operand specifying how the absolute*
994	/// differences are to be calculated
995	/// Bit `[2]` specify the offset for operand `a`*
996	/// Bits `[1:0]` specify the offset for operand `b`*
997	///
998	/// Returns:
999	///
1000	/// A `__m128i` vector containing the sums of the sets of absolute*
1001	/// differences between both operands.
1002	///
1003	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
1004	#[inline]
1005	#[target_feature(enable = "sse4.1")]
1006	#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = `0`))]
1007	#[rustc_legacy_const_generics(`2`)]
1008	#[stable(feature = "simd_x86", since = "1.27.0")]
1009	pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
1010	static_assert_uimm_bits!(IMM8, `3`);
1011	transmute(src:mpsadbw(a:a.as_u8x16(), b:b.as_u8x16(), IMM8 as u8))
1012	}
1013
1014	/// Tests whether the specified bits in a 128-bit integer vector are all
1015	/// zeros.
1016	///
1017	/// Arguments:
1018	///
1019	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1020	/// `mask` - A 128-bit integer vector selecting which bits to test in*
1021	/// operand `a`.
1022	///
1023	/// Returns:
1024	///
1025	/// `1` - if the specified bits are all zeros,*
1026	/// `0` - otherwise.*
1027	///
1028	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
1029	#[inline]
1030	#[target_feature(enable = "sse4.1")]
1031	#[cfg_attr(test, assert_instr(ptest))]
1032	#[stable(feature = "simd_x86", since = "1.27.0")]
1033	pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1034	ptestz(a:a.as_i64x2(), mask:mask.as_i64x2())
1035	}
1036
1037	/// Tests whether the specified bits in a 128-bit integer vector are all
1038	/// ones.
1039	///
1040	/// Arguments:
1041	///
1042	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1043	/// `mask` - A 128-bit integer vector selecting which bits to test in*
1044	/// operand `a`.
1045	///
1046	/// Returns:
1047	///
1048	/// `1` - if the specified bits are all ones,*
1049	/// `0` - otherwise.*
1050	///
1051	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
1052	#[inline]
1053	#[target_feature(enable = "sse4.1")]
1054	#[cfg_attr(test, assert_instr(ptest))]
1055	#[stable(feature = "simd_x86", since = "1.27.0")]
1056	pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1057	ptestc(a:a.as_i64x2(), mask:mask.as_i64x2())
1058	}
1059
1060	/// Tests whether the specified bits in a 128-bit integer vector are
1061	/// neither all zeros nor all ones.
1062	///
1063	/// Arguments:
1064	///
1065	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1066	/// `mask` - A 128-bit integer vector selecting which bits to test in*
1067	/// operand `a`.
1068	///
1069	/// Returns:
1070	///
1071	/// `1` - if the specified bits are neither all zeros nor all ones,*
1072	/// `0` - otherwise.*
1073	///
1074	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
1075	#[inline]
1076	#[target_feature(enable = "sse4.1")]
1077	#[cfg_attr(test, assert_instr(ptest))]
1078	#[stable(feature = "simd_x86", since = "1.27.0")]
1079	pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1080	ptestnzc(a:a.as_i64x2(), mask:mask.as_i64x2())
1081	}
1082
1083	/// Tests whether the specified bits in a 128-bit integer vector are all
1084	/// zeros.
1085	///
1086	/// Arguments:
1087	///
1088	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1089	/// `mask` - A 128-bit integer vector selecting which bits to test in*
1090	/// operand `a`.
1091	///
1092	/// Returns:
1093	///
1094	/// `1` - if the specified bits are all zeros,*
1095	/// `0` - otherwise.*
1096	///
1097	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
1098	#[inline]
1099	#[target_feature(enable = "sse4.1")]
1100	#[cfg_attr(test, assert_instr(ptest))]
1101	#[stable(feature = "simd_x86", since = "1.27.0")]
1102	pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1103	_mm_testz_si128(a, mask)
1104	}
1105
1106	/// Tests whether the specified bits in `a` 128-bit integer vector are all
1107	/// ones.
1108	///
1109	/// Argument:
1110	///
1111	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1112	///
1113	/// Returns:
1114	///
1115	/// `1` - if the bits specified in the operand are all set to 1,*
1116	/// `0` - otherwise.*
1117	///
1118	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
1119	#[inline]
1120	#[target_feature(enable = "sse4.1")]
1121	#[cfg_attr(test, assert_instr(pcmpeqd))]
1122	#[cfg_attr(test, assert_instr(ptest))]
1123	#[stable(feature = "simd_x86", since = "1.27.0")]
1124	pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1125	_mm_testc_si128(a, mask:_mm_cmpeq_epi32(a, b:a))
1126	}
1127
1128	/// Tests whether the specified bits in a 128-bit integer vector are
1129	/// neither all zeros nor all ones.
1130	///
1131	/// Arguments:
1132	///
1133	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1134	/// `mask` - A 128-bit integer vector selecting which bits to test in*
1135	/// operand `a`.
1136	///
1137	/// Returns:
1138	///
1139	/// `1` - if the specified bits are neither all zeros nor all ones,*
1140	/// `0` - otherwise.*
1141	///
1142	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
1143	#[inline]
1144	#[target_feature(enable = "sse4.1")]
1145	#[cfg_attr(test, assert_instr(ptest))]
1146	#[stable(feature = "simd_x86", since = "1.27.0")]
1147	pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1148	_mm_testnzc_si128(a, mask)
1149	}
1150
1151	#[allow(improper_ctypes)]
1152	extern "C" {
1153	#[link_name = "llvm.x86.sse41.insertps"]
1154	fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1155	#[link_name = "llvm.x86.sse41.packusdw"]
1156	fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1157	#[link_name = "llvm.x86.sse41.dppd"]
1158	fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1159	#[link_name = "llvm.x86.sse41.dpps"]
1160	fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1161	#[link_name = "llvm.x86.sse41.round.pd"]
1162	fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1163	#[link_name = "llvm.x86.sse41.round.ps"]
1164	fn roundps(a: __m128, rounding: i32) -> __m128;
1165	#[link_name = "llvm.x86.sse41.round.sd"]
1166	fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1167	#[link_name = "llvm.x86.sse41.round.ss"]
1168	fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1169	#[link_name = "llvm.x86.sse41.phminposuw"]
1170	fn phminposuw(a: u16x8) -> u16x8;
1171	#[link_name = "llvm.x86.sse41.mpsadbw"]
1172	fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1173	#[link_name = "llvm.x86.sse41.ptestz"]
1174	fn ptestz(a: i64x2, mask: i64x2) -> i32;
1175	#[link_name = "llvm.x86.sse41.ptestc"]
1176	fn ptestc(a: i64x2, mask: i64x2) -> i32;
1177	#[link_name = "llvm.x86.sse41.ptestnzc"]
1178	fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1179	}
1180
1181	#[cfg(test)]
1182	mod tests {
1183	use crate::core_arch::x86::*;
1184	use std::mem;
1185	use stdarch_test::simd_test;
1186
1187	#[simd_test(enable = "sse4.1")]
1188	unsafe fn test_mm_blendv_epi8() {
1189	#[rustfmt::skip]
1190	let a = _mm_setr_epi8(
1191	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`,
1192	`8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
1193	);
1194	#[rustfmt::skip]
1195	let b = _mm_setr_epi8(
1196	`16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`,
1197	);
1198	#[rustfmt::skip]
1199	let mask = _mm_setr_epi8(
1200	`0`, `-1`, `0`, `-1`, `0`, `-1`, `0`, `-1`,
1201	`0`, `-1`, `0`, `-1`, `0`, `-1`, `0`, `-1`,
1202	);
1203	#[rustfmt::skip]
1204	let e = _mm_setr_epi8(
1205	`0`, `17`, `2`, `19`, `4`, `21`, `6`, `23`, `8`, `25`, `10`, `27`, `12`, `29`, `14`, `31`,
1206	);
1207	assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1208	}
1209
1210	#[simd_test(enable = "sse4.1")]
1211	unsafe fn test_mm_blendv_pd() {
1212	let a = _mm_set1_pd(`0.0`);
1213	let b = _mm_set1_pd(`1.0`);
1214	let mask = transmute(_mm_setr_epi64x(`0`, `-1`));
1215	let r = _mm_blendv_pd(a, b, mask);
1216	let e = _mm_setr_pd(`0.0`, `1.0`);
1217	assert_eq_m128d(r, e);
1218	}
1219
1220	#[simd_test(enable = "sse4.1")]
1221	unsafe fn test_mm_blendv_ps() {
1222	let a = _mm_set1_ps(`0.0`);
1223	let b = _mm_set1_ps(`1.0`);
1224	let mask = transmute(_mm_setr_epi32(`0`, `-1`, `0`, `-1`));
1225	let r = _mm_blendv_ps(a, b, mask);
1226	let e = _mm_setr_ps(`0.0`, `1.0`, `0.0`, `1.0`);
1227	assert_eq_m128(r, e);
1228	}
1229
1230	#[simd_test(enable = "sse4.1")]
1231	unsafe fn test_mm_blend_pd() {
1232	let a = _mm_set1_pd(`0.0`);
1233	let b = _mm_set1_pd(`1.0`);
1234	let r = _mm_blend_pd::<`0b10`>(a, b);
1235	let e = _mm_setr_pd(`0.0`, `1.0`);
1236	assert_eq_m128d(r, e);
1237	}
1238
1239	#[simd_test(enable = "sse4.1")]
1240	unsafe fn test_mm_blend_ps() {
1241	let a = _mm_set1_ps(`0.0`);
1242	let b = _mm_set1_ps(`1.0`);
1243	let r = _mm_blend_ps::<`0b1010`>(a, b);
1244	let e = _mm_setr_ps(`0.0`, `1.0`, `0.0`, `1.0`);
1245	assert_eq_m128(r, e);
1246	}
1247
1248	#[simd_test(enable = "sse4.1")]
1249	unsafe fn test_mm_blend_epi16() {
1250	let a = _mm_set1_epi16(`0`);
1251	let b = _mm_set1_epi16(`1`);
1252	let r = _mm_blend_epi16::<`0b1010_1100`>(a, b);
1253	let e = _mm_setr_epi16(`0`, `0`, `1`, `1`, `0`, `1`, `0`, `1`);
1254	assert_eq_m128i(r, e);
1255	}
1256
1257	#[simd_test(enable = "sse4.1")]
1258	unsafe fn test_mm_extract_ps() {
1259	let a = _mm_setr_ps(`0.0`, `1.0`, `2.0`, `3.0`);
1260	let r: f32 = f32::from_bits(_mm_extract_ps::<`1`>(a) as u32);
1261	assert_eq!(r, `1.0`);
1262	let r: f32 = f32::from_bits(_mm_extract_ps::<`3`>(a) as u32);
1263	assert_eq!(r, `3.0`);
1264	}
1265
1266	#[simd_test(enable = "sse4.1")]
1267	unsafe fn test_mm_extract_epi8() {
1268	#[rustfmt::skip]
1269	let a = _mm_setr_epi8(
1270	`-1`, `1`, `2`, `3`, `4`, `5`, `6`, `7`,
1271	`8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`
1272	);
1273	let r1 = _mm_extract_epi8::<`0`>(a);
1274	let r2 = _mm_extract_epi8::<`3`>(a);
1275	assert_eq!(r1, `0xFF`);
1276	assert_eq!(r2, `3`);
1277	}
1278
1279	#[simd_test(enable = "sse4.1")]
1280	unsafe fn test_mm_extract_epi32() {
1281	let a = _mm_setr_epi32(`0`, `1`, `2`, `3`);
1282	let r = _mm_extract_epi32::<`1`>(a);
1283	assert_eq!(r, `1`);
1284	let r = _mm_extract_epi32::<`3`>(a);
1285	assert_eq!(r, `3`);
1286	}
1287
1288	#[simd_test(enable = "sse4.1")]
1289	unsafe fn test_mm_insert_ps() {
1290	let a = _mm_set1_ps(`1.0`);
1291	let b = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
1292	let r = _mm_insert_ps::<`0b11_00_1100`>(a, b);
1293	let e = _mm_setr_ps(`4.0`, `1.0`, `0.0`, `0.0`);
1294	assert_eq_m128(r, e);
1295
1296	// Zeroing takes precedence over copied value
1297	let a = _mm_set1_ps(`1.0`);
1298	let b = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
1299	let r = _mm_insert_ps::<`0b11_00_0001`>(a, b);
1300	let e = _mm_setr_ps(`0.0`, `1.0`, `1.0`, `1.0`);
1301	assert_eq_m128(r, e);
1302	}
1303
1304	#[simd_test(enable = "sse4.1")]
1305	unsafe fn test_mm_insert_epi8() {
1306	let a = _mm_set1_epi8(`0`);
1307	let e = _mm_setr_epi8(`0`, `32`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`);
1308	let r = _mm_insert_epi8::<`1`>(a, `32`);
1309	assert_eq_m128i(r, e);
1310	let e = _mm_setr_epi8(`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `32`, `0`);
1311	let r = _mm_insert_epi8::<`14`>(a, `32`);
1312	assert_eq_m128i(r, e);
1313	}
1314
1315	#[simd_test(enable = "sse4.1")]
1316	unsafe fn test_mm_insert_epi32() {
1317	let a = _mm_set1_epi32(`0`);
1318	let e = _mm_setr_epi32(`0`, `32`, `0`, `0`);
1319	let r = _mm_insert_epi32::<`1`>(a, `32`);
1320	assert_eq_m128i(r, e);
1321	let e = _mm_setr_epi32(`0`, `0`, `0`, `32`);
1322	let r = _mm_insert_epi32::<`3`>(a, `32`);
1323	assert_eq_m128i(r, e);
1324	}
1325
1326	#[simd_test(enable = "sse4.1")]
1327	unsafe fn test_mm_max_epi8() {
1328	#[rustfmt::skip]
1329	let a = _mm_setr_epi8(
1330	`1`, `4`, `5`, `8`, `9`, `12`, `13`, `16`,
1331	`17`, `20`, `21`, `24`, `25`, `28`, `29`, `32`,
1332	);
1333	#[rustfmt::skip]
1334	let b = _mm_setr_epi8(
1335	`2`, `3`, `6`, `7`, `10`, `11`, `14`, `15`,
1336	`18`, `19`, `22`, `23`, `26`, `27`, `30`, `31`,
1337	);
1338	let r = _mm_max_epi8(a, b);
1339	#[rustfmt::skip]
1340	let e = _mm_setr_epi8(
1341	`2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`,
1342	`18`, `20`, `22`, `24`, `26`, `28`, `30`, `32`,
1343	);
1344	assert_eq_m128i(r, e);
1345	}
1346
1347	#[simd_test(enable = "sse4.1")]
1348	unsafe fn test_mm_max_epu16() {
1349	let a = _mm_setr_epi16(`1`, `4`, `5`, `8`, `9`, `12`, `13`, `16`);
1350	let b = _mm_setr_epi16(`2`, `3`, `6`, `7`, `10`, `11`, `14`, `15`);
1351	let r = _mm_max_epu16(a, b);
1352	let e = _mm_setr_epi16(`2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`);
1353	assert_eq_m128i(r, e);
1354	}
1355
1356	#[simd_test(enable = "sse4.1")]
1357	unsafe fn test_mm_max_epi32() {
1358	let a = _mm_setr_epi32(`1`, `4`, `5`, `8`);
1359	let b = _mm_setr_epi32(`2`, `3`, `6`, `7`);
1360	let r = _mm_max_epi32(a, b);
1361	let e = _mm_setr_epi32(`2`, `4`, `6`, `8`);
1362	assert_eq_m128i(r, e);
1363	}
1364
1365	#[simd_test(enable = "sse4.1")]
1366	unsafe fn test_mm_max_epu32() {
1367	let a = _mm_setr_epi32(`1`, `4`, `5`, `8`);
1368	let b = _mm_setr_epi32(`2`, `3`, `6`, `7`);
1369	let r = _mm_max_epu32(a, b);
1370	let e = _mm_setr_epi32(`2`, `4`, `6`, `8`);
1371	assert_eq_m128i(r, e);
1372	}
1373
1374	#[simd_test(enable = "sse4.1")]
1375	unsafe fn test_mm_min_epi8_1() {
1376	#[rustfmt::skip]
1377	let a = _mm_setr_epi8(
1378	`1`, `4`, `5`, `8`, `9`, `12`, `13`, `16`,
1379	`17`, `20`, `21`, `24`, `25`, `28`, `29`, `32`,
1380	);
1381	#[rustfmt::skip]
1382	let b = _mm_setr_epi8(
1383	`2`, `3`, `6`, `7`, `10`, `11`, `14`, `15`,
1384	`18`, `19`, `22`, `23`, `26`, `27`, `30`, `31`,
1385	);
1386	let r = _mm_min_epi8(a, b);
1387	#[rustfmt::skip]
1388	let e = _mm_setr_epi8(
1389	`1`, `3`, `5`, `7`, `9`, `11`, `13`, `15`,
1390	`17`, `19`, `21`, `23`, `25`, `27`, `29`, `31`,
1391	);
1392	assert_eq_m128i(r, e);
1393	}
1394
1395	#[simd_test(enable = "sse4.1")]
1396	unsafe fn test_mm_min_epi8_2() {
1397	#[rustfmt::skip]
1398	let a = _mm_setr_epi8(
1399	`1`, `-4`, `-5`, `8`, `-9`, `-12`, `13`, `-16`,
1400	`17`, `20`, `21`, `24`, `25`, `28`, `29`, `32`,
1401	);
1402	#[rustfmt::skip]
1403	let b = _mm_setr_epi8(
1404	`2`, `-3`, `-6`, `7`, `-10`, `-11`, `14`, `-15`,
1405	`18`, `19`, `22`, `23`, `26`, `27`, `30`, `31`,
1406	);
1407	let r = _mm_min_epi8(a, b);
1408	#[rustfmt::skip]
1409	let e = _mm_setr_epi8(
1410	`1`, `-4`, `-6`, `7`, `-10`, `-12`, `13`, `-16`,
1411	`17`, `19`, `21`, `23`, `25`, `27`, `29`, `31`,
1412	);
1413	assert_eq_m128i(r, e);
1414	}
1415
1416	#[simd_test(enable = "sse4.1")]
1417	unsafe fn test_mm_min_epu16() {
1418	let a = _mm_setr_epi16(`1`, `4`, `5`, `8`, `9`, `12`, `13`, `16`);
1419	let b = _mm_setr_epi16(`2`, `3`, `6`, `7`, `10`, `11`, `14`, `15`);
1420	let r = _mm_min_epu16(a, b);
1421	let e = _mm_setr_epi16(`1`, `3`, `5`, `7`, `9`, `11`, `13`, `15`);
1422	assert_eq_m128i(r, e);
1423	}
1424
1425	#[simd_test(enable = "sse4.1")]
1426	unsafe fn test_mm_min_epi32_1() {
1427	let a = _mm_setr_epi32(`1`, `4`, `5`, `8`);
1428	let b = _mm_setr_epi32(`2`, `3`, `6`, `7`);
1429	let r = _mm_min_epi32(a, b);
1430	let e = _mm_setr_epi32(`1`, `3`, `5`, `7`);
1431	assert_eq_m128i(r, e);
1432	}
1433
1434	#[simd_test(enable = "sse4.1")]
1435	unsafe fn test_mm_min_epi32_2() {
1436	let a = _mm_setr_epi32(`-1`, `4`, `5`, `-7`);
1437	let b = _mm_setr_epi32(`-2`, `3`, `-6`, `8`);
1438	let r = _mm_min_epi32(a, b);
1439	let e = _mm_setr_epi32(`-2`, `3`, `-6`, `-7`);
1440	assert_eq_m128i(r, e);
1441	}
1442
1443	#[simd_test(enable = "sse4.1")]
1444	unsafe fn test_mm_min_epu32() {
1445	let a = _mm_setr_epi32(`1`, `4`, `5`, `8`);
1446	let b = _mm_setr_epi32(`2`, `3`, `6`, `7`);
1447	let r = _mm_min_epu32(a, b);
1448	let e = _mm_setr_epi32(`1`, `3`, `5`, `7`);
1449	assert_eq_m128i(r, e);
1450	}
1451
1452	#[simd_test(enable = "sse4.1")]
1453	unsafe fn test_mm_packus_epi32() {
1454	let a = _mm_setr_epi32(`1`, `2`, `3`, `4`);
1455	let b = _mm_setr_epi32(`-1`, `-2`, `-3`, `-4`);
1456	let r = _mm_packus_epi32(a, b);
1457	let e = _mm_setr_epi16(`1`, `2`, `3`, `4`, `0`, `0`, `0`, `0`);
1458	assert_eq_m128i(r, e);
1459	}
1460
1461	#[simd_test(enable = "sse4.1")]
1462	unsafe fn test_mm_cmpeq_epi64() {
1463	let a = _mm_setr_epi64x(`0`, `1`);
1464	let b = _mm_setr_epi64x(`0`, `0`);
1465	let r = _mm_cmpeq_epi64(a, b);
1466	let e = _mm_setr_epi64x(`-1`, `0`);
1467	assert_eq_m128i(r, e);
1468	}
1469
1470	#[simd_test(enable = "sse4.1")]
1471	unsafe fn test_mm_cvtepi8_epi16() {
1472	let a = _mm_set1_epi8(`10`);
1473	let r = _mm_cvtepi8_epi16(a);
1474	let e = _mm_set1_epi16(`10`);
1475	assert_eq_m128i(r, e);
1476	let a = _mm_set1_epi8(`-10`);
1477	let r = _mm_cvtepi8_epi16(a);
1478	let e = _mm_set1_epi16(`-10`);
1479	assert_eq_m128i(r, e);
1480	}
1481
1482	#[simd_test(enable = "sse4.1")]
1483	unsafe fn test_mm_cvtepi8_epi32() {
1484	let a = _mm_set1_epi8(`10`);
1485	let r = _mm_cvtepi8_epi32(a);
1486	let e = _mm_set1_epi32(`10`);
1487	assert_eq_m128i(r, e);
1488	let a = _mm_set1_epi8(`-10`);
1489	let r = _mm_cvtepi8_epi32(a);
1490	let e = _mm_set1_epi32(`-10`);
1491	assert_eq_m128i(r, e);
1492	}
1493
1494	#[simd_test(enable = "sse4.1")]
1495	unsafe fn test_mm_cvtepi8_epi64() {
1496	let a = _mm_set1_epi8(`10`);
1497	let r = _mm_cvtepi8_epi64(a);
1498	let e = _mm_set1_epi64x(`10`);
1499	assert_eq_m128i(r, e);
1500	let a = _mm_set1_epi8(`-10`);
1501	let r = _mm_cvtepi8_epi64(a);
1502	let e = _mm_set1_epi64x(`-10`);
1503	assert_eq_m128i(r, e);
1504	}
1505
1506	#[simd_test(enable = "sse4.1")]
1507	unsafe fn test_mm_cvtepi16_epi32() {
1508	let a = _mm_set1_epi16(`10`);
1509	let r = _mm_cvtepi16_epi32(a);
1510	let e = _mm_set1_epi32(`10`);
1511	assert_eq_m128i(r, e);
1512	let a = _mm_set1_epi16(`-10`);
1513	let r = _mm_cvtepi16_epi32(a);
1514	let e = _mm_set1_epi32(`-10`);
1515	assert_eq_m128i(r, e);
1516	}
1517
1518	#[simd_test(enable = "sse4.1")]
1519	unsafe fn test_mm_cvtepi16_epi64() {
1520	let a = _mm_set1_epi16(`10`);
1521	let r = _mm_cvtepi16_epi64(a);
1522	let e = _mm_set1_epi64x(`10`);
1523	assert_eq_m128i(r, e);
1524	let a = _mm_set1_epi16(`-10`);
1525	let r = _mm_cvtepi16_epi64(a);
1526	let e = _mm_set1_epi64x(`-10`);
1527	assert_eq_m128i(r, e);
1528	}
1529
1530	#[simd_test(enable = "sse4.1")]
1531	unsafe fn test_mm_cvtepi32_epi64() {
1532	let a = _mm_set1_epi32(`10`);
1533	let r = _mm_cvtepi32_epi64(a);
1534	let e = _mm_set1_epi64x(`10`);
1535	assert_eq_m128i(r, e);
1536	let a = _mm_set1_epi32(`-10`);
1537	let r = _mm_cvtepi32_epi64(a);
1538	let e = _mm_set1_epi64x(`-10`);
1539	assert_eq_m128i(r, e);
1540	}
1541
1542	#[simd_test(enable = "sse4.1")]
1543	unsafe fn test_mm_cvtepu8_epi16() {
1544	let a = _mm_set1_epi8(`10`);
1545	let r = _mm_cvtepu8_epi16(a);
1546	let e = _mm_set1_epi16(`10`);
1547	assert_eq_m128i(r, e);
1548	}
1549
1550	#[simd_test(enable = "sse4.1")]
1551	unsafe fn test_mm_cvtepu8_epi32() {
1552	let a = _mm_set1_epi8(`10`);
1553	let r = _mm_cvtepu8_epi32(a);
1554	let e = _mm_set1_epi32(`10`);
1555	assert_eq_m128i(r, e);
1556	}
1557
1558	#[simd_test(enable = "sse4.1")]
1559	unsafe fn test_mm_cvtepu8_epi64() {
1560	let a = _mm_set1_epi8(`10`);
1561	let r = _mm_cvtepu8_epi64(a);
1562	let e = _mm_set1_epi64x(`10`);
1563	assert_eq_m128i(r, e);
1564	}
1565
1566	#[simd_test(enable = "sse4.1")]
1567	unsafe fn test_mm_cvtepu16_epi32() {
1568	let a = _mm_set1_epi16(`10`);
1569	let r = _mm_cvtepu16_epi32(a);
1570	let e = _mm_set1_epi32(`10`);
1571	assert_eq_m128i(r, e);
1572	}
1573
1574	#[simd_test(enable = "sse4.1")]
1575	unsafe fn test_mm_cvtepu16_epi64() {
1576	let a = _mm_set1_epi16(`10`);
1577	let r = _mm_cvtepu16_epi64(a);
1578	let e = _mm_set1_epi64x(`10`);
1579	assert_eq_m128i(r, e);
1580	}
1581
1582	#[simd_test(enable = "sse4.1")]
1583	unsafe fn test_mm_cvtepu32_epi64() {
1584	let a = _mm_set1_epi32(`10`);
1585	let r = _mm_cvtepu32_epi64(a);
1586	let e = _mm_set1_epi64x(`10`);
1587	assert_eq_m128i(r, e);
1588	}
1589
1590	#[simd_test(enable = "sse4.1")]
1591	unsafe fn test_mm_dp_pd() {
1592	let a = _mm_setr_pd(`2.0`, `3.0`);
1593	let b = _mm_setr_pd(`1.0`, `4.0`);
1594	let e = _mm_setr_pd(`14.0`, `0.0`);
1595	assert_eq_m128d(_mm_dp_pd::<`0b00110001`>(a, b), e);
1596	}
1597
1598	#[simd_test(enable = "sse4.1")]
1599	unsafe fn test_mm_dp_ps() {
1600	let a = _mm_setr_ps(`2.0`, `3.0`, `1.0`, `10.0`);
1601	let b = _mm_setr_ps(`1.0`, `4.0`, `0.5`, `10.0`);
1602	let e = _mm_setr_ps(`14.5`, `0.0`, `14.5`, `0.0`);
1603	assert_eq_m128(_mm_dp_ps::<`0b01110101`>(a, b), e);
1604	}
1605
1606	#[simd_test(enable = "sse4.1")]
1607	unsafe fn test_mm_floor_pd() {
1608	let a = _mm_setr_pd(`2.5`, `4.5`);
1609	let r = _mm_floor_pd(a);
1610	let e = _mm_setr_pd(`2.0`, `4.0`);
1611	assert_eq_m128d(r, e);
1612	}
1613
1614	#[simd_test(enable = "sse4.1")]
1615	unsafe fn test_mm_floor_ps() {
1616	let a = _mm_setr_ps(`2.5`, `4.5`, `8.5`, `16.5`);
1617	let r = _mm_floor_ps(a);
1618	let e = _mm_setr_ps(`2.0`, `4.0`, `8.0`, `16.0`);
1619	assert_eq_m128(r, e);
1620	}
1621
1622	#[simd_test(enable = "sse4.1")]
1623	unsafe fn test_mm_floor_sd() {
1624	let a = _mm_setr_pd(`2.5`, `4.5`);
1625	let b = _mm_setr_pd(`-1.5`, `-3.5`);
1626	let r = _mm_floor_sd(a, b);
1627	let e = _mm_setr_pd(`-2.0`, `4.5`);
1628	assert_eq_m128d(r, e);
1629	}
1630
1631	#[simd_test(enable = "sse4.1")]
1632	unsafe fn test_mm_floor_ss() {
1633	let a = _mm_setr_ps(`2.5`, `4.5`, `8.5`, `16.5`);
1634	let b = _mm_setr_ps(`-1.5`, `-3.5`, `-7.5`, `-15.5`);
1635	let r = _mm_floor_ss(a, b);
1636	let e = _mm_setr_ps(`-2.0`, `4.5`, `8.5`, `16.5`);
1637	assert_eq_m128(r, e);
1638	}
1639
1640	#[simd_test(enable = "sse4.1")]
1641	unsafe fn test_mm_ceil_pd() {
1642	let a = _mm_setr_pd(`1.5`, `3.5`);
1643	let r = _mm_ceil_pd(a);
1644	let e = _mm_setr_pd(`2.0`, `4.0`);
1645	assert_eq_m128d(r, e);
1646	}
1647
1648	#[simd_test(enable = "sse4.1")]
1649	unsafe fn test_mm_ceil_ps() {
1650	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1651	let r = _mm_ceil_ps(a);
1652	let e = _mm_setr_ps(`2.0`, `4.0`, `8.0`, `16.0`);
1653	assert_eq_m128(r, e);
1654	}
1655
1656	#[simd_test(enable = "sse4.1")]
1657	unsafe fn test_mm_ceil_sd() {
1658	let a = _mm_setr_pd(`1.5`, `3.5`);
1659	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1660	let r = _mm_ceil_sd(a, b);
1661	let e = _mm_setr_pd(`-2.0`, `3.5`);
1662	assert_eq_m128d(r, e);
1663	}
1664
1665	#[simd_test(enable = "sse4.1")]
1666	unsafe fn test_mm_ceil_ss() {
1667	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1668	let b = _mm_setr_ps(`-2.5`, `-4.5`, `-8.5`, `-16.5`);
1669	let r = _mm_ceil_ss(a, b);
1670	let e = _mm_setr_ps(`-2.0`, `3.5`, `7.5`, `15.5`);
1671	assert_eq_m128(r, e);
1672	}
1673
1674	#[simd_test(enable = "sse4.1")]
1675	unsafe fn test_mm_round_pd() {
1676	let a = _mm_setr_pd(`1.25`, `3.75`);
1677	let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1678	let e = _mm_setr_pd(`1.0`, `4.0`);
1679	assert_eq_m128d(r, e);
1680	}
1681
1682	#[simd_test(enable = "sse4.1")]
1683	unsafe fn test_mm_round_ps() {
1684	let a = _mm_setr_ps(`2.25`, `4.75`, `-1.75`, `-4.25`);
1685	let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1686	let e = _mm_setr_ps(`2.0`, `4.0`, `-1.0`, `-4.0`);
1687	assert_eq_m128(r, e);
1688	}
1689
1690	#[simd_test(enable = "sse4.1")]
1691	unsafe fn test_mm_round_sd() {
1692	let a = _mm_setr_pd(`1.5`, `3.5`);
1693	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1694	let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1695	let e = _mm_setr_pd(`-2.0`, `3.5`);
1696	assert_eq_m128d(r, e);
1697
1698	let a = _mm_setr_pd(`1.5`, `3.5`);
1699	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1700	let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
1701	let e = _mm_setr_pd(`-3.0`, `3.5`);
1702	assert_eq_m128d(r, e);
1703
1704	let a = _mm_setr_pd(`1.5`, `3.5`);
1705	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1706	let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
1707	let e = _mm_setr_pd(`-2.0`, `3.5`);
1708	assert_eq_m128d(r, e);
1709
1710	let a = _mm_setr_pd(`1.5`, `3.5`);
1711	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1712	let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
1713	let e = _mm_setr_pd(`-2.0`, `3.5`);
1714	assert_eq_m128d(r, e);
1715	}
1716
1717	#[simd_test(enable = "sse4.1")]
1718	unsafe fn test_mm_round_ss() {
1719	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1720	let b = _mm_setr_ps(`-1.75`, `-4.5`, `-8.5`, `-16.5`);
1721	let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1722	let e = _mm_setr_ps(`-2.0`, `3.5`, `7.5`, `15.5`);
1723	assert_eq_m128(r, e);
1724
1725	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1726	let b = _mm_setr_ps(`-1.75`, `-4.5`, `-8.5`, `-16.5`);
1727	let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
1728	let e = _mm_setr_ps(`-2.0`, `3.5`, `7.5`, `15.5`);
1729	assert_eq_m128(r, e);
1730
1731	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1732	let b = _mm_setr_ps(`-1.75`, `-4.5`, `-8.5`, `-16.5`);
1733	let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
1734	let e = _mm_setr_ps(`-1.0`, `3.5`, `7.5`, `15.5`);
1735	assert_eq_m128(r, e);
1736
1737	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1738	let b = _mm_setr_ps(`-1.75`, `-4.5`, `-8.5`, `-16.5`);
1739	let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
1740	let e = _mm_setr_ps(`-1.0`, `3.5`, `7.5`, `15.5`);
1741	assert_eq_m128(r, e);
1742	}
1743
1744	#[simd_test(enable = "sse4.1")]
1745	unsafe fn test_mm_minpos_epu16_1() {
1746	let a = _mm_setr_epi16(`23`, `18`, `44`, `97`, `50`, `13`, `67`, `66`);
1747	let r = _mm_minpos_epu16(a);
1748	let e = _mm_setr_epi16(`13`, `5`, `0`, `0`, `0`, `0`, `0`, `0`);
1749	assert_eq_m128i(r, e);
1750	}
1751
1752	#[simd_test(enable = "sse4.1")]
1753	unsafe fn test_mm_minpos_epu16_2() {
1754	let a = _mm_setr_epi16(`0`, `18`, `44`, `97`, `50`, `13`, `67`, `66`);
1755	let r = _mm_minpos_epu16(a);
1756	let e = _mm_setr_epi16(`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`);
1757	assert_eq_m128i(r, e);
1758	}
1759
1760	#[simd_test(enable = "sse4.1")]
1761	unsafe fn test_mm_minpos_epu16_3() {
1762	// Case where the minimum value is repeated
1763	let a = _mm_setr_epi16(`23`, `18`, `44`, `97`, `50`, `13`, `67`, `13`);
1764	let r = _mm_minpos_epu16(a);
1765	let e = _mm_setr_epi16(`13`, `5`, `0`, `0`, `0`, `0`, `0`, `0`);
1766	assert_eq_m128i(r, e);
1767	}
1768
1769	#[simd_test(enable = "sse4.1")]
1770	unsafe fn test_mm_mul_epi32() {
1771	{
1772	let a = _mm_setr_epi32(`1`, `1`, `1`, `1`);
1773	let b = _mm_setr_epi32(`1`, `2`, `3`, `4`);
1774	let r = _mm_mul_epi32(a, b);
1775	let e = _mm_setr_epi64x(`1`, `3`);
1776	assert_eq_m128i(r, e);
1777	}
1778	{
1779	let a = _mm_setr_epi32(`15`, `2` / ignored /, `1234567`, `4` / ignored /);
1780	let b = _mm_setr_epi32(
1781	`-20`, `-256`, / ignored /
1782	`666666`, `666666`, / ignored /
1783	);
1784	let r = _mm_mul_epi32(a, b);
1785	let e = _mm_setr_epi64x(`-300`, `823043843622`);
1786	assert_eq_m128i(r, e);
1787	}
1788	}
1789
1790	#[simd_test(enable = "sse4.1")]
1791	unsafe fn test_mm_mullo_epi32() {
1792	{
1793	let a = _mm_setr_epi32(`1`, `1`, `1`, `1`);
1794	let b = _mm_setr_epi32(`1`, `2`, `3`, `4`);
1795	let r = _mm_mullo_epi32(a, b);
1796	let e = _mm_setr_epi32(`1`, `2`, `3`, `4`);
1797	assert_eq_m128i(r, e);
1798	}
1799	{
1800	let a = _mm_setr_epi32(`15`, `-2`, `1234567`, `99999`);
1801	let b = _mm_setr_epi32(`-20`, `-256`, `666666`, `-99999`);
1802	let r = _mm_mullo_epi32(a, b);
1803	// Attention, most significant bit in r[2] is treated
1804	// as a sign bit:
1805	// 1234567 666666 = -1589877210*
1806	let e = _mm_setr_epi32(`-300`, `512`, `-1589877210`, `-1409865409`);
1807	assert_eq_m128i(r, e);
1808	}
1809	}
1810
1811	#[simd_test(enable = "sse4.1")]
1812	unsafe fn test_mm_minpos_epu16() {
1813	let a = _mm_setr_epi16(`8`, `7`, `6`, `5`, `4`, `1`, `2`, `3`);
1814	let r = _mm_minpos_epu16(a);
1815	let e = _mm_setr_epi16(`1`, `5`, `0`, `0`, `0`, `0`, `0`, `0`);
1816	assert_eq_m128i(r, e);
1817	}
1818
1819	#[simd_test(enable = "sse4.1")]
1820	unsafe fn test_mm_mpsadbw_epu8() {
1821	#[rustfmt::skip]
1822	let a = _mm_setr_epi8(
1823	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`,
1824	`8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
1825	);
1826
1827	let r = _mm_mpsadbw_epu8::<`0b000`>(a, a);
1828	let e = _mm_setr_epi16(`0`, `4`, `8`, `12`, `16`, `20`, `24`, `28`);
1829	assert_eq_m128i(r, e);
1830
1831	let r = _mm_mpsadbw_epu8::<`0b001`>(a, a);
1832	let e = _mm_setr_epi16(`16`, `12`, `8`, `4`, `0`, `4`, `8`, `12`);
1833	assert_eq_m128i(r, e);
1834
1835	let r = _mm_mpsadbw_epu8::<`0b100`>(a, a);
1836	let e = _mm_setr_epi16(`16`, `20`, `24`, `28`, `32`, `36`, `40`, `44`);
1837	assert_eq_m128i(r, e);
1838
1839	let r = _mm_mpsadbw_epu8::<`0b101`>(a, a);
1840	let e = _mm_setr_epi16(`0`, `4`, `8`, `12`, `16`, `20`, `24`, `28`);
1841	assert_eq_m128i(r, e);
1842
1843	let r = _mm_mpsadbw_epu8::<`0b111`>(a, a);
1844	let e = _mm_setr_epi16(`32`, `28`, `24`, `20`, `16`, `12`, `8`, `4`);
1845	assert_eq_m128i(r, e);
1846	}
1847
1848	#[simd_test(enable = "sse4.1")]
1849	unsafe fn test_mm_testz_si128() {
1850	let a = _mm_set1_epi8(`1`);
1851	let mask = _mm_set1_epi8(`0`);
1852	let r = _mm_testz_si128(a, mask);
1853	assert_eq!(r, `1`);
1854	let a = _mm_set1_epi8(`0b101`);
1855	let mask = _mm_set1_epi8(`0b110`);
1856	let r = _mm_testz_si128(a, mask);
1857	assert_eq!(r, `0`);
1858	let a = _mm_set1_epi8(`0b011`);
1859	let mask = _mm_set1_epi8(`0b100`);
1860	let r = _mm_testz_si128(a, mask);
1861	assert_eq!(r, `1`);
1862	}
1863
1864	#[simd_test(enable = "sse4.1")]
1865	unsafe fn test_mm_testc_si128() {
1866	let a = _mm_set1_epi8(`-1`);
1867	let mask = _mm_set1_epi8(`0`);
1868	let r = _mm_testc_si128(a, mask);
1869	assert_eq!(r, `1`);
1870	let a = _mm_set1_epi8(`0b101`);
1871	let mask = _mm_set1_epi8(`0b110`);
1872	let r = _mm_testc_si128(a, mask);
1873	assert_eq!(r, `0`);
1874	let a = _mm_set1_epi8(`0b101`);
1875	let mask = _mm_set1_epi8(`0b100`);
1876	let r = _mm_testc_si128(a, mask);
1877	assert_eq!(r, `1`);
1878	}
1879
1880	#[simd_test(enable = "sse4.1")]
1881	unsafe fn test_mm_testnzc_si128() {
1882	let a = _mm_set1_epi8(`0`);
1883	let mask = _mm_set1_epi8(`1`);
1884	let r = _mm_testnzc_si128(a, mask);
1885	assert_eq!(r, `0`);
1886	let a = _mm_set1_epi8(`-1`);
1887	let mask = _mm_set1_epi8(`0`);
1888	let r = _mm_testnzc_si128(a, mask);
1889	assert_eq!(r, `0`);
1890	let a = _mm_set1_epi8(`0b101`);
1891	let mask = _mm_set1_epi8(`0b110`);
1892	let r = _mm_testnzc_si128(a, mask);
1893	assert_eq!(r, `1`);
1894	let a = _mm_set1_epi8(`0b101`);
1895	let mask = _mm_set1_epi8(`0b101`);
1896	let r = _mm_testnzc_si128(a, mask);
1897	assert_eq!(r, `0`);
1898	}
1899
1900	#[simd_test(enable = "sse4.1")]
1901	unsafe fn test_mm_test_all_zeros() {
1902	let a = _mm_set1_epi8(`1`);
1903	let mask = _mm_set1_epi8(`0`);
1904	let r = _mm_test_all_zeros(a, mask);
1905	assert_eq!(r, `1`);
1906	let a = _mm_set1_epi8(`0b101`);
1907	let mask = _mm_set1_epi8(`0b110`);
1908	let r = _mm_test_all_zeros(a, mask);
1909	assert_eq!(r, `0`);
1910	let a = _mm_set1_epi8(`0b011`);
1911	let mask = _mm_set1_epi8(`0b100`);
1912	let r = _mm_test_all_zeros(a, mask);
1913	assert_eq!(r, `1`);
1914	}
1915
1916	#[simd_test(enable = "sse4.1")]
1917	unsafe fn test_mm_test_all_ones() {
1918	let a = _mm_set1_epi8(`-1`);
1919	let r = _mm_test_all_ones(a);
1920	assert_eq!(r, `1`);
1921	let a = _mm_set1_epi8(`0b101`);
1922	let r = _mm_test_all_ones(a);
1923	assert_eq!(r, `0`);
1924	}
1925
1926	#[simd_test(enable = "sse4.1")]
1927	unsafe fn test_mm_test_mix_ones_zeros() {
1928	let a = _mm_set1_epi8(`0`);
1929	let mask = _mm_set1_epi8(`1`);
1930	let r = _mm_test_mix_ones_zeros(a, mask);
1931	assert_eq!(r, `0`);
1932	let a = _mm_set1_epi8(`-1`);
1933	let mask = _mm_set1_epi8(`0`);
1934	let r = _mm_test_mix_ones_zeros(a, mask);
1935	assert_eq!(r, `0`);
1936	let a = _mm_set1_epi8(`0b101`);
1937	let mask = _mm_set1_epi8(`0b110`);
1938	let r = _mm_test_mix_ones_zeros(a, mask);
1939	assert_eq!(r, `1`);
1940	let a = _mm_set1_epi8(`0b101`);
1941	let mask = _mm_set1_epi8(`0b101`);
1942	let r = _mm_test_mix_ones_zeros(a, mask);
1943	assert_eq!(r, `0`);
1944	}
1945	}
1946