sse41.rs source code [crates/core_arch/src/x86/sse41.rs]

1	//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3	use crate::core_arch::{simd::, x86::};
4	use crate::intrinsics::simd::*;
5
6	#[cfg(test)]
7	use stdarch_test::assert_instr;
8
9	// SSE4 rounding constants
10	/// round to nearest
11	#[stable(feature = "simd_x86", since = "1.27.0")]
12	pub const _MM_FROUND_TO_NEAREST_INT: i32 = `0x00`;
13	/// round down
14	#[stable(feature = "simd_x86", since = "1.27.0")]
15	pub const _MM_FROUND_TO_NEG_INF: i32 = `0x01`;
16	/// round up
17	#[stable(feature = "simd_x86", since = "1.27.0")]
18	pub const _MM_FROUND_TO_POS_INF: i32 = `0x02`;
19	/// truncate
20	#[stable(feature = "simd_x86", since = "1.27.0")]
21	pub const _MM_FROUND_TO_ZERO: i32 = `0x03`;
22	/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
23	#[stable(feature = "simd_x86", since = "1.27.0")]
24	pub const _MM_FROUND_CUR_DIRECTION: i32 = `0x04`;
25	/// do not suppress exceptions
26	#[stable(feature = "simd_x86", since = "1.27.0")]
27	pub const _MM_FROUND_RAISE_EXC: i32 = `0x00`;
28	/// suppress exceptions
29	#[stable(feature = "simd_x86", since = "1.27.0")]
30	pub const _MM_FROUND_NO_EXC: i32 = `0x08`;
31	/// round to nearest and do not suppress exceptions
32	#[stable(feature = "simd_x86", since = "1.27.0")]
33	pub const _MM_FROUND_NINT: i32 = `0x00`;
34	/// round down and do not suppress exceptions
35	#[stable(feature = "simd_x86", since = "1.27.0")]
36	pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEG_INF;
37	/// round up and do not suppress exceptions
38	#[stable(feature = "simd_x86", since = "1.27.0")]
39	pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_POS_INF;
40	/// truncate and do not suppress exceptions
41	#[stable(feature = "simd_x86", since = "1.27.0")]
42	pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_ZERO;
43	/// use MXCSR.RC and do not suppress exceptions; see
44	/// `vendor::_MM_SET_ROUNDING_MODE`
45	#[stable(feature = "simd_x86", since = "1.27.0")]
46	pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC \| _MM_FROUND_CUR_DIRECTION;
47	/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
48	#[stable(feature = "simd_x86", since = "1.27.0")]
49	pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC \| _MM_FROUND_CUR_DIRECTION;
50
51	/// Blend packed 8-bit integers from `a` and `b` using `mask`
52	///
53	/// The high bit of each corresponding mask byte determines the selection.
54	/// If the high bit is set, the element of `b` is selected.
55	/// Otherwise, the element of `a` is selected.
56	///
57	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
58	#[inline]
59	#[target_feature(enable = "sse4.1")]
60	#[cfg_attr(test, assert_instr(pblendvb))]
61	#[stable(feature = "simd_x86", since = "1.27.0")]
62	pub fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
63	unsafe {
64	let mask: i8x16 = simd_lt(x:mask.as_i8x16(), y:i8x16::ZERO);
65	transmute(src:simd_select(mask, if_true:b.as_i8x16(), if_false:a.as_i8x16()))
66	}
67	}
68
69	/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
70	///
71	/// The mask bits determine the selection. A clear bit selects the
72	/// corresponding element of `a`, and a set bit the corresponding
73	/// element of `b`.
74	///
75	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
76	#[inline]
77	#[target_feature(enable = "sse4.1")]
78	#[cfg_attr(test, assert_instr(pblendw, IMM8 = `0xB1`))]
79	#[rustc_legacy_const_generics(`2`)]
80	#[stable(feature = "simd_x86", since = "1.27.0")]
81	pub fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
82	static_assert_uimm_bits!(IMM8, `8`);
83	unsafe {
84	transmute::<i16x8, _>(src:simd_shuffle!(
85	a.as_i16x8(),
86	b.as_i16x8(),
87	[
88	[`0`, `8`][IMM8 as usize & `1`],
89	[`1`, `9`][(IMM8 >> `1`) as usize & `1`],
90	[`2`, `10`][(IMM8 >> `2`) as usize & `1`],
91	[`3`, `11`][(IMM8 >> `3`) as usize & `1`],
92	[`4`, `12`][(IMM8 >> `4`) as usize & `1`],
93	[`5`, `13`][(IMM8 >> `5`) as usize & `1`],
94	[`6`, `14`][(IMM8 >> `6`) as usize & `1`],
95	[`7`, `15`][(IMM8 >> `7`) as usize & `1`],
96	]
97	))
98	}
99	}
100
101	/// Blend packed double-precision (64-bit) floating-point elements from `a`
102	/// and `b` using `mask`
103	///
104	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
105	#[inline]
106	#[target_feature(enable = "sse4.1")]
107	#[cfg_attr(test, assert_instr(blendvpd))]
108	#[stable(feature = "simd_x86", since = "1.27.0")]
109	pub fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
110	unsafe {
111	let mask: i64x2 = simd_lt(x:transmute::<_, i64x2>(mask), y:i64x2::ZERO);
112	transmute(src:simd_select(mask, if_true:b.as_f64x2(), if_false:a.as_f64x2()))
113	}
114	}
115
116	/// Blend packed single-precision (32-bit) floating-point elements from `a`
117	/// and `b` using `mask`
118	///
119	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
120	#[inline]
121	#[target_feature(enable = "sse4.1")]
122	#[cfg_attr(test, assert_instr(blendvps))]
123	#[stable(feature = "simd_x86", since = "1.27.0")]
124	pub fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
125	unsafe {
126	let mask: i32x4 = simd_lt(x:transmute::<_, i32x4>(mask), y:i32x4::ZERO);
127	transmute(src:simd_select(mask, if_true:b.as_f32x4(), if_false:a.as_f32x4()))
128	}
129	}
130
131	/// Blend packed double-precision (64-bit) floating-point elements from `a`
132	/// and `b` using control mask `IMM2`
133	///
134	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
135	#[inline]
136	#[target_feature(enable = "sse4.1")]
137	// Note: LLVM7 prefers the single-precision floating-point domain when possible
138	// see https://bugs.llvm.org/show_bug.cgi?id=38195
139	// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
140	#[cfg_attr(test, assert_instr(blendps, IMM2 = `0b10`))]
141	#[rustc_legacy_const_generics(`2`)]
142	#[stable(feature = "simd_x86", since = "1.27.0")]
143	pub fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
144	static_assert_uimm_bits!(IMM2, `2`);
145	unsafe {
146	transmute::<f64x2, _>(src:simd_shuffle!(
147	a.as_f64x2(),
148	b.as_f64x2(),
149	[[`0`, `2`][IMM2 as usize & `1`], [`1`, `3`][(IMM2 >> `1`) as usize & `1`]]
150	))
151	}
152	}
153
154	/// Blend packed single-precision (32-bit) floating-point elements from `a`
155	/// and `b` using mask `IMM4`
156	///
157	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
158	#[inline]
159	#[target_feature(enable = "sse4.1")]
160	#[cfg_attr(test, assert_instr(blendps, IMM4 = `0b0101`))]
161	#[rustc_legacy_const_generics(`2`)]
162	#[stable(feature = "simd_x86", since = "1.27.0")]
163	pub fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
164	static_assert_uimm_bits!(IMM4, `4`);
165	unsafe {
166	transmute::<f32x4, _>(src:simd_shuffle!(
167	a.as_f32x4(),
168	b.as_f32x4(),
169	[
170	[`0`, `4`][IMM4 as usize & `1`],
171	[`1`, `5`][(IMM4 >> `1`) as usize & `1`],
172	[`2`, `6`][(IMM4 >> `2`) as usize & `1`],
173	[`3`, `7`][(IMM4 >> `3`) as usize & `1`],
174	]
175	))
176	}
177	}
178
179	/// Extracts a single-precision (32-bit) floating-point element from `a`,
180	/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
181	/// and may be converted back to a floating point number via casting.
182	///
183	/// # Example
184	/// ```rust
185	/// # #[cfg(target_arch = "x86")]
186	/// # use std::arch::x86::*;
187	/// # #[cfg(target_arch = "x86_64")]
188	/// # use std::arch::x86_64::*;
189	/// # fn main() {
190	/// # if is_x86_feature_detected!("sse4.1") {
191	/// # #[target_feature(enable = "sse4.1")]
192	/// # #[allow(unused_unsafe)] // FIXME remove after stdarch bump in rustc
193	/// # unsafe fn worker() { unsafe {
194	/// let mut float_store = vec![`1.0`, `1.0`, `2.0`, `3.0`];
195	/// let simd_floats = _mm_set_ps(`2.5`, `5.0`, `7.5`, `10.0`);
196	/// let x: i32 = _mm_extract_ps::<`2`>(simd_floats);
197	/// float_store.push(f32::from_bits(x as u32));
198	/// assert_eq!(float_store, vec![`1.0`, `1.0`, `2.0`, `3.0`, `5.0`]);
199	/// # }}
200	/// # unsafe { worker() }
201	/// # }
202	/// # }
203	/// ```
204	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
205	#[inline]
206	#[target_feature(enable = "sse4.1")]
207	#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(extractps, IMM8 = `0`))]
208	#[rustc_legacy_const_generics(`1`)]
209	#[stable(feature = "simd_x86", since = "1.27.0")]
210	pub fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
211	static_assert_uimm_bits!(IMM8, `2`);
212	unsafe { simd_extract!(a, IMM8 as u32, f32).to_bits() as i32 }
213	}
214
215	/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
216	/// integer containing the zero-extended integer data.
217	///
218	/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
219	///
220	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
221	#[inline]
222	#[target_feature(enable = "sse4.1")]
223	#[cfg_attr(test, assert_instr(pextrb, IMM8 = `0`))]
224	#[rustc_legacy_const_generics(`1`)]
225	#[stable(feature = "simd_x86", since = "1.27.0")]
226	pub fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
227	static_assert_uimm_bits!(IMM8, `4`);
228	unsafe { simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32 }
229	}
230
231	/// Extracts an 32-bit integer from `a` selected with `IMM8`
232	///
233	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
234	#[inline]
235	#[target_feature(enable = "sse4.1")]
236	#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(extractps, IMM8 = `1`))]
237	#[rustc_legacy_const_generics(`1`)]
238	#[stable(feature = "simd_x86", since = "1.27.0")]
239	pub fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
240	static_assert_uimm_bits!(IMM8, `2`);
241	unsafe { simd_extract!(a.as_i32x4(), IMM8 as u32, i32) }
242	}
243
244	/// Select a single value in `b` to store at some position in `a`,
245	/// Then zero elements according to `IMM8`.
246	///
247	/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
248	/// the result they will be copied to, and which bits in the result will be
249	/// cleared. The following assignments are made:
250	///
251	/// Bits `[7:6]` specify the bits to copy from operand `b`:*
252	/// - `00`: Selects bits `[31:0]` from operand `b`.
253	/// - `01`: Selects bits `[63:32]` from operand `b`.
254	/// - `10`: Selects bits `[95:64]` from operand `b`.
255	/// - `11`: Selects bits `[127:96]` from operand `b`.
256	///
257	/// Bits `[5:4]` specify the bits in the result to which the selected bits*
258	/// from operand `b` are copied:
259	/// - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
260	/// - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
261	/// - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
262	/// - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
263	///
264	/// Bits `[3:0]`: If any of these bits are set, the corresponding result*
265	/// element is cleared.
266	///
267	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
268	#[inline]
269	#[target_feature(enable = "sse4.1")]
270	#[cfg_attr(test, assert_instr(insertps, IMM8 = `0b1010`))]
271	#[rustc_legacy_const_generics(`2`)]
272	#[stable(feature = "simd_x86", since = "1.27.0")]
273	pub fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
274	static_assert_uimm_bits!(IMM8, `8`);
275	unsafe { insertps(a, b, IMM8 as u8) }
276	}
277
278	/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
279	/// location specified by `IMM8`.
280	///
281	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
282	#[inline]
283	#[target_feature(enable = "sse4.1")]
284	#[cfg_attr(test, assert_instr(pinsrb, IMM8 = `0`))]
285	#[rustc_legacy_const_generics(`2`)]
286	#[stable(feature = "simd_x86", since = "1.27.0")]
287	pub fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
288	static_assert_uimm_bits!(IMM8, `4`);
289	unsafe { transmute(src:simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8)) }
290	}
291
292	/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
293	/// location specified by `IMM8`.
294	///
295	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
296	#[inline]
297	#[target_feature(enable = "sse4.1")]
298	#[cfg_attr(test, assert_instr(pinsrd, IMM8 = `0`))]
299	#[rustc_legacy_const_generics(`2`)]
300	#[stable(feature = "simd_x86", since = "1.27.0")]
301	pub fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
302	static_assert_uimm_bits!(IMM8, `2`);
303	unsafe { transmute(src:simd_insert!(a.as_i32x4(), IMM8 as u32, i)) }
304	}
305
306	/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
307	/// values in dst.
308	///
309	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
310	#[inline]
311	#[target_feature(enable = "sse4.1")]
312	#[cfg_attr(test, assert_instr(pmaxsb))]
313	#[stable(feature = "simd_x86", since = "1.27.0")]
314	pub fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
315	unsafe {
316	let a: i8x16 = a.as_i8x16();
317	let b: i8x16 = b.as_i8x16();
318	transmute(src:simd_select::<i8x16, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
319	}
320	}
321
322	/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
323	/// maximum.
324	///
325	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
326	#[inline]
327	#[target_feature(enable = "sse4.1")]
328	#[cfg_attr(test, assert_instr(pmaxuw))]
329	#[stable(feature = "simd_x86", since = "1.27.0")]
330	pub fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
331	unsafe {
332	let a: u16x8 = a.as_u16x8();
333	let b: u16x8 = b.as_u16x8();
334	transmute(src:simd_select::<i16x8, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
335	}
336	}
337
338	/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
339	/// values.
340	///
341	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
342	#[inline]
343	#[target_feature(enable = "sse4.1")]
344	#[cfg_attr(test, assert_instr(pmaxsd))]
345	#[stable(feature = "simd_x86", since = "1.27.0")]
346	pub fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
347	unsafe {
348	let a: i32x4 = a.as_i32x4();
349	let b: i32x4 = b.as_i32x4();
350	transmute(src:simd_select::<i32x4, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
351	}
352	}
353
354	/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
355	/// maximum values.
356	///
357	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
358	#[inline]
359	#[target_feature(enable = "sse4.1")]
360	#[cfg_attr(test, assert_instr(pmaxud))]
361	#[stable(feature = "simd_x86", since = "1.27.0")]
362	pub fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
363	unsafe {
364	let a: u32x4 = a.as_u32x4();
365	let b: u32x4 = b.as_u32x4();
366	transmute(src:simd_select::<i32x4, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
367	}
368	}
369
370	/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
371	/// values in dst.
372	///
373	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
374	#[inline]
375	#[target_feature(enable = "sse4.1")]
376	#[cfg_attr(test, assert_instr(pminsb))]
377	#[stable(feature = "simd_x86", since = "1.27.0")]
378	pub fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
379	unsafe {
380	let a: i8x16 = a.as_i8x16();
381	let b: i8x16 = b.as_i8x16();
382	transmute(src:simd_select::<i8x16, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
383	}
384	}
385
386	/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
387	/// minimum.
388	///
389	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
390	#[inline]
391	#[target_feature(enable = "sse4.1")]
392	#[cfg_attr(test, assert_instr(pminuw))]
393	#[stable(feature = "simd_x86", since = "1.27.0")]
394	pub fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
395	unsafe {
396	let a: u16x8 = a.as_u16x8();
397	let b: u16x8 = b.as_u16x8();
398	transmute(src:simd_select::<i16x8, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
399	}
400	}
401
402	/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
403	/// values.
404	///
405	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
406	#[inline]
407	#[target_feature(enable = "sse4.1")]
408	#[cfg_attr(test, assert_instr(pminsd))]
409	#[stable(feature = "simd_x86", since = "1.27.0")]
410	pub fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
411	unsafe {
412	let a: i32x4 = a.as_i32x4();
413	let b: i32x4 = b.as_i32x4();
414	transmute(src:simd_select::<i32x4, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
415	}
416	}
417
418	/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
419	/// minimum values.
420	///
421	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
422	#[inline]
423	#[target_feature(enable = "sse4.1")]
424	#[cfg_attr(test, assert_instr(pminud))]
425	#[stable(feature = "simd_x86", since = "1.27.0")]
426	pub fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
427	unsafe {
428	let a: u32x4 = a.as_u32x4();
429	let b: u32x4 = b.as_u32x4();
430	transmute(src:simd_select::<i32x4, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
431	}
432	}
433
434	/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
435	/// using unsigned saturation
436	///
437	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
438	#[inline]
439	#[target_feature(enable = "sse4.1")]
440	#[cfg_attr(test, assert_instr(packusdw))]
441	#[stable(feature = "simd_x86", since = "1.27.0")]
442	pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
443	unsafe { transmute(src:packusdw(a.as_i32x4(), b.as_i32x4())) }
444	}
445
446	/// Compares packed 64-bit integers in `a` and `b` for equality
447	///
448	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
449	#[inline]
450	#[target_feature(enable = "sse4.1")]
451	#[cfg_attr(test, assert_instr(pcmpeqq))]
452	#[stable(feature = "simd_x86", since = "1.27.0")]
453	pub fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
454	unsafe { transmute(src:simd_eq::<_, i64x2>(x:a.as_i64x2(), y:b.as_i64x2())) }
455	}
456
457	/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
458	///
459	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
460	#[inline]
461	#[target_feature(enable = "sse4.1")]
462	#[cfg_attr(test, assert_instr(pmovsxbw))]
463	#[stable(feature = "simd_x86", since = "1.27.0")]
464	pub fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
465	unsafe {
466	let a: i8x16 = a.as_i8x16();
467	let a: i8x8 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`]);
468	transmute(src:simd_cast::<_, i16x8>(a))
469	}
470	}
471
472	/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
473	///
474	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
475	#[inline]
476	#[target_feature(enable = "sse4.1")]
477	#[cfg_attr(test, assert_instr(pmovsxbd))]
478	#[stable(feature = "simd_x86", since = "1.27.0")]
479	pub fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
480	unsafe {
481	let a: i8x16 = a.as_i8x16();
482	let a: i8x4 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`]);
483	transmute(src:simd_cast::<_, i32x4>(a))
484	}
485	}
486
487	/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
488	/// 64-bit integers
489	///
490	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
491	#[inline]
492	#[target_feature(enable = "sse4.1")]
493	#[cfg_attr(test, assert_instr(pmovsxbq))]
494	#[stable(feature = "simd_x86", since = "1.27.0")]
495	pub fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
496	unsafe {
497	let a: i8x16 = a.as_i8x16();
498	let a: i8x2 = simd_shuffle!(a, a, [`0`, `1`]);
499	transmute(src:simd_cast::<_, i64x2>(a))
500	}
501	}
502
503	/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
504	///
505	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
506	#[inline]
507	#[target_feature(enable = "sse4.1")]
508	#[cfg_attr(test, assert_instr(pmovsxwd))]
509	#[stable(feature = "simd_x86", since = "1.27.0")]
510	pub fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
511	unsafe {
512	let a: i16x8 = a.as_i16x8();
513	let a: i16x4 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`]);
514	transmute(src:simd_cast::<_, i32x4>(a))
515	}
516	}
517
518	/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
519	///
520	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
521	#[inline]
522	#[target_feature(enable = "sse4.1")]
523	#[cfg_attr(test, assert_instr(pmovsxwq))]
524	#[stable(feature = "simd_x86", since = "1.27.0")]
525	pub fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
526	unsafe {
527	let a: i16x8 = a.as_i16x8();
528	let a: i16x2 = simd_shuffle!(a, a, [`0`, `1`]);
529	transmute(src:simd_cast::<_, i64x2>(a))
530	}
531	}
532
533	/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
534	///
535	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
536	#[inline]
537	#[target_feature(enable = "sse4.1")]
538	#[cfg_attr(test, assert_instr(pmovsxdq))]
539	#[stable(feature = "simd_x86", since = "1.27.0")]
540	pub fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
541	unsafe {
542	let a: i32x4 = a.as_i32x4();
543	let a: i32x2 = simd_shuffle!(a, a, [`0`, `1`]);
544	transmute(src:simd_cast::<_, i64x2>(a))
545	}
546	}
547
548	/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
549	///
550	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
551	#[inline]
552	#[target_feature(enable = "sse4.1")]
553	#[cfg_attr(test, assert_instr(pmovzxbw))]
554	#[stable(feature = "simd_x86", since = "1.27.0")]
555	pub fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
556	unsafe {
557	let a: u8x16 = a.as_u8x16();
558	let a: u8x8 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`]);
559	transmute(src:simd_cast::<_, i16x8>(a))
560	}
561	}
562
563	/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
564	///
565	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
566	#[inline]
567	#[target_feature(enable = "sse4.1")]
568	#[cfg_attr(test, assert_instr(pmovzxbd))]
569	#[stable(feature = "simd_x86", since = "1.27.0")]
570	pub fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
571	unsafe {
572	let a: u8x16 = a.as_u8x16();
573	let a: u8x4 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`]);
574	transmute(src:simd_cast::<_, i32x4>(a))
575	}
576	}
577
578	/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
579	///
580	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
581	#[inline]
582	#[target_feature(enable = "sse4.1")]
583	#[cfg_attr(test, assert_instr(pmovzxbq))]
584	#[stable(feature = "simd_x86", since = "1.27.0")]
585	pub fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
586	unsafe {
587	let a: u8x16 = a.as_u8x16();
588	let a: u8x2 = simd_shuffle!(a, a, [`0`, `1`]);
589	transmute(src:simd_cast::<_, i64x2>(a))
590	}
591	}
592
593	/// Zeroes extend packed unsigned 16-bit integers in `a`
594	/// to packed 32-bit integers
595	///
596	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
597	#[inline]
598	#[target_feature(enable = "sse4.1")]
599	#[cfg_attr(test, assert_instr(pmovzxwd))]
600	#[stable(feature = "simd_x86", since = "1.27.0")]
601	pub fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
602	unsafe {
603	let a: u16x8 = a.as_u16x8();
604	let a: u16x4 = simd_shuffle!(a, a, [`0`, `1`, `2`, `3`]);
605	transmute(src:simd_cast::<_, i32x4>(a))
606	}
607	}
608
609	/// Zeroes extend packed unsigned 16-bit integers in `a`
610	/// to packed 64-bit integers
611	///
612	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
613	#[inline]
614	#[target_feature(enable = "sse4.1")]
615	#[cfg_attr(test, assert_instr(pmovzxwq))]
616	#[stable(feature = "simd_x86", since = "1.27.0")]
617	pub fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
618	unsafe {
619	let a: u16x8 = a.as_u16x8();
620	let a: u16x2 = simd_shuffle!(a, a, [`0`, `1`]);
621	transmute(src:simd_cast::<_, i64x2>(a))
622	}
623	}
624
625	/// Zeroes extend packed unsigned 32-bit integers in `a`
626	/// to packed 64-bit integers
627	///
628	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
629	#[inline]
630	#[target_feature(enable = "sse4.1")]
631	#[cfg_attr(test, assert_instr(pmovzxdq))]
632	#[stable(feature = "simd_x86", since = "1.27.0")]
633	pub fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
634	unsafe {
635	let a: u32x4 = a.as_u32x4();
636	let a: u32x2 = simd_shuffle!(a, a, [`0`, `1`]);
637	transmute(src:simd_cast::<_, i64x2>(a))
638	}
639	}
640
641	/// Returns the dot product of two __m128d vectors.
642	///
643	/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
644	/// If a condition mask bit is zero, the corresponding multiplication is
645	/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
646	/// the dot product will be stored in the return value component. Otherwise if
647	/// the broadcast mask bit is zero then the return component will be zero.
648	///
649	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
650	#[inline]
651	#[target_feature(enable = "sse4.1")]
652	#[cfg_attr(test, assert_instr(dppd, IMM8 = `0`))]
653	#[rustc_legacy_const_generics(`2`)]
654	#[stable(feature = "simd_x86", since = "1.27.0")]
655	pub fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
656	unsafe {
657	static_assert_uimm_bits!(IMM8, `8`);
658	dppd(a, b, IMM8 as u8)
659	}
660	}
661
662	/// Returns the dot product of two __m128 vectors.
663	///
664	/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
665	/// If a condition mask bit is zero, the corresponding multiplication is
666	/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
667	/// the dot product will be stored in the return value component. Otherwise if
668	/// the broadcast mask bit is zero then the return component will be zero.
669	///
670	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
671	#[inline]
672	#[target_feature(enable = "sse4.1")]
673	#[cfg_attr(test, assert_instr(dpps, IMM8 = `0`))]
674	#[rustc_legacy_const_generics(`2`)]
675	#[stable(feature = "simd_x86", since = "1.27.0")]
676	pub fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
677	static_assert_uimm_bits!(IMM8, `8`);
678	unsafe { dpps(a, b, IMM8 as u8) }
679	}
680
681	/// Round the packed double-precision (64-bit) floating-point elements in `a`
682	/// down to an integer value, and stores the results as packed double-precision
683	/// floating-point elements.
684	///
685	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
686	#[inline]
687	#[target_feature(enable = "sse4.1")]
688	#[cfg_attr(test, assert_instr(roundpd))]
689	#[stable(feature = "simd_x86", since = "1.27.0")]
690	pub fn _mm_floor_pd(a: __m128d) -> __m128d {
691	unsafe { simd_floor(a) }
692	}
693
694	/// Round the packed single-precision (32-bit) floating-point elements in `a`
695	/// down to an integer value, and stores the results as packed single-precision
696	/// floating-point elements.
697	///
698	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
699	#[inline]
700	#[target_feature(enable = "sse4.1")]
701	#[cfg_attr(test, assert_instr(roundps))]
702	#[stable(feature = "simd_x86", since = "1.27.0")]
703	pub fn _mm_floor_ps(a: __m128) -> __m128 {
704	unsafe { simd_floor(a) }
705	}
706
707	/// Round the lower double-precision (64-bit) floating-point element in `b`
708	/// down to an integer value, store the result as a double-precision
709	/// floating-point element in the lower element of the intrinsic result,
710	/// and copies the upper element from `a` to the upper element of the intrinsic
711	/// result.
712	///
713	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
714	#[inline]
715	#[target_feature(enable = "sse4.1")]
716	#[cfg_attr(test, assert_instr(roundsd))]
717	#[stable(feature = "simd_x86", since = "1.27.0")]
718	pub fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
719	unsafe { roundsd(a, b, _MM_FROUND_FLOOR) }
720	}
721
722	/// Round the lower single-precision (32-bit) floating-point element in `b`
723	/// down to an integer value, store the result as a single-precision
724	/// floating-point element in the lower element of the intrinsic result,
725	/// and copies the upper 3 packed elements from `a` to the upper elements
726	/// of the intrinsic result.
727	///
728	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
729	#[inline]
730	#[target_feature(enable = "sse4.1")]
731	#[cfg_attr(test, assert_instr(roundss))]
732	#[stable(feature = "simd_x86", since = "1.27.0")]
733	pub fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
734	unsafe { roundss(a, b, _MM_FROUND_FLOOR) }
735	}
736
737	/// Round the packed double-precision (64-bit) floating-point elements in `a`
738	/// up to an integer value, and stores the results as packed double-precision
739	/// floating-point elements.
740	///
741	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
742	#[inline]
743	#[target_feature(enable = "sse4.1")]
744	#[cfg_attr(test, assert_instr(roundpd))]
745	#[stable(feature = "simd_x86", since = "1.27.0")]
746	pub fn _mm_ceil_pd(a: __m128d) -> __m128d {
747	unsafe { simd_ceil(a) }
748	}
749
750	/// Round the packed single-precision (32-bit) floating-point elements in `a`
751	/// up to an integer value, and stores the results as packed single-precision
752	/// floating-point elements.
753	///
754	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
755	#[inline]
756	#[target_feature(enable = "sse4.1")]
757	#[cfg_attr(test, assert_instr(roundps))]
758	#[stable(feature = "simd_x86", since = "1.27.0")]
759	pub fn _mm_ceil_ps(a: __m128) -> __m128 {
760	unsafe { simd_ceil(a) }
761	}
762
763	/// Round the lower double-precision (64-bit) floating-point element in `b`
764	/// up to an integer value, store the result as a double-precision
765	/// floating-point element in the lower element of the intrinsic result,
766	/// and copies the upper element from `a` to the upper element
767	/// of the intrinsic result.
768	///
769	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
770	#[inline]
771	#[target_feature(enable = "sse4.1")]
772	#[cfg_attr(test, assert_instr(roundsd))]
773	#[stable(feature = "simd_x86", since = "1.27.0")]
774	pub fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
775	unsafe { roundsd(a, b, _MM_FROUND_CEIL) }
776	}
777
778	/// Round the lower single-precision (32-bit) floating-point element in `b`
779	/// up to an integer value, store the result as a single-precision
780	/// floating-point element in the lower element of the intrinsic result,
781	/// and copies the upper 3 packed elements from `a` to the upper elements
782	/// of the intrinsic result.
783	///
784	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
785	#[inline]
786	#[target_feature(enable = "sse4.1")]
787	#[cfg_attr(test, assert_instr(roundss))]
788	#[stable(feature = "simd_x86", since = "1.27.0")]
789	pub fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
790	unsafe { roundss(a, b, _MM_FROUND_CEIL) }
791	}
792
793	/// Round the packed double-precision (64-bit) floating-point elements in `a`
794	/// using the `ROUNDING` parameter, and stores the results as packed
795	/// double-precision floating-point elements.
796	/// Rounding is done according to the rounding parameter, which can be one of:
797	///
798	/// * [`_MM_FROUND_TO_NEAREST_INT`] \| [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
799	/// * [`_MM_FROUND_TO_NEG_INF`] \| [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
800	/// * [`_MM_FROUND_TO_POS_INF`] \| [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
801	/// * [`_MM_FROUND_TO_ZERO`] \| [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
802	/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
803	///
804	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
805	#[inline]
806	#[target_feature(enable = "sse4.1")]
807	#[cfg_attr(test, assert_instr(roundpd, ROUNDING = `0`))]
808	#[rustc_legacy_const_generics(`1`)]
809	#[stable(feature = "simd_x86", since = "1.27.0")]
810	pub fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
811	static_assert_uimm_bits!(ROUNDING, `4`);
812	unsafe { roundpd(a, ROUNDING) }
813	}
814
815	/// Round the packed single-precision (32-bit) floating-point elements in `a`
816	/// using the `ROUNDING` parameter, and stores the results as packed
817	/// single-precision floating-point elements.
818	/// Rounding is done according to the rounding parameter, which can be one of:
819	///
820	/// * [`_MM_FROUND_TO_NEAREST_INT`] \| [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
821	/// * [`_MM_FROUND_TO_NEG_INF`] \| [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
822	/// * [`_MM_FROUND_TO_POS_INF`] \| [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
823	/// * [`_MM_FROUND_TO_ZERO`] \| [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
824	/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
825	///
826	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
827	#[inline]
828	#[target_feature(enable = "sse4.1")]
829	#[cfg_attr(test, assert_instr(roundps, ROUNDING = `0`))]
830	#[rustc_legacy_const_generics(`1`)]
831	#[stable(feature = "simd_x86", since = "1.27.0")]
832	pub fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
833	static_assert_uimm_bits!(ROUNDING, `4`);
834	unsafe { roundps(a, ROUNDING) }
835	}
836
837	/// Round the lower double-precision (64-bit) floating-point element in `b`
838	/// using the `ROUNDING` parameter, store the result as a double-precision
839	/// floating-point element in the lower element of the intrinsic result,
840	/// and copies the upper element from `a` to the upper element of the intrinsic
841	/// result.
842	/// Rounding is done according to the rounding parameter, which can be one of:
843	///
844	/// * [`_MM_FROUND_TO_NEAREST_INT`] \| [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
845	/// * [`_MM_FROUND_TO_NEG_INF`] \| [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
846	/// * [`_MM_FROUND_TO_POS_INF`] \| [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
847	/// * [`_MM_FROUND_TO_ZERO`] \| [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
848	/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
849	///
850	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
851	#[inline]
852	#[target_feature(enable = "sse4.1")]
853	#[cfg_attr(test, assert_instr(roundsd, ROUNDING = `0`))]
854	#[rustc_legacy_const_generics(`2`)]
855	#[stable(feature = "simd_x86", since = "1.27.0")]
856	pub fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
857	static_assert_uimm_bits!(ROUNDING, `4`);
858	unsafe { roundsd(a, b, ROUNDING) }
859	}
860
861	/// Round the lower single-precision (32-bit) floating-point element in `b`
862	/// using the `ROUNDING` parameter, store the result as a single-precision
863	/// floating-point element in the lower element of the intrinsic result,
864	/// and copies the upper 3 packed elements from `a` to the upper elements
865	/// of the intrinsic result.
866	/// Rounding is done according to the rounding parameter, which can be one of:
867	///
868	/// * [`_MM_FROUND_TO_NEAREST_INT`] \| [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
869	/// * [`_MM_FROUND_TO_NEG_INF`] \| [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
870	/// * [`_MM_FROUND_TO_POS_INF`] \| [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
871	/// * [`_MM_FROUND_TO_ZERO`] \| [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
872	/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
873	///
874	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
875	#[inline]
876	#[target_feature(enable = "sse4.1")]
877	#[cfg_attr(test, assert_instr(roundss, ROUNDING = `0`))]
878	#[rustc_legacy_const_generics(`2`)]
879	#[stable(feature = "simd_x86", since = "1.27.0")]
880	pub fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
881	static_assert_uimm_bits!(ROUNDING, `4`);
882	unsafe { roundss(a, b, ROUNDING) }
883	}
884
885	/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
886	/// returning a vector containing its value in its first position, and its
887	/// index
888	/// in its second position; all other elements are set to zero.
889	///
890	/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
891	/// instruction.
892	///
893	/// Arguments:
894	///
895	/// `a` - A 128-bit vector of type `__m128i`.*
896	///
897	/// Returns:
898	///
899	/// A 128-bit value where:
900	///
901	/// bits `[15:0]` - contain the minimum value found in parameter `a`,*
902	/// bits `[18:16]` - contain the index of the minimum value*
903	/// remaining bits are set to `0`.*
904	///
905	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
906	#[inline]
907	#[target_feature(enable = "sse4.1")]
908	#[cfg_attr(test, assert_instr(phminposuw))]
909	#[stable(feature = "simd_x86", since = "1.27.0")]
910	pub fn _mm_minpos_epu16(a: __m128i) -> __m128i {
911	unsafe { transmute(src:phminposuw(a.as_u16x8())) }
912	}
913
914	/// Multiplies the low 32-bit integers from each packed 64-bit
915	/// element in `a` and `b`, and returns the signed 64-bit result.
916	///
917	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
918	#[inline]
919	#[target_feature(enable = "sse4.1")]
920	#[cfg_attr(test, assert_instr(pmuldq))]
921	#[stable(feature = "simd_x86", since = "1.27.0")]
922	pub fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
923	unsafe {
924	let a: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
925	let b: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
926	transmute(src:simd_mul(x:a, y:b))
927	}
928	}
929
930	/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
931	/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
932	/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
933	/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
934	/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
935	/// return a negative number.
936	///
937	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
938	#[inline]
939	#[target_feature(enable = "sse4.1")]
940	#[cfg_attr(test, assert_instr(pmulld))]
941	#[stable(feature = "simd_x86", since = "1.27.0")]
942	pub fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
943	unsafe { transmute(src:simd_mul(x:a.as_i32x4(), y:b.as_i32x4())) }
944	}
945
946	/// Subtracts 8-bit unsigned integer values and computes the absolute
947	/// values of the differences to the corresponding bits in the destination.
948	/// Then sums of the absolute differences are returned according to the bit
949	/// fields in the immediate operand.
950	///
951	/// The following algorithm is performed:
952	///
953	/// ```ignore
954	/// i = IMM8[`2`] * `4`
955	/// j = IMM8[`1`:`0`] * `4`
956	/// for k := `0` to `7`
957	/// d0 = abs(a[i + k + `0`] - b[j + `0`])
958	/// d1 = abs(a[i + k + `1`] - b[j + `1`])
959	/// d2 = abs(a[i + k + `2`] - b[j + `2`])
960	/// d3 = abs(a[i + k + `3`] - b[j + `3`])
961	/// r[k] = d0 + d1 + d2 + d3
962	/// ```
963	///
964	/// Arguments:
965	///
966	/// `a` - A 128-bit vector of type `__m128i`.*
967	/// `b` - A 128-bit vector of type `__m128i`.*
968	/// `IMM8` - An 8-bit immediate operand specifying how the absolute*
969	/// differences are to be calculated
970	/// Bit `[2]` specify the offset for operand `a`*
971	/// Bits `[1:0]` specify the offset for operand `b`*
972	///
973	/// Returns:
974	///
975	/// A `__m128i` vector containing the sums of the sets of absolute*
976	/// differences between both operands.
977	///
978	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
979	#[inline]
980	#[target_feature(enable = "sse4.1")]
981	#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = `0`))]
982	#[rustc_legacy_const_generics(`2`)]
983	#[stable(feature = "simd_x86", since = "1.27.0")]
984	pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
985	static_assert_uimm_bits!(IMM8, `3`);
986	unsafe { transmute(src:mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8)) }
987	}
988
989	/// Tests whether the specified bits in a 128-bit integer vector are all
990	/// zeros.
991	///
992	/// Arguments:
993	///
994	/// `a` - A 128-bit integer vector containing the bits to be tested.*
995	/// `mask` - A 128-bit integer vector selecting which bits to test in*
996	/// operand `a`.
997	///
998	/// Returns:
999	///
1000	/// `1` - if the specified bits are all zeros,*
1001	/// `0` - otherwise.*
1002	///
1003	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
1004	#[inline]
1005	#[target_feature(enable = "sse4.1")]
1006	#[cfg_attr(test, assert_instr(ptest))]
1007	#[stable(feature = "simd_x86", since = "1.27.0")]
1008	pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1009	unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) }
1010	}
1011
1012	/// Tests whether the specified bits in a 128-bit integer vector are all
1013	/// ones.
1014	///
1015	/// Arguments:
1016	///
1017	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1018	/// `mask` - A 128-bit integer vector selecting which bits to test in*
1019	/// operand `a`.
1020	///
1021	/// Returns:
1022	///
1023	/// `1` - if the specified bits are all ones,*
1024	/// `0` - otherwise.*
1025	///
1026	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
1027	#[inline]
1028	#[target_feature(enable = "sse4.1")]
1029	#[cfg_attr(test, assert_instr(ptest))]
1030	#[stable(feature = "simd_x86", since = "1.27.0")]
1031	pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1032	unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) }
1033	}
1034
1035	/// Tests whether the specified bits in a 128-bit integer vector are
1036	/// neither all zeros nor all ones.
1037	///
1038	/// Arguments:
1039	///
1040	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1041	/// `mask` - A 128-bit integer vector selecting which bits to test in*
1042	/// operand `a`.
1043	///
1044	/// Returns:
1045	///
1046	/// `1` - if the specified bits are neither all zeros nor all ones,*
1047	/// `0` - otherwise.*
1048	///
1049	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
1050	#[inline]
1051	#[target_feature(enable = "sse4.1")]
1052	#[cfg_attr(test, assert_instr(ptest))]
1053	#[stable(feature = "simd_x86", since = "1.27.0")]
1054	pub fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1055	unsafe { ptestnzc(a.as_i64x2(), mask.as_i64x2()) }
1056	}
1057
1058	/// Tests whether the specified bits in a 128-bit integer vector are all
1059	/// zeros.
1060	///
1061	/// Arguments:
1062	///
1063	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1064	/// `mask` - A 128-bit integer vector selecting which bits to test in*
1065	/// operand `a`.
1066	///
1067	/// Returns:
1068	///
1069	/// `1` - if the specified bits are all zeros,*
1070	/// `0` - otherwise.*
1071	///
1072	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
1073	#[inline]
1074	#[target_feature(enable = "sse4.1")]
1075	#[cfg_attr(test, assert_instr(ptest))]
1076	#[stable(feature = "simd_x86", since = "1.27.0")]
1077	pub fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1078	_mm_testz_si128(a, mask)
1079	}
1080
1081	/// Tests whether the specified bits in `a` 128-bit integer vector are all
1082	/// ones.
1083	///
1084	/// Argument:
1085	///
1086	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1087	///
1088	/// Returns:
1089	///
1090	/// `1` - if the bits specified in the operand are all set to 1,*
1091	/// `0` - otherwise.*
1092	///
1093	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
1094	#[inline]
1095	#[target_feature(enable = "sse4.1")]
1096	#[cfg_attr(test, assert_instr(pcmpeqd))]
1097	#[cfg_attr(test, assert_instr(ptest))]
1098	#[stable(feature = "simd_x86", since = "1.27.0")]
1099	pub fn _mm_test_all_ones(a: __m128i) -> i32 {
1100	_mm_testc_si128(a, mask:_mm_cmpeq_epi32(a, b:a))
1101	}
1102
1103	/// Tests whether the specified bits in a 128-bit integer vector are
1104	/// neither all zeros nor all ones.
1105	///
1106	/// Arguments:
1107	///
1108	/// `a` - A 128-bit integer vector containing the bits to be tested.*
1109	/// `mask` - A 128-bit integer vector selecting which bits to test in*
1110	/// operand `a`.
1111	///
1112	/// Returns:
1113	///
1114	/// `1` - if the specified bits are neither all zeros nor all ones,*
1115	/// `0` - otherwise.*
1116	///
1117	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
1118	#[inline]
1119	#[target_feature(enable = "sse4.1")]
1120	#[cfg_attr(test, assert_instr(ptest))]
1121	#[stable(feature = "simd_x86", since = "1.27.0")]
1122	pub fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1123	_mm_testnzc_si128(a, mask)
1124	}
1125
1126	/// Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte
1127	/// boundary or a general-protection exception may be generated. To minimize caching, the data
1128	/// is flagged as non-temporal (unlikely to be used again soon)
1129	///
1130	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
1131	#[inline]
1132	#[target_feature(enable = "sse4.1")]
1133	#[cfg_attr(test, assert_instr(movntdqa))]
1134	#[stable(feature = "simd_x86_updates", since = "1.82.0")]
1135	pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
1136	let dst: __m128i;
1137	crate::arch::asm!(
1138	vpl!("movntdqa {a}"),
1139	a = out(xmm_reg) dst,
1140	p = in(reg) mem_addr,
1141	options(pure, readonly, nostack, preserves_flags),
1142	);
1143	dst
1144	}
1145
1146	#[allow(improper_ctypes)]
1147	unsafe extern "C" {
1148	#[link_name = "llvm.x86.sse41.insertps"]
1149	unsafefn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1150	#[link_name = "llvm.x86.sse41.packusdw"]
1151	unsafefn packusdw(a: i32x4, b: i32x4) -> u16x8;
1152	#[link_name = "llvm.x86.sse41.dppd"]
1153	unsafefn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1154	#[link_name = "llvm.x86.sse41.dpps"]
1155	unsafefn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1156	#[link_name = "llvm.x86.sse41.round.pd"]
1157	unsafefn roundpd(a: __m128d, rounding: i32) -> __m128d;
1158	#[link_name = "llvm.x86.sse41.round.ps"]
1159	unsafefn roundps(a: __m128, rounding: i32) -> __m128;
1160	#[link_name = "llvm.x86.sse41.round.sd"]
1161	unsafefn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1162	#[link_name = "llvm.x86.sse41.round.ss"]
1163	unsafefn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1164	#[link_name = "llvm.x86.sse41.phminposuw"]
1165	unsafefn phminposuw(a: u16x8) -> u16x8;
1166	#[link_name = "llvm.x86.sse41.mpsadbw"]
1167	unsafefn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1168	#[link_name = "llvm.x86.sse41.ptestz"]
1169	unsafefn ptestz(a: i64x2, mask: i64x2) -> i32;
1170	#[link_name = "llvm.x86.sse41.ptestc"]
1171	unsafefn ptestc(a: i64x2, mask: i64x2) -> i32;
1172	#[link_name = "llvm.x86.sse41.ptestnzc"]
1173	unsafefn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1174	}
1175
1176	#[cfg(test)]
1177	mod tests {
1178	use crate::core_arch::x86::*;
1179	use std::mem;
1180	use stdarch_test::simd_test;
1181
1182	#[simd_test(enable = "sse4.1")]
1183	unsafe fn test_mm_blendv_epi8() {
1184	#[rustfmt::skip]
1185	let a = _mm_setr_epi8(
1186	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`,
1187	`8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
1188	);
1189	#[rustfmt::skip]
1190	let b = _mm_setr_epi8(
1191	`16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`,
1192	);
1193	#[rustfmt::skip]
1194	let mask = _mm_setr_epi8(
1195	`0`, `-1`, `0`, `-1`, `0`, `-1`, `0`, `-1`,
1196	`0`, `-1`, `0`, `-1`, `0`, `-1`, `0`, `-1`,
1197	);
1198	#[rustfmt::skip]
1199	let e = _mm_setr_epi8(
1200	`0`, `17`, `2`, `19`, `4`, `21`, `6`, `23`, `8`, `25`, `10`, `27`, `12`, `29`, `14`, `31`,
1201	);
1202	assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1203	}
1204
1205	#[simd_test(enable = "sse4.1")]
1206	unsafe fn test_mm_blendv_pd() {
1207	let a = _mm_set1_pd(`0.0`);
1208	let b = _mm_set1_pd(`1.0`);
1209	let mask = transmute(_mm_setr_epi64x(`0`, `-1`));
1210	let r = _mm_blendv_pd(a, b, mask);
1211	let e = _mm_setr_pd(`0.0`, `1.0`);
1212	assert_eq_m128d(r, e);
1213	}
1214
1215	#[simd_test(enable = "sse4.1")]
1216	unsafe fn test_mm_blendv_ps() {
1217	let a = _mm_set1_ps(`0.0`);
1218	let b = _mm_set1_ps(`1.0`);
1219	let mask = transmute(_mm_setr_epi32(`0`, `-1`, `0`, `-1`));
1220	let r = _mm_blendv_ps(a, b, mask);
1221	let e = _mm_setr_ps(`0.0`, `1.0`, `0.0`, `1.0`);
1222	assert_eq_m128(r, e);
1223	}
1224
1225	#[simd_test(enable = "sse4.1")]
1226	unsafe fn test_mm_blend_pd() {
1227	let a = _mm_set1_pd(`0.0`);
1228	let b = _mm_set1_pd(`1.0`);
1229	let r = _mm_blend_pd::<`0b10`>(a, b);
1230	let e = _mm_setr_pd(`0.0`, `1.0`);
1231	assert_eq_m128d(r, e);
1232	}
1233
1234	#[simd_test(enable = "sse4.1")]
1235	unsafe fn test_mm_blend_ps() {
1236	let a = _mm_set1_ps(`0.0`);
1237	let b = _mm_set1_ps(`1.0`);
1238	let r = _mm_blend_ps::<`0b1010`>(a, b);
1239	let e = _mm_setr_ps(`0.0`, `1.0`, `0.0`, `1.0`);
1240	assert_eq_m128(r, e);
1241	}
1242
1243	#[simd_test(enable = "sse4.1")]
1244	unsafe fn test_mm_blend_epi16() {
1245	let a = _mm_set1_epi16(`0`);
1246	let b = _mm_set1_epi16(`1`);
1247	let r = _mm_blend_epi16::<`0b1010_1100`>(a, b);
1248	let e = _mm_setr_epi16(`0`, `0`, `1`, `1`, `0`, `1`, `0`, `1`);
1249	assert_eq_m128i(r, e);
1250	}
1251
1252	#[simd_test(enable = "sse4.1")]
1253	unsafe fn test_mm_extract_ps() {
1254	let a = _mm_setr_ps(`0.0`, `1.0`, `2.0`, `3.0`);
1255	let r: f32 = f32::from_bits(_mm_extract_ps::<`1`>(a) as u32);
1256	assert_eq!(r, `1.0`);
1257	let r: f32 = f32::from_bits(_mm_extract_ps::<`3`>(a) as u32);
1258	assert_eq!(r, `3.0`);
1259	}
1260
1261	#[simd_test(enable = "sse4.1")]
1262	unsafe fn test_mm_extract_epi8() {
1263	#[rustfmt::skip]
1264	let a = _mm_setr_epi8(
1265	`-1`, `1`, `2`, `3`, `4`, `5`, `6`, `7`,
1266	`8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`
1267	);
1268	let r1 = _mm_extract_epi8::<`0`>(a);
1269	let r2 = _mm_extract_epi8::<`3`>(a);
1270	assert_eq!(r1, `0xFF`);
1271	assert_eq!(r2, `3`);
1272	}
1273
1274	#[simd_test(enable = "sse4.1")]
1275	unsafe fn test_mm_extract_epi32() {
1276	let a = _mm_setr_epi32(`0`, `1`, `2`, `3`);
1277	let r = _mm_extract_epi32::<`1`>(a);
1278	assert_eq!(r, `1`);
1279	let r = _mm_extract_epi32::<`3`>(a);
1280	assert_eq!(r, `3`);
1281	}
1282
1283	#[simd_test(enable = "sse4.1")]
1284	unsafe fn test_mm_insert_ps() {
1285	let a = _mm_set1_ps(`1.0`);
1286	let b = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
1287	let r = _mm_insert_ps::<`0b11_00_1100`>(a, b);
1288	let e = _mm_setr_ps(`4.0`, `1.0`, `0.0`, `0.0`);
1289	assert_eq_m128(r, e);
1290
1291	// Zeroing takes precedence over copied value
1292	let a = _mm_set1_ps(`1.0`);
1293	let b = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
1294	let r = _mm_insert_ps::<`0b11_00_0001`>(a, b);
1295	let e = _mm_setr_ps(`0.0`, `1.0`, `1.0`, `1.0`);
1296	assert_eq_m128(r, e);
1297	}
1298
1299	#[simd_test(enable = "sse4.1")]
1300	unsafe fn test_mm_insert_epi8() {
1301	let a = _mm_set1_epi8(`0`);
1302	let e = _mm_setr_epi8(`0`, `32`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`);
1303	let r = _mm_insert_epi8::<`1`>(a, `32`);
1304	assert_eq_m128i(r, e);
1305	let e = _mm_setr_epi8(`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `32`, `0`);
1306	let r = _mm_insert_epi8::<`14`>(a, `32`);
1307	assert_eq_m128i(r, e);
1308	}
1309
1310	#[simd_test(enable = "sse4.1")]
1311	unsafe fn test_mm_insert_epi32() {
1312	let a = _mm_set1_epi32(`0`);
1313	let e = _mm_setr_epi32(`0`, `32`, `0`, `0`);
1314	let r = _mm_insert_epi32::<`1`>(a, `32`);
1315	assert_eq_m128i(r, e);
1316	let e = _mm_setr_epi32(`0`, `0`, `0`, `32`);
1317	let r = _mm_insert_epi32::<`3`>(a, `32`);
1318	assert_eq_m128i(r, e);
1319	}
1320
1321	#[simd_test(enable = "sse4.1")]
1322	unsafe fn test_mm_max_epi8() {
1323	#[rustfmt::skip]
1324	let a = _mm_setr_epi8(
1325	`1`, `4`, `5`, `8`, `9`, `12`, `13`, `16`,
1326	`17`, `20`, `21`, `24`, `25`, `28`, `29`, `32`,
1327	);
1328	#[rustfmt::skip]
1329	let b = _mm_setr_epi8(
1330	`2`, `3`, `6`, `7`, `10`, `11`, `14`, `15`,
1331	`18`, `19`, `22`, `23`, `26`, `27`, `30`, `31`,
1332	);
1333	let r = _mm_max_epi8(a, b);
1334	#[rustfmt::skip]
1335	let e = _mm_setr_epi8(
1336	`2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`,
1337	`18`, `20`, `22`, `24`, `26`, `28`, `30`, `32`,
1338	);
1339	assert_eq_m128i(r, e);
1340	}
1341
1342	#[simd_test(enable = "sse4.1")]
1343	unsafe fn test_mm_max_epu16() {
1344	let a = _mm_setr_epi16(`1`, `4`, `5`, `8`, `9`, `12`, `13`, `16`);
1345	let b = _mm_setr_epi16(`2`, `3`, `6`, `7`, `10`, `11`, `14`, `15`);
1346	let r = _mm_max_epu16(a, b);
1347	let e = _mm_setr_epi16(`2`, `4`, `6`, `8`, `10`, `12`, `14`, `16`);
1348	assert_eq_m128i(r, e);
1349	}
1350
1351	#[simd_test(enable = "sse4.1")]
1352	unsafe fn test_mm_max_epi32() {
1353	let a = _mm_setr_epi32(`1`, `4`, `5`, `8`);
1354	let b = _mm_setr_epi32(`2`, `3`, `6`, `7`);
1355	let r = _mm_max_epi32(a, b);
1356	let e = _mm_setr_epi32(`2`, `4`, `6`, `8`);
1357	assert_eq_m128i(r, e);
1358	}
1359
1360	#[simd_test(enable = "sse4.1")]
1361	unsafe fn test_mm_max_epu32() {
1362	let a = _mm_setr_epi32(`1`, `4`, `5`, `8`);
1363	let b = _mm_setr_epi32(`2`, `3`, `6`, `7`);
1364	let r = _mm_max_epu32(a, b);
1365	let e = _mm_setr_epi32(`2`, `4`, `6`, `8`);
1366	assert_eq_m128i(r, e);
1367	}
1368
1369	#[simd_test(enable = "sse4.1")]
1370	unsafe fn test_mm_min_epi8() {
1371	#[rustfmt::skip]
1372	let a = _mm_setr_epi8(
1373	`1`, `4`, `5`, `8`, `9`, `12`, `13`, `16`,
1374	`17`, `20`, `21`, `24`, `25`, `28`, `29`, `32`,
1375	);
1376	#[rustfmt::skip]
1377	let b = _mm_setr_epi8(
1378	`2`, `3`, `6`, `7`, `10`, `11`, `14`, `15`,
1379	`18`, `19`, `22`, `23`, `26`, `27`, `30`, `31`,
1380	);
1381	let r = _mm_min_epi8(a, b);
1382	#[rustfmt::skip]
1383	let e = _mm_setr_epi8(
1384	`1`, `3`, `5`, `7`, `9`, `11`, `13`, `15`,
1385	`17`, `19`, `21`, `23`, `25`, `27`, `29`, `31`,
1386	);
1387	assert_eq_m128i(r, e);
1388
1389	#[rustfmt::skip]
1390	let a = _mm_setr_epi8(
1391	`1`, `-4`, `-5`, `8`, `-9`, `-12`, `13`, `-16`,
1392	`17`, `20`, `21`, `24`, `25`, `28`, `29`, `32`,
1393	);
1394	#[rustfmt::skip]
1395	let b = _mm_setr_epi8(
1396	`2`, `-3`, `-6`, `7`, `-10`, `-11`, `14`, `-15`,
1397	`18`, `19`, `22`, `23`, `26`, `27`, `30`, `31`,
1398	);
1399	let r = _mm_min_epi8(a, b);
1400	#[rustfmt::skip]
1401	let e = _mm_setr_epi8(
1402	`1`, `-4`, `-6`, `7`, `-10`, `-12`, `13`, `-16`,
1403	`17`, `19`, `21`, `23`, `25`, `27`, `29`, `31`,
1404	);
1405	assert_eq_m128i(r, e);
1406	}
1407
1408	#[simd_test(enable = "sse4.1")]
1409	unsafe fn test_mm_min_epu16() {
1410	let a = _mm_setr_epi16(`1`, `4`, `5`, `8`, `9`, `12`, `13`, `16`);
1411	let b = _mm_setr_epi16(`2`, `3`, `6`, `7`, `10`, `11`, `14`, `15`);
1412	let r = _mm_min_epu16(a, b);
1413	let e = _mm_setr_epi16(`1`, `3`, `5`, `7`, `9`, `11`, `13`, `15`);
1414	assert_eq_m128i(r, e);
1415	}
1416
1417	#[simd_test(enable = "sse4.1")]
1418	unsafe fn test_mm_min_epi32() {
1419	let a = _mm_setr_epi32(`1`, `4`, `5`, `8`);
1420	let b = _mm_setr_epi32(`2`, `3`, `6`, `7`);
1421	let r = _mm_min_epi32(a, b);
1422	let e = _mm_setr_epi32(`1`, `3`, `5`, `7`);
1423	assert_eq_m128i(r, e);
1424
1425	let a = _mm_setr_epi32(`-1`, `4`, `5`, `-7`);
1426	let b = _mm_setr_epi32(`-2`, `3`, `-6`, `8`);
1427	let r = _mm_min_epi32(a, b);
1428	let e = _mm_setr_epi32(`-2`, `3`, `-6`, `-7`);
1429	assert_eq_m128i(r, e);
1430	}
1431
1432	#[simd_test(enable = "sse4.1")]
1433	unsafe fn test_mm_min_epu32() {
1434	let a = _mm_setr_epi32(`1`, `4`, `5`, `8`);
1435	let b = _mm_setr_epi32(`2`, `3`, `6`, `7`);
1436	let r = _mm_min_epu32(a, b);
1437	let e = _mm_setr_epi32(`1`, `3`, `5`, `7`);
1438	assert_eq_m128i(r, e);
1439	}
1440
1441	#[simd_test(enable = "sse4.1")]
1442	unsafe fn test_mm_packus_epi32() {
1443	let a = _mm_setr_epi32(`1`, `2`, `3`, `4`);
1444	let b = _mm_setr_epi32(`-1`, `-2`, `-3`, `-4`);
1445	let r = _mm_packus_epi32(a, b);
1446	let e = _mm_setr_epi16(`1`, `2`, `3`, `4`, `0`, `0`, `0`, `0`);
1447	assert_eq_m128i(r, e);
1448	}
1449
1450	#[simd_test(enable = "sse4.1")]
1451	unsafe fn test_mm_cmpeq_epi64() {
1452	let a = _mm_setr_epi64x(`0`, `1`);
1453	let b = _mm_setr_epi64x(`0`, `0`);
1454	let r = _mm_cmpeq_epi64(a, b);
1455	let e = _mm_setr_epi64x(`-1`, `0`);
1456	assert_eq_m128i(r, e);
1457	}
1458
1459	#[simd_test(enable = "sse4.1")]
1460	unsafe fn test_mm_cvtepi8_epi16() {
1461	let a = _mm_set1_epi8(`10`);
1462	let r = _mm_cvtepi8_epi16(a);
1463	let e = _mm_set1_epi16(`10`);
1464	assert_eq_m128i(r, e);
1465	let a = _mm_set1_epi8(`-10`);
1466	let r = _mm_cvtepi8_epi16(a);
1467	let e = _mm_set1_epi16(`-10`);
1468	assert_eq_m128i(r, e);
1469	}
1470
1471	#[simd_test(enable = "sse4.1")]
1472	unsafe fn test_mm_cvtepi8_epi32() {
1473	let a = _mm_set1_epi8(`10`);
1474	let r = _mm_cvtepi8_epi32(a);
1475	let e = _mm_set1_epi32(`10`);
1476	assert_eq_m128i(r, e);
1477	let a = _mm_set1_epi8(`-10`);
1478	let r = _mm_cvtepi8_epi32(a);
1479	let e = _mm_set1_epi32(`-10`);
1480	assert_eq_m128i(r, e);
1481	}
1482
1483	#[simd_test(enable = "sse4.1")]
1484	unsafe fn test_mm_cvtepi8_epi64() {
1485	let a = _mm_set1_epi8(`10`);
1486	let r = _mm_cvtepi8_epi64(a);
1487	let e = _mm_set1_epi64x(`10`);
1488	assert_eq_m128i(r, e);
1489	let a = _mm_set1_epi8(`-10`);
1490	let r = _mm_cvtepi8_epi64(a);
1491	let e = _mm_set1_epi64x(`-10`);
1492	assert_eq_m128i(r, e);
1493	}
1494
1495	#[simd_test(enable = "sse4.1")]
1496	unsafe fn test_mm_cvtepi16_epi32() {
1497	let a = _mm_set1_epi16(`10`);
1498	let r = _mm_cvtepi16_epi32(a);
1499	let e = _mm_set1_epi32(`10`);
1500	assert_eq_m128i(r, e);
1501	let a = _mm_set1_epi16(`-10`);
1502	let r = _mm_cvtepi16_epi32(a);
1503	let e = _mm_set1_epi32(`-10`);
1504	assert_eq_m128i(r, e);
1505	}
1506
1507	#[simd_test(enable = "sse4.1")]
1508	unsafe fn test_mm_cvtepi16_epi64() {
1509	let a = _mm_set1_epi16(`10`);
1510	let r = _mm_cvtepi16_epi64(a);
1511	let e = _mm_set1_epi64x(`10`);
1512	assert_eq_m128i(r, e);
1513	let a = _mm_set1_epi16(`-10`);
1514	let r = _mm_cvtepi16_epi64(a);
1515	let e = _mm_set1_epi64x(`-10`);
1516	assert_eq_m128i(r, e);
1517	}
1518
1519	#[simd_test(enable = "sse4.1")]
1520	unsafe fn test_mm_cvtepi32_epi64() {
1521	let a = _mm_set1_epi32(`10`);
1522	let r = _mm_cvtepi32_epi64(a);
1523	let e = _mm_set1_epi64x(`10`);
1524	assert_eq_m128i(r, e);
1525	let a = _mm_set1_epi32(`-10`);
1526	let r = _mm_cvtepi32_epi64(a);
1527	let e = _mm_set1_epi64x(`-10`);
1528	assert_eq_m128i(r, e);
1529	}
1530
1531	#[simd_test(enable = "sse4.1")]
1532	unsafe fn test_mm_cvtepu8_epi16() {
1533	let a = _mm_set1_epi8(`10`);
1534	let r = _mm_cvtepu8_epi16(a);
1535	let e = _mm_set1_epi16(`10`);
1536	assert_eq_m128i(r, e);
1537	}
1538
1539	#[simd_test(enable = "sse4.1")]
1540	unsafe fn test_mm_cvtepu8_epi32() {
1541	let a = _mm_set1_epi8(`10`);
1542	let r = _mm_cvtepu8_epi32(a);
1543	let e = _mm_set1_epi32(`10`);
1544	assert_eq_m128i(r, e);
1545	}
1546
1547	#[simd_test(enable = "sse4.1")]
1548	unsafe fn test_mm_cvtepu8_epi64() {
1549	let a = _mm_set1_epi8(`10`);
1550	let r = _mm_cvtepu8_epi64(a);
1551	let e = _mm_set1_epi64x(`10`);
1552	assert_eq_m128i(r, e);
1553	}
1554
1555	#[simd_test(enable = "sse4.1")]
1556	unsafe fn test_mm_cvtepu16_epi32() {
1557	let a = _mm_set1_epi16(`10`);
1558	let r = _mm_cvtepu16_epi32(a);
1559	let e = _mm_set1_epi32(`10`);
1560	assert_eq_m128i(r, e);
1561	}
1562
1563	#[simd_test(enable = "sse4.1")]
1564	unsafe fn test_mm_cvtepu16_epi64() {
1565	let a = _mm_set1_epi16(`10`);
1566	let r = _mm_cvtepu16_epi64(a);
1567	let e = _mm_set1_epi64x(`10`);
1568	assert_eq_m128i(r, e);
1569	}
1570
1571	#[simd_test(enable = "sse4.1")]
1572	unsafe fn test_mm_cvtepu32_epi64() {
1573	let a = _mm_set1_epi32(`10`);
1574	let r = _mm_cvtepu32_epi64(a);
1575	let e = _mm_set1_epi64x(`10`);
1576	assert_eq_m128i(r, e);
1577	}
1578
1579	#[simd_test(enable = "sse4.1")]
1580	unsafe fn test_mm_dp_pd() {
1581	let a = _mm_setr_pd(`2.0`, `3.0`);
1582	let b = _mm_setr_pd(`1.0`, `4.0`);
1583	let e = _mm_setr_pd(`14.0`, `0.0`);
1584	assert_eq_m128d(_mm_dp_pd::<`0b00110001`>(a, b), e);
1585	}
1586
1587	#[simd_test(enable = "sse4.1")]
1588	unsafe fn test_mm_dp_ps() {
1589	let a = _mm_setr_ps(`2.0`, `3.0`, `1.0`, `10.0`);
1590	let b = _mm_setr_ps(`1.0`, `4.0`, `0.5`, `10.0`);
1591	let e = _mm_setr_ps(`14.5`, `0.0`, `14.5`, `0.0`);
1592	assert_eq_m128(_mm_dp_ps::<`0b01110101`>(a, b), e);
1593	}
1594
1595	#[simd_test(enable = "sse4.1")]
1596	unsafe fn test_mm_floor_pd() {
1597	let a = _mm_setr_pd(`2.5`, `4.5`);
1598	let r = _mm_floor_pd(a);
1599	let e = _mm_setr_pd(`2.0`, `4.0`);
1600	assert_eq_m128d(r, e);
1601	}
1602
1603	#[simd_test(enable = "sse4.1")]
1604	unsafe fn test_mm_floor_ps() {
1605	let a = _mm_setr_ps(`2.5`, `4.5`, `8.5`, `16.5`);
1606	let r = _mm_floor_ps(a);
1607	let e = _mm_setr_ps(`2.0`, `4.0`, `8.0`, `16.0`);
1608	assert_eq_m128(r, e);
1609	}
1610
1611	#[simd_test(enable = "sse4.1")]
1612	unsafe fn test_mm_floor_sd() {
1613	let a = _mm_setr_pd(`2.5`, `4.5`);
1614	let b = _mm_setr_pd(`-1.5`, `-3.5`);
1615	let r = _mm_floor_sd(a, b);
1616	let e = _mm_setr_pd(`-2.0`, `4.5`);
1617	assert_eq_m128d(r, e);
1618	}
1619
1620	#[simd_test(enable = "sse4.1")]
1621	unsafe fn test_mm_floor_ss() {
1622	let a = _mm_setr_ps(`2.5`, `4.5`, `8.5`, `16.5`);
1623	let b = _mm_setr_ps(`-1.5`, `-3.5`, `-7.5`, `-15.5`);
1624	let r = _mm_floor_ss(a, b);
1625	let e = _mm_setr_ps(`-2.0`, `4.5`, `8.5`, `16.5`);
1626	assert_eq_m128(r, e);
1627	}
1628
1629	#[simd_test(enable = "sse4.1")]
1630	unsafe fn test_mm_ceil_pd() {
1631	let a = _mm_setr_pd(`1.5`, `3.5`);
1632	let r = _mm_ceil_pd(a);
1633	let e = _mm_setr_pd(`2.0`, `4.0`);
1634	assert_eq_m128d(r, e);
1635	}
1636
1637	#[simd_test(enable = "sse4.1")]
1638	unsafe fn test_mm_ceil_ps() {
1639	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1640	let r = _mm_ceil_ps(a);
1641	let e = _mm_setr_ps(`2.0`, `4.0`, `8.0`, `16.0`);
1642	assert_eq_m128(r, e);
1643	}
1644
1645	#[simd_test(enable = "sse4.1")]
1646	unsafe fn test_mm_ceil_sd() {
1647	let a = _mm_setr_pd(`1.5`, `3.5`);
1648	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1649	let r = _mm_ceil_sd(a, b);
1650	let e = _mm_setr_pd(`-2.0`, `3.5`);
1651	assert_eq_m128d(r, e);
1652	}
1653
1654	#[simd_test(enable = "sse4.1")]
1655	unsafe fn test_mm_ceil_ss() {
1656	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1657	let b = _mm_setr_ps(`-2.5`, `-4.5`, `-8.5`, `-16.5`);
1658	let r = _mm_ceil_ss(a, b);
1659	let e = _mm_setr_ps(`-2.0`, `3.5`, `7.5`, `15.5`);
1660	assert_eq_m128(r, e);
1661	}
1662
1663	#[simd_test(enable = "sse4.1")]
1664	unsafe fn test_mm_round_pd() {
1665	let a = _mm_setr_pd(`1.25`, `3.75`);
1666	let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1667	let e = _mm_setr_pd(`1.0`, `4.0`);
1668	assert_eq_m128d(r, e);
1669	}
1670
1671	#[simd_test(enable = "sse4.1")]
1672	unsafe fn test_mm_round_ps() {
1673	let a = _mm_setr_ps(`2.25`, `4.75`, `-1.75`, `-4.25`);
1674	let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1675	let e = _mm_setr_ps(`2.0`, `4.0`, `-1.0`, `-4.0`);
1676	assert_eq_m128(r, e);
1677	}
1678
1679	#[simd_test(enable = "sse4.1")]
1680	unsafe fn test_mm_round_sd() {
1681	let a = _mm_setr_pd(`1.5`, `3.5`);
1682	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1683	let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1684	let e = _mm_setr_pd(`-2.0`, `3.5`);
1685	assert_eq_m128d(r, e);
1686
1687	let a = _mm_setr_pd(`1.5`, `3.5`);
1688	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1689	let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
1690	let e = _mm_setr_pd(`-3.0`, `3.5`);
1691	assert_eq_m128d(r, e);
1692
1693	let a = _mm_setr_pd(`1.5`, `3.5`);
1694	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1695	let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
1696	let e = _mm_setr_pd(`-2.0`, `3.5`);
1697	assert_eq_m128d(r, e);
1698
1699	let a = _mm_setr_pd(`1.5`, `3.5`);
1700	let b = _mm_setr_pd(`-2.5`, `-4.5`);
1701	let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
1702	let e = _mm_setr_pd(`-2.0`, `3.5`);
1703	assert_eq_m128d(r, e);
1704	}
1705
1706	#[simd_test(enable = "sse4.1")]
1707	unsafe fn test_mm_round_ss() {
1708	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1709	let b = _mm_setr_ps(`-1.75`, `-4.5`, `-8.5`, `-16.5`);
1710	let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1711	let e = _mm_setr_ps(`-2.0`, `3.5`, `7.5`, `15.5`);
1712	assert_eq_m128(r, e);
1713
1714	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1715	let b = _mm_setr_ps(`-1.75`, `-4.5`, `-8.5`, `-16.5`);
1716	let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
1717	let e = _mm_setr_ps(`-2.0`, `3.5`, `7.5`, `15.5`);
1718	assert_eq_m128(r, e);
1719
1720	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1721	let b = _mm_setr_ps(`-1.75`, `-4.5`, `-8.5`, `-16.5`);
1722	let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
1723	let e = _mm_setr_ps(`-1.0`, `3.5`, `7.5`, `15.5`);
1724	assert_eq_m128(r, e);
1725
1726	let a = _mm_setr_ps(`1.5`, `3.5`, `7.5`, `15.5`);
1727	let b = _mm_setr_ps(`-1.75`, `-4.5`, `-8.5`, `-16.5`);
1728	let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
1729	let e = _mm_setr_ps(`-1.0`, `3.5`, `7.5`, `15.5`);
1730	assert_eq_m128(r, e);
1731	}
1732
1733	#[simd_test(enable = "sse4.1")]
1734	unsafe fn test_mm_minpos_epu16_1() {
1735	let a = _mm_setr_epi16(`23`, `18`, `44`, `97`, `50`, `13`, `67`, `66`);
1736	let r = _mm_minpos_epu16(a);
1737	let e = _mm_setr_epi16(`13`, `5`, `0`, `0`, `0`, `0`, `0`, `0`);
1738	assert_eq_m128i(r, e);
1739	}
1740
1741	#[simd_test(enable = "sse4.1")]
1742	unsafe fn test_mm_minpos_epu16_2() {
1743	let a = _mm_setr_epi16(`0`, `18`, `44`, `97`, `50`, `13`, `67`, `66`);
1744	let r = _mm_minpos_epu16(a);
1745	let e = _mm_setr_epi16(`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`);
1746	assert_eq_m128i(r, e);
1747	}
1748
1749	#[simd_test(enable = "sse4.1")]
1750	unsafe fn test_mm_minpos_epu16_3() {
1751	// Case where the minimum value is repeated
1752	let a = _mm_setr_epi16(`23`, `18`, `44`, `97`, `50`, `13`, `67`, `13`);
1753	let r = _mm_minpos_epu16(a);
1754	let e = _mm_setr_epi16(`13`, `5`, `0`, `0`, `0`, `0`, `0`, `0`);
1755	assert_eq_m128i(r, e);
1756	}
1757
1758	#[simd_test(enable = "sse4.1")]
1759	unsafe fn test_mm_mul_epi32() {
1760	{
1761	let a = _mm_setr_epi32(`1`, `1`, `1`, `1`);
1762	let b = _mm_setr_epi32(`1`, `2`, `3`, `4`);
1763	let r = _mm_mul_epi32(a, b);
1764	let e = _mm_setr_epi64x(`1`, `3`);
1765	assert_eq_m128i(r, e);
1766	}
1767	{
1768	let a = _mm_setr_epi32(`15`, `2` / ignored /, `1234567`, `4` / ignored /);
1769	let b = _mm_setr_epi32(
1770	`-20`, `-256`, / ignored /
1771	`666666`, `666666`, / ignored /
1772	);
1773	let r = _mm_mul_epi32(a, b);
1774	let e = _mm_setr_epi64x(`-300`, `823043843622`);
1775	assert_eq_m128i(r, e);
1776	}
1777	}
1778
1779	#[simd_test(enable = "sse4.1")]
1780	unsafe fn test_mm_mullo_epi32() {
1781	{
1782	let a = _mm_setr_epi32(`1`, `1`, `1`, `1`);
1783	let b = _mm_setr_epi32(`1`, `2`, `3`, `4`);
1784	let r = _mm_mullo_epi32(a, b);
1785	let e = _mm_setr_epi32(`1`, `2`, `3`, `4`);
1786	assert_eq_m128i(r, e);
1787	}
1788	{
1789	let a = _mm_setr_epi32(`15`, `-2`, `1234567`, `99999`);
1790	let b = _mm_setr_epi32(`-20`, `-256`, `666666`, `-99999`);
1791	let r = _mm_mullo_epi32(a, b);
1792	// Attention, most significant bit in r[2] is treated
1793	// as a sign bit:
1794	// 1234567 666666 = -1589877210*
1795	let e = _mm_setr_epi32(`-300`, `512`, `-1589877210`, `-1409865409`);
1796	assert_eq_m128i(r, e);
1797	}
1798	}
1799
1800	#[simd_test(enable = "sse4.1")]
1801	unsafe fn test_mm_minpos_epu16() {
1802	let a = _mm_setr_epi16(`8`, `7`, `6`, `5`, `4`, `1`, `2`, `3`);
1803	let r = _mm_minpos_epu16(a);
1804	let e = _mm_setr_epi16(`1`, `5`, `0`, `0`, `0`, `0`, `0`, `0`);
1805	assert_eq_m128i(r, e);
1806	}
1807
1808	#[simd_test(enable = "sse4.1")]
1809	unsafe fn test_mm_mpsadbw_epu8() {
1810	#[rustfmt::skip]
1811	let a = _mm_setr_epi8(
1812	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`,
1813	`8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
1814	);
1815
1816	let r = _mm_mpsadbw_epu8::<`0b000`>(a, a);
1817	let e = _mm_setr_epi16(`0`, `4`, `8`, `12`, `16`, `20`, `24`, `28`);
1818	assert_eq_m128i(r, e);
1819
1820	let r = _mm_mpsadbw_epu8::<`0b001`>(a, a);
1821	let e = _mm_setr_epi16(`16`, `12`, `8`, `4`, `0`, `4`, `8`, `12`);
1822	assert_eq_m128i(r, e);
1823
1824	let r = _mm_mpsadbw_epu8::<`0b100`>(a, a);
1825	let e = _mm_setr_epi16(`16`, `20`, `24`, `28`, `32`, `36`, `40`, `44`);
1826	assert_eq_m128i(r, e);
1827
1828	let r = _mm_mpsadbw_epu8::<`0b101`>(a, a);
1829	let e = _mm_setr_epi16(`0`, `4`, `8`, `12`, `16`, `20`, `24`, `28`);
1830	assert_eq_m128i(r, e);
1831
1832	let r = _mm_mpsadbw_epu8::<`0b111`>(a, a);
1833	let e = _mm_setr_epi16(`32`, `28`, `24`, `20`, `16`, `12`, `8`, `4`);
1834	assert_eq_m128i(r, e);
1835	}
1836
1837	#[simd_test(enable = "sse4.1")]
1838	unsafe fn test_mm_testz_si128() {
1839	let a = _mm_set1_epi8(`1`);
1840	let mask = _mm_set1_epi8(`0`);
1841	let r = _mm_testz_si128(a, mask);
1842	assert_eq!(r, `1`);
1843	let a = _mm_set1_epi8(`0b101`);
1844	let mask = _mm_set1_epi8(`0b110`);
1845	let r = _mm_testz_si128(a, mask);
1846	assert_eq!(r, `0`);
1847	let a = _mm_set1_epi8(`0b011`);
1848	let mask = _mm_set1_epi8(`0b100`);
1849	let r = _mm_testz_si128(a, mask);
1850	assert_eq!(r, `1`);
1851	}
1852
1853	#[simd_test(enable = "sse4.1")]
1854	unsafe fn test_mm_testc_si128() {
1855	let a = _mm_set1_epi8(`-1`);
1856	let mask = _mm_set1_epi8(`0`);
1857	let r = _mm_testc_si128(a, mask);
1858	assert_eq!(r, `1`);
1859	let a = _mm_set1_epi8(`0b101`);
1860	let mask = _mm_set1_epi8(`0b110`);
1861	let r = _mm_testc_si128(a, mask);
1862	assert_eq!(r, `0`);
1863	let a = _mm_set1_epi8(`0b101`);
1864	let mask = _mm_set1_epi8(`0b100`);
1865	let r = _mm_testc_si128(a, mask);
1866	assert_eq!(r, `1`);
1867	}
1868
1869	#[simd_test(enable = "sse4.1")]
1870	unsafe fn test_mm_testnzc_si128() {
1871	let a = _mm_set1_epi8(`0`);
1872	let mask = _mm_set1_epi8(`1`);
1873	let r = _mm_testnzc_si128(a, mask);
1874	assert_eq!(r, `0`);
1875	let a = _mm_set1_epi8(`-1`);
1876	let mask = _mm_set1_epi8(`0`);
1877	let r = _mm_testnzc_si128(a, mask);
1878	assert_eq!(r, `0`);
1879	let a = _mm_set1_epi8(`0b101`);
1880	let mask = _mm_set1_epi8(`0b110`);
1881	let r = _mm_testnzc_si128(a, mask);
1882	assert_eq!(r, `1`);
1883	let a = _mm_set1_epi8(`0b101`);
1884	let mask = _mm_set1_epi8(`0b101`);
1885	let r = _mm_testnzc_si128(a, mask);
1886	assert_eq!(r, `0`);
1887	}
1888
1889	#[simd_test(enable = "sse4.1")]
1890	unsafe fn test_mm_test_all_zeros() {
1891	let a = _mm_set1_epi8(`1`);
1892	let mask = _mm_set1_epi8(`0`);
1893	let r = _mm_test_all_zeros(a, mask);
1894	assert_eq!(r, `1`);
1895	let a = _mm_set1_epi8(`0b101`);
1896	let mask = _mm_set1_epi8(`0b110`);
1897	let r = _mm_test_all_zeros(a, mask);
1898	assert_eq!(r, `0`);
1899	let a = _mm_set1_epi8(`0b011`);
1900	let mask = _mm_set1_epi8(`0b100`);
1901	let r = _mm_test_all_zeros(a, mask);
1902	assert_eq!(r, `1`);
1903	}
1904
1905	#[simd_test(enable = "sse4.1")]
1906	unsafe fn test_mm_test_all_ones() {
1907	let a = _mm_set1_epi8(`-1`);
1908	let r = _mm_test_all_ones(a);
1909	assert_eq!(r, `1`);
1910	let a = _mm_set1_epi8(`0b101`);
1911	let r = _mm_test_all_ones(a);
1912	assert_eq!(r, `0`);
1913	}
1914
1915	#[simd_test(enable = "sse4.1")]
1916	unsafe fn test_mm_test_mix_ones_zeros() {
1917	let a = _mm_set1_epi8(`0`);
1918	let mask = _mm_set1_epi8(`1`);
1919	let r = _mm_test_mix_ones_zeros(a, mask);
1920	assert_eq!(r, `0`);
1921	let a = _mm_set1_epi8(`-1`);
1922	let mask = _mm_set1_epi8(`0`);
1923	let r = _mm_test_mix_ones_zeros(a, mask);
1924	assert_eq!(r, `0`);
1925	let a = _mm_set1_epi8(`0b101`);
1926	let mask = _mm_set1_epi8(`0b110`);
1927	let r = _mm_test_mix_ones_zeros(a, mask);
1928	assert_eq!(r, `1`);
1929	let a = _mm_set1_epi8(`0b101`);
1930	let mask = _mm_set1_epi8(`0b101`);
1931	let r = _mm_test_mix_ones_zeros(a, mask);
1932	assert_eq!(r, `0`);
1933	}
1934
1935	#[simd_test(enable = "sse4.1")]
1936	unsafe fn test_mm_stream_load_si128() {
1937	let a = _mm_set_epi64x(`5`, `6`);
1938	let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _);
1939	assert_eq_m128i(a, r);
1940	}
1941	}
1942