1//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3use crate::core_arch::{simd::*, x86::*};
4use crate::intrinsics::simd::*;
5
6#[cfg(test)]
7use stdarch_test::assert_instr;
8
9// SSE4 rounding constants
10/// round to nearest
11#[stable(feature = "simd_x86", since = "1.27.0")]
12pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
13/// round down
14#[stable(feature = "simd_x86", since = "1.27.0")]
15pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
16/// round up
17#[stable(feature = "simd_x86", since = "1.27.0")]
18pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
19/// truncate
20#[stable(feature = "simd_x86", since = "1.27.0")]
21pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
22/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
23#[stable(feature = "simd_x86", since = "1.27.0")]
24pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
25/// do not suppress exceptions
26#[stable(feature = "simd_x86", since = "1.27.0")]
27pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
28/// suppress exceptions
29#[stable(feature = "simd_x86", since = "1.27.0")]
30pub const _MM_FROUND_NO_EXC: i32 = 0x08;
31/// round to nearest and do not suppress exceptions
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub const _MM_FROUND_NINT: i32 = 0x00;
34/// round down and do not suppress exceptions
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
37/// round up and do not suppress exceptions
38#[stable(feature = "simd_x86", since = "1.27.0")]
39pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
40/// truncate and do not suppress exceptions
41#[stable(feature = "simd_x86", since = "1.27.0")]
42pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
43/// use MXCSR.RC and do not suppress exceptions; see
44/// `vendor::_MM_SET_ROUNDING_MODE`
45#[stable(feature = "simd_x86", since = "1.27.0")]
46pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
47/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
50
51/// Blend packed 8-bit integers from `a` and `b` using `mask`
52///
53/// The high bit of each corresponding mask byte determines the selection.
54/// If the high bit is set the element of `a` is selected. The element
55/// of `b` is selected otherwise.
56///
57/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
58#[inline]
59#[target_feature(enable = "sse4.1")]
60#[cfg_attr(test, assert_instr(pblendvb))]
61#[stable(feature = "simd_x86", since = "1.27.0")]
62pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
63 let mask: i8x16 = simd_lt(x:mask.as_i8x16(), y:i8x16::splat(0));
64 transmute(src:simd_select(mask, if_true:b.as_i8x16(), if_false:a.as_i8x16()))
65}
66
67/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
68///
69/// The mask bits determine the selection. A clear bit selects the
70/// corresponding element of `a`, and a set bit the corresponding
71/// element of `b`.
72///
73/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
74#[inline]
75#[target_feature(enable = "sse4.1")]
76#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
77#[rustc_legacy_const_generics(2)]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
80 static_assert_uimm_bits!(IMM8, 8);
81 transmute::<i16x8, _>(src:simd_shuffle!(
82 a.as_i16x8(),
83 b.as_i16x8(),
84 [
85 [0, 8][IMM8 as usize & 1],
86 [1, 9][(IMM8 >> 1) as usize & 1],
87 [2, 10][(IMM8 >> 2) as usize & 1],
88 [3, 11][(IMM8 >> 3) as usize & 1],
89 [4, 12][(IMM8 >> 4) as usize & 1],
90 [5, 13][(IMM8 >> 5) as usize & 1],
91 [6, 14][(IMM8 >> 6) as usize & 1],
92 [7, 15][(IMM8 >> 7) as usize & 1],
93 ]
94 ))
95}
96
97/// Blend packed double-precision (64-bit) floating-point elements from `a`
98/// and `b` using `mask`
99///
100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
101#[inline]
102#[target_feature(enable = "sse4.1")]
103#[cfg_attr(test, assert_instr(blendvpd))]
104#[stable(feature = "simd_x86", since = "1.27.0")]
105pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
106 let mask: i64x2 = simd_lt(x:transmute::<_, i64x2>(mask), y:i64x2::splat(0));
107 transmute(src:simd_select(mask, if_true:b.as_f64x2(), if_false:a.as_f64x2()))
108}
109
110/// Blend packed single-precision (32-bit) floating-point elements from `a`
111/// and `b` using `mask`
112///
113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
114#[inline]
115#[target_feature(enable = "sse4.1")]
116#[cfg_attr(test, assert_instr(blendvps))]
117#[stable(feature = "simd_x86", since = "1.27.0")]
118pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
119 let mask: i32x4 = simd_lt(x:transmute::<_, i32x4>(mask), y:i32x4::splat(0));
120 transmute(src:simd_select(mask, if_true:b.as_f32x4(), if_false:a.as_f32x4()))
121}
122
123/// Blend packed double-precision (64-bit) floating-point elements from `a`
124/// and `b` using control mask `IMM2`
125///
126/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
127#[inline]
128#[target_feature(enable = "sse4.1")]
129// Note: LLVM7 prefers the single-precision floating-point domain when possible
130// see https://bugs.llvm.org/show_bug.cgi?id=38195
131// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
132#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
133#[rustc_legacy_const_generics(2)]
134#[stable(feature = "simd_x86", since = "1.27.0")]
135pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
136 static_assert_uimm_bits!(IMM2, 2);
137 transmute::<f64x2, _>(src:simd_shuffle!(
138 a.as_f64x2(),
139 b.as_f64x2(),
140 [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
141 ))
142}
143
144/// Blend packed single-precision (32-bit) floating-point elements from `a`
145/// and `b` using mask `IMM4`
146///
147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
148#[inline]
149#[target_feature(enable = "sse4.1")]
150#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
151#[rustc_legacy_const_generics(2)]
152#[stable(feature = "simd_x86", since = "1.27.0")]
153pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
154 static_assert_uimm_bits!(IMM4, 4);
155 transmute::<f32x4, _>(src:simd_shuffle!(
156 a.as_f32x4(),
157 b.as_f32x4(),
158 [
159 [0, 4][IMM4 as usize & 1],
160 [1, 5][(IMM4 >> 1) as usize & 1],
161 [2, 6][(IMM4 >> 2) as usize & 1],
162 [3, 7][(IMM4 >> 3) as usize & 1],
163 ]
164 ))
165}
166
167/// Extracts a single-precision (32-bit) floating-point element from `a`,
168/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
169/// and may be converted back to a floating point number via casting.
170///
171/// # Example
172/// ```rust
173/// # #[cfg(target_arch = "x86")]
174/// # use std::arch::x86::*;
175/// # #[cfg(target_arch = "x86_64")]
176/// # use std::arch::x86_64::*;
177/// # fn main() {
178/// # if is_x86_feature_detected!("sse4.1") {
179/// # #[target_feature(enable = "sse4.1")]
180/// # unsafe fn worker() {
181/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
182/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
183/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
184/// float_store.push(f32::from_bits(x as u32));
185/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
186/// # }
187/// # unsafe { worker() }
188/// # }
189/// # }
190/// ```
191/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
192#[inline]
193#[target_feature(enable = "sse4.1")]
194#[cfg_attr(
195 all(test, not(target_os = "windows")),
196 assert_instr(extractps, IMM8 = 0)
197)]
198#[rustc_legacy_const_generics(1)]
199#[stable(feature = "simd_x86", since = "1.27.0")]
200pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
201 static_assert_uimm_bits!(IMM8, 2);
202 simd_extract!(a, IMM8 as u32, f32).to_bits() as i32
203}
204
205/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
206/// integer containing the zero-extended integer data.
207///
208/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
209///
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
211#[inline]
212#[target_feature(enable = "sse4.1")]
213#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
214#[rustc_legacy_const_generics(1)]
215#[stable(feature = "simd_x86", since = "1.27.0")]
216pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
217 static_assert_uimm_bits!(IMM8, 4);
218 simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32
219}
220
221/// Extracts an 32-bit integer from `a` selected with `IMM8`
222///
223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
224#[inline]
225#[target_feature(enable = "sse4.1")]
226#[cfg_attr(
227 all(test, not(target_os = "windows")),
228 assert_instr(extractps, IMM8 = 1)
229)]
230#[rustc_legacy_const_generics(1)]
231#[stable(feature = "simd_x86", since = "1.27.0")]
232pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
233 static_assert_uimm_bits!(IMM8, 2);
234 simd_extract!(a.as_i32x4(), IMM8 as u32, i32)
235}
236
237/// Select a single value in `b` to store at some position in `a`,
238/// Then zero elements according to `IMM8`.
239///
240/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
241/// the result they will be copied to, and which bits in the result will be
242/// cleared. The following assignments are made:
243///
244/// * Bits `[7:6]` specify the bits to copy from operand `b`:
245/// - `00`: Selects bits `[31:0]` from operand `b`.
246/// - `01`: Selects bits `[63:32]` from operand `b`.
247/// - `10`: Selects bits `[95:64]` from operand `b`.
248/// - `11`: Selects bits `[127:96]` from operand `b`.
249///
250/// * Bits `[5:4]` specify the bits in the result to which the selected bits
251/// from operand `b` are copied:
252/// - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
253/// - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
254/// - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
255/// - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
256///
257/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
258/// element is cleared.
259///
260/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
261#[inline]
262#[target_feature(enable = "sse4.1")]
263#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
264#[rustc_legacy_const_generics(2)]
265#[stable(feature = "simd_x86", since = "1.27.0")]
266pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
267 static_assert_uimm_bits!(IMM8, 8);
268 insertps(a, b, IMM8 as u8)
269}
270
271/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
272/// location specified by `IMM8`.
273///
274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
275#[inline]
276#[target_feature(enable = "sse4.1")]
277#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
278#[rustc_legacy_const_generics(2)]
279#[stable(feature = "simd_x86", since = "1.27.0")]
280pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
281 static_assert_uimm_bits!(IMM8, 4);
282 transmute(src:simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8))
283}
284
285/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
286/// location specified by `IMM8`.
287///
288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
289#[inline]
290#[target_feature(enable = "sse4.1")]
291#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
292#[rustc_legacy_const_generics(2)]
293#[stable(feature = "simd_x86", since = "1.27.0")]
294pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
295 static_assert_uimm_bits!(IMM8, 2);
296 transmute(src:simd_insert!(a.as_i32x4(), IMM8 as u32, i))
297}
298
299/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
300/// values in dst.
301///
302/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
303#[inline]
304#[target_feature(enable = "sse4.1")]
305#[cfg_attr(test, assert_instr(pmaxsb))]
306#[stable(feature = "simd_x86", since = "1.27.0")]
307pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
308 let a: i8x16 = a.as_i8x16();
309 let b: i8x16 = b.as_i8x16();
310 transmute(src:simd_select::<i8x16, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
311}
312
313/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
314/// maximum.
315///
316/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
317#[inline]
318#[target_feature(enable = "sse4.1")]
319#[cfg_attr(test, assert_instr(pmaxuw))]
320#[stable(feature = "simd_x86", since = "1.27.0")]
321pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
322 let a: u16x8 = a.as_u16x8();
323 let b: u16x8 = b.as_u16x8();
324 transmute(src:simd_select::<i16x8, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
325}
326
327/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
328/// values.
329///
330/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
331#[inline]
332#[target_feature(enable = "sse4.1")]
333#[cfg_attr(test, assert_instr(pmaxsd))]
334#[stable(feature = "simd_x86", since = "1.27.0")]
335pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
336 let a: i32x4 = a.as_i32x4();
337 let b: i32x4 = b.as_i32x4();
338 transmute(src:simd_select::<i32x4, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
339}
340
341/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
342/// maximum values.
343///
344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
345#[inline]
346#[target_feature(enable = "sse4.1")]
347#[cfg_attr(test, assert_instr(pmaxud))]
348#[stable(feature = "simd_x86", since = "1.27.0")]
349pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
350 let a: u32x4 = a.as_u32x4();
351 let b: u32x4 = b.as_u32x4();
352 transmute(src:simd_select::<i32x4, _>(mask:simd_gt(a, b), if_true:a, if_false:b))
353}
354
355/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
356/// values in dst.
357///
358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
359#[inline]
360#[target_feature(enable = "sse4.1")]
361#[cfg_attr(test, assert_instr(pminsb))]
362#[stable(feature = "simd_x86", since = "1.27.0")]
363pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
364 let a: i8x16 = a.as_i8x16();
365 let b: i8x16 = b.as_i8x16();
366 transmute(src:simd_select::<i8x16, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
367}
368
369/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
370/// minimum.
371///
372/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
373#[inline]
374#[target_feature(enable = "sse4.1")]
375#[cfg_attr(test, assert_instr(pminuw))]
376#[stable(feature = "simd_x86", since = "1.27.0")]
377pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
378 let a: u16x8 = a.as_u16x8();
379 let b: u16x8 = b.as_u16x8();
380 transmute(src:simd_select::<i16x8, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
381}
382
383/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
384/// values.
385///
386/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
387#[inline]
388#[target_feature(enable = "sse4.1")]
389#[cfg_attr(test, assert_instr(pminsd))]
390#[stable(feature = "simd_x86", since = "1.27.0")]
391pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
392 let a: i32x4 = a.as_i32x4();
393 let b: i32x4 = b.as_i32x4();
394 transmute(src:simd_select::<i32x4, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
395}
396
397/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
398/// minimum values.
399///
400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
401#[inline]
402#[target_feature(enable = "sse4.1")]
403#[cfg_attr(test, assert_instr(pminud))]
404#[stable(feature = "simd_x86", since = "1.27.0")]
405pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
406 let a: u32x4 = a.as_u32x4();
407 let b: u32x4 = b.as_u32x4();
408 transmute(src:simd_select::<i32x4, _>(mask:simd_lt(a, b), if_true:a, if_false:b))
409}
410
411/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
412/// using unsigned saturation
413///
414/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
415#[inline]
416#[target_feature(enable = "sse4.1")]
417#[cfg_attr(test, assert_instr(packusdw))]
418#[stable(feature = "simd_x86", since = "1.27.0")]
419pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
420 transmute(src:packusdw(a:a.as_i32x4(), b:b.as_i32x4()))
421}
422
423/// Compares packed 64-bit integers in `a` and `b` for equality
424///
425/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
426#[inline]
427#[target_feature(enable = "sse4.1")]
428#[cfg_attr(test, assert_instr(pcmpeqq))]
429#[stable(feature = "simd_x86", since = "1.27.0")]
430pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
431 transmute(src:simd_eq::<_, i64x2>(x:a.as_i64x2(), y:b.as_i64x2()))
432}
433
434/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
435///
436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
437#[inline]
438#[target_feature(enable = "sse4.1")]
439#[cfg_attr(test, assert_instr(pmovsxbw))]
440#[stable(feature = "simd_x86", since = "1.27.0")]
441pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
442 let a: i8x16 = a.as_i8x16();
443 let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
444 transmute(src:simd_cast::<_, i16x8>(a))
445}
446
447/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
448///
449/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
450#[inline]
451#[target_feature(enable = "sse4.1")]
452#[cfg_attr(test, assert_instr(pmovsxbd))]
453#[stable(feature = "simd_x86", since = "1.27.0")]
454pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
455 let a: i8x16 = a.as_i8x16();
456 let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
457 transmute(src:simd_cast::<_, i32x4>(a))
458}
459
460/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
461/// 64-bit integers
462///
463/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
464#[inline]
465#[target_feature(enable = "sse4.1")]
466#[cfg_attr(test, assert_instr(pmovsxbq))]
467#[stable(feature = "simd_x86", since = "1.27.0")]
468pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
469 let a: i8x16 = a.as_i8x16();
470 let a: i8x2 = simd_shuffle!(a, a, [0, 1]);
471 transmute(src:simd_cast::<_, i64x2>(a))
472}
473
474/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
475///
476/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
477#[inline]
478#[target_feature(enable = "sse4.1")]
479#[cfg_attr(test, assert_instr(pmovsxwd))]
480#[stable(feature = "simd_x86", since = "1.27.0")]
481pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
482 let a: i16x8 = a.as_i16x8();
483 let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
484 transmute(src:simd_cast::<_, i32x4>(a))
485}
486
487/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
488///
489/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
490#[inline]
491#[target_feature(enable = "sse4.1")]
492#[cfg_attr(test, assert_instr(pmovsxwq))]
493#[stable(feature = "simd_x86", since = "1.27.0")]
494pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
495 let a: i16x8 = a.as_i16x8();
496 let a: i16x2 = simd_shuffle!(a, a, [0, 1]);
497 transmute(src:simd_cast::<_, i64x2>(a))
498}
499
500/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
501///
502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
503#[inline]
504#[target_feature(enable = "sse4.1")]
505#[cfg_attr(test, assert_instr(pmovsxdq))]
506#[stable(feature = "simd_x86", since = "1.27.0")]
507pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
508 let a: i32x4 = a.as_i32x4();
509 let a: i32x2 = simd_shuffle!(a, a, [0, 1]);
510 transmute(src:simd_cast::<_, i64x2>(a))
511}
512
513/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
514///
515/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
516#[inline]
517#[target_feature(enable = "sse4.1")]
518#[cfg_attr(test, assert_instr(pmovzxbw))]
519#[stable(feature = "simd_x86", since = "1.27.0")]
520pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
521 let a: u8x16 = a.as_u8x16();
522 let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
523 transmute(src:simd_cast::<_, i16x8>(a))
524}
525
526/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
527///
528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
529#[inline]
530#[target_feature(enable = "sse4.1")]
531#[cfg_attr(test, assert_instr(pmovzxbd))]
532#[stable(feature = "simd_x86", since = "1.27.0")]
533pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
534 let a: u8x16 = a.as_u8x16();
535 let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
536 transmute(src:simd_cast::<_, i32x4>(a))
537}
538
539/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
540///
541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
542#[inline]
543#[target_feature(enable = "sse4.1")]
544#[cfg_attr(test, assert_instr(pmovzxbq))]
545#[stable(feature = "simd_x86", since = "1.27.0")]
546pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
547 let a: u8x16 = a.as_u8x16();
548 let a: u8x2 = simd_shuffle!(a, a, [0, 1]);
549 transmute(src:simd_cast::<_, i64x2>(a))
550}
551
552/// Zeroes extend packed unsigned 16-bit integers in `a`
553/// to packed 32-bit integers
554///
555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
556#[inline]
557#[target_feature(enable = "sse4.1")]
558#[cfg_attr(test, assert_instr(pmovzxwd))]
559#[stable(feature = "simd_x86", since = "1.27.0")]
560pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
561 let a: u16x8 = a.as_u16x8();
562 let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
563 transmute(src:simd_cast::<_, i32x4>(a))
564}
565
566/// Zeroes extend packed unsigned 16-bit integers in `a`
567/// to packed 64-bit integers
568///
569/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
570#[inline]
571#[target_feature(enable = "sse4.1")]
572#[cfg_attr(test, assert_instr(pmovzxwq))]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
575 let a: u16x8 = a.as_u16x8();
576 let a: u16x2 = simd_shuffle!(a, a, [0, 1]);
577 transmute(src:simd_cast::<_, i64x2>(a))
578}
579
580/// Zeroes extend packed unsigned 32-bit integers in `a`
581/// to packed 64-bit integers
582///
583/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
584#[inline]
585#[target_feature(enable = "sse4.1")]
586#[cfg_attr(test, assert_instr(pmovzxdq))]
587#[stable(feature = "simd_x86", since = "1.27.0")]
588pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
589 let a: u32x4 = a.as_u32x4();
590 let a: u32x2 = simd_shuffle!(a, a, [0, 1]);
591 transmute(src:simd_cast::<_, i64x2>(a))
592}
593
594/// Returns the dot product of two __m128d vectors.
595///
596/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
597/// If a condition mask bit is zero, the corresponding multiplication is
598/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
599/// the dot product will be stored in the return value component. Otherwise if
600/// the broadcast mask bit is zero then the return component will be zero.
601///
602/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
603#[inline]
604#[target_feature(enable = "sse4.1")]
605#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
606#[rustc_legacy_const_generics(2)]
607#[stable(feature = "simd_x86", since = "1.27.0")]
608pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
609 static_assert_uimm_bits!(IMM8, 8);
610 dppd(a, b, IMM8 as u8)
611}
612
613/// Returns the dot product of two __m128 vectors.
614///
615/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
616/// If a condition mask bit is zero, the corresponding multiplication is
617/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
618/// the dot product will be stored in the return value component. Otherwise if
619/// the broadcast mask bit is zero then the return component will be zero.
620///
621/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
622#[inline]
623#[target_feature(enable = "sse4.1")]
624#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
625#[rustc_legacy_const_generics(2)]
626#[stable(feature = "simd_x86", since = "1.27.0")]
627pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
628 static_assert_uimm_bits!(IMM8, 8);
629 dpps(a, b, IMM8 as u8)
630}
631
632/// Round the packed double-precision (64-bit) floating-point elements in `a`
633/// down to an integer value, and stores the results as packed double-precision
634/// floating-point elements.
635///
636/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
637#[inline]
638#[target_feature(enable = "sse4.1")]
639#[cfg_attr(test, assert_instr(roundpd))]
640#[stable(feature = "simd_x86", since = "1.27.0")]
641pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
642 simd_floor(a)
643}
644
645/// Round the packed single-precision (32-bit) floating-point elements in `a`
646/// down to an integer value, and stores the results as packed single-precision
647/// floating-point elements.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
650#[inline]
651#[target_feature(enable = "sse4.1")]
652#[cfg_attr(test, assert_instr(roundps))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
655 simd_floor(a)
656}
657
658/// Round the lower double-precision (64-bit) floating-point element in `b`
659/// down to an integer value, store the result as a double-precision
660/// floating-point element in the lower element of the intrinsic result,
661/// and copies the upper element from `a` to the upper element of the intrinsic
662/// result.
663///
664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
665#[inline]
666#[target_feature(enable = "sse4.1")]
667#[cfg_attr(test, assert_instr(roundsd))]
668#[stable(feature = "simd_x86", since = "1.27.0")]
669pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
670 roundsd(a, b, _MM_FROUND_FLOOR)
671}
672
673/// Round the lower single-precision (32-bit) floating-point element in `b`
674/// down to an integer value, store the result as a single-precision
675/// floating-point element in the lower element of the intrinsic result,
676/// and copies the upper 3 packed elements from `a` to the upper elements
677/// of the intrinsic result.
678///
679/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
680#[inline]
681#[target_feature(enable = "sse4.1")]
682#[cfg_attr(test, assert_instr(roundss))]
683#[stable(feature = "simd_x86", since = "1.27.0")]
684pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
685 roundss(a, b, _MM_FROUND_FLOOR)
686}
687
688/// Round the packed double-precision (64-bit) floating-point elements in `a`
689/// up to an integer value, and stores the results as packed double-precision
690/// floating-point elements.
691///
692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
693#[inline]
694#[target_feature(enable = "sse4.1")]
695#[cfg_attr(test, assert_instr(roundpd))]
696#[stable(feature = "simd_x86", since = "1.27.0")]
697pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
698 simd_ceil(a)
699}
700
701/// Round the packed single-precision (32-bit) floating-point elements in `a`
702/// up to an integer value, and stores the results as packed single-precision
703/// floating-point elements.
704///
705/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
706#[inline]
707#[target_feature(enable = "sse4.1")]
708#[cfg_attr(test, assert_instr(roundps))]
709#[stable(feature = "simd_x86", since = "1.27.0")]
710pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
711 simd_ceil(a)
712}
713
714/// Round the lower double-precision (64-bit) floating-point element in `b`
715/// up to an integer value, store the result as a double-precision
716/// floating-point element in the lower element of the intrinsic result,
717/// and copies the upper element from `a` to the upper element
718/// of the intrinsic result.
719///
720/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
721#[inline]
722#[target_feature(enable = "sse4.1")]
723#[cfg_attr(test, assert_instr(roundsd))]
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
726 roundsd(a, b, _MM_FROUND_CEIL)
727}
728
729/// Round the lower single-precision (32-bit) floating-point element in `b`
730/// up to an integer value, store the result as a single-precision
731/// floating-point element in the lower element of the intrinsic result,
732/// and copies the upper 3 packed elements from `a` to the upper elements
733/// of the intrinsic result.
734///
735/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
736#[inline]
737#[target_feature(enable = "sse4.1")]
738#[cfg_attr(test, assert_instr(roundss))]
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
741 roundss(a, b, _MM_FROUND_CEIL)
742}
743
744/// Round the packed double-precision (64-bit) floating-point elements in `a`
745/// using the `ROUNDING` parameter, and stores the results as packed
746/// double-precision floating-point elements.
747/// Rounding is done according to the rounding parameter, which can be one of:
748///
749/// ```
750/// #[cfg(target_arch = "x86")]
751/// use std::arch::x86::*;
752/// #[cfg(target_arch = "x86_64")]
753/// use std::arch::x86_64::*;
754///
755/// # fn main() {
756/// // round to nearest, and suppress exceptions:
757/// # let _x =
758/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
759/// // round down, and suppress exceptions:
760/// # let _x =
761/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
762/// // round up, and suppress exceptions:
763/// # let _x =
764/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
765/// // truncate, and suppress exceptions:
766/// # let _x =
767/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
768/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
769/// # let _x =
770/// _MM_FROUND_CUR_DIRECTION;
771/// # }
772/// ```
773///
774/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
775#[inline]
776#[target_feature(enable = "sse4.1")]
777#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
778#[rustc_legacy_const_generics(1)]
779#[stable(feature = "simd_x86", since = "1.27.0")]
780pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
781 static_assert_uimm_bits!(ROUNDING, 4);
782 roundpd(a, ROUNDING)
783}
784
785/// Round the packed single-precision (32-bit) floating-point elements in `a`
786/// using the `ROUNDING` parameter, and stores the results as packed
787/// single-precision floating-point elements.
788/// Rounding is done according to the rounding parameter, which can be one of:
789///
790/// ```
791/// #[cfg(target_arch = "x86")]
792/// use std::arch::x86::*;
793/// #[cfg(target_arch = "x86_64")]
794/// use std::arch::x86_64::*;
795///
796/// # fn main() {
797/// // round to nearest, and suppress exceptions:
798/// # let _x =
799/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
800/// // round down, and suppress exceptions:
801/// # let _x =
802/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
803/// // round up, and suppress exceptions:
804/// # let _x =
805/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
806/// // truncate, and suppress exceptions:
807/// # let _x =
808/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
809/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
810/// # let _x =
811/// _MM_FROUND_CUR_DIRECTION;
812/// # }
813/// ```
814///
815/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
816#[inline]
817#[target_feature(enable = "sse4.1")]
818#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
819#[rustc_legacy_const_generics(1)]
820#[stable(feature = "simd_x86", since = "1.27.0")]
821pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
822 static_assert_uimm_bits!(ROUNDING, 4);
823 roundps(a, ROUNDING)
824}
825
826/// Round the lower double-precision (64-bit) floating-point element in `b`
827/// using the `ROUNDING` parameter, store the result as a double-precision
828/// floating-point element in the lower element of the intrinsic result,
829/// and copies the upper element from `a` to the upper element of the intrinsic
830/// result.
831/// Rounding is done according to the rounding parameter, which can be one of:
832///
833/// ```
834/// #[cfg(target_arch = "x86")]
835/// use std::arch::x86::*;
836/// #[cfg(target_arch = "x86_64")]
837/// use std::arch::x86_64::*;
838///
839/// # fn main() {
840/// // round to nearest, and suppress exceptions:
841/// # let _x =
842/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
843/// // round down, and suppress exceptions:
844/// # let _x =
845/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
846/// // round up, and suppress exceptions:
847/// # let _x =
848/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
849/// // truncate, and suppress exceptions:
850/// # let _x =
851/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
852/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
853/// # let _x =
854/// _MM_FROUND_CUR_DIRECTION;
855/// # }
856/// ```
857///
858/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
859#[inline]
860#[target_feature(enable = "sse4.1")]
861#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
862#[rustc_legacy_const_generics(2)]
863#[stable(feature = "simd_x86", since = "1.27.0")]
864pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
865 static_assert_uimm_bits!(ROUNDING, 4);
866 roundsd(a, b, ROUNDING)
867}
868
869/// Round the lower single-precision (32-bit) floating-point element in `b`
870/// using the `ROUNDING` parameter, store the result as a single-precision
871/// floating-point element in the lower element of the intrinsic result,
872/// and copies the upper 3 packed elements from `a` to the upper elements
873/// of the intrinsic result.
874/// Rounding is done according to the rounding parameter, which can be one of:
875///
876/// ```
877/// #[cfg(target_arch = "x86")]
878/// use std::arch::x86::*;
879/// #[cfg(target_arch = "x86_64")]
880/// use std::arch::x86_64::*;
881///
882/// # fn main() {
883/// // round to nearest, and suppress exceptions:
884/// # let _x =
885/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
886/// // round down, and suppress exceptions:
887/// # let _x =
888/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
889/// // round up, and suppress exceptions:
890/// # let _x =
891/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
892/// // truncate, and suppress exceptions:
893/// # let _x =
894/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
895/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
896/// # let _x =
897/// _MM_FROUND_CUR_DIRECTION;
898/// # }
899/// ```
900///
901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
902#[inline]
903#[target_feature(enable = "sse4.1")]
904#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
905#[rustc_legacy_const_generics(2)]
906#[stable(feature = "simd_x86", since = "1.27.0")]
907pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
908 static_assert_uimm_bits!(ROUNDING, 4);
909 roundss(a, b, ROUNDING)
910}
911
912/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
913/// returning a vector containing its value in its first position, and its
914/// index
915/// in its second position; all other elements are set to zero.
916///
917/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
918/// instruction.
919///
920/// Arguments:
921///
922/// * `a` - A 128-bit vector of type `__m128i`.
923///
924/// Returns:
925///
926/// A 128-bit value where:
927///
928/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
929/// * bits `[18:16]` - contain the index of the minimum value
930/// * remaining bits are set to `0`.
931///
932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
933#[inline]
934#[target_feature(enable = "sse4.1")]
935#[cfg_attr(test, assert_instr(phminposuw))]
936#[stable(feature = "simd_x86", since = "1.27.0")]
937pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
938 transmute(src:phminposuw(a.as_u16x8()))
939}
940
941/// Multiplies the low 32-bit integers from each packed 64-bit
942/// element in `a` and `b`, and returns the signed 64-bit result.
943///
944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
945#[inline]
946#[target_feature(enable = "sse4.1")]
947#[cfg_attr(test, assert_instr(pmuldq))]
948#[stable(feature = "simd_x86", since = "1.27.0")]
949pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
950 let a: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
951 let b: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
952 transmute(src:simd_mul(x:a, y:b))
953}
954
955/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
956/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
957/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
958/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
959/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
960/// return a negative number.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
963#[inline]
964#[target_feature(enable = "sse4.1")]
965#[cfg_attr(test, assert_instr(pmulld))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
968 transmute(src:simd_mul(x:a.as_i32x4(), y:b.as_i32x4()))
969}
970
971/// Subtracts 8-bit unsigned integer values and computes the absolute
972/// values of the differences to the corresponding bits in the destination.
973/// Then sums of the absolute differences are returned according to the bit
974/// fields in the immediate operand.
975///
976/// The following algorithm is performed:
977///
978/// ```ignore
979/// i = IMM8[2] * 4
980/// j = IMM8[1:0] * 4
981/// for k := 0 to 7
982/// d0 = abs(a[i + k + 0] - b[j + 0])
983/// d1 = abs(a[i + k + 1] - b[j + 1])
984/// d2 = abs(a[i + k + 2] - b[j + 2])
985/// d3 = abs(a[i + k + 3] - b[j + 3])
986/// r[k] = d0 + d1 + d2 + d3
987/// ```
988///
989/// Arguments:
990///
991/// * `a` - A 128-bit vector of type `__m128i`.
992/// * `b` - A 128-bit vector of type `__m128i`.
993/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
994/// differences are to be calculated
995/// * Bit `[2]` specify the offset for operand `a`
996/// * Bits `[1:0]` specify the offset for operand `b`
997///
998/// Returns:
999///
1000/// * A `__m128i` vector containing the sums of the sets of absolute
1001/// differences between both operands.
1002///
1003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
1004#[inline]
1005#[target_feature(enable = "sse4.1")]
1006#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
1007#[rustc_legacy_const_generics(2)]
1008#[stable(feature = "simd_x86", since = "1.27.0")]
1009pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
1010 static_assert_uimm_bits!(IMM8, 3);
1011 transmute(src:mpsadbw(a:a.as_u8x16(), b:b.as_u8x16(), IMM8 as u8))
1012}
1013
1014/// Tests whether the specified bits in a 128-bit integer vector are all
1015/// zeros.
1016///
1017/// Arguments:
1018///
1019/// * `a` - A 128-bit integer vector containing the bits to be tested.
1020/// * `mask` - A 128-bit integer vector selecting which bits to test in
1021/// operand `a`.
1022///
1023/// Returns:
1024///
1025/// * `1` - if the specified bits are all zeros,
1026/// * `0` - otherwise.
1027///
1028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
1029#[inline]
1030#[target_feature(enable = "sse4.1")]
1031#[cfg_attr(test, assert_instr(ptest))]
1032#[stable(feature = "simd_x86", since = "1.27.0")]
1033pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1034 ptestz(a:a.as_i64x2(), mask:mask.as_i64x2())
1035}
1036
1037/// Tests whether the specified bits in a 128-bit integer vector are all
1038/// ones.
1039///
1040/// Arguments:
1041///
1042/// * `a` - A 128-bit integer vector containing the bits to be tested.
1043/// * `mask` - A 128-bit integer vector selecting which bits to test in
1044/// operand `a`.
1045///
1046/// Returns:
1047///
1048/// * `1` - if the specified bits are all ones,
1049/// * `0` - otherwise.
1050///
1051/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
1052#[inline]
1053#[target_feature(enable = "sse4.1")]
1054#[cfg_attr(test, assert_instr(ptest))]
1055#[stable(feature = "simd_x86", since = "1.27.0")]
1056pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1057 ptestc(a:a.as_i64x2(), mask:mask.as_i64x2())
1058}
1059
1060/// Tests whether the specified bits in a 128-bit integer vector are
1061/// neither all zeros nor all ones.
1062///
1063/// Arguments:
1064///
1065/// * `a` - A 128-bit integer vector containing the bits to be tested.
1066/// * `mask` - A 128-bit integer vector selecting which bits to test in
1067/// operand `a`.
1068///
1069/// Returns:
1070///
1071/// * `1` - if the specified bits are neither all zeros nor all ones,
1072/// * `0` - otherwise.
1073///
1074/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
1075#[inline]
1076#[target_feature(enable = "sse4.1")]
1077#[cfg_attr(test, assert_instr(ptest))]
1078#[stable(feature = "simd_x86", since = "1.27.0")]
1079pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1080 ptestnzc(a:a.as_i64x2(), mask:mask.as_i64x2())
1081}
1082
1083/// Tests whether the specified bits in a 128-bit integer vector are all
1084/// zeros.
1085///
1086/// Arguments:
1087///
1088/// * `a` - A 128-bit integer vector containing the bits to be tested.
1089/// * `mask` - A 128-bit integer vector selecting which bits to test in
1090/// operand `a`.
1091///
1092/// Returns:
1093///
1094/// * `1` - if the specified bits are all zeros,
1095/// * `0` - otherwise.
1096///
1097/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
1098#[inline]
1099#[target_feature(enable = "sse4.1")]
1100#[cfg_attr(test, assert_instr(ptest))]
1101#[stable(feature = "simd_x86", since = "1.27.0")]
1102pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1103 _mm_testz_si128(a, mask)
1104}
1105
1106/// Tests whether the specified bits in `a` 128-bit integer vector are all
1107/// ones.
1108///
1109/// Argument:
1110///
1111/// * `a` - A 128-bit integer vector containing the bits to be tested.
1112///
1113/// Returns:
1114///
1115/// * `1` - if the bits specified in the operand are all set to 1,
1116/// * `0` - otherwise.
1117///
1118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
1119#[inline]
1120#[target_feature(enable = "sse4.1")]
1121#[cfg_attr(test, assert_instr(pcmpeqd))]
1122#[cfg_attr(test, assert_instr(ptest))]
1123#[stable(feature = "simd_x86", since = "1.27.0")]
1124pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1125 _mm_testc_si128(a, mask:_mm_cmpeq_epi32(a, b:a))
1126}
1127
1128/// Tests whether the specified bits in a 128-bit integer vector are
1129/// neither all zeros nor all ones.
1130///
1131/// Arguments:
1132///
1133/// * `a` - A 128-bit integer vector containing the bits to be tested.
1134/// * `mask` - A 128-bit integer vector selecting which bits to test in
1135/// operand `a`.
1136///
1137/// Returns:
1138///
1139/// * `1` - if the specified bits are neither all zeros nor all ones,
1140/// * `0` - otherwise.
1141///
1142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
1143#[inline]
1144#[target_feature(enable = "sse4.1")]
1145#[cfg_attr(test, assert_instr(ptest))]
1146#[stable(feature = "simd_x86", since = "1.27.0")]
1147pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1148 _mm_testnzc_si128(a, mask)
1149}
1150
1151#[allow(improper_ctypes)]
1152extern "C" {
1153 #[link_name = "llvm.x86.sse41.insertps"]
1154 fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1155 #[link_name = "llvm.x86.sse41.packusdw"]
1156 fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1157 #[link_name = "llvm.x86.sse41.dppd"]
1158 fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1159 #[link_name = "llvm.x86.sse41.dpps"]
1160 fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1161 #[link_name = "llvm.x86.sse41.round.pd"]
1162 fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1163 #[link_name = "llvm.x86.sse41.round.ps"]
1164 fn roundps(a: __m128, rounding: i32) -> __m128;
1165 #[link_name = "llvm.x86.sse41.round.sd"]
1166 fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1167 #[link_name = "llvm.x86.sse41.round.ss"]
1168 fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1169 #[link_name = "llvm.x86.sse41.phminposuw"]
1170 fn phminposuw(a: u16x8) -> u16x8;
1171 #[link_name = "llvm.x86.sse41.mpsadbw"]
1172 fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1173 #[link_name = "llvm.x86.sse41.ptestz"]
1174 fn ptestz(a: i64x2, mask: i64x2) -> i32;
1175 #[link_name = "llvm.x86.sse41.ptestc"]
1176 fn ptestc(a: i64x2, mask: i64x2) -> i32;
1177 #[link_name = "llvm.x86.sse41.ptestnzc"]
1178 fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1179}
1180
1181#[cfg(test)]
1182mod tests {
1183 use crate::core_arch::x86::*;
1184 use std::mem;
1185 use stdarch_test::simd_test;
1186
1187 #[simd_test(enable = "sse4.1")]
1188 unsafe fn test_mm_blendv_epi8() {
1189 #[rustfmt::skip]
1190 let a = _mm_setr_epi8(
1191 0, 1, 2, 3, 4, 5, 6, 7,
1192 8, 9, 10, 11, 12, 13, 14, 15,
1193 );
1194 #[rustfmt::skip]
1195 let b = _mm_setr_epi8(
1196 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1197 );
1198 #[rustfmt::skip]
1199 let mask = _mm_setr_epi8(
1200 0, -1, 0, -1, 0, -1, 0, -1,
1201 0, -1, 0, -1, 0, -1, 0, -1,
1202 );
1203 #[rustfmt::skip]
1204 let e = _mm_setr_epi8(
1205 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1206 );
1207 assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1208 }
1209
1210 #[simd_test(enable = "sse4.1")]
1211 unsafe fn test_mm_blendv_pd() {
1212 let a = _mm_set1_pd(0.0);
1213 let b = _mm_set1_pd(1.0);
1214 let mask = transmute(_mm_setr_epi64x(0, -1));
1215 let r = _mm_blendv_pd(a, b, mask);
1216 let e = _mm_setr_pd(0.0, 1.0);
1217 assert_eq_m128d(r, e);
1218 }
1219
1220 #[simd_test(enable = "sse4.1")]
1221 unsafe fn test_mm_blendv_ps() {
1222 let a = _mm_set1_ps(0.0);
1223 let b = _mm_set1_ps(1.0);
1224 let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1225 let r = _mm_blendv_ps(a, b, mask);
1226 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1227 assert_eq_m128(r, e);
1228 }
1229
1230 #[simd_test(enable = "sse4.1")]
1231 unsafe fn test_mm_blend_pd() {
1232 let a = _mm_set1_pd(0.0);
1233 let b = _mm_set1_pd(1.0);
1234 let r = _mm_blend_pd::<0b10>(a, b);
1235 let e = _mm_setr_pd(0.0, 1.0);
1236 assert_eq_m128d(r, e);
1237 }
1238
1239 #[simd_test(enable = "sse4.1")]
1240 unsafe fn test_mm_blend_ps() {
1241 let a = _mm_set1_ps(0.0);
1242 let b = _mm_set1_ps(1.0);
1243 let r = _mm_blend_ps::<0b1010>(a, b);
1244 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1245 assert_eq_m128(r, e);
1246 }
1247
1248 #[simd_test(enable = "sse4.1")]
1249 unsafe fn test_mm_blend_epi16() {
1250 let a = _mm_set1_epi16(0);
1251 let b = _mm_set1_epi16(1);
1252 let r = _mm_blend_epi16::<0b1010_1100>(a, b);
1253 let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1254 assert_eq_m128i(r, e);
1255 }
1256
1257 #[simd_test(enable = "sse4.1")]
1258 unsafe fn test_mm_extract_ps() {
1259 let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1260 let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
1261 assert_eq!(r, 1.0);
1262 let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
1263 assert_eq!(r, 3.0);
1264 }
1265
1266 #[simd_test(enable = "sse4.1")]
1267 unsafe fn test_mm_extract_epi8() {
1268 #[rustfmt::skip]
1269 let a = _mm_setr_epi8(
1270 -1, 1, 2, 3, 4, 5, 6, 7,
1271 8, 9, 10, 11, 12, 13, 14, 15
1272 );
1273 let r1 = _mm_extract_epi8::<0>(a);
1274 let r2 = _mm_extract_epi8::<3>(a);
1275 assert_eq!(r1, 0xFF);
1276 assert_eq!(r2, 3);
1277 }
1278
1279 #[simd_test(enable = "sse4.1")]
1280 unsafe fn test_mm_extract_epi32() {
1281 let a = _mm_setr_epi32(0, 1, 2, 3);
1282 let r = _mm_extract_epi32::<1>(a);
1283 assert_eq!(r, 1);
1284 let r = _mm_extract_epi32::<3>(a);
1285 assert_eq!(r, 3);
1286 }
1287
1288 #[simd_test(enable = "sse4.1")]
1289 unsafe fn test_mm_insert_ps() {
1290 let a = _mm_set1_ps(1.0);
1291 let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1292 let r = _mm_insert_ps::<0b11_00_1100>(a, b);
1293 let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1294 assert_eq_m128(r, e);
1295
1296 // Zeroing takes precedence over copied value
1297 let a = _mm_set1_ps(1.0);
1298 let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1299 let r = _mm_insert_ps::<0b11_00_0001>(a, b);
1300 let e = _mm_setr_ps(0.0, 1.0, 1.0, 1.0);
1301 assert_eq_m128(r, e);
1302 }
1303
1304 #[simd_test(enable = "sse4.1")]
1305 unsafe fn test_mm_insert_epi8() {
1306 let a = _mm_set1_epi8(0);
1307 let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1308 let r = _mm_insert_epi8::<1>(a, 32);
1309 assert_eq_m128i(r, e);
1310 let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1311 let r = _mm_insert_epi8::<14>(a, 32);
1312 assert_eq_m128i(r, e);
1313 }
1314
1315 #[simd_test(enable = "sse4.1")]
1316 unsafe fn test_mm_insert_epi32() {
1317 let a = _mm_set1_epi32(0);
1318 let e = _mm_setr_epi32(0, 32, 0, 0);
1319 let r = _mm_insert_epi32::<1>(a, 32);
1320 assert_eq_m128i(r, e);
1321 let e = _mm_setr_epi32(0, 0, 0, 32);
1322 let r = _mm_insert_epi32::<3>(a, 32);
1323 assert_eq_m128i(r, e);
1324 }
1325
1326 #[simd_test(enable = "sse4.1")]
1327 unsafe fn test_mm_max_epi8() {
1328 #[rustfmt::skip]
1329 let a = _mm_setr_epi8(
1330 1, 4, 5, 8, 9, 12, 13, 16,
1331 17, 20, 21, 24, 25, 28, 29, 32,
1332 );
1333 #[rustfmt::skip]
1334 let b = _mm_setr_epi8(
1335 2, 3, 6, 7, 10, 11, 14, 15,
1336 18, 19, 22, 23, 26, 27, 30, 31,
1337 );
1338 let r = _mm_max_epi8(a, b);
1339 #[rustfmt::skip]
1340 let e = _mm_setr_epi8(
1341 2, 4, 6, 8, 10, 12, 14, 16,
1342 18, 20, 22, 24, 26, 28, 30, 32,
1343 );
1344 assert_eq_m128i(r, e);
1345 }
1346
1347 #[simd_test(enable = "sse4.1")]
1348 unsafe fn test_mm_max_epu16() {
1349 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1350 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1351 let r = _mm_max_epu16(a, b);
1352 let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1353 assert_eq_m128i(r, e);
1354 }
1355
1356 #[simd_test(enable = "sse4.1")]
1357 unsafe fn test_mm_max_epi32() {
1358 let a = _mm_setr_epi32(1, 4, 5, 8);
1359 let b = _mm_setr_epi32(2, 3, 6, 7);
1360 let r = _mm_max_epi32(a, b);
1361 let e = _mm_setr_epi32(2, 4, 6, 8);
1362 assert_eq_m128i(r, e);
1363 }
1364
1365 #[simd_test(enable = "sse4.1")]
1366 unsafe fn test_mm_max_epu32() {
1367 let a = _mm_setr_epi32(1, 4, 5, 8);
1368 let b = _mm_setr_epi32(2, 3, 6, 7);
1369 let r = _mm_max_epu32(a, b);
1370 let e = _mm_setr_epi32(2, 4, 6, 8);
1371 assert_eq_m128i(r, e);
1372 }
1373
1374 #[simd_test(enable = "sse4.1")]
1375 unsafe fn test_mm_min_epi8_1() {
1376 #[rustfmt::skip]
1377 let a = _mm_setr_epi8(
1378 1, 4, 5, 8, 9, 12, 13, 16,
1379 17, 20, 21, 24, 25, 28, 29, 32,
1380 );
1381 #[rustfmt::skip]
1382 let b = _mm_setr_epi8(
1383 2, 3, 6, 7, 10, 11, 14, 15,
1384 18, 19, 22, 23, 26, 27, 30, 31,
1385 );
1386 let r = _mm_min_epi8(a, b);
1387 #[rustfmt::skip]
1388 let e = _mm_setr_epi8(
1389 1, 3, 5, 7, 9, 11, 13, 15,
1390 17, 19, 21, 23, 25, 27, 29, 31,
1391 );
1392 assert_eq_m128i(r, e);
1393 }
1394
1395 #[simd_test(enable = "sse4.1")]
1396 unsafe fn test_mm_min_epi8_2() {
1397 #[rustfmt::skip]
1398 let a = _mm_setr_epi8(
1399 1, -4, -5, 8, -9, -12, 13, -16,
1400 17, 20, 21, 24, 25, 28, 29, 32,
1401 );
1402 #[rustfmt::skip]
1403 let b = _mm_setr_epi8(
1404 2, -3, -6, 7, -10, -11, 14, -15,
1405 18, 19, 22, 23, 26, 27, 30, 31,
1406 );
1407 let r = _mm_min_epi8(a, b);
1408 #[rustfmt::skip]
1409 let e = _mm_setr_epi8(
1410 1, -4, -6, 7, -10, -12, 13, -16,
1411 17, 19, 21, 23, 25, 27, 29, 31,
1412 );
1413 assert_eq_m128i(r, e);
1414 }
1415
1416 #[simd_test(enable = "sse4.1")]
1417 unsafe fn test_mm_min_epu16() {
1418 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1419 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1420 let r = _mm_min_epu16(a, b);
1421 let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1422 assert_eq_m128i(r, e);
1423 }
1424
1425 #[simd_test(enable = "sse4.1")]
1426 unsafe fn test_mm_min_epi32_1() {
1427 let a = _mm_setr_epi32(1, 4, 5, 8);
1428 let b = _mm_setr_epi32(2, 3, 6, 7);
1429 let r = _mm_min_epi32(a, b);
1430 let e = _mm_setr_epi32(1, 3, 5, 7);
1431 assert_eq_m128i(r, e);
1432 }
1433
1434 #[simd_test(enable = "sse4.1")]
1435 unsafe fn test_mm_min_epi32_2() {
1436 let a = _mm_setr_epi32(-1, 4, 5, -7);
1437 let b = _mm_setr_epi32(-2, 3, -6, 8);
1438 let r = _mm_min_epi32(a, b);
1439 let e = _mm_setr_epi32(-2, 3, -6, -7);
1440 assert_eq_m128i(r, e);
1441 }
1442
1443 #[simd_test(enable = "sse4.1")]
1444 unsafe fn test_mm_min_epu32() {
1445 let a = _mm_setr_epi32(1, 4, 5, 8);
1446 let b = _mm_setr_epi32(2, 3, 6, 7);
1447 let r = _mm_min_epu32(a, b);
1448 let e = _mm_setr_epi32(1, 3, 5, 7);
1449 assert_eq_m128i(r, e);
1450 }
1451
1452 #[simd_test(enable = "sse4.1")]
1453 unsafe fn test_mm_packus_epi32() {
1454 let a = _mm_setr_epi32(1, 2, 3, 4);
1455 let b = _mm_setr_epi32(-1, -2, -3, -4);
1456 let r = _mm_packus_epi32(a, b);
1457 let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1458 assert_eq_m128i(r, e);
1459 }
1460
1461 #[simd_test(enable = "sse4.1")]
1462 unsafe fn test_mm_cmpeq_epi64() {
1463 let a = _mm_setr_epi64x(0, 1);
1464 let b = _mm_setr_epi64x(0, 0);
1465 let r = _mm_cmpeq_epi64(a, b);
1466 let e = _mm_setr_epi64x(-1, 0);
1467 assert_eq_m128i(r, e);
1468 }
1469
1470 #[simd_test(enable = "sse4.1")]
1471 unsafe fn test_mm_cvtepi8_epi16() {
1472 let a = _mm_set1_epi8(10);
1473 let r = _mm_cvtepi8_epi16(a);
1474 let e = _mm_set1_epi16(10);
1475 assert_eq_m128i(r, e);
1476 let a = _mm_set1_epi8(-10);
1477 let r = _mm_cvtepi8_epi16(a);
1478 let e = _mm_set1_epi16(-10);
1479 assert_eq_m128i(r, e);
1480 }
1481
1482 #[simd_test(enable = "sse4.1")]
1483 unsafe fn test_mm_cvtepi8_epi32() {
1484 let a = _mm_set1_epi8(10);
1485 let r = _mm_cvtepi8_epi32(a);
1486 let e = _mm_set1_epi32(10);
1487 assert_eq_m128i(r, e);
1488 let a = _mm_set1_epi8(-10);
1489 let r = _mm_cvtepi8_epi32(a);
1490 let e = _mm_set1_epi32(-10);
1491 assert_eq_m128i(r, e);
1492 }
1493
1494 #[simd_test(enable = "sse4.1")]
1495 unsafe fn test_mm_cvtepi8_epi64() {
1496 let a = _mm_set1_epi8(10);
1497 let r = _mm_cvtepi8_epi64(a);
1498 let e = _mm_set1_epi64x(10);
1499 assert_eq_m128i(r, e);
1500 let a = _mm_set1_epi8(-10);
1501 let r = _mm_cvtepi8_epi64(a);
1502 let e = _mm_set1_epi64x(-10);
1503 assert_eq_m128i(r, e);
1504 }
1505
1506 #[simd_test(enable = "sse4.1")]
1507 unsafe fn test_mm_cvtepi16_epi32() {
1508 let a = _mm_set1_epi16(10);
1509 let r = _mm_cvtepi16_epi32(a);
1510 let e = _mm_set1_epi32(10);
1511 assert_eq_m128i(r, e);
1512 let a = _mm_set1_epi16(-10);
1513 let r = _mm_cvtepi16_epi32(a);
1514 let e = _mm_set1_epi32(-10);
1515 assert_eq_m128i(r, e);
1516 }
1517
1518 #[simd_test(enable = "sse4.1")]
1519 unsafe fn test_mm_cvtepi16_epi64() {
1520 let a = _mm_set1_epi16(10);
1521 let r = _mm_cvtepi16_epi64(a);
1522 let e = _mm_set1_epi64x(10);
1523 assert_eq_m128i(r, e);
1524 let a = _mm_set1_epi16(-10);
1525 let r = _mm_cvtepi16_epi64(a);
1526 let e = _mm_set1_epi64x(-10);
1527 assert_eq_m128i(r, e);
1528 }
1529
1530 #[simd_test(enable = "sse4.1")]
1531 unsafe fn test_mm_cvtepi32_epi64() {
1532 let a = _mm_set1_epi32(10);
1533 let r = _mm_cvtepi32_epi64(a);
1534 let e = _mm_set1_epi64x(10);
1535 assert_eq_m128i(r, e);
1536 let a = _mm_set1_epi32(-10);
1537 let r = _mm_cvtepi32_epi64(a);
1538 let e = _mm_set1_epi64x(-10);
1539 assert_eq_m128i(r, e);
1540 }
1541
1542 #[simd_test(enable = "sse4.1")]
1543 unsafe fn test_mm_cvtepu8_epi16() {
1544 let a = _mm_set1_epi8(10);
1545 let r = _mm_cvtepu8_epi16(a);
1546 let e = _mm_set1_epi16(10);
1547 assert_eq_m128i(r, e);
1548 }
1549
1550 #[simd_test(enable = "sse4.1")]
1551 unsafe fn test_mm_cvtepu8_epi32() {
1552 let a = _mm_set1_epi8(10);
1553 let r = _mm_cvtepu8_epi32(a);
1554 let e = _mm_set1_epi32(10);
1555 assert_eq_m128i(r, e);
1556 }
1557
1558 #[simd_test(enable = "sse4.1")]
1559 unsafe fn test_mm_cvtepu8_epi64() {
1560 let a = _mm_set1_epi8(10);
1561 let r = _mm_cvtepu8_epi64(a);
1562 let e = _mm_set1_epi64x(10);
1563 assert_eq_m128i(r, e);
1564 }
1565
1566 #[simd_test(enable = "sse4.1")]
1567 unsafe fn test_mm_cvtepu16_epi32() {
1568 let a = _mm_set1_epi16(10);
1569 let r = _mm_cvtepu16_epi32(a);
1570 let e = _mm_set1_epi32(10);
1571 assert_eq_m128i(r, e);
1572 }
1573
1574 #[simd_test(enable = "sse4.1")]
1575 unsafe fn test_mm_cvtepu16_epi64() {
1576 let a = _mm_set1_epi16(10);
1577 let r = _mm_cvtepu16_epi64(a);
1578 let e = _mm_set1_epi64x(10);
1579 assert_eq_m128i(r, e);
1580 }
1581
1582 #[simd_test(enable = "sse4.1")]
1583 unsafe fn test_mm_cvtepu32_epi64() {
1584 let a = _mm_set1_epi32(10);
1585 let r = _mm_cvtepu32_epi64(a);
1586 let e = _mm_set1_epi64x(10);
1587 assert_eq_m128i(r, e);
1588 }
1589
1590 #[simd_test(enable = "sse4.1")]
1591 unsafe fn test_mm_dp_pd() {
1592 let a = _mm_setr_pd(2.0, 3.0);
1593 let b = _mm_setr_pd(1.0, 4.0);
1594 let e = _mm_setr_pd(14.0, 0.0);
1595 assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
1596 }
1597
1598 #[simd_test(enable = "sse4.1")]
1599 unsafe fn test_mm_dp_ps() {
1600 let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1601 let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1602 let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1603 assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
1604 }
1605
1606 #[simd_test(enable = "sse4.1")]
1607 unsafe fn test_mm_floor_pd() {
1608 let a = _mm_setr_pd(2.5, 4.5);
1609 let r = _mm_floor_pd(a);
1610 let e = _mm_setr_pd(2.0, 4.0);
1611 assert_eq_m128d(r, e);
1612 }
1613
1614 #[simd_test(enable = "sse4.1")]
1615 unsafe fn test_mm_floor_ps() {
1616 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1617 let r = _mm_floor_ps(a);
1618 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1619 assert_eq_m128(r, e);
1620 }
1621
1622 #[simd_test(enable = "sse4.1")]
1623 unsafe fn test_mm_floor_sd() {
1624 let a = _mm_setr_pd(2.5, 4.5);
1625 let b = _mm_setr_pd(-1.5, -3.5);
1626 let r = _mm_floor_sd(a, b);
1627 let e = _mm_setr_pd(-2.0, 4.5);
1628 assert_eq_m128d(r, e);
1629 }
1630
1631 #[simd_test(enable = "sse4.1")]
1632 unsafe fn test_mm_floor_ss() {
1633 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1634 let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1635 let r = _mm_floor_ss(a, b);
1636 let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1637 assert_eq_m128(r, e);
1638 }
1639
1640 #[simd_test(enable = "sse4.1")]
1641 unsafe fn test_mm_ceil_pd() {
1642 let a = _mm_setr_pd(1.5, 3.5);
1643 let r = _mm_ceil_pd(a);
1644 let e = _mm_setr_pd(2.0, 4.0);
1645 assert_eq_m128d(r, e);
1646 }
1647
1648 #[simd_test(enable = "sse4.1")]
1649 unsafe fn test_mm_ceil_ps() {
1650 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1651 let r = _mm_ceil_ps(a);
1652 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1653 assert_eq_m128(r, e);
1654 }
1655
1656 #[simd_test(enable = "sse4.1")]
1657 unsafe fn test_mm_ceil_sd() {
1658 let a = _mm_setr_pd(1.5, 3.5);
1659 let b = _mm_setr_pd(-2.5, -4.5);
1660 let r = _mm_ceil_sd(a, b);
1661 let e = _mm_setr_pd(-2.0, 3.5);
1662 assert_eq_m128d(r, e);
1663 }
1664
1665 #[simd_test(enable = "sse4.1")]
1666 unsafe fn test_mm_ceil_ss() {
1667 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1668 let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1669 let r = _mm_ceil_ss(a, b);
1670 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1671 assert_eq_m128(r, e);
1672 }
1673
1674 #[simd_test(enable = "sse4.1")]
1675 unsafe fn test_mm_round_pd() {
1676 let a = _mm_setr_pd(1.25, 3.75);
1677 let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1678 let e = _mm_setr_pd(1.0, 4.0);
1679 assert_eq_m128d(r, e);
1680 }
1681
1682 #[simd_test(enable = "sse4.1")]
1683 unsafe fn test_mm_round_ps() {
1684 let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1685 let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1686 let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1687 assert_eq_m128(r, e);
1688 }
1689
1690 #[simd_test(enable = "sse4.1")]
1691 unsafe fn test_mm_round_sd() {
1692 let a = _mm_setr_pd(1.5, 3.5);
1693 let b = _mm_setr_pd(-2.5, -4.5);
1694 let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1695 let e = _mm_setr_pd(-2.0, 3.5);
1696 assert_eq_m128d(r, e);
1697
1698 let a = _mm_setr_pd(1.5, 3.5);
1699 let b = _mm_setr_pd(-2.5, -4.5);
1700 let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
1701 let e = _mm_setr_pd(-3.0, 3.5);
1702 assert_eq_m128d(r, e);
1703
1704 let a = _mm_setr_pd(1.5, 3.5);
1705 let b = _mm_setr_pd(-2.5, -4.5);
1706 let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
1707 let e = _mm_setr_pd(-2.0, 3.5);
1708 assert_eq_m128d(r, e);
1709
1710 let a = _mm_setr_pd(1.5, 3.5);
1711 let b = _mm_setr_pd(-2.5, -4.5);
1712 let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
1713 let e = _mm_setr_pd(-2.0, 3.5);
1714 assert_eq_m128d(r, e);
1715 }
1716
1717 #[simd_test(enable = "sse4.1")]
1718 unsafe fn test_mm_round_ss() {
1719 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1720 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1721 let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1722 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1723 assert_eq_m128(r, e);
1724
1725 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1726 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1727 let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
1728 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1729 assert_eq_m128(r, e);
1730
1731 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1732 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1733 let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
1734 let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1735 assert_eq_m128(r, e);
1736
1737 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1738 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1739 let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
1740 let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1741 assert_eq_m128(r, e);
1742 }
1743
1744 #[simd_test(enable = "sse4.1")]
1745 unsafe fn test_mm_minpos_epu16_1() {
1746 let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1747 let r = _mm_minpos_epu16(a);
1748 let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1749 assert_eq_m128i(r, e);
1750 }
1751
1752 #[simd_test(enable = "sse4.1")]
1753 unsafe fn test_mm_minpos_epu16_2() {
1754 let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1755 let r = _mm_minpos_epu16(a);
1756 let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1757 assert_eq_m128i(r, e);
1758 }
1759
1760 #[simd_test(enable = "sse4.1")]
1761 unsafe fn test_mm_minpos_epu16_3() {
1762 // Case where the minimum value is repeated
1763 let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 13);
1764 let r = _mm_minpos_epu16(a);
1765 let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1766 assert_eq_m128i(r, e);
1767 }
1768
1769 #[simd_test(enable = "sse4.1")]
1770 unsafe fn test_mm_mul_epi32() {
1771 {
1772 let a = _mm_setr_epi32(1, 1, 1, 1);
1773 let b = _mm_setr_epi32(1, 2, 3, 4);
1774 let r = _mm_mul_epi32(a, b);
1775 let e = _mm_setr_epi64x(1, 3);
1776 assert_eq_m128i(r, e);
1777 }
1778 {
1779 let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1780 let b = _mm_setr_epi32(
1781 -20, -256, /* ignored */
1782 666666, 666666, /* ignored */
1783 );
1784 let r = _mm_mul_epi32(a, b);
1785 let e = _mm_setr_epi64x(-300, 823043843622);
1786 assert_eq_m128i(r, e);
1787 }
1788 }
1789
1790 #[simd_test(enable = "sse4.1")]
1791 unsafe fn test_mm_mullo_epi32() {
1792 {
1793 let a = _mm_setr_epi32(1, 1, 1, 1);
1794 let b = _mm_setr_epi32(1, 2, 3, 4);
1795 let r = _mm_mullo_epi32(a, b);
1796 let e = _mm_setr_epi32(1, 2, 3, 4);
1797 assert_eq_m128i(r, e);
1798 }
1799 {
1800 let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1801 let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1802 let r = _mm_mullo_epi32(a, b);
1803 // Attention, most significant bit in r[2] is treated
1804 // as a sign bit:
1805 // 1234567 * 666666 = -1589877210
1806 let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1807 assert_eq_m128i(r, e);
1808 }
1809 }
1810
1811 #[simd_test(enable = "sse4.1")]
1812 unsafe fn test_mm_minpos_epu16() {
1813 let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1814 let r = _mm_minpos_epu16(a);
1815 let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1816 assert_eq_m128i(r, e);
1817 }
1818
1819 #[simd_test(enable = "sse4.1")]
1820 unsafe fn test_mm_mpsadbw_epu8() {
1821 #[rustfmt::skip]
1822 let a = _mm_setr_epi8(
1823 0, 1, 2, 3, 4, 5, 6, 7,
1824 8, 9, 10, 11, 12, 13, 14, 15,
1825 );
1826
1827 let r = _mm_mpsadbw_epu8::<0b000>(a, a);
1828 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1829 assert_eq_m128i(r, e);
1830
1831 let r = _mm_mpsadbw_epu8::<0b001>(a, a);
1832 let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1833 assert_eq_m128i(r, e);
1834
1835 let r = _mm_mpsadbw_epu8::<0b100>(a, a);
1836 let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1837 assert_eq_m128i(r, e);
1838
1839 let r = _mm_mpsadbw_epu8::<0b101>(a, a);
1840 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1841 assert_eq_m128i(r, e);
1842
1843 let r = _mm_mpsadbw_epu8::<0b111>(a, a);
1844 let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1845 assert_eq_m128i(r, e);
1846 }
1847
1848 #[simd_test(enable = "sse4.1")]
1849 unsafe fn test_mm_testz_si128() {
1850 let a = _mm_set1_epi8(1);
1851 let mask = _mm_set1_epi8(0);
1852 let r = _mm_testz_si128(a, mask);
1853 assert_eq!(r, 1);
1854 let a = _mm_set1_epi8(0b101);
1855 let mask = _mm_set1_epi8(0b110);
1856 let r = _mm_testz_si128(a, mask);
1857 assert_eq!(r, 0);
1858 let a = _mm_set1_epi8(0b011);
1859 let mask = _mm_set1_epi8(0b100);
1860 let r = _mm_testz_si128(a, mask);
1861 assert_eq!(r, 1);
1862 }
1863
1864 #[simd_test(enable = "sse4.1")]
1865 unsafe fn test_mm_testc_si128() {
1866 let a = _mm_set1_epi8(-1);
1867 let mask = _mm_set1_epi8(0);
1868 let r = _mm_testc_si128(a, mask);
1869 assert_eq!(r, 1);
1870 let a = _mm_set1_epi8(0b101);
1871 let mask = _mm_set1_epi8(0b110);
1872 let r = _mm_testc_si128(a, mask);
1873 assert_eq!(r, 0);
1874 let a = _mm_set1_epi8(0b101);
1875 let mask = _mm_set1_epi8(0b100);
1876 let r = _mm_testc_si128(a, mask);
1877 assert_eq!(r, 1);
1878 }
1879
1880 #[simd_test(enable = "sse4.1")]
1881 unsafe fn test_mm_testnzc_si128() {
1882 let a = _mm_set1_epi8(0);
1883 let mask = _mm_set1_epi8(1);
1884 let r = _mm_testnzc_si128(a, mask);
1885 assert_eq!(r, 0);
1886 let a = _mm_set1_epi8(-1);
1887 let mask = _mm_set1_epi8(0);
1888 let r = _mm_testnzc_si128(a, mask);
1889 assert_eq!(r, 0);
1890 let a = _mm_set1_epi8(0b101);
1891 let mask = _mm_set1_epi8(0b110);
1892 let r = _mm_testnzc_si128(a, mask);
1893 assert_eq!(r, 1);
1894 let a = _mm_set1_epi8(0b101);
1895 let mask = _mm_set1_epi8(0b101);
1896 let r = _mm_testnzc_si128(a, mask);
1897 assert_eq!(r, 0);
1898 }
1899
1900 #[simd_test(enable = "sse4.1")]
1901 unsafe fn test_mm_test_all_zeros() {
1902 let a = _mm_set1_epi8(1);
1903 let mask = _mm_set1_epi8(0);
1904 let r = _mm_test_all_zeros(a, mask);
1905 assert_eq!(r, 1);
1906 let a = _mm_set1_epi8(0b101);
1907 let mask = _mm_set1_epi8(0b110);
1908 let r = _mm_test_all_zeros(a, mask);
1909 assert_eq!(r, 0);
1910 let a = _mm_set1_epi8(0b011);
1911 let mask = _mm_set1_epi8(0b100);
1912 let r = _mm_test_all_zeros(a, mask);
1913 assert_eq!(r, 1);
1914 }
1915
1916 #[simd_test(enable = "sse4.1")]
1917 unsafe fn test_mm_test_all_ones() {
1918 let a = _mm_set1_epi8(-1);
1919 let r = _mm_test_all_ones(a);
1920 assert_eq!(r, 1);
1921 let a = _mm_set1_epi8(0b101);
1922 let r = _mm_test_all_ones(a);
1923 assert_eq!(r, 0);
1924 }
1925
1926 #[simd_test(enable = "sse4.1")]
1927 unsafe fn test_mm_test_mix_ones_zeros() {
1928 let a = _mm_set1_epi8(0);
1929 let mask = _mm_set1_epi8(1);
1930 let r = _mm_test_mix_ones_zeros(a, mask);
1931 assert_eq!(r, 0);
1932 let a = _mm_set1_epi8(-1);
1933 let mask = _mm_set1_epi8(0);
1934 let r = _mm_test_mix_ones_zeros(a, mask);
1935 assert_eq!(r, 0);
1936 let a = _mm_set1_epi8(0b101);
1937 let mask = _mm_set1_epi8(0b110);
1938 let r = _mm_test_mix_ones_zeros(a, mask);
1939 assert_eq!(r, 1);
1940 let a = _mm_set1_epi8(0b101);
1941 let mask = _mm_set1_epi8(0b101);
1942 let r = _mm_test_mix_ones_zeros(a, mask);
1943 assert_eq!(r, 0);
1944 }
1945}
1946