1//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3use crate::{
4 core_arch::{simd::*, simd_llvm::*, x86::*},
5 mem::transmute,
6};
7
8#[cfg(test)]
9use stdarch_test::assert_instr;
10
11// SSE4 rounding constants
12/// round to nearest
13#[stable(feature = "simd_x86", since = "1.27.0")]
14pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
15/// round down
16#[stable(feature = "simd_x86", since = "1.27.0")]
17pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
18/// round up
19#[stable(feature = "simd_x86", since = "1.27.0")]
20pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
21/// truncate
22#[stable(feature = "simd_x86", since = "1.27.0")]
23pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
24/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
25#[stable(feature = "simd_x86", since = "1.27.0")]
26pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
27/// do not suppress exceptions
28#[stable(feature = "simd_x86", since = "1.27.0")]
29pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
30/// suppress exceptions
31#[stable(feature = "simd_x86", since = "1.27.0")]
32pub const _MM_FROUND_NO_EXC: i32 = 0x08;
33/// round to nearest and do not suppress exceptions
34#[stable(feature = "simd_x86", since = "1.27.0")]
35pub const _MM_FROUND_NINT: i32 = 0x00;
36/// round down and do not suppress exceptions
37#[stable(feature = "simd_x86", since = "1.27.0")]
38pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
39/// round up and do not suppress exceptions
40#[stable(feature = "simd_x86", since = "1.27.0")]
41pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
42/// truncate and do not suppress exceptions
43#[stable(feature = "simd_x86", since = "1.27.0")]
44pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
45/// use MXCSR.RC and do not suppress exceptions; see
46/// `vendor::_MM_SET_ROUNDING_MODE`
47#[stable(feature = "simd_x86", since = "1.27.0")]
48pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
49/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
50#[stable(feature = "simd_x86", since = "1.27.0")]
51pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
52
53/// Blend packed 8-bit integers from `a` and `b` using `mask`
54///
55/// The high bit of each corresponding mask byte determines the selection.
56/// If the high bit is set the element of `a` is selected. The element
57/// of `b` is selected otherwise.
58///
59/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
60#[inline]
61#[target_feature(enable = "sse4.1")]
62#[cfg_attr(test, assert_instr(pblendvb))]
63#[stable(feature = "simd_x86", since = "1.27.0")]
64pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
65 let mask: i8x16 = simd_lt(x:mask.as_i8x16(), y:i8x16::splat(0));
66 transmute(src:simd_select(m:mask, a:b.as_i8x16(), b:a.as_i8x16()))
67}
68
69/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
70///
71/// The mask bits determine the selection. A clear bit selects the
72/// corresponding element of `a`, and a set bit the corresponding
73/// element of `b`.
74///
75/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
76#[inline]
77#[target_feature(enable = "sse4.1")]
78#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
79#[rustc_legacy_const_generics(2)]
80#[stable(feature = "simd_x86", since = "1.27.0")]
81pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
82 static_assert_uimm_bits!(IMM8, 8);
83 transmute::<i16x8, _>(src:simd_shuffle!(
84 a.as_i16x8(),
85 b.as_i16x8(),
86 [
87 [0, 8][IMM8 as usize & 1],
88 [1, 9][(IMM8 >> 1) as usize & 1],
89 [2, 10][(IMM8 >> 2) as usize & 1],
90 [3, 11][(IMM8 >> 3) as usize & 1],
91 [4, 12][(IMM8 >> 4) as usize & 1],
92 [5, 13][(IMM8 >> 5) as usize & 1],
93 [6, 14][(IMM8 >> 6) as usize & 1],
94 [7, 15][(IMM8 >> 7) as usize & 1],
95 ]
96 ))
97}
98
99/// Blend packed double-precision (64-bit) floating-point elements from `a`
100/// and `b` using `mask`
101///
102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
103#[inline]
104#[target_feature(enable = "sse4.1")]
105#[cfg_attr(test, assert_instr(blendvpd))]
106#[stable(feature = "simd_x86", since = "1.27.0")]
107pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
108 let mask: i64x2 = simd_lt(x:transmute::<_, i64x2>(mask), y:i64x2::splat(0));
109 transmute(src:simd_select(m:mask, a:b.as_f64x2(), b:a.as_f64x2()))
110}
111
112/// Blend packed single-precision (32-bit) floating-point elements from `a`
113/// and `b` using `mask`
114///
115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
116#[inline]
117#[target_feature(enable = "sse4.1")]
118#[cfg_attr(test, assert_instr(blendvps))]
119#[stable(feature = "simd_x86", since = "1.27.0")]
120pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
121 let mask: i32x4 = simd_lt(x:transmute::<_, i32x4>(mask), y:i32x4::splat(0));
122 transmute(src:simd_select(m:mask, a:b.as_f32x4(), b:a.as_f32x4()))
123}
124
125/// Blend packed double-precision (64-bit) floating-point elements from `a`
126/// and `b` using control mask `IMM2`
127///
128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
129#[inline]
130#[target_feature(enable = "sse4.1")]
131// Note: LLVM7 prefers the single-precision floating-point domain when possible
132// see https://bugs.llvm.org/show_bug.cgi?id=38195
133// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
134#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
135#[rustc_legacy_const_generics(2)]
136#[stable(feature = "simd_x86", since = "1.27.0")]
137pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
138 static_assert_uimm_bits!(IMM2, 2);
139 transmute::<f64x2, _>(src:simd_shuffle!(
140 a.as_f64x2(),
141 b.as_f64x2(),
142 [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
143 ))
144}
145
146/// Blend packed single-precision (32-bit) floating-point elements from `a`
147/// and `b` using mask `IMM4`
148///
149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
150#[inline]
151#[target_feature(enable = "sse4.1")]
152#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
153#[rustc_legacy_const_generics(2)]
154#[stable(feature = "simd_x86", since = "1.27.0")]
155pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
156 static_assert_uimm_bits!(IMM4, 4);
157 transmute::<f32x4, _>(src:simd_shuffle!(
158 a.as_f32x4(),
159 b.as_f32x4(),
160 [
161 [0, 4][IMM4 as usize & 1],
162 [1, 5][(IMM4 >> 1) as usize & 1],
163 [2, 6][(IMM4 >> 2) as usize & 1],
164 [3, 7][(IMM4 >> 3) as usize & 1],
165 ]
166 ))
167}
168
169/// Extracts a single-precision (32-bit) floating-point element from `a`,
170/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
171/// and may be converted back to a floating point number via casting.
172///
173/// # Example
174/// ```rust
175/// # #[cfg(target_arch = "x86")]
176/// # use std::arch::x86::*;
177/// # #[cfg(target_arch = "x86_64")]
178/// # use std::arch::x86_64::*;
179/// # fn main() {
180/// # if is_x86_feature_detected!("sse4.1") {
181/// # #[target_feature(enable = "sse4.1")]
182/// # unsafe fn worker() {
183/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
184/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
185/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
186/// float_store.push(f32::from_bits(x as u32));
187/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
188/// # }
189/// # unsafe { worker() }
190/// # }
191/// # }
192/// ```
193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
194#[inline]
195#[target_feature(enable = "sse4.1")]
196#[cfg_attr(
197 all(test, not(target_os = "windows")),
198 assert_instr(extractps, IMM8 = 0)
199)]
200#[rustc_legacy_const_generics(1)]
201#[stable(feature = "simd_x86", since = "1.27.0")]
202pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
203 static_assert_uimm_bits!(IMM8, 2);
204 simd_extract::<_, f32>(x:a, IMM8 as u32).to_bits() as i32
205}
206
207/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
208/// integer containing the zero-extended integer data.
209///
210/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
211///
212/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
213#[inline]
214#[target_feature(enable = "sse4.1")]
215#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
216#[rustc_legacy_const_generics(1)]
217#[stable(feature = "simd_x86", since = "1.27.0")]
218pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
219 static_assert_uimm_bits!(IMM8, 4);
220 simd_extract::<_, u8>(x:a.as_u8x16(), IMM8 as u32) as i32
221}
222
223/// Extracts an 32-bit integer from `a` selected with `IMM8`
224///
225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
226#[inline]
227#[target_feature(enable = "sse4.1")]
228#[cfg_attr(
229 all(test, not(target_os = "windows")),
230 assert_instr(extractps, IMM8 = 1)
231)]
232#[rustc_legacy_const_generics(1)]
233#[stable(feature = "simd_x86", since = "1.27.0")]
234pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
235 static_assert_uimm_bits!(IMM8, 2);
236 simd_extract::<_, i32>(x:a.as_i32x4(), IMM8 as u32)
237}
238
239/// Select a single value in `a` to store at some position in `b`,
240/// Then zero elements according to `IMM8`.
241///
242/// `IMM8` specifies which bits from operand `a` will be copied, which bits in
243/// the result they will be copied to, and which bits in the result will be
244/// cleared. The following assignments are made:
245///
246/// * Bits `[7:6]` specify the bits to copy from operand `a`:
247/// - `00`: Selects bits `[31:0]` from operand `a`.
248/// - `01`: Selects bits `[63:32]` from operand `a`.
249/// - `10`: Selects bits `[95:64]` from operand `a`.
250/// - `11`: Selects bits `[127:96]` from operand `a`.
251///
252/// * Bits `[5:4]` specify the bits in the result to which the selected bits
253/// from operand `a` are copied:
254/// - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
255/// - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
256/// - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
257/// - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
258///
259/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
260/// element is cleared.
261///
262/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
263#[inline]
264#[target_feature(enable = "sse4.1")]
265#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
266#[rustc_legacy_const_generics(2)]
267#[stable(feature = "simd_x86", since = "1.27.0")]
268pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
269 static_assert_uimm_bits!(IMM8, 8);
270 insertps(a, b, IMM8 as u8)
271}
272
273/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
274/// location specified by `IMM8`.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
277#[inline]
278#[target_feature(enable = "sse4.1")]
279#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
280#[rustc_legacy_const_generics(2)]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
283 static_assert_uimm_bits!(IMM8, 4);
284 transmute(src:simd_insert(x:a.as_i8x16(), IMM8 as u32, val:i as i8))
285}
286
287/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
288/// location specified by `IMM8`.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
291#[inline]
292#[target_feature(enable = "sse4.1")]
293#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
294#[rustc_legacy_const_generics(2)]
295#[stable(feature = "simd_x86", since = "1.27.0")]
296pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
297 static_assert_uimm_bits!(IMM8, 2);
298 transmute(src:simd_insert(x:a.as_i32x4(), IMM8 as u32, val:i))
299}
300
301/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
302/// values in dst.
303///
304/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
305#[inline]
306#[target_feature(enable = "sse4.1")]
307#[cfg_attr(test, assert_instr(pmaxsb))]
308#[stable(feature = "simd_x86", since = "1.27.0")]
309pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
310 let a: i8x16 = a.as_i8x16();
311 let b: i8x16 = b.as_i8x16();
312 transmute(src:simd_select::<i8x16, _>(m:simd_gt(x:a, y:b), a, b))
313}
314
315/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
316/// maximum.
317///
318/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
319#[inline]
320#[target_feature(enable = "sse4.1")]
321#[cfg_attr(test, assert_instr(pmaxuw))]
322#[stable(feature = "simd_x86", since = "1.27.0")]
323pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
324 let a: u16x8 = a.as_u16x8();
325 let b: u16x8 = b.as_u16x8();
326 transmute(src:simd_select::<i16x8, _>(m:simd_gt(x:a, y:b), a, b))
327}
328
329/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
330/// values.
331///
332/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
333#[inline]
334#[target_feature(enable = "sse4.1")]
335#[cfg_attr(test, assert_instr(pmaxsd))]
336#[stable(feature = "simd_x86", since = "1.27.0")]
337pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
338 let a: i32x4 = a.as_i32x4();
339 let b: i32x4 = b.as_i32x4();
340 transmute(src:simd_select::<i32x4, _>(m:simd_gt(x:a, y:b), a, b))
341}
342
343/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
344/// maximum values.
345///
346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
347#[inline]
348#[target_feature(enable = "sse4.1")]
349#[cfg_attr(test, assert_instr(pmaxud))]
350#[stable(feature = "simd_x86", since = "1.27.0")]
351pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
352 let a: u32x4 = a.as_u32x4();
353 let b: u32x4 = b.as_u32x4();
354 transmute(src:simd_select::<i32x4, _>(m:simd_gt(x:a, y:b), a, b))
355}
356
357/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
358/// values in dst.
359///
360/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
361#[inline]
362#[target_feature(enable = "sse4.1")]
363#[cfg_attr(test, assert_instr(pminsb))]
364#[stable(feature = "simd_x86", since = "1.27.0")]
365pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
366 let a: i8x16 = a.as_i8x16();
367 let b: i8x16 = b.as_i8x16();
368 transmute(src:simd_select::<i8x16, _>(m:simd_lt(x:a, y:b), a, b))
369}
370
371/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
372/// minimum.
373///
374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
375#[inline]
376#[target_feature(enable = "sse4.1")]
377#[cfg_attr(test, assert_instr(pminuw))]
378#[stable(feature = "simd_x86", since = "1.27.0")]
379pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
380 let a: u16x8 = a.as_u16x8();
381 let b: u16x8 = b.as_u16x8();
382 transmute(src:simd_select::<i16x8, _>(m:simd_lt(x:a, y:b), a, b))
383}
384
385/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
386/// values.
387///
388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
389#[inline]
390#[target_feature(enable = "sse4.1")]
391#[cfg_attr(test, assert_instr(pminsd))]
392#[stable(feature = "simd_x86", since = "1.27.0")]
393pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
394 let a: i32x4 = a.as_i32x4();
395 let b: i32x4 = b.as_i32x4();
396 transmute(src:simd_select::<i32x4, _>(m:simd_lt(x:a, y:b), a, b))
397}
398
399/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
400/// minimum values.
401///
402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
403#[inline]
404#[target_feature(enable = "sse4.1")]
405#[cfg_attr(test, assert_instr(pminud))]
406#[stable(feature = "simd_x86", since = "1.27.0")]
407pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
408 let a: u32x4 = a.as_u32x4();
409 let b: u32x4 = b.as_u32x4();
410 transmute(src:simd_select::<i32x4, _>(m:simd_lt(x:a, y:b), a, b))
411}
412
413/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
414/// using unsigned saturation
415///
416/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
417#[inline]
418#[target_feature(enable = "sse4.1")]
419#[cfg_attr(test, assert_instr(packusdw))]
420#[stable(feature = "simd_x86", since = "1.27.0")]
421pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
422 transmute(src:packusdw(a:a.as_i32x4(), b:b.as_i32x4()))
423}
424
425/// Compares packed 64-bit integers in `a` and `b` for equality
426///
427/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
428#[inline]
429#[target_feature(enable = "sse4.1")]
430#[cfg_attr(test, assert_instr(pcmpeqq))]
431#[stable(feature = "simd_x86", since = "1.27.0")]
432pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
433 transmute(src:simd_eq::<_, i64x2>(x:a.as_i64x2(), y:b.as_i64x2()))
434}
435
436/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
437///
438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
439#[inline]
440#[target_feature(enable = "sse4.1")]
441#[cfg_attr(test, assert_instr(pmovsxbw))]
442#[stable(feature = "simd_x86", since = "1.27.0")]
443pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
444 let a: i8x16 = a.as_i8x16();
445 let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
446 transmute(src:simd_cast::<_, i16x8>(a))
447}
448
449/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
452#[inline]
453#[target_feature(enable = "sse4.1")]
454#[cfg_attr(test, assert_instr(pmovsxbd))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
457 let a: i8x16 = a.as_i8x16();
458 let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
459 transmute(src:simd_cast::<_, i32x4>(a))
460}
461
462/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
463/// 64-bit integers
464///
465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
466#[inline]
467#[target_feature(enable = "sse4.1")]
468#[cfg_attr(test, assert_instr(pmovsxbq))]
469#[stable(feature = "simd_x86", since = "1.27.0")]
470pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
471 let a: i8x16 = a.as_i8x16();
472 let a: i8x2 = simd_shuffle!(a, a, [0, 1]);
473 transmute(src:simd_cast::<_, i64x2>(a))
474}
475
476/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
477///
478/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
479#[inline]
480#[target_feature(enable = "sse4.1")]
481#[cfg_attr(test, assert_instr(pmovsxwd))]
482#[stable(feature = "simd_x86", since = "1.27.0")]
483pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
484 let a: i16x8 = a.as_i16x8();
485 let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
486 transmute(src:simd_cast::<_, i32x4>(a))
487}
488
489/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
490///
491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
492#[inline]
493#[target_feature(enable = "sse4.1")]
494#[cfg_attr(test, assert_instr(pmovsxwq))]
495#[stable(feature = "simd_x86", since = "1.27.0")]
496pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
497 let a: i16x8 = a.as_i16x8();
498 let a: i16x2 = simd_shuffle!(a, a, [0, 1]);
499 transmute(src:simd_cast::<_, i64x2>(a))
500}
501
502/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
503///
504/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
505#[inline]
506#[target_feature(enable = "sse4.1")]
507#[cfg_attr(test, assert_instr(pmovsxdq))]
508#[stable(feature = "simd_x86", since = "1.27.0")]
509pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
510 let a: i32x4 = a.as_i32x4();
511 let a: i32x2 = simd_shuffle!(a, a, [0, 1]);
512 transmute(src:simd_cast::<_, i64x2>(a))
513}
514
515/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
516///
517/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
518#[inline]
519#[target_feature(enable = "sse4.1")]
520#[cfg_attr(test, assert_instr(pmovzxbw))]
521#[stable(feature = "simd_x86", since = "1.27.0")]
522pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
523 let a: u8x16 = a.as_u8x16();
524 let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
525 transmute(src:simd_cast::<_, i16x8>(a))
526}
527
528/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
529///
530/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
531#[inline]
532#[target_feature(enable = "sse4.1")]
533#[cfg_attr(test, assert_instr(pmovzxbd))]
534#[stable(feature = "simd_x86", since = "1.27.0")]
535pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
536 let a: u8x16 = a.as_u8x16();
537 let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
538 transmute(src:simd_cast::<_, i32x4>(a))
539}
540
541/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
542///
543/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
544#[inline]
545#[target_feature(enable = "sse4.1")]
546#[cfg_attr(test, assert_instr(pmovzxbq))]
547#[stable(feature = "simd_x86", since = "1.27.0")]
548pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
549 let a: u8x16 = a.as_u8x16();
550 let a: u8x2 = simd_shuffle!(a, a, [0, 1]);
551 transmute(src:simd_cast::<_, i64x2>(a))
552}
553
554/// Zeroes extend packed unsigned 16-bit integers in `a`
555/// to packed 32-bit integers
556///
557/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
558#[inline]
559#[target_feature(enable = "sse4.1")]
560#[cfg_attr(test, assert_instr(pmovzxwd))]
561#[stable(feature = "simd_x86", since = "1.27.0")]
562pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
563 let a: u16x8 = a.as_u16x8();
564 let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
565 transmute(src:simd_cast::<_, i32x4>(a))
566}
567
568/// Zeroes extend packed unsigned 16-bit integers in `a`
569/// to packed 64-bit integers
570///
571/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
572#[inline]
573#[target_feature(enable = "sse4.1")]
574#[cfg_attr(test, assert_instr(pmovzxwq))]
575#[stable(feature = "simd_x86", since = "1.27.0")]
576pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
577 let a: u16x8 = a.as_u16x8();
578 let a: u16x2 = simd_shuffle!(a, a, [0, 1]);
579 transmute(src:simd_cast::<_, i64x2>(a))
580}
581
582/// Zeroes extend packed unsigned 32-bit integers in `a`
583/// to packed 64-bit integers
584///
585/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
586#[inline]
587#[target_feature(enable = "sse4.1")]
588#[cfg_attr(test, assert_instr(pmovzxdq))]
589#[stable(feature = "simd_x86", since = "1.27.0")]
590pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
591 let a: u32x4 = a.as_u32x4();
592 let a: u32x2 = simd_shuffle!(a, a, [0, 1]);
593 transmute(src:simd_cast::<_, i64x2>(a))
594}
595
596/// Returns the dot product of two __m128d vectors.
597///
598/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
599/// If a condition mask bit is zero, the corresponding multiplication is
600/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
601/// the dot product will be stored in the return value component. Otherwise if
602/// the broadcast mask bit is zero then the return component will be zero.
603///
604/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
605#[inline]
606#[target_feature(enable = "sse4.1")]
607#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
608#[rustc_legacy_const_generics(2)]
609#[stable(feature = "simd_x86", since = "1.27.0")]
610pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
611 static_assert_uimm_bits!(IMM8, 8);
612 dppd(a, b, IMM8 as u8)
613}
614
615/// Returns the dot product of two __m128 vectors.
616///
617/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
618/// If a condition mask bit is zero, the corresponding multiplication is
619/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
620/// the dot product will be stored in the return value component. Otherwise if
621/// the broadcast mask bit is zero then the return component will be zero.
622///
623/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
624#[inline]
625#[target_feature(enable = "sse4.1")]
626#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
627#[rustc_legacy_const_generics(2)]
628#[stable(feature = "simd_x86", since = "1.27.0")]
629pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
630 static_assert_uimm_bits!(IMM8, 8);
631 dpps(a, b, IMM8 as u8)
632}
633
634/// Round the packed double-precision (64-bit) floating-point elements in `a`
635/// down to an integer value, and stores the results as packed double-precision
636/// floating-point elements.
637///
638/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
639#[inline]
640#[target_feature(enable = "sse4.1")]
641#[cfg_attr(test, assert_instr(roundpd))]
642#[stable(feature = "simd_x86", since = "1.27.0")]
643pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
644 simd_floor(a)
645}
646
647/// Round the packed single-precision (32-bit) floating-point elements in `a`
648/// down to an integer value, and stores the results as packed single-precision
649/// floating-point elements.
650///
651/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
652#[inline]
653#[target_feature(enable = "sse4.1")]
654#[cfg_attr(test, assert_instr(roundps))]
655#[stable(feature = "simd_x86", since = "1.27.0")]
656pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
657 simd_floor(a)
658}
659
660/// Round the lower double-precision (64-bit) floating-point element in `b`
661/// down to an integer value, store the result as a double-precision
662/// floating-point element in the lower element of the intrinsic result,
663/// and copies the upper element from `a` to the upper element of the intrinsic
664/// result.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
667#[inline]
668#[target_feature(enable = "sse4.1")]
669#[cfg_attr(test, assert_instr(roundsd))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
672 roundsd(a, b, _MM_FROUND_FLOOR)
673}
674
675/// Round the lower single-precision (32-bit) floating-point element in `b`
676/// down to an integer value, store the result as a single-precision
677/// floating-point element in the lower element of the intrinsic result,
678/// and copies the upper 3 packed elements from `a` to the upper elements
679/// of the intrinsic result.
680///
681/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
682#[inline]
683#[target_feature(enable = "sse4.1")]
684#[cfg_attr(test, assert_instr(roundss))]
685#[stable(feature = "simd_x86", since = "1.27.0")]
686pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
687 roundss(a, b, _MM_FROUND_FLOOR)
688}
689
690/// Round the packed double-precision (64-bit) floating-point elements in `a`
691/// up to an integer value, and stores the results as packed double-precision
692/// floating-point elements.
693///
694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
695#[inline]
696#[target_feature(enable = "sse4.1")]
697#[cfg_attr(test, assert_instr(roundpd))]
698#[stable(feature = "simd_x86", since = "1.27.0")]
699pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
700 simd_ceil(a)
701}
702
703/// Round the packed single-precision (32-bit) floating-point elements in `a`
704/// up to an integer value, and stores the results as packed single-precision
705/// floating-point elements.
706///
707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
708#[inline]
709#[target_feature(enable = "sse4.1")]
710#[cfg_attr(test, assert_instr(roundps))]
711#[stable(feature = "simd_x86", since = "1.27.0")]
712pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
713 simd_ceil(a)
714}
715
716/// Round the lower double-precision (64-bit) floating-point element in `b`
717/// up to an integer value, store the result as a double-precision
718/// floating-point element in the lower element of the intrinsic result,
719/// and copies the upper element from `a` to the upper element
720/// of the intrinsic result.
721///
722/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
723#[inline]
724#[target_feature(enable = "sse4.1")]
725#[cfg_attr(test, assert_instr(roundsd))]
726#[stable(feature = "simd_x86", since = "1.27.0")]
727pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
728 roundsd(a, b, _MM_FROUND_CEIL)
729}
730
731/// Round the lower single-precision (32-bit) floating-point element in `b`
732/// up to an integer value, store the result as a single-precision
733/// floating-point element in the lower element of the intrinsic result,
734/// and copies the upper 3 packed elements from `a` to the upper elements
735/// of the intrinsic result.
736///
737/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
738#[inline]
739#[target_feature(enable = "sse4.1")]
740#[cfg_attr(test, assert_instr(roundss))]
741#[stable(feature = "simd_x86", since = "1.27.0")]
742pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
743 roundss(a, b, _MM_FROUND_CEIL)
744}
745
746/// Round the packed double-precision (64-bit) floating-point elements in `a`
747/// using the `ROUNDING` parameter, and stores the results as packed
748/// double-precision floating-point elements.
749/// Rounding is done according to the rounding parameter, which can be one of:
750///
751/// ```
752/// #[cfg(target_arch = "x86")]
753/// use std::arch::x86::*;
754/// #[cfg(target_arch = "x86_64")]
755/// use std::arch::x86_64::*;
756///
757/// # fn main() {
758/// // round to nearest, and suppress exceptions:
759/// # let _x =
760/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
761/// // round down, and suppress exceptions:
762/// # let _x =
763/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
764/// // round up, and suppress exceptions:
765/// # let _x =
766/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
767/// // truncate, and suppress exceptions:
768/// # let _x =
769/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
770/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
771/// # let _x =
772/// _MM_FROUND_CUR_DIRECTION;
773/// # }
774/// ```
775///
776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
777#[inline]
778#[target_feature(enable = "sse4.1")]
779#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
780#[rustc_legacy_const_generics(1)]
781#[stable(feature = "simd_x86", since = "1.27.0")]
782pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
783 static_assert_uimm_bits!(ROUNDING, 4);
784 roundpd(a, ROUNDING)
785}
786
787/// Round the packed single-precision (32-bit) floating-point elements in `a`
788/// using the `ROUNDING` parameter, and stores the results as packed
789/// single-precision floating-point elements.
790/// Rounding is done according to the rounding parameter, which can be one of:
791///
792/// ```
793/// #[cfg(target_arch = "x86")]
794/// use std::arch::x86::*;
795/// #[cfg(target_arch = "x86_64")]
796/// use std::arch::x86_64::*;
797///
798/// # fn main() {
799/// // round to nearest, and suppress exceptions:
800/// # let _x =
801/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
802/// // round down, and suppress exceptions:
803/// # let _x =
804/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
805/// // round up, and suppress exceptions:
806/// # let _x =
807/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
808/// // truncate, and suppress exceptions:
809/// # let _x =
810/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
811/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
812/// # let _x =
813/// _MM_FROUND_CUR_DIRECTION;
814/// # }
815/// ```
816///
817/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
818#[inline]
819#[target_feature(enable = "sse4.1")]
820#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
821#[rustc_legacy_const_generics(1)]
822#[stable(feature = "simd_x86", since = "1.27.0")]
823pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
824 static_assert_uimm_bits!(ROUNDING, 4);
825 roundps(a, ROUNDING)
826}
827
828/// Round the lower double-precision (64-bit) floating-point element in `b`
829/// using the `ROUNDING` parameter, store the result as a double-precision
830/// floating-point element in the lower element of the intrinsic result,
831/// and copies the upper element from `a` to the upper element of the intrinsic
832/// result.
833/// Rounding is done according to the rounding parameter, which can be one of:
834///
835/// ```
836/// #[cfg(target_arch = "x86")]
837/// use std::arch::x86::*;
838/// #[cfg(target_arch = "x86_64")]
839/// use std::arch::x86_64::*;
840///
841/// # fn main() {
842/// // round to nearest, and suppress exceptions:
843/// # let _x =
844/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
845/// // round down, and suppress exceptions:
846/// # let _x =
847/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
848/// // round up, and suppress exceptions:
849/// # let _x =
850/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
851/// // truncate, and suppress exceptions:
852/// # let _x =
853/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
854/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
855/// # let _x =
856/// _MM_FROUND_CUR_DIRECTION;
857/// # }
858/// ```
859///
860/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
861#[inline]
862#[target_feature(enable = "sse4.1")]
863#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
864#[rustc_legacy_const_generics(2)]
865#[stable(feature = "simd_x86", since = "1.27.0")]
866pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
867 static_assert_uimm_bits!(ROUNDING, 4);
868 roundsd(a, b, ROUNDING)
869}
870
871/// Round the lower single-precision (32-bit) floating-point element in `b`
872/// using the `ROUNDING` parameter, store the result as a single-precision
873/// floating-point element in the lower element of the intrinsic result,
874/// and copies the upper 3 packed elements from `a` to the upper elements
875/// of the intrinsic result.
876/// Rounding is done according to the rounding parameter, which can be one of:
877///
878/// ```
879/// #[cfg(target_arch = "x86")]
880/// use std::arch::x86::*;
881/// #[cfg(target_arch = "x86_64")]
882/// use std::arch::x86_64::*;
883///
884/// # fn main() {
885/// // round to nearest, and suppress exceptions:
886/// # let _x =
887/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
888/// // round down, and suppress exceptions:
889/// # let _x =
890/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
891/// // round up, and suppress exceptions:
892/// # let _x =
893/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
894/// // truncate, and suppress exceptions:
895/// # let _x =
896/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
897/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
898/// # let _x =
899/// _MM_FROUND_CUR_DIRECTION;
900/// # }
901/// ```
902///
903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
904#[inline]
905#[target_feature(enable = "sse4.1")]
906#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
907#[rustc_legacy_const_generics(2)]
908#[stable(feature = "simd_x86", since = "1.27.0")]
909pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
910 static_assert_uimm_bits!(ROUNDING, 4);
911 roundss(a, b, ROUNDING)
912}
913
914/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
915/// returning a vector containing its value in its first position, and its
916/// index
917/// in its second position; all other elements are set to zero.
918///
919/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
920/// instruction.
921///
922/// Arguments:
923///
924/// * `a` - A 128-bit vector of type `__m128i`.
925///
926/// Returns:
927///
928/// A 128-bit value where:
929///
930/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
931/// * bits `[18:16]` - contain the index of the minimum value
932/// * remaining bits are set to `0`.
933///
934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
935#[inline]
936#[target_feature(enable = "sse4.1")]
937#[cfg_attr(test, assert_instr(phminposuw))]
938#[stable(feature = "simd_x86", since = "1.27.0")]
939pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
940 transmute(src:phminposuw(a.as_u16x8()))
941}
942
943/// Multiplies the low 32-bit integers from each packed 64-bit
944/// element in `a` and `b`, and returns the signed 64-bit result.
945///
946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
947#[inline]
948#[target_feature(enable = "sse4.1")]
949#[cfg_attr(test, assert_instr(pmuldq))]
950#[stable(feature = "simd_x86", since = "1.27.0")]
951pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
952 let a: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
953 let b: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
954 transmute(src:simd_mul(x:a, y:b))
955}
956
957/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
958/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
959/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
960/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
961/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
962/// return a negative number.
963///
964/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
965#[inline]
966#[target_feature(enable = "sse4.1")]
967#[cfg_attr(test, assert_instr(pmulld))]
968#[stable(feature = "simd_x86", since = "1.27.0")]
969pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
970 transmute(src:simd_mul(x:a.as_i32x4(), y:b.as_i32x4()))
971}
972
973/// Subtracts 8-bit unsigned integer values and computes the absolute
974/// values of the differences to the corresponding bits in the destination.
975/// Then sums of the absolute differences are returned according to the bit
976/// fields in the immediate operand.
977///
978/// The following algorithm is performed:
979///
980/// ```ignore
981/// i = IMM8[2] * 4
982/// j = IMM8[1:0] * 4
983/// for k := 0 to 7
984/// d0 = abs(a[i + k + 0] - b[j + 0])
985/// d1 = abs(a[i + k + 1] - b[j + 1])
986/// d2 = abs(a[i + k + 2] - b[j + 2])
987/// d3 = abs(a[i + k + 3] - b[j + 3])
988/// r[k] = d0 + d1 + d2 + d3
989/// ```
990///
991/// Arguments:
992///
993/// * `a` - A 128-bit vector of type `__m128i`.
994/// * `b` - A 128-bit vector of type `__m128i`.
995/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
996/// differences are to be calculated
997/// * Bit `[2]` specify the offset for operand `a`
998/// * Bits `[1:0]` specify the offset for operand `b`
999///
1000/// Returns:
1001///
1002/// * A `__m128i` vector containing the sums of the sets of absolute
1003/// differences between both operands.
1004///
1005/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
1006#[inline]
1007#[target_feature(enable = "sse4.1")]
1008#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
1009#[rustc_legacy_const_generics(2)]
1010#[stable(feature = "simd_x86", since = "1.27.0")]
1011pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
1012 static_assert_uimm_bits!(IMM8, 3);
1013 transmute(src:mpsadbw(a:a.as_u8x16(), b:b.as_u8x16(), IMM8 as u8))
1014}
1015
1016/// Tests whether the specified bits in a 128-bit integer vector are all
1017/// zeros.
1018///
1019/// Arguments:
1020///
1021/// * `a` - A 128-bit integer vector containing the bits to be tested.
1022/// * `mask` - A 128-bit integer vector selecting which bits to test in
1023/// operand `a`.
1024///
1025/// Returns:
1026///
1027/// * `1` - if the specified bits are all zeros,
1028/// * `0` - otherwise.
1029///
1030/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
1031#[inline]
1032#[target_feature(enable = "sse4.1")]
1033#[cfg_attr(test, assert_instr(ptest))]
1034#[stable(feature = "simd_x86", since = "1.27.0")]
1035pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1036 ptestz(a:a.as_i64x2(), mask:mask.as_i64x2())
1037}
1038
1039/// Tests whether the specified bits in a 128-bit integer vector are all
1040/// ones.
1041///
1042/// Arguments:
1043///
1044/// * `a` - A 128-bit integer vector containing the bits to be tested.
1045/// * `mask` - A 128-bit integer vector selecting which bits to test in
1046/// operand `a`.
1047///
1048/// Returns:
1049///
1050/// * `1` - if the specified bits are all ones,
1051/// * `0` - otherwise.
1052///
1053/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
1054#[inline]
1055#[target_feature(enable = "sse4.1")]
1056#[cfg_attr(test, assert_instr(ptest))]
1057#[stable(feature = "simd_x86", since = "1.27.0")]
1058pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1059 ptestc(a:a.as_i64x2(), mask:mask.as_i64x2())
1060}
1061
1062/// Tests whether the specified bits in a 128-bit integer vector are
1063/// neither all zeros nor all ones.
1064///
1065/// Arguments:
1066///
1067/// * `a` - A 128-bit integer vector containing the bits to be tested.
1068/// * `mask` - A 128-bit integer vector selecting which bits to test in
1069/// operand `a`.
1070///
1071/// Returns:
1072///
1073/// * `1` - if the specified bits are neither all zeros nor all ones,
1074/// * `0` - otherwise.
1075///
1076/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
1077#[inline]
1078#[target_feature(enable = "sse4.1")]
1079#[cfg_attr(test, assert_instr(ptest))]
1080#[stable(feature = "simd_x86", since = "1.27.0")]
1081pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1082 ptestnzc(a:a.as_i64x2(), mask:mask.as_i64x2())
1083}
1084
1085/// Tests whether the specified bits in a 128-bit integer vector are all
1086/// zeros.
1087///
1088/// Arguments:
1089///
1090/// * `a` - A 128-bit integer vector containing the bits to be tested.
1091/// * `mask` - A 128-bit integer vector selecting which bits to test in
1092/// operand `a`.
1093///
1094/// Returns:
1095///
1096/// * `1` - if the specified bits are all zeros,
1097/// * `0` - otherwise.
1098///
1099/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
1100#[inline]
1101#[target_feature(enable = "sse4.1")]
1102#[cfg_attr(test, assert_instr(ptest))]
1103#[stable(feature = "simd_x86", since = "1.27.0")]
1104pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1105 _mm_testz_si128(a, mask)
1106}
1107
1108/// Tests whether the specified bits in `a` 128-bit integer vector are all
1109/// ones.
1110///
1111/// Argument:
1112///
1113/// * `a` - A 128-bit integer vector containing the bits to be tested.
1114///
1115/// Returns:
1116///
1117/// * `1` - if the bits specified in the operand are all set to 1,
1118/// * `0` - otherwise.
1119///
1120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
1121#[inline]
1122#[target_feature(enable = "sse4.1")]
1123#[cfg_attr(test, assert_instr(pcmpeqd))]
1124#[cfg_attr(test, assert_instr(ptest))]
1125#[stable(feature = "simd_x86", since = "1.27.0")]
1126pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1127 _mm_testc_si128(a, mask:_mm_cmpeq_epi32(a, b:a))
1128}
1129
1130/// Tests whether the specified bits in a 128-bit integer vector are
1131/// neither all zeros nor all ones.
1132///
1133/// Arguments:
1134///
1135/// * `a` - A 128-bit integer vector containing the bits to be tested.
1136/// * `mask` - A 128-bit integer vector selecting which bits to test in
1137/// operand `a`.
1138///
1139/// Returns:
1140///
1141/// * `1` - if the specified bits are neither all zeros nor all ones,
1142/// * `0` - otherwise.
1143///
1144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
1145#[inline]
1146#[target_feature(enable = "sse4.1")]
1147#[cfg_attr(test, assert_instr(ptest))]
1148#[stable(feature = "simd_x86", since = "1.27.0")]
1149pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1150 _mm_testnzc_si128(a, mask)
1151}
1152
1153#[allow(improper_ctypes)]
1154extern "C" {
1155 #[link_name = "llvm.x86.sse41.insertps"]
1156 fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1157 #[link_name = "llvm.x86.sse41.packusdw"]
1158 fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1159 #[link_name = "llvm.x86.sse41.dppd"]
1160 fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1161 #[link_name = "llvm.x86.sse41.dpps"]
1162 fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1163 #[link_name = "llvm.x86.sse41.round.pd"]
1164 fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1165 #[link_name = "llvm.x86.sse41.round.ps"]
1166 fn roundps(a: __m128, rounding: i32) -> __m128;
1167 #[link_name = "llvm.x86.sse41.round.sd"]
1168 fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1169 #[link_name = "llvm.x86.sse41.round.ss"]
1170 fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1171 #[link_name = "llvm.x86.sse41.phminposuw"]
1172 fn phminposuw(a: u16x8) -> u16x8;
1173 #[link_name = "llvm.x86.sse41.mpsadbw"]
1174 fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1175 #[link_name = "llvm.x86.sse41.ptestz"]
1176 fn ptestz(a: i64x2, mask: i64x2) -> i32;
1177 #[link_name = "llvm.x86.sse41.ptestc"]
1178 fn ptestc(a: i64x2, mask: i64x2) -> i32;
1179 #[link_name = "llvm.x86.sse41.ptestnzc"]
1180 fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1181}
1182
1183#[cfg(test)]
1184mod tests {
1185 use crate::core_arch::x86::*;
1186 use std::mem;
1187 use stdarch_test::simd_test;
1188
1189 #[simd_test(enable = "sse4.1")]
1190 unsafe fn test_mm_blendv_epi8() {
1191 #[rustfmt::skip]
1192 let a = _mm_setr_epi8(
1193 0, 1, 2, 3, 4, 5, 6, 7,
1194 8, 9, 10, 11, 12, 13, 14, 15,
1195 );
1196 #[rustfmt::skip]
1197 let b = _mm_setr_epi8(
1198 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1199 );
1200 #[rustfmt::skip]
1201 let mask = _mm_setr_epi8(
1202 0, -1, 0, -1, 0, -1, 0, -1,
1203 0, -1, 0, -1, 0, -1, 0, -1,
1204 );
1205 #[rustfmt::skip]
1206 let e = _mm_setr_epi8(
1207 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1208 );
1209 assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1210 }
1211
1212 #[simd_test(enable = "sse4.1")]
1213 unsafe fn test_mm_blendv_pd() {
1214 let a = _mm_set1_pd(0.0);
1215 let b = _mm_set1_pd(1.0);
1216 let mask = transmute(_mm_setr_epi64x(0, -1));
1217 let r = _mm_blendv_pd(a, b, mask);
1218 let e = _mm_setr_pd(0.0, 1.0);
1219 assert_eq_m128d(r, e);
1220 }
1221
1222 #[simd_test(enable = "sse4.1")]
1223 unsafe fn test_mm_blendv_ps() {
1224 let a = _mm_set1_ps(0.0);
1225 let b = _mm_set1_ps(1.0);
1226 let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1227 let r = _mm_blendv_ps(a, b, mask);
1228 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1229 assert_eq_m128(r, e);
1230 }
1231
1232 #[simd_test(enable = "sse4.1")]
1233 unsafe fn test_mm_blend_pd() {
1234 let a = _mm_set1_pd(0.0);
1235 let b = _mm_set1_pd(1.0);
1236 let r = _mm_blend_pd::<0b10>(a, b);
1237 let e = _mm_setr_pd(0.0, 1.0);
1238 assert_eq_m128d(r, e);
1239 }
1240
1241 #[simd_test(enable = "sse4.1")]
1242 unsafe fn test_mm_blend_ps() {
1243 let a = _mm_set1_ps(0.0);
1244 let b = _mm_set1_ps(1.0);
1245 let r = _mm_blend_ps::<0b1010>(a, b);
1246 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1247 assert_eq_m128(r, e);
1248 }
1249
1250 #[simd_test(enable = "sse4.1")]
1251 unsafe fn test_mm_blend_epi16() {
1252 let a = _mm_set1_epi16(0);
1253 let b = _mm_set1_epi16(1);
1254 let r = _mm_blend_epi16::<0b1010_1100>(a, b);
1255 let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1256 assert_eq_m128i(r, e);
1257 }
1258
1259 #[simd_test(enable = "sse4.1")]
1260 unsafe fn test_mm_extract_ps() {
1261 let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1262 let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
1263 assert_eq!(r, 1.0);
1264 let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
1265 assert_eq!(r, 3.0);
1266 }
1267
1268 #[simd_test(enable = "sse4.1")]
1269 unsafe fn test_mm_extract_epi8() {
1270 #[rustfmt::skip]
1271 let a = _mm_setr_epi8(
1272 -1, 1, 2, 3, 4, 5, 6, 7,
1273 8, 9, 10, 11, 12, 13, 14, 15
1274 );
1275 let r1 = _mm_extract_epi8::<0>(a);
1276 let r2 = _mm_extract_epi8::<3>(a);
1277 assert_eq!(r1, 0xFF);
1278 assert_eq!(r2, 3);
1279 }
1280
1281 #[simd_test(enable = "sse4.1")]
1282 unsafe fn test_mm_extract_epi32() {
1283 let a = _mm_setr_epi32(0, 1, 2, 3);
1284 let r = _mm_extract_epi32::<1>(a);
1285 assert_eq!(r, 1);
1286 let r = _mm_extract_epi32::<3>(a);
1287 assert_eq!(r, 3);
1288 }
1289
1290 #[simd_test(enable = "sse4.1")]
1291 unsafe fn test_mm_insert_ps() {
1292 let a = _mm_set1_ps(1.0);
1293 let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1294 let r = _mm_insert_ps::<0b11_00_1100>(a, b);
1295 let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1296 assert_eq_m128(r, e);
1297 }
1298
1299 #[simd_test(enable = "sse4.1")]
1300 unsafe fn test_mm_insert_epi8() {
1301 let a = _mm_set1_epi8(0);
1302 let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1303 let r = _mm_insert_epi8::<1>(a, 32);
1304 assert_eq_m128i(r, e);
1305 let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1306 let r = _mm_insert_epi8::<14>(a, 32);
1307 assert_eq_m128i(r, e);
1308 }
1309
1310 #[simd_test(enable = "sse4.1")]
1311 unsafe fn test_mm_insert_epi32() {
1312 let a = _mm_set1_epi32(0);
1313 let e = _mm_setr_epi32(0, 32, 0, 0);
1314 let r = _mm_insert_epi32::<1>(a, 32);
1315 assert_eq_m128i(r, e);
1316 let e = _mm_setr_epi32(0, 0, 0, 32);
1317 let r = _mm_insert_epi32::<3>(a, 32);
1318 assert_eq_m128i(r, e);
1319 }
1320
1321 #[simd_test(enable = "sse4.1")]
1322 unsafe fn test_mm_max_epi8() {
1323 #[rustfmt::skip]
1324 let a = _mm_setr_epi8(
1325 1, 4, 5, 8, 9, 12, 13, 16,
1326 17, 20, 21, 24, 25, 28, 29, 32,
1327 );
1328 #[rustfmt::skip]
1329 let b = _mm_setr_epi8(
1330 2, 3, 6, 7, 10, 11, 14, 15,
1331 18, 19, 22, 23, 26, 27, 30, 31,
1332 );
1333 let r = _mm_max_epi8(a, b);
1334 #[rustfmt::skip]
1335 let e = _mm_setr_epi8(
1336 2, 4, 6, 8, 10, 12, 14, 16,
1337 18, 20, 22, 24, 26, 28, 30, 32,
1338 );
1339 assert_eq_m128i(r, e);
1340 }
1341
1342 #[simd_test(enable = "sse4.1")]
1343 unsafe fn test_mm_max_epu16() {
1344 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1345 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1346 let r = _mm_max_epu16(a, b);
1347 let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1348 assert_eq_m128i(r, e);
1349 }
1350
1351 #[simd_test(enable = "sse4.1")]
1352 unsafe fn test_mm_max_epi32() {
1353 let a = _mm_setr_epi32(1, 4, 5, 8);
1354 let b = _mm_setr_epi32(2, 3, 6, 7);
1355 let r = _mm_max_epi32(a, b);
1356 let e = _mm_setr_epi32(2, 4, 6, 8);
1357 assert_eq_m128i(r, e);
1358 }
1359
1360 #[simd_test(enable = "sse4.1")]
1361 unsafe fn test_mm_max_epu32() {
1362 let a = _mm_setr_epi32(1, 4, 5, 8);
1363 let b = _mm_setr_epi32(2, 3, 6, 7);
1364 let r = _mm_max_epu32(a, b);
1365 let e = _mm_setr_epi32(2, 4, 6, 8);
1366 assert_eq_m128i(r, e);
1367 }
1368
1369 #[simd_test(enable = "sse4.1")]
1370 unsafe fn test_mm_min_epi8_1() {
1371 #[rustfmt::skip]
1372 let a = _mm_setr_epi8(
1373 1, 4, 5, 8, 9, 12, 13, 16,
1374 17, 20, 21, 24, 25, 28, 29, 32,
1375 );
1376 #[rustfmt::skip]
1377 let b = _mm_setr_epi8(
1378 2, 3, 6, 7, 10, 11, 14, 15,
1379 18, 19, 22, 23, 26, 27, 30, 31,
1380 );
1381 let r = _mm_min_epi8(a, b);
1382 #[rustfmt::skip]
1383 let e = _mm_setr_epi8(
1384 1, 3, 5, 7, 9, 11, 13, 15,
1385 17, 19, 21, 23, 25, 27, 29, 31,
1386 );
1387 assert_eq_m128i(r, e);
1388 }
1389
1390 #[simd_test(enable = "sse4.1")]
1391 unsafe fn test_mm_min_epi8_2() {
1392 #[rustfmt::skip]
1393 let a = _mm_setr_epi8(
1394 1, -4, -5, 8, -9, -12, 13, -16,
1395 17, 20, 21, 24, 25, 28, 29, 32,
1396 );
1397 #[rustfmt::skip]
1398 let b = _mm_setr_epi8(
1399 2, -3, -6, 7, -10, -11, 14, -15,
1400 18, 19, 22, 23, 26, 27, 30, 31,
1401 );
1402 let r = _mm_min_epi8(a, b);
1403 #[rustfmt::skip]
1404 let e = _mm_setr_epi8(
1405 1, -4, -6, 7, -10, -12, 13, -16,
1406 17, 19, 21, 23, 25, 27, 29, 31,
1407 );
1408 assert_eq_m128i(r, e);
1409 }
1410
1411 #[simd_test(enable = "sse4.1")]
1412 unsafe fn test_mm_min_epu16() {
1413 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1414 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1415 let r = _mm_min_epu16(a, b);
1416 let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1417 assert_eq_m128i(r, e);
1418 }
1419
1420 #[simd_test(enable = "sse4.1")]
1421 unsafe fn test_mm_min_epi32_1() {
1422 let a = _mm_setr_epi32(1, 4, 5, 8);
1423 let b = _mm_setr_epi32(2, 3, 6, 7);
1424 let r = _mm_min_epi32(a, b);
1425 let e = _mm_setr_epi32(1, 3, 5, 7);
1426 assert_eq_m128i(r, e);
1427 }
1428
1429 #[simd_test(enable = "sse4.1")]
1430 unsafe fn test_mm_min_epi32_2() {
1431 let a = _mm_setr_epi32(-1, 4, 5, -7);
1432 let b = _mm_setr_epi32(-2, 3, -6, 8);
1433 let r = _mm_min_epi32(a, b);
1434 let e = _mm_setr_epi32(-2, 3, -6, -7);
1435 assert_eq_m128i(r, e);
1436 }
1437
1438 #[simd_test(enable = "sse4.1")]
1439 unsafe fn test_mm_min_epu32() {
1440 let a = _mm_setr_epi32(1, 4, 5, 8);
1441 let b = _mm_setr_epi32(2, 3, 6, 7);
1442 let r = _mm_min_epu32(a, b);
1443 let e = _mm_setr_epi32(1, 3, 5, 7);
1444 assert_eq_m128i(r, e);
1445 }
1446
1447 #[simd_test(enable = "sse4.1")]
1448 unsafe fn test_mm_packus_epi32() {
1449 let a = _mm_setr_epi32(1, 2, 3, 4);
1450 let b = _mm_setr_epi32(-1, -2, -3, -4);
1451 let r = _mm_packus_epi32(a, b);
1452 let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1453 assert_eq_m128i(r, e);
1454 }
1455
1456 #[simd_test(enable = "sse4.1")]
1457 unsafe fn test_mm_cmpeq_epi64() {
1458 let a = _mm_setr_epi64x(0, 1);
1459 let b = _mm_setr_epi64x(0, 0);
1460 let r = _mm_cmpeq_epi64(a, b);
1461 let e = _mm_setr_epi64x(-1, 0);
1462 assert_eq_m128i(r, e);
1463 }
1464
1465 #[simd_test(enable = "sse4.1")]
1466 unsafe fn test_mm_cvtepi8_epi16() {
1467 let a = _mm_set1_epi8(10);
1468 let r = _mm_cvtepi8_epi16(a);
1469 let e = _mm_set1_epi16(10);
1470 assert_eq_m128i(r, e);
1471 let a = _mm_set1_epi8(-10);
1472 let r = _mm_cvtepi8_epi16(a);
1473 let e = _mm_set1_epi16(-10);
1474 assert_eq_m128i(r, e);
1475 }
1476
1477 #[simd_test(enable = "sse4.1")]
1478 unsafe fn test_mm_cvtepi8_epi32() {
1479 let a = _mm_set1_epi8(10);
1480 let r = _mm_cvtepi8_epi32(a);
1481 let e = _mm_set1_epi32(10);
1482 assert_eq_m128i(r, e);
1483 let a = _mm_set1_epi8(-10);
1484 let r = _mm_cvtepi8_epi32(a);
1485 let e = _mm_set1_epi32(-10);
1486 assert_eq_m128i(r, e);
1487 }
1488
1489 #[simd_test(enable = "sse4.1")]
1490 unsafe fn test_mm_cvtepi8_epi64() {
1491 let a = _mm_set1_epi8(10);
1492 let r = _mm_cvtepi8_epi64(a);
1493 let e = _mm_set1_epi64x(10);
1494 assert_eq_m128i(r, e);
1495 let a = _mm_set1_epi8(-10);
1496 let r = _mm_cvtepi8_epi64(a);
1497 let e = _mm_set1_epi64x(-10);
1498 assert_eq_m128i(r, e);
1499 }
1500
1501 #[simd_test(enable = "sse4.1")]
1502 unsafe fn test_mm_cvtepi16_epi32() {
1503 let a = _mm_set1_epi16(10);
1504 let r = _mm_cvtepi16_epi32(a);
1505 let e = _mm_set1_epi32(10);
1506 assert_eq_m128i(r, e);
1507 let a = _mm_set1_epi16(-10);
1508 let r = _mm_cvtepi16_epi32(a);
1509 let e = _mm_set1_epi32(-10);
1510 assert_eq_m128i(r, e);
1511 }
1512
1513 #[simd_test(enable = "sse4.1")]
1514 unsafe fn test_mm_cvtepi16_epi64() {
1515 let a = _mm_set1_epi16(10);
1516 let r = _mm_cvtepi16_epi64(a);
1517 let e = _mm_set1_epi64x(10);
1518 assert_eq_m128i(r, e);
1519 let a = _mm_set1_epi16(-10);
1520 let r = _mm_cvtepi16_epi64(a);
1521 let e = _mm_set1_epi64x(-10);
1522 assert_eq_m128i(r, e);
1523 }
1524
1525 #[simd_test(enable = "sse4.1")]
1526 unsafe fn test_mm_cvtepi32_epi64() {
1527 let a = _mm_set1_epi32(10);
1528 let r = _mm_cvtepi32_epi64(a);
1529 let e = _mm_set1_epi64x(10);
1530 assert_eq_m128i(r, e);
1531 let a = _mm_set1_epi32(-10);
1532 let r = _mm_cvtepi32_epi64(a);
1533 let e = _mm_set1_epi64x(-10);
1534 assert_eq_m128i(r, e);
1535 }
1536
1537 #[simd_test(enable = "sse4.1")]
1538 unsafe fn test_mm_cvtepu8_epi16() {
1539 let a = _mm_set1_epi8(10);
1540 let r = _mm_cvtepu8_epi16(a);
1541 let e = _mm_set1_epi16(10);
1542 assert_eq_m128i(r, e);
1543 }
1544
1545 #[simd_test(enable = "sse4.1")]
1546 unsafe fn test_mm_cvtepu8_epi32() {
1547 let a = _mm_set1_epi8(10);
1548 let r = _mm_cvtepu8_epi32(a);
1549 let e = _mm_set1_epi32(10);
1550 assert_eq_m128i(r, e);
1551 }
1552
1553 #[simd_test(enable = "sse4.1")]
1554 unsafe fn test_mm_cvtepu8_epi64() {
1555 let a = _mm_set1_epi8(10);
1556 let r = _mm_cvtepu8_epi64(a);
1557 let e = _mm_set1_epi64x(10);
1558 assert_eq_m128i(r, e);
1559 }
1560
1561 #[simd_test(enable = "sse4.1")]
1562 unsafe fn test_mm_cvtepu16_epi32() {
1563 let a = _mm_set1_epi16(10);
1564 let r = _mm_cvtepu16_epi32(a);
1565 let e = _mm_set1_epi32(10);
1566 assert_eq_m128i(r, e);
1567 }
1568
1569 #[simd_test(enable = "sse4.1")]
1570 unsafe fn test_mm_cvtepu16_epi64() {
1571 let a = _mm_set1_epi16(10);
1572 let r = _mm_cvtepu16_epi64(a);
1573 let e = _mm_set1_epi64x(10);
1574 assert_eq_m128i(r, e);
1575 }
1576
1577 #[simd_test(enable = "sse4.1")]
1578 unsafe fn test_mm_cvtepu32_epi64() {
1579 let a = _mm_set1_epi32(10);
1580 let r = _mm_cvtepu32_epi64(a);
1581 let e = _mm_set1_epi64x(10);
1582 assert_eq_m128i(r, e);
1583 }
1584
1585 #[simd_test(enable = "sse4.1")]
1586 unsafe fn test_mm_dp_pd() {
1587 let a = _mm_setr_pd(2.0, 3.0);
1588 let b = _mm_setr_pd(1.0, 4.0);
1589 let e = _mm_setr_pd(14.0, 0.0);
1590 assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
1591 }
1592
1593 #[simd_test(enable = "sse4.1")]
1594 unsafe fn test_mm_dp_ps() {
1595 let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1596 let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1597 let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1598 assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
1599 }
1600
1601 #[simd_test(enable = "sse4.1")]
1602 unsafe fn test_mm_floor_pd() {
1603 let a = _mm_setr_pd(2.5, 4.5);
1604 let r = _mm_floor_pd(a);
1605 let e = _mm_setr_pd(2.0, 4.0);
1606 assert_eq_m128d(r, e);
1607 }
1608
1609 #[simd_test(enable = "sse4.1")]
1610 unsafe fn test_mm_floor_ps() {
1611 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1612 let r = _mm_floor_ps(a);
1613 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1614 assert_eq_m128(r, e);
1615 }
1616
1617 #[simd_test(enable = "sse4.1")]
1618 unsafe fn test_mm_floor_sd() {
1619 let a = _mm_setr_pd(2.5, 4.5);
1620 let b = _mm_setr_pd(-1.5, -3.5);
1621 let r = _mm_floor_sd(a, b);
1622 let e = _mm_setr_pd(-2.0, 4.5);
1623 assert_eq_m128d(r, e);
1624 }
1625
1626 #[simd_test(enable = "sse4.1")]
1627 unsafe fn test_mm_floor_ss() {
1628 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1629 let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1630 let r = _mm_floor_ss(a, b);
1631 let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1632 assert_eq_m128(r, e);
1633 }
1634
1635 #[simd_test(enable = "sse4.1")]
1636 unsafe fn test_mm_ceil_pd() {
1637 let a = _mm_setr_pd(1.5, 3.5);
1638 let r = _mm_ceil_pd(a);
1639 let e = _mm_setr_pd(2.0, 4.0);
1640 assert_eq_m128d(r, e);
1641 }
1642
1643 #[simd_test(enable = "sse4.1")]
1644 unsafe fn test_mm_ceil_ps() {
1645 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1646 let r = _mm_ceil_ps(a);
1647 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1648 assert_eq_m128(r, e);
1649 }
1650
1651 #[simd_test(enable = "sse4.1")]
1652 unsafe fn test_mm_ceil_sd() {
1653 let a = _mm_setr_pd(1.5, 3.5);
1654 let b = _mm_setr_pd(-2.5, -4.5);
1655 let r = _mm_ceil_sd(a, b);
1656 let e = _mm_setr_pd(-2.0, 3.5);
1657 assert_eq_m128d(r, e);
1658 }
1659
1660 #[simd_test(enable = "sse4.1")]
1661 unsafe fn test_mm_ceil_ss() {
1662 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1663 let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1664 let r = _mm_ceil_ss(a, b);
1665 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1666 assert_eq_m128(r, e);
1667 }
1668
1669 #[simd_test(enable = "sse4.1")]
1670 unsafe fn test_mm_round_pd() {
1671 let a = _mm_setr_pd(1.25, 3.75);
1672 let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1673 let e = _mm_setr_pd(1.0, 4.0);
1674 assert_eq_m128d(r, e);
1675 }
1676
1677 #[simd_test(enable = "sse4.1")]
1678 unsafe fn test_mm_round_ps() {
1679 let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1680 let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1681 let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1682 assert_eq_m128(r, e);
1683 }
1684
1685 #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
1686 #[simd_test(enable = "sse4.1")]
1687 unsafe fn test_mm_round_sd() {
1688 let a = _mm_setr_pd(1.5, 3.5);
1689 let b = _mm_setr_pd(-2.5, -4.5);
1690 let old_mode = _MM_GET_ROUNDING_MODE();
1691 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1692 let r = _mm_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
1693 _MM_SET_ROUNDING_MODE(old_mode);
1694 let e = _mm_setr_pd(-2.0, 3.5);
1695 assert_eq_m128d(r, e);
1696 }
1697
1698 #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
1699 #[simd_test(enable = "sse4.1")]
1700 unsafe fn test_mm_round_ss() {
1701 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1702 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1703 let old_mode = _MM_GET_ROUNDING_MODE();
1704 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1705 let r = _mm_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
1706 _MM_SET_ROUNDING_MODE(old_mode);
1707 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1708 assert_eq_m128(r, e);
1709 }
1710
1711 #[simd_test(enable = "sse4.1")]
1712 unsafe fn test_mm_minpos_epu16_1() {
1713 let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1714 let r = _mm_minpos_epu16(a);
1715 let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1716 assert_eq_m128i(r, e);
1717 }
1718
1719 #[simd_test(enable = "sse4.1")]
1720 unsafe fn test_mm_minpos_epu16_2() {
1721 let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1722 let r = _mm_minpos_epu16(a);
1723 let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1724 assert_eq_m128i(r, e);
1725 }
1726
1727 #[simd_test(enable = "sse4.1")]
1728 unsafe fn test_mm_mul_epi32() {
1729 {
1730 let a = _mm_setr_epi32(1, 1, 1, 1);
1731 let b = _mm_setr_epi32(1, 2, 3, 4);
1732 let r = _mm_mul_epi32(a, b);
1733 let e = _mm_setr_epi64x(1, 3);
1734 assert_eq_m128i(r, e);
1735 }
1736 {
1737 let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1738 let b = _mm_setr_epi32(
1739 -20, -256, /* ignored */
1740 666666, 666666, /* ignored */
1741 );
1742 let r = _mm_mul_epi32(a, b);
1743 let e = _mm_setr_epi64x(-300, 823043843622);
1744 assert_eq_m128i(r, e);
1745 }
1746 }
1747
1748 #[simd_test(enable = "sse4.1")]
1749 unsafe fn test_mm_mullo_epi32() {
1750 {
1751 let a = _mm_setr_epi32(1, 1, 1, 1);
1752 let b = _mm_setr_epi32(1, 2, 3, 4);
1753 let r = _mm_mullo_epi32(a, b);
1754 let e = _mm_setr_epi32(1, 2, 3, 4);
1755 assert_eq_m128i(r, e);
1756 }
1757 {
1758 let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1759 let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1760 let r = _mm_mullo_epi32(a, b);
1761 // Attention, most significant bit in r[2] is treated
1762 // as a sign bit:
1763 // 1234567 * 666666 = -1589877210
1764 let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1765 assert_eq_m128i(r, e);
1766 }
1767 }
1768
1769 #[simd_test(enable = "sse4.1")]
1770 unsafe fn test_mm_minpos_epu16() {
1771 let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1772 let r = _mm_minpos_epu16(a);
1773 let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1774 assert_eq_m128i(r, e);
1775 }
1776
1777 #[simd_test(enable = "sse4.1")]
1778 unsafe fn test_mm_mpsadbw_epu8() {
1779 #[rustfmt::skip]
1780 let a = _mm_setr_epi8(
1781 0, 1, 2, 3, 4, 5, 6, 7,
1782 8, 9, 10, 11, 12, 13, 14, 15,
1783 );
1784
1785 let r = _mm_mpsadbw_epu8::<0b000>(a, a);
1786 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1787 assert_eq_m128i(r, e);
1788
1789 let r = _mm_mpsadbw_epu8::<0b001>(a, a);
1790 let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1791 assert_eq_m128i(r, e);
1792
1793 let r = _mm_mpsadbw_epu8::<0b100>(a, a);
1794 let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1795 assert_eq_m128i(r, e);
1796
1797 let r = _mm_mpsadbw_epu8::<0b101>(a, a);
1798 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1799 assert_eq_m128i(r, e);
1800
1801 let r = _mm_mpsadbw_epu8::<0b111>(a, a);
1802 let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1803 assert_eq_m128i(r, e);
1804 }
1805
1806 #[simd_test(enable = "sse4.1")]
1807 unsafe fn test_mm_testz_si128() {
1808 let a = _mm_set1_epi8(1);
1809 let mask = _mm_set1_epi8(0);
1810 let r = _mm_testz_si128(a, mask);
1811 assert_eq!(r, 1);
1812 let a = _mm_set1_epi8(0b101);
1813 let mask = _mm_set1_epi8(0b110);
1814 let r = _mm_testz_si128(a, mask);
1815 assert_eq!(r, 0);
1816 let a = _mm_set1_epi8(0b011);
1817 let mask = _mm_set1_epi8(0b100);
1818 let r = _mm_testz_si128(a, mask);
1819 assert_eq!(r, 1);
1820 }
1821
1822 #[simd_test(enable = "sse4.1")]
1823 unsafe fn test_mm_testc_si128() {
1824 let a = _mm_set1_epi8(-1);
1825 let mask = _mm_set1_epi8(0);
1826 let r = _mm_testc_si128(a, mask);
1827 assert_eq!(r, 1);
1828 let a = _mm_set1_epi8(0b101);
1829 let mask = _mm_set1_epi8(0b110);
1830 let r = _mm_testc_si128(a, mask);
1831 assert_eq!(r, 0);
1832 let a = _mm_set1_epi8(0b101);
1833 let mask = _mm_set1_epi8(0b100);
1834 let r = _mm_testc_si128(a, mask);
1835 assert_eq!(r, 1);
1836 }
1837
1838 #[simd_test(enable = "sse4.1")]
1839 unsafe fn test_mm_testnzc_si128() {
1840 let a = _mm_set1_epi8(0);
1841 let mask = _mm_set1_epi8(1);
1842 let r = _mm_testnzc_si128(a, mask);
1843 assert_eq!(r, 0);
1844 let a = _mm_set1_epi8(-1);
1845 let mask = _mm_set1_epi8(0);
1846 let r = _mm_testnzc_si128(a, mask);
1847 assert_eq!(r, 0);
1848 let a = _mm_set1_epi8(0b101);
1849 let mask = _mm_set1_epi8(0b110);
1850 let r = _mm_testnzc_si128(a, mask);
1851 assert_eq!(r, 1);
1852 let a = _mm_set1_epi8(0b101);
1853 let mask = _mm_set1_epi8(0b101);
1854 let r = _mm_testnzc_si128(a, mask);
1855 assert_eq!(r, 0);
1856 }
1857
1858 #[simd_test(enable = "sse4.1")]
1859 unsafe fn test_mm_test_all_zeros() {
1860 let a = _mm_set1_epi8(1);
1861 let mask = _mm_set1_epi8(0);
1862 let r = _mm_test_all_zeros(a, mask);
1863 assert_eq!(r, 1);
1864 let a = _mm_set1_epi8(0b101);
1865 let mask = _mm_set1_epi8(0b110);
1866 let r = _mm_test_all_zeros(a, mask);
1867 assert_eq!(r, 0);
1868 let a = _mm_set1_epi8(0b011);
1869 let mask = _mm_set1_epi8(0b100);
1870 let r = _mm_test_all_zeros(a, mask);
1871 assert_eq!(r, 1);
1872 }
1873
1874 #[simd_test(enable = "sse4.1")]
1875 unsafe fn test_mm_test_all_ones() {
1876 let a = _mm_set1_epi8(-1);
1877 let r = _mm_test_all_ones(a);
1878 assert_eq!(r, 1);
1879 let a = _mm_set1_epi8(0b101);
1880 let r = _mm_test_all_ones(a);
1881 assert_eq!(r, 0);
1882 }
1883
1884 #[simd_test(enable = "sse4.1")]
1885 unsafe fn test_mm_test_mix_ones_zeros() {
1886 let a = _mm_set1_epi8(0);
1887 let mask = _mm_set1_epi8(1);
1888 let r = _mm_test_mix_ones_zeros(a, mask);
1889 assert_eq!(r, 0);
1890 let a = _mm_set1_epi8(-1);
1891 let mask = _mm_set1_epi8(0);
1892 let r = _mm_test_mix_ones_zeros(a, mask);
1893 assert_eq!(r, 0);
1894 let a = _mm_set1_epi8(0b101);
1895 let mask = _mm_set1_epi8(0b110);
1896 let r = _mm_test_mix_ones_zeros(a, mask);
1897 assert_eq!(r, 1);
1898 let a = _mm_set1_epi8(0b101);
1899 let mask = _mm_set1_epi8(0b101);
1900 let r = _mm_test_mix_ones_zeros(a, mask);
1901 assert_eq!(r, 0);
1902 }
1903}
1904