1 | //! Streaming SIMD Extensions 4.1 (SSE4.1) |
2 | |
3 | use crate::core_arch::{simd::*, x86::*}; |
4 | use crate::intrinsics::simd::*; |
5 | |
6 | #[cfg (test)] |
7 | use stdarch_test::assert_instr; |
8 | |
9 | // SSE4 rounding constants |
10 | /// round to nearest |
11 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
12 | pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00; |
13 | /// round down |
14 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
15 | pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01; |
16 | /// round up |
17 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
18 | pub const _MM_FROUND_TO_POS_INF: i32 = 0x02; |
19 | /// truncate |
20 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
21 | pub const _MM_FROUND_TO_ZERO: i32 = 0x03; |
22 | /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE` |
23 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
24 | pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04; |
25 | /// do not suppress exceptions |
26 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
27 | pub const _MM_FROUND_RAISE_EXC: i32 = 0x00; |
28 | /// suppress exceptions |
29 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
30 | pub const _MM_FROUND_NO_EXC: i32 = 0x08; |
31 | /// round to nearest and do not suppress exceptions |
32 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
33 | pub const _MM_FROUND_NINT: i32 = 0x00; |
34 | /// round down and do not suppress exceptions |
35 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
36 | pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF; |
37 | /// round up and do not suppress exceptions |
38 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
39 | pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF; |
40 | /// truncate and do not suppress exceptions |
41 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
42 | pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO; |
43 | /// use MXCSR.RC and do not suppress exceptions; see |
44 | /// `vendor::_MM_SET_ROUNDING_MODE` |
45 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
46 | pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION; |
47 | /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE` |
48 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
49 | pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION; |
50 | |
51 | /// Blend packed 8-bit integers from `a` and `b` using `mask` |
52 | /// |
53 | /// The high bit of each corresponding mask byte determines the selection. |
54 | /// If the high bit is set, the element of `b` is selected. |
55 | /// Otherwise, the element of `a` is selected. |
56 | /// |
57 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8) |
58 | #[inline ] |
59 | #[target_feature (enable = "sse4.1" )] |
60 | #[cfg_attr (test, assert_instr(pblendvb))] |
61 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
62 | pub fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i { |
63 | unsafe { |
64 | let mask: i8x16 = simd_lt(x:mask.as_i8x16(), y:i8x16::ZERO); |
65 | transmute(src:simd_select(mask, if_true:b.as_i8x16(), if_false:a.as_i8x16())) |
66 | } |
67 | } |
68 | |
69 | /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`. |
70 | /// |
71 | /// The mask bits determine the selection. A clear bit selects the |
72 | /// corresponding element of `a`, and a set bit the corresponding |
73 | /// element of `b`. |
74 | /// |
75 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16) |
76 | #[inline ] |
77 | #[target_feature (enable = "sse4.1" )] |
78 | #[cfg_attr (test, assert_instr(pblendw, IMM8 = 0xB1))] |
79 | #[rustc_legacy_const_generics (2)] |
80 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
81 | pub fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i { |
82 | static_assert_uimm_bits!(IMM8, 8); |
83 | unsafe { |
84 | transmute::<i16x8, _>(src:simd_shuffle!( |
85 | a.as_i16x8(), |
86 | b.as_i16x8(), |
87 | [ |
88 | [0, 8][IMM8 as usize & 1], |
89 | [1, 9][(IMM8 >> 1) as usize & 1], |
90 | [2, 10][(IMM8 >> 2) as usize & 1], |
91 | [3, 11][(IMM8 >> 3) as usize & 1], |
92 | [4, 12][(IMM8 >> 4) as usize & 1], |
93 | [5, 13][(IMM8 >> 5) as usize & 1], |
94 | [6, 14][(IMM8 >> 6) as usize & 1], |
95 | [7, 15][(IMM8 >> 7) as usize & 1], |
96 | ] |
97 | )) |
98 | } |
99 | } |
100 | |
101 | /// Blend packed double-precision (64-bit) floating-point elements from `a` |
102 | /// and `b` using `mask` |
103 | /// |
104 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd) |
105 | #[inline ] |
106 | #[target_feature (enable = "sse4.1" )] |
107 | #[cfg_attr (test, assert_instr(blendvpd))] |
108 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
109 | pub fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { |
110 | unsafe { |
111 | let mask: i64x2 = simd_lt(x:transmute::<_, i64x2>(mask), y:i64x2::ZERO); |
112 | transmute(src:simd_select(mask, if_true:b.as_f64x2(), if_false:a.as_f64x2())) |
113 | } |
114 | } |
115 | |
116 | /// Blend packed single-precision (32-bit) floating-point elements from `a` |
117 | /// and `b` using `mask` |
118 | /// |
119 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps) |
120 | #[inline ] |
121 | #[target_feature (enable = "sse4.1" )] |
122 | #[cfg_attr (test, assert_instr(blendvps))] |
123 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
124 | pub fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { |
125 | unsafe { |
126 | let mask: i32x4 = simd_lt(x:transmute::<_, i32x4>(mask), y:i32x4::ZERO); |
127 | transmute(src:simd_select(mask, if_true:b.as_f32x4(), if_false:a.as_f32x4())) |
128 | } |
129 | } |
130 | |
131 | /// Blend packed double-precision (64-bit) floating-point elements from `a` |
132 | /// and `b` using control mask `IMM2` |
133 | /// |
134 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd) |
135 | #[inline ] |
136 | #[target_feature (enable = "sse4.1" )] |
137 | // Note: LLVM7 prefers the single-precision floating-point domain when possible |
138 | // see https://bugs.llvm.org/show_bug.cgi?id=38195 |
139 | // #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))] |
140 | #[cfg_attr (test, assert_instr(blendps, IMM2 = 0b10))] |
141 | #[rustc_legacy_const_generics (2)] |
142 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
143 | pub fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d { |
144 | static_assert_uimm_bits!(IMM2, 2); |
145 | unsafe { |
146 | transmute::<f64x2, _>(src:simd_shuffle!( |
147 | a.as_f64x2(), |
148 | b.as_f64x2(), |
149 | [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]] |
150 | )) |
151 | } |
152 | } |
153 | |
154 | /// Blend packed single-precision (32-bit) floating-point elements from `a` |
155 | /// and `b` using mask `IMM4` |
156 | /// |
157 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps) |
158 | #[inline ] |
159 | #[target_feature (enable = "sse4.1" )] |
160 | #[cfg_attr (test, assert_instr(blendps, IMM4 = 0b0101))] |
161 | #[rustc_legacy_const_generics (2)] |
162 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
163 | pub fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 { |
164 | static_assert_uimm_bits!(IMM4, 4); |
165 | unsafe { |
166 | transmute::<f32x4, _>(src:simd_shuffle!( |
167 | a.as_f32x4(), |
168 | b.as_f32x4(), |
169 | [ |
170 | [0, 4][IMM4 as usize & 1], |
171 | [1, 5][(IMM4 >> 1) as usize & 1], |
172 | [2, 6][(IMM4 >> 2) as usize & 1], |
173 | [3, 7][(IMM4 >> 3) as usize & 1], |
174 | ] |
175 | )) |
176 | } |
177 | } |
178 | |
179 | /// Extracts a single-precision (32-bit) floating-point element from `a`, |
180 | /// selected with `IMM8`. The returned `i32` stores the float's bit-pattern, |
181 | /// and may be converted back to a floating point number via casting. |
182 | /// |
183 | /// # Example |
184 | /// ```rust |
185 | /// # #[cfg (target_arch = "x86" )] |
186 | /// # use std::arch::x86::*; |
187 | /// # #[cfg (target_arch = "x86_64" )] |
188 | /// # use std::arch::x86_64::*; |
189 | /// # fn main() { |
190 | /// # if is_x86_feature_detected!("sse4.1" ) { |
191 | /// # #[target_feature (enable = "sse4.1" )] |
192 | /// # #[allow (unused_unsafe)] // FIXME remove after stdarch bump in rustc |
193 | /// # unsafe fn worker() { unsafe { |
194 | /// let mut float_store = vec![1.0, 1.0, 2.0, 3.0]; |
195 | /// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0); |
196 | /// let x: i32 = _mm_extract_ps::<2>(simd_floats); |
197 | /// float_store.push(f32::from_bits(x as u32)); |
198 | /// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]); |
199 | /// # }} |
200 | /// # unsafe { worker() } |
201 | /// # } |
202 | /// # } |
203 | /// ``` |
204 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps) |
205 | #[inline ] |
206 | #[target_feature (enable = "sse4.1" )] |
207 | #[cfg_attr (all(test, not(target_env = "msvc" )), assert_instr(extractps, IMM8 = 0))] |
208 | #[rustc_legacy_const_generics (1)] |
209 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
210 | pub fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 { |
211 | static_assert_uimm_bits!(IMM8, 2); |
212 | unsafe { simd_extract!(a, IMM8 as u32, f32).to_bits() as i32 } |
213 | } |
214 | |
215 | /// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit |
216 | /// integer containing the zero-extended integer data. |
217 | /// |
218 | /// See [LLVM commit D20468](https://reviews.llvm.org/D20468). |
219 | /// |
220 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8) |
221 | #[inline ] |
222 | #[target_feature (enable = "sse4.1" )] |
223 | #[cfg_attr (test, assert_instr(pextrb, IMM8 = 0))] |
224 | #[rustc_legacy_const_generics (1)] |
225 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
226 | pub fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 { |
227 | static_assert_uimm_bits!(IMM8, 4); |
228 | unsafe { simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32 } |
229 | } |
230 | |
231 | /// Extracts an 32-bit integer from `a` selected with `IMM8` |
232 | /// |
233 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32) |
234 | #[inline ] |
235 | #[target_feature (enable = "sse4.1" )] |
236 | #[cfg_attr (all(test, not(target_env = "msvc" )), assert_instr(extractps, IMM8 = 1))] |
237 | #[rustc_legacy_const_generics (1)] |
238 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
239 | pub fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 { |
240 | static_assert_uimm_bits!(IMM8, 2); |
241 | unsafe { simd_extract!(a.as_i32x4(), IMM8 as u32, i32) } |
242 | } |
243 | |
244 | /// Select a single value in `b` to store at some position in `a`, |
245 | /// Then zero elements according to `IMM8`. |
246 | /// |
247 | /// `IMM8` specifies which bits from operand `b` will be copied, which bits in |
248 | /// the result they will be copied to, and which bits in the result will be |
249 | /// cleared. The following assignments are made: |
250 | /// |
251 | /// * Bits `[7:6]` specify the bits to copy from operand `b`: |
252 | /// - `00`: Selects bits `[31:0]` from operand `b`. |
253 | /// - `01`: Selects bits `[63:32]` from operand `b`. |
254 | /// - `10`: Selects bits `[95:64]` from operand `b`. |
255 | /// - `11`: Selects bits `[127:96]` from operand `b`. |
256 | /// |
257 | /// * Bits `[5:4]` specify the bits in the result to which the selected bits |
258 | /// from operand `b` are copied: |
259 | /// - `00`: Copies the selected bits from `b` to result bits `[31:0]`. |
260 | /// - `01`: Copies the selected bits from `b` to result bits `[63:32]`. |
261 | /// - `10`: Copies the selected bits from `b` to result bits `[95:64]`. |
262 | /// - `11`: Copies the selected bits from `b` to result bits `[127:96]`. |
263 | /// |
264 | /// * Bits `[3:0]`: If any of these bits are set, the corresponding result |
265 | /// element is cleared. |
266 | /// |
267 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps) |
268 | #[inline ] |
269 | #[target_feature (enable = "sse4.1" )] |
270 | #[cfg_attr (test, assert_instr(insertps, IMM8 = 0b1010))] |
271 | #[rustc_legacy_const_generics (2)] |
272 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
273 | pub fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 { |
274 | static_assert_uimm_bits!(IMM8, 8); |
275 | unsafe { insertps(a, b, IMM8 as u8) } |
276 | } |
277 | |
278 | /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a |
279 | /// location specified by `IMM8`. |
280 | /// |
281 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8) |
282 | #[inline ] |
283 | #[target_feature (enable = "sse4.1" )] |
284 | #[cfg_attr (test, assert_instr(pinsrb, IMM8 = 0))] |
285 | #[rustc_legacy_const_generics (2)] |
286 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
287 | pub fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i { |
288 | static_assert_uimm_bits!(IMM8, 4); |
289 | unsafe { transmute(src:simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8)) } |
290 | } |
291 | |
292 | /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a |
293 | /// location specified by `IMM8`. |
294 | /// |
295 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32) |
296 | #[inline ] |
297 | #[target_feature (enable = "sse4.1" )] |
298 | #[cfg_attr (test, assert_instr(pinsrd, IMM8 = 0))] |
299 | #[rustc_legacy_const_generics (2)] |
300 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
301 | pub fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i { |
302 | static_assert_uimm_bits!(IMM8, 2); |
303 | unsafe { transmute(src:simd_insert!(a.as_i32x4(), IMM8 as u32, i)) } |
304 | } |
305 | |
306 | /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum |
307 | /// values in dst. |
308 | /// |
309 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8) |
310 | #[inline ] |
311 | #[target_feature (enable = "sse4.1" )] |
312 | #[cfg_attr (test, assert_instr(pmaxsb))] |
313 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
314 | pub fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i { |
315 | unsafe { |
316 | let a: i8x16 = a.as_i8x16(); |
317 | let b: i8x16 = b.as_i8x16(); |
318 | transmute(src:simd_select::<i8x16, _>(mask:simd_gt(a, b), if_true:a, if_false:b)) |
319 | } |
320 | } |
321 | |
322 | /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed |
323 | /// maximum. |
324 | /// |
325 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16) |
326 | #[inline ] |
327 | #[target_feature (enable = "sse4.1" )] |
328 | #[cfg_attr (test, assert_instr(pmaxuw))] |
329 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
330 | pub fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i { |
331 | unsafe { |
332 | let a: u16x8 = a.as_u16x8(); |
333 | let b: u16x8 = b.as_u16x8(); |
334 | transmute(src:simd_select::<i16x8, _>(mask:simd_gt(a, b), if_true:a, if_false:b)) |
335 | } |
336 | } |
337 | |
338 | /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum |
339 | /// values. |
340 | /// |
341 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32) |
342 | #[inline ] |
343 | #[target_feature (enable = "sse4.1" )] |
344 | #[cfg_attr (test, assert_instr(pmaxsd))] |
345 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
346 | pub fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i { |
347 | unsafe { |
348 | let a: i32x4 = a.as_i32x4(); |
349 | let b: i32x4 = b.as_i32x4(); |
350 | transmute(src:simd_select::<i32x4, _>(mask:simd_gt(a, b), if_true:a, if_false:b)) |
351 | } |
352 | } |
353 | |
354 | /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed |
355 | /// maximum values. |
356 | /// |
357 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32) |
358 | #[inline ] |
359 | #[target_feature (enable = "sse4.1" )] |
360 | #[cfg_attr (test, assert_instr(pmaxud))] |
361 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
362 | pub fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i { |
363 | unsafe { |
364 | let a: u32x4 = a.as_u32x4(); |
365 | let b: u32x4 = b.as_u32x4(); |
366 | transmute(src:simd_select::<i32x4, _>(mask:simd_gt(a, b), if_true:a, if_false:b)) |
367 | } |
368 | } |
369 | |
370 | /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum |
371 | /// values in dst. |
372 | /// |
373 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8) |
374 | #[inline ] |
375 | #[target_feature (enable = "sse4.1" )] |
376 | #[cfg_attr (test, assert_instr(pminsb))] |
377 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
378 | pub fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i { |
379 | unsafe { |
380 | let a: i8x16 = a.as_i8x16(); |
381 | let b: i8x16 = b.as_i8x16(); |
382 | transmute(src:simd_select::<i8x16, _>(mask:simd_lt(a, b), if_true:a, if_false:b)) |
383 | } |
384 | } |
385 | |
386 | /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed |
387 | /// minimum. |
388 | /// |
389 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16) |
390 | #[inline ] |
391 | #[target_feature (enable = "sse4.1" )] |
392 | #[cfg_attr (test, assert_instr(pminuw))] |
393 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
394 | pub fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i { |
395 | unsafe { |
396 | let a: u16x8 = a.as_u16x8(); |
397 | let b: u16x8 = b.as_u16x8(); |
398 | transmute(src:simd_select::<i16x8, _>(mask:simd_lt(a, b), if_true:a, if_false:b)) |
399 | } |
400 | } |
401 | |
402 | /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum |
403 | /// values. |
404 | /// |
405 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32) |
406 | #[inline ] |
407 | #[target_feature (enable = "sse4.1" )] |
408 | #[cfg_attr (test, assert_instr(pminsd))] |
409 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
410 | pub fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i { |
411 | unsafe { |
412 | let a: i32x4 = a.as_i32x4(); |
413 | let b: i32x4 = b.as_i32x4(); |
414 | transmute(src:simd_select::<i32x4, _>(mask:simd_lt(a, b), if_true:a, if_false:b)) |
415 | } |
416 | } |
417 | |
418 | /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed |
419 | /// minimum values. |
420 | /// |
421 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32) |
422 | #[inline ] |
423 | #[target_feature (enable = "sse4.1" )] |
424 | #[cfg_attr (test, assert_instr(pminud))] |
425 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
426 | pub fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { |
427 | unsafe { |
428 | let a: u32x4 = a.as_u32x4(); |
429 | let b: u32x4 = b.as_u32x4(); |
430 | transmute(src:simd_select::<i32x4, _>(mask:simd_lt(a, b), if_true:a, if_false:b)) |
431 | } |
432 | } |
433 | |
434 | /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers |
435 | /// using unsigned saturation |
436 | /// |
437 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32) |
438 | #[inline ] |
439 | #[target_feature (enable = "sse4.1" )] |
440 | #[cfg_attr (test, assert_instr(packusdw))] |
441 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
442 | pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { |
443 | unsafe { transmute(src:packusdw(a.as_i32x4(), b.as_i32x4())) } |
444 | } |
445 | |
446 | /// Compares packed 64-bit integers in `a` and `b` for equality |
447 | /// |
448 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64) |
449 | #[inline ] |
450 | #[target_feature (enable = "sse4.1" )] |
451 | #[cfg_attr (test, assert_instr(pcmpeqq))] |
452 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
453 | pub fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i { |
454 | unsafe { transmute(src:simd_eq::<_, i64x2>(x:a.as_i64x2(), y:b.as_i64x2())) } |
455 | } |
456 | |
457 | /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers |
458 | /// |
459 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16) |
460 | #[inline ] |
461 | #[target_feature (enable = "sse4.1" )] |
462 | #[cfg_attr (test, assert_instr(pmovsxbw))] |
463 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
464 | pub fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i { |
465 | unsafe { |
466 | let a: i8x16 = a.as_i8x16(); |
467 | let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); |
468 | transmute(src:simd_cast::<_, i16x8>(a)) |
469 | } |
470 | } |
471 | |
472 | /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers |
473 | /// |
474 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32) |
475 | #[inline ] |
476 | #[target_feature (enable = "sse4.1" )] |
477 | #[cfg_attr (test, assert_instr(pmovsxbd))] |
478 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
479 | pub fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i { |
480 | unsafe { |
481 | let a: i8x16 = a.as_i8x16(); |
482 | let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); |
483 | transmute(src:simd_cast::<_, i32x4>(a)) |
484 | } |
485 | } |
486 | |
487 | /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed |
488 | /// 64-bit integers |
489 | /// |
490 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64) |
491 | #[inline ] |
492 | #[target_feature (enable = "sse4.1" )] |
493 | #[cfg_attr (test, assert_instr(pmovsxbq))] |
494 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
495 | pub fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i { |
496 | unsafe { |
497 | let a: i8x16 = a.as_i8x16(); |
498 | let a: i8x2 = simd_shuffle!(a, a, [0, 1]); |
499 | transmute(src:simd_cast::<_, i64x2>(a)) |
500 | } |
501 | } |
502 | |
503 | /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers |
504 | /// |
505 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32) |
506 | #[inline ] |
507 | #[target_feature (enable = "sse4.1" )] |
508 | #[cfg_attr (test, assert_instr(pmovsxwd))] |
509 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
510 | pub fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i { |
511 | unsafe { |
512 | let a: i16x8 = a.as_i16x8(); |
513 | let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); |
514 | transmute(src:simd_cast::<_, i32x4>(a)) |
515 | } |
516 | } |
517 | |
518 | /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers |
519 | /// |
520 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64) |
521 | #[inline ] |
522 | #[target_feature (enable = "sse4.1" )] |
523 | #[cfg_attr (test, assert_instr(pmovsxwq))] |
524 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
525 | pub fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i { |
526 | unsafe { |
527 | let a: i16x8 = a.as_i16x8(); |
528 | let a: i16x2 = simd_shuffle!(a, a, [0, 1]); |
529 | transmute(src:simd_cast::<_, i64x2>(a)) |
530 | } |
531 | } |
532 | |
533 | /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers |
534 | /// |
535 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64) |
536 | #[inline ] |
537 | #[target_feature (enable = "sse4.1" )] |
538 | #[cfg_attr (test, assert_instr(pmovsxdq))] |
539 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
540 | pub fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i { |
541 | unsafe { |
542 | let a: i32x4 = a.as_i32x4(); |
543 | let a: i32x2 = simd_shuffle!(a, a, [0, 1]); |
544 | transmute(src:simd_cast::<_, i64x2>(a)) |
545 | } |
546 | } |
547 | |
548 | /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers |
549 | /// |
550 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16) |
551 | #[inline ] |
552 | #[target_feature (enable = "sse4.1" )] |
553 | #[cfg_attr (test, assert_instr(pmovzxbw))] |
554 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
555 | pub fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i { |
556 | unsafe { |
557 | let a: u8x16 = a.as_u8x16(); |
558 | let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); |
559 | transmute(src:simd_cast::<_, i16x8>(a)) |
560 | } |
561 | } |
562 | |
563 | /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers |
564 | /// |
565 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32) |
566 | #[inline ] |
567 | #[target_feature (enable = "sse4.1" )] |
568 | #[cfg_attr (test, assert_instr(pmovzxbd))] |
569 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
570 | pub fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i { |
571 | unsafe { |
572 | let a: u8x16 = a.as_u8x16(); |
573 | let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); |
574 | transmute(src:simd_cast::<_, i32x4>(a)) |
575 | } |
576 | } |
577 | |
578 | /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers |
579 | /// |
580 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64) |
581 | #[inline ] |
582 | #[target_feature (enable = "sse4.1" )] |
583 | #[cfg_attr (test, assert_instr(pmovzxbq))] |
584 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
585 | pub fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i { |
586 | unsafe { |
587 | let a: u8x16 = a.as_u8x16(); |
588 | let a: u8x2 = simd_shuffle!(a, a, [0, 1]); |
589 | transmute(src:simd_cast::<_, i64x2>(a)) |
590 | } |
591 | } |
592 | |
593 | /// Zeroes extend packed unsigned 16-bit integers in `a` |
594 | /// to packed 32-bit integers |
595 | /// |
596 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32) |
597 | #[inline ] |
598 | #[target_feature (enable = "sse4.1" )] |
599 | #[cfg_attr (test, assert_instr(pmovzxwd))] |
600 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
601 | pub fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i { |
602 | unsafe { |
603 | let a: u16x8 = a.as_u16x8(); |
604 | let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); |
605 | transmute(src:simd_cast::<_, i32x4>(a)) |
606 | } |
607 | } |
608 | |
609 | /// Zeroes extend packed unsigned 16-bit integers in `a` |
610 | /// to packed 64-bit integers |
611 | /// |
612 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64) |
613 | #[inline ] |
614 | #[target_feature (enable = "sse4.1" )] |
615 | #[cfg_attr (test, assert_instr(pmovzxwq))] |
616 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
617 | pub fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i { |
618 | unsafe { |
619 | let a: u16x8 = a.as_u16x8(); |
620 | let a: u16x2 = simd_shuffle!(a, a, [0, 1]); |
621 | transmute(src:simd_cast::<_, i64x2>(a)) |
622 | } |
623 | } |
624 | |
625 | /// Zeroes extend packed unsigned 32-bit integers in `a` |
626 | /// to packed 64-bit integers |
627 | /// |
628 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64) |
629 | #[inline ] |
630 | #[target_feature (enable = "sse4.1" )] |
631 | #[cfg_attr (test, assert_instr(pmovzxdq))] |
632 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
633 | pub fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i { |
634 | unsafe { |
635 | let a: u32x4 = a.as_u32x4(); |
636 | let a: u32x2 = simd_shuffle!(a, a, [0, 1]); |
637 | transmute(src:simd_cast::<_, i64x2>(a)) |
638 | } |
639 | } |
640 | |
641 | /// Returns the dot product of two __m128d vectors. |
642 | /// |
643 | /// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask. |
644 | /// If a condition mask bit is zero, the corresponding multiplication is |
645 | /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of |
646 | /// the dot product will be stored in the return value component. Otherwise if |
647 | /// the broadcast mask bit is zero then the return component will be zero. |
648 | /// |
649 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd) |
650 | #[inline ] |
651 | #[target_feature (enable = "sse4.1" )] |
652 | #[cfg_attr (test, assert_instr(dppd, IMM8 = 0))] |
653 | #[rustc_legacy_const_generics (2)] |
654 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
655 | pub fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d { |
656 | unsafe { |
657 | static_assert_uimm_bits!(IMM8, 8); |
658 | dppd(a, b, IMM8 as u8) |
659 | } |
660 | } |
661 | |
662 | /// Returns the dot product of two __m128 vectors. |
663 | /// |
664 | /// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask. |
665 | /// If a condition mask bit is zero, the corresponding multiplication is |
666 | /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of |
667 | /// the dot product will be stored in the return value component. Otherwise if |
668 | /// the broadcast mask bit is zero then the return component will be zero. |
669 | /// |
670 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps) |
671 | #[inline ] |
672 | #[target_feature (enable = "sse4.1" )] |
673 | #[cfg_attr (test, assert_instr(dpps, IMM8 = 0))] |
674 | #[rustc_legacy_const_generics (2)] |
675 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
676 | pub fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 { |
677 | static_assert_uimm_bits!(IMM8, 8); |
678 | unsafe { dpps(a, b, IMM8 as u8) } |
679 | } |
680 | |
681 | /// Round the packed double-precision (64-bit) floating-point elements in `a` |
682 | /// down to an integer value, and stores the results as packed double-precision |
683 | /// floating-point elements. |
684 | /// |
685 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd) |
686 | #[inline ] |
687 | #[target_feature (enable = "sse4.1" )] |
688 | #[cfg_attr (test, assert_instr(roundpd))] |
689 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
690 | pub fn _mm_floor_pd(a: __m128d) -> __m128d { |
691 | unsafe { simd_floor(a) } |
692 | } |
693 | |
694 | /// Round the packed single-precision (32-bit) floating-point elements in `a` |
695 | /// down to an integer value, and stores the results as packed single-precision |
696 | /// floating-point elements. |
697 | /// |
698 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps) |
699 | #[inline ] |
700 | #[target_feature (enable = "sse4.1" )] |
701 | #[cfg_attr (test, assert_instr(roundps))] |
702 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
703 | pub fn _mm_floor_ps(a: __m128) -> __m128 { |
704 | unsafe { simd_floor(a) } |
705 | } |
706 | |
707 | /// Round the lower double-precision (64-bit) floating-point element in `b` |
708 | /// down to an integer value, store the result as a double-precision |
709 | /// floating-point element in the lower element of the intrinsic result, |
710 | /// and copies the upper element from `a` to the upper element of the intrinsic |
711 | /// result. |
712 | /// |
713 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd) |
714 | #[inline ] |
715 | #[target_feature (enable = "sse4.1" )] |
716 | #[cfg_attr (test, assert_instr(roundsd))] |
717 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
718 | pub fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d { |
719 | unsafe { roundsd(a, b, _MM_FROUND_FLOOR) } |
720 | } |
721 | |
722 | /// Round the lower single-precision (32-bit) floating-point element in `b` |
723 | /// down to an integer value, store the result as a single-precision |
724 | /// floating-point element in the lower element of the intrinsic result, |
725 | /// and copies the upper 3 packed elements from `a` to the upper elements |
726 | /// of the intrinsic result. |
727 | /// |
728 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss) |
729 | #[inline ] |
730 | #[target_feature (enable = "sse4.1" )] |
731 | #[cfg_attr (test, assert_instr(roundss))] |
732 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
733 | pub fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 { |
734 | unsafe { roundss(a, b, _MM_FROUND_FLOOR) } |
735 | } |
736 | |
737 | /// Round the packed double-precision (64-bit) floating-point elements in `a` |
738 | /// up to an integer value, and stores the results as packed double-precision |
739 | /// floating-point elements. |
740 | /// |
741 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd) |
742 | #[inline ] |
743 | #[target_feature (enable = "sse4.1" )] |
744 | #[cfg_attr (test, assert_instr(roundpd))] |
745 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
746 | pub fn _mm_ceil_pd(a: __m128d) -> __m128d { |
747 | unsafe { simd_ceil(a) } |
748 | } |
749 | |
750 | /// Round the packed single-precision (32-bit) floating-point elements in `a` |
751 | /// up to an integer value, and stores the results as packed single-precision |
752 | /// floating-point elements. |
753 | /// |
754 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps) |
755 | #[inline ] |
756 | #[target_feature (enable = "sse4.1" )] |
757 | #[cfg_attr (test, assert_instr(roundps))] |
758 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
759 | pub fn _mm_ceil_ps(a: __m128) -> __m128 { |
760 | unsafe { simd_ceil(a) } |
761 | } |
762 | |
763 | /// Round the lower double-precision (64-bit) floating-point element in `b` |
764 | /// up to an integer value, store the result as a double-precision |
765 | /// floating-point element in the lower element of the intrinsic result, |
766 | /// and copies the upper element from `a` to the upper element |
767 | /// of the intrinsic result. |
768 | /// |
769 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd) |
770 | #[inline ] |
771 | #[target_feature (enable = "sse4.1" )] |
772 | #[cfg_attr (test, assert_instr(roundsd))] |
773 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
774 | pub fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d { |
775 | unsafe { roundsd(a, b, _MM_FROUND_CEIL) } |
776 | } |
777 | |
778 | /// Round the lower single-precision (32-bit) floating-point element in `b` |
779 | /// up to an integer value, store the result as a single-precision |
780 | /// floating-point element in the lower element of the intrinsic result, |
781 | /// and copies the upper 3 packed elements from `a` to the upper elements |
782 | /// of the intrinsic result. |
783 | /// |
784 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss) |
785 | #[inline ] |
786 | #[target_feature (enable = "sse4.1" )] |
787 | #[cfg_attr (test, assert_instr(roundss))] |
788 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
789 | pub fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 { |
790 | unsafe { roundss(a, b, _MM_FROUND_CEIL) } |
791 | } |
792 | |
793 | /// Round the packed double-precision (64-bit) floating-point elements in `a` |
794 | /// using the `ROUNDING` parameter, and stores the results as packed |
795 | /// double-precision floating-point elements. |
796 | /// Rounding is done according to the rounding parameter, which can be one of: |
797 | /// |
798 | /// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions |
799 | /// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions |
800 | /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions |
801 | /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions |
802 | /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`] |
803 | /// |
804 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd) |
805 | #[inline ] |
806 | #[target_feature (enable = "sse4.1" )] |
807 | #[cfg_attr (test, assert_instr(roundpd, ROUNDING = 0))] |
808 | #[rustc_legacy_const_generics (1)] |
809 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
810 | pub fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d { |
811 | static_assert_uimm_bits!(ROUNDING, 4); |
812 | unsafe { roundpd(a, ROUNDING) } |
813 | } |
814 | |
815 | /// Round the packed single-precision (32-bit) floating-point elements in `a` |
816 | /// using the `ROUNDING` parameter, and stores the results as packed |
817 | /// single-precision floating-point elements. |
818 | /// Rounding is done according to the rounding parameter, which can be one of: |
819 | /// |
820 | /// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions |
821 | /// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions |
822 | /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions |
823 | /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions |
824 | /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`] |
825 | /// |
826 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps) |
827 | #[inline ] |
828 | #[target_feature (enable = "sse4.1" )] |
829 | #[cfg_attr (test, assert_instr(roundps, ROUNDING = 0))] |
830 | #[rustc_legacy_const_generics (1)] |
831 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
832 | pub fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 { |
833 | static_assert_uimm_bits!(ROUNDING, 4); |
834 | unsafe { roundps(a, ROUNDING) } |
835 | } |
836 | |
837 | /// Round the lower double-precision (64-bit) floating-point element in `b` |
838 | /// using the `ROUNDING` parameter, store the result as a double-precision |
839 | /// floating-point element in the lower element of the intrinsic result, |
840 | /// and copies the upper element from `a` to the upper element of the intrinsic |
841 | /// result. |
842 | /// Rounding is done according to the rounding parameter, which can be one of: |
843 | /// |
844 | /// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions |
845 | /// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions |
846 | /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions |
847 | /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions |
848 | /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`] |
849 | /// |
850 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd) |
851 | #[inline ] |
852 | #[target_feature (enable = "sse4.1" )] |
853 | #[cfg_attr (test, assert_instr(roundsd, ROUNDING = 0))] |
854 | #[rustc_legacy_const_generics (2)] |
855 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
856 | pub fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d { |
857 | static_assert_uimm_bits!(ROUNDING, 4); |
858 | unsafe { roundsd(a, b, ROUNDING) } |
859 | } |
860 | |
861 | /// Round the lower single-precision (32-bit) floating-point element in `b` |
862 | /// using the `ROUNDING` parameter, store the result as a single-precision |
863 | /// floating-point element in the lower element of the intrinsic result, |
864 | /// and copies the upper 3 packed elements from `a` to the upper elements |
865 | /// of the intrinsic result. |
866 | /// Rounding is done according to the rounding parameter, which can be one of: |
867 | /// |
868 | /// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions |
869 | /// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions |
870 | /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions |
871 | /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions |
872 | /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`] |
873 | /// |
874 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss) |
875 | #[inline ] |
876 | #[target_feature (enable = "sse4.1" )] |
877 | #[cfg_attr (test, assert_instr(roundss, ROUNDING = 0))] |
878 | #[rustc_legacy_const_generics (2)] |
879 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
880 | pub fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 { |
881 | static_assert_uimm_bits!(ROUNDING, 4); |
882 | unsafe { roundss(a, b, ROUNDING) } |
883 | } |
884 | |
885 | /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector, |
886 | /// returning a vector containing its value in its first position, and its |
887 | /// index |
888 | /// in its second position; all other elements are set to zero. |
889 | /// |
890 | /// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW` |
891 | /// instruction. |
892 | /// |
893 | /// Arguments: |
894 | /// |
895 | /// * `a` - A 128-bit vector of type `__m128i`. |
896 | /// |
897 | /// Returns: |
898 | /// |
899 | /// A 128-bit value where: |
900 | /// |
901 | /// * bits `[15:0]` - contain the minimum value found in parameter `a`, |
902 | /// * bits `[18:16]` - contain the index of the minimum value |
903 | /// * remaining bits are set to `0`. |
904 | /// |
905 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16) |
906 | #[inline ] |
907 | #[target_feature (enable = "sse4.1" )] |
908 | #[cfg_attr (test, assert_instr(phminposuw))] |
909 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
910 | pub fn _mm_minpos_epu16(a: __m128i) -> __m128i { |
911 | unsafe { transmute(src:phminposuw(a.as_u16x8())) } |
912 | } |
913 | |
914 | /// Multiplies the low 32-bit integers from each packed 64-bit |
915 | /// element in `a` and `b`, and returns the signed 64-bit result. |
916 | /// |
917 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32) |
918 | #[inline ] |
919 | #[target_feature (enable = "sse4.1" )] |
920 | #[cfg_attr (test, assert_instr(pmuldq))] |
921 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
922 | pub fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { |
923 | unsafe { |
924 | let a: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2())); |
925 | let b: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2())); |
926 | transmute(src:simd_mul(x:a, y:b)) |
927 | } |
928 | } |
929 | |
930 | /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate |
931 | /// 64-bit integers, and returns the lowest 32-bit, whatever they might be, |
932 | /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2), |
933 | /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping |
934 | /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would |
935 | /// return a negative number. |
936 | /// |
937 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32) |
938 | #[inline ] |
939 | #[target_feature (enable = "sse4.1" )] |
940 | #[cfg_attr (test, assert_instr(pmulld))] |
941 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
942 | pub fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i { |
943 | unsafe { transmute(src:simd_mul(x:a.as_i32x4(), y:b.as_i32x4())) } |
944 | } |
945 | |
946 | /// Subtracts 8-bit unsigned integer values and computes the absolute |
947 | /// values of the differences to the corresponding bits in the destination. |
948 | /// Then sums of the absolute differences are returned according to the bit |
949 | /// fields in the immediate operand. |
950 | /// |
951 | /// The following algorithm is performed: |
952 | /// |
953 | /// ```ignore |
954 | /// i = IMM8[2] * 4 |
955 | /// j = IMM8[1:0] * 4 |
956 | /// for k := 0 to 7 |
957 | /// d0 = abs(a[i + k + 0] - b[j + 0]) |
958 | /// d1 = abs(a[i + k + 1] - b[j + 1]) |
959 | /// d2 = abs(a[i + k + 2] - b[j + 2]) |
960 | /// d3 = abs(a[i + k + 3] - b[j + 3]) |
961 | /// r[k] = d0 + d1 + d2 + d3 |
962 | /// ``` |
963 | /// |
964 | /// Arguments: |
965 | /// |
966 | /// * `a` - A 128-bit vector of type `__m128i`. |
967 | /// * `b` - A 128-bit vector of type `__m128i`. |
968 | /// * `IMM8` - An 8-bit immediate operand specifying how the absolute |
969 | /// differences are to be calculated |
970 | /// * Bit `[2]` specify the offset for operand `a` |
971 | /// * Bits `[1:0]` specify the offset for operand `b` |
972 | /// |
973 | /// Returns: |
974 | /// |
975 | /// * A `__m128i` vector containing the sums of the sets of absolute |
976 | /// differences between both operands. |
977 | /// |
978 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8) |
979 | #[inline ] |
980 | #[target_feature (enable = "sse4.1" )] |
981 | #[cfg_attr (test, assert_instr(mpsadbw, IMM8 = 0))] |
982 | #[rustc_legacy_const_generics (2)] |
983 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
984 | pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i { |
985 | static_assert_uimm_bits!(IMM8, 3); |
986 | unsafe { transmute(src:mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8)) } |
987 | } |
988 | |
989 | /// Tests whether the specified bits in a 128-bit integer vector are all |
990 | /// zeros. |
991 | /// |
992 | /// Arguments: |
993 | /// |
994 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
995 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
996 | /// operand `a`. |
997 | /// |
998 | /// Returns: |
999 | /// |
1000 | /// * `1` - if the specified bits are all zeros, |
1001 | /// * `0` - otherwise. |
1002 | /// |
1003 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128) |
1004 | #[inline ] |
1005 | #[target_feature (enable = "sse4.1" )] |
1006 | #[cfg_attr (test, assert_instr(ptest))] |
1007 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1008 | pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 { |
1009 | unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) } |
1010 | } |
1011 | |
1012 | /// Tests whether the specified bits in a 128-bit integer vector are all |
1013 | /// ones. |
1014 | /// |
1015 | /// Arguments: |
1016 | /// |
1017 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1018 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
1019 | /// operand `a`. |
1020 | /// |
1021 | /// Returns: |
1022 | /// |
1023 | /// * `1` - if the specified bits are all ones, |
1024 | /// * `0` - otherwise. |
1025 | /// |
1026 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128) |
1027 | #[inline ] |
1028 | #[target_feature (enable = "sse4.1" )] |
1029 | #[cfg_attr (test, assert_instr(ptest))] |
1030 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1031 | pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 { |
1032 | unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) } |
1033 | } |
1034 | |
1035 | /// Tests whether the specified bits in a 128-bit integer vector are |
1036 | /// neither all zeros nor all ones. |
1037 | /// |
1038 | /// Arguments: |
1039 | /// |
1040 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1041 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
1042 | /// operand `a`. |
1043 | /// |
1044 | /// Returns: |
1045 | /// |
1046 | /// * `1` - if the specified bits are neither all zeros nor all ones, |
1047 | /// * `0` - otherwise. |
1048 | /// |
1049 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128) |
1050 | #[inline ] |
1051 | #[target_feature (enable = "sse4.1" )] |
1052 | #[cfg_attr (test, assert_instr(ptest))] |
1053 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1054 | pub fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 { |
1055 | unsafe { ptestnzc(a.as_i64x2(), mask.as_i64x2()) } |
1056 | } |
1057 | |
1058 | /// Tests whether the specified bits in a 128-bit integer vector are all |
1059 | /// zeros. |
1060 | /// |
1061 | /// Arguments: |
1062 | /// |
1063 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1064 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
1065 | /// operand `a`. |
1066 | /// |
1067 | /// Returns: |
1068 | /// |
1069 | /// * `1` - if the specified bits are all zeros, |
1070 | /// * `0` - otherwise. |
1071 | /// |
1072 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros) |
1073 | #[inline ] |
1074 | #[target_feature (enable = "sse4.1" )] |
1075 | #[cfg_attr (test, assert_instr(ptest))] |
1076 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1077 | pub fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 { |
1078 | _mm_testz_si128(a, mask) |
1079 | } |
1080 | |
1081 | /// Tests whether the specified bits in `a` 128-bit integer vector are all |
1082 | /// ones. |
1083 | /// |
1084 | /// Argument: |
1085 | /// |
1086 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1087 | /// |
1088 | /// Returns: |
1089 | /// |
1090 | /// * `1` - if the bits specified in the operand are all set to 1, |
1091 | /// * `0` - otherwise. |
1092 | /// |
1093 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones) |
1094 | #[inline ] |
1095 | #[target_feature (enable = "sse4.1" )] |
1096 | #[cfg_attr (test, assert_instr(pcmpeqd))] |
1097 | #[cfg_attr (test, assert_instr(ptest))] |
1098 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1099 | pub fn _mm_test_all_ones(a: __m128i) -> i32 { |
1100 | _mm_testc_si128(a, mask:_mm_cmpeq_epi32(a, b:a)) |
1101 | } |
1102 | |
1103 | /// Tests whether the specified bits in a 128-bit integer vector are |
1104 | /// neither all zeros nor all ones. |
1105 | /// |
1106 | /// Arguments: |
1107 | /// |
1108 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1109 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
1110 | /// operand `a`. |
1111 | /// |
1112 | /// Returns: |
1113 | /// |
1114 | /// * `1` - if the specified bits are neither all zeros nor all ones, |
1115 | /// * `0` - otherwise. |
1116 | /// |
1117 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros) |
1118 | #[inline ] |
1119 | #[target_feature (enable = "sse4.1" )] |
1120 | #[cfg_attr (test, assert_instr(ptest))] |
1121 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1122 | pub fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { |
1123 | _mm_testnzc_si128(a, mask) |
1124 | } |
1125 | |
1126 | /// Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte |
1127 | /// boundary or a general-protection exception may be generated. To minimize caching, the data |
1128 | /// is flagged as non-temporal (unlikely to be used again soon) |
1129 | /// |
1130 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128) |
1131 | #[inline ] |
1132 | #[target_feature (enable = "sse4.1" )] |
1133 | #[cfg_attr (test, assert_instr(movntdqa))] |
1134 | #[stable (feature = "simd_x86_updates" , since = "1.82.0" )] |
1135 | pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i { |
1136 | let dst: __m128i; |
1137 | crate::arch::asm!( |
1138 | vpl!("movntdqa {a}" ), |
1139 | a = out(xmm_reg) dst, |
1140 | p = in(reg) mem_addr, |
1141 | options(pure, readonly, nostack, preserves_flags), |
1142 | ); |
1143 | dst |
1144 | } |
1145 | |
1146 | #[allow (improper_ctypes)] |
1147 | unsafe extern "C" { |
1148 | #[link_name = "llvm.x86.sse41.insertps" ] |
1149 | unsafefn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; |
1150 | #[link_name = "llvm.x86.sse41.packusdw" ] |
1151 | unsafefn packusdw(a: i32x4, b: i32x4) -> u16x8; |
1152 | #[link_name = "llvm.x86.sse41.dppd" ] |
1153 | unsafefn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d; |
1154 | #[link_name = "llvm.x86.sse41.dpps" ] |
1155 | unsafefn dpps(a: __m128, b: __m128, imm8: u8) -> __m128; |
1156 | #[link_name = "llvm.x86.sse41.round.pd" ] |
1157 | unsafefn roundpd(a: __m128d, rounding: i32) -> __m128d; |
1158 | #[link_name = "llvm.x86.sse41.round.ps" ] |
1159 | unsafefn roundps(a: __m128, rounding: i32) -> __m128; |
1160 | #[link_name = "llvm.x86.sse41.round.sd" ] |
1161 | unsafefn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d; |
1162 | #[link_name = "llvm.x86.sse41.round.ss" ] |
1163 | unsafefn roundss(a: __m128, b: __m128, rounding: i32) -> __m128; |
1164 | #[link_name = "llvm.x86.sse41.phminposuw" ] |
1165 | unsafefn phminposuw(a: u16x8) -> u16x8; |
1166 | #[link_name = "llvm.x86.sse41.mpsadbw" ] |
1167 | unsafefn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8; |
1168 | #[link_name = "llvm.x86.sse41.ptestz" ] |
1169 | unsafefn ptestz(a: i64x2, mask: i64x2) -> i32; |
1170 | #[link_name = "llvm.x86.sse41.ptestc" ] |
1171 | unsafefn ptestc(a: i64x2, mask: i64x2) -> i32; |
1172 | #[link_name = "llvm.x86.sse41.ptestnzc" ] |
1173 | unsafefn ptestnzc(a: i64x2, mask: i64x2) -> i32; |
1174 | } |
1175 | |
1176 | #[cfg (test)] |
1177 | mod tests { |
1178 | use crate::core_arch::x86::*; |
1179 | use std::mem; |
1180 | use stdarch_test::simd_test; |
1181 | |
1182 | #[simd_test(enable = "sse4.1" )] |
1183 | unsafe fn test_mm_blendv_epi8() { |
1184 | #[rustfmt::skip] |
1185 | let a = _mm_setr_epi8( |
1186 | 0, 1, 2, 3, 4, 5, 6, 7, |
1187 | 8, 9, 10, 11, 12, 13, 14, 15, |
1188 | ); |
1189 | #[rustfmt::skip] |
1190 | let b = _mm_setr_epi8( |
1191 | 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, |
1192 | ); |
1193 | #[rustfmt::skip] |
1194 | let mask = _mm_setr_epi8( |
1195 | 0, -1, 0, -1, 0, -1, 0, -1, |
1196 | 0, -1, 0, -1, 0, -1, 0, -1, |
1197 | ); |
1198 | #[rustfmt::skip] |
1199 | let e = _mm_setr_epi8( |
1200 | 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, |
1201 | ); |
1202 | assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e); |
1203 | } |
1204 | |
1205 | #[simd_test(enable = "sse4.1" )] |
1206 | unsafe fn test_mm_blendv_pd() { |
1207 | let a = _mm_set1_pd(0.0); |
1208 | let b = _mm_set1_pd(1.0); |
1209 | let mask = transmute(_mm_setr_epi64x(0, -1)); |
1210 | let r = _mm_blendv_pd(a, b, mask); |
1211 | let e = _mm_setr_pd(0.0, 1.0); |
1212 | assert_eq_m128d(r, e); |
1213 | } |
1214 | |
1215 | #[simd_test(enable = "sse4.1" )] |
1216 | unsafe fn test_mm_blendv_ps() { |
1217 | let a = _mm_set1_ps(0.0); |
1218 | let b = _mm_set1_ps(1.0); |
1219 | let mask = transmute(_mm_setr_epi32(0, -1, 0, -1)); |
1220 | let r = _mm_blendv_ps(a, b, mask); |
1221 | let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0); |
1222 | assert_eq_m128(r, e); |
1223 | } |
1224 | |
1225 | #[simd_test(enable = "sse4.1" )] |
1226 | unsafe fn test_mm_blend_pd() { |
1227 | let a = _mm_set1_pd(0.0); |
1228 | let b = _mm_set1_pd(1.0); |
1229 | let r = _mm_blend_pd::<0b10>(a, b); |
1230 | let e = _mm_setr_pd(0.0, 1.0); |
1231 | assert_eq_m128d(r, e); |
1232 | } |
1233 | |
1234 | #[simd_test(enable = "sse4.1" )] |
1235 | unsafe fn test_mm_blend_ps() { |
1236 | let a = _mm_set1_ps(0.0); |
1237 | let b = _mm_set1_ps(1.0); |
1238 | let r = _mm_blend_ps::<0b1010>(a, b); |
1239 | let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0); |
1240 | assert_eq_m128(r, e); |
1241 | } |
1242 | |
1243 | #[simd_test(enable = "sse4.1" )] |
1244 | unsafe fn test_mm_blend_epi16() { |
1245 | let a = _mm_set1_epi16(0); |
1246 | let b = _mm_set1_epi16(1); |
1247 | let r = _mm_blend_epi16::<0b1010_1100>(a, b); |
1248 | let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1); |
1249 | assert_eq_m128i(r, e); |
1250 | } |
1251 | |
1252 | #[simd_test(enable = "sse4.1" )] |
1253 | unsafe fn test_mm_extract_ps() { |
1254 | let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0); |
1255 | let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32); |
1256 | assert_eq!(r, 1.0); |
1257 | let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32); |
1258 | assert_eq!(r, 3.0); |
1259 | } |
1260 | |
1261 | #[simd_test(enable = "sse4.1" )] |
1262 | unsafe fn test_mm_extract_epi8() { |
1263 | #[rustfmt::skip] |
1264 | let a = _mm_setr_epi8( |
1265 | -1, 1, 2, 3, 4, 5, 6, 7, |
1266 | 8, 9, 10, 11, 12, 13, 14, 15 |
1267 | ); |
1268 | let r1 = _mm_extract_epi8::<0>(a); |
1269 | let r2 = _mm_extract_epi8::<3>(a); |
1270 | assert_eq!(r1, 0xFF); |
1271 | assert_eq!(r2, 3); |
1272 | } |
1273 | |
1274 | #[simd_test(enable = "sse4.1" )] |
1275 | unsafe fn test_mm_extract_epi32() { |
1276 | let a = _mm_setr_epi32(0, 1, 2, 3); |
1277 | let r = _mm_extract_epi32::<1>(a); |
1278 | assert_eq!(r, 1); |
1279 | let r = _mm_extract_epi32::<3>(a); |
1280 | assert_eq!(r, 3); |
1281 | } |
1282 | |
1283 | #[simd_test(enable = "sse4.1" )] |
1284 | unsafe fn test_mm_insert_ps() { |
1285 | let a = _mm_set1_ps(1.0); |
1286 | let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
1287 | let r = _mm_insert_ps::<0b11_00_1100>(a, b); |
1288 | let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0); |
1289 | assert_eq_m128(r, e); |
1290 | |
1291 | // Zeroing takes precedence over copied value |
1292 | let a = _mm_set1_ps(1.0); |
1293 | let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
1294 | let r = _mm_insert_ps::<0b11_00_0001>(a, b); |
1295 | let e = _mm_setr_ps(0.0, 1.0, 1.0, 1.0); |
1296 | assert_eq_m128(r, e); |
1297 | } |
1298 | |
1299 | #[simd_test(enable = "sse4.1" )] |
1300 | unsafe fn test_mm_insert_epi8() { |
1301 | let a = _mm_set1_epi8(0); |
1302 | let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); |
1303 | let r = _mm_insert_epi8::<1>(a, 32); |
1304 | assert_eq_m128i(r, e); |
1305 | let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0); |
1306 | let r = _mm_insert_epi8::<14>(a, 32); |
1307 | assert_eq_m128i(r, e); |
1308 | } |
1309 | |
1310 | #[simd_test(enable = "sse4.1" )] |
1311 | unsafe fn test_mm_insert_epi32() { |
1312 | let a = _mm_set1_epi32(0); |
1313 | let e = _mm_setr_epi32(0, 32, 0, 0); |
1314 | let r = _mm_insert_epi32::<1>(a, 32); |
1315 | assert_eq_m128i(r, e); |
1316 | let e = _mm_setr_epi32(0, 0, 0, 32); |
1317 | let r = _mm_insert_epi32::<3>(a, 32); |
1318 | assert_eq_m128i(r, e); |
1319 | } |
1320 | |
1321 | #[simd_test(enable = "sse4.1" )] |
1322 | unsafe fn test_mm_max_epi8() { |
1323 | #[rustfmt::skip] |
1324 | let a = _mm_setr_epi8( |
1325 | 1, 4, 5, 8, 9, 12, 13, 16, |
1326 | 17, 20, 21, 24, 25, 28, 29, 32, |
1327 | ); |
1328 | #[rustfmt::skip] |
1329 | let b = _mm_setr_epi8( |
1330 | 2, 3, 6, 7, 10, 11, 14, 15, |
1331 | 18, 19, 22, 23, 26, 27, 30, 31, |
1332 | ); |
1333 | let r = _mm_max_epi8(a, b); |
1334 | #[rustfmt::skip] |
1335 | let e = _mm_setr_epi8( |
1336 | 2, 4, 6, 8, 10, 12, 14, 16, |
1337 | 18, 20, 22, 24, 26, 28, 30, 32, |
1338 | ); |
1339 | assert_eq_m128i(r, e); |
1340 | } |
1341 | |
1342 | #[simd_test(enable = "sse4.1" )] |
1343 | unsafe fn test_mm_max_epu16() { |
1344 | let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16); |
1345 | let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15); |
1346 | let r = _mm_max_epu16(a, b); |
1347 | let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16); |
1348 | assert_eq_m128i(r, e); |
1349 | } |
1350 | |
1351 | #[simd_test(enable = "sse4.1" )] |
1352 | unsafe fn test_mm_max_epi32() { |
1353 | let a = _mm_setr_epi32(1, 4, 5, 8); |
1354 | let b = _mm_setr_epi32(2, 3, 6, 7); |
1355 | let r = _mm_max_epi32(a, b); |
1356 | let e = _mm_setr_epi32(2, 4, 6, 8); |
1357 | assert_eq_m128i(r, e); |
1358 | } |
1359 | |
1360 | #[simd_test(enable = "sse4.1" )] |
1361 | unsafe fn test_mm_max_epu32() { |
1362 | let a = _mm_setr_epi32(1, 4, 5, 8); |
1363 | let b = _mm_setr_epi32(2, 3, 6, 7); |
1364 | let r = _mm_max_epu32(a, b); |
1365 | let e = _mm_setr_epi32(2, 4, 6, 8); |
1366 | assert_eq_m128i(r, e); |
1367 | } |
1368 | |
1369 | #[simd_test(enable = "sse4.1" )] |
1370 | unsafe fn test_mm_min_epi8() { |
1371 | #[rustfmt::skip] |
1372 | let a = _mm_setr_epi8( |
1373 | 1, 4, 5, 8, 9, 12, 13, 16, |
1374 | 17, 20, 21, 24, 25, 28, 29, 32, |
1375 | ); |
1376 | #[rustfmt::skip] |
1377 | let b = _mm_setr_epi8( |
1378 | 2, 3, 6, 7, 10, 11, 14, 15, |
1379 | 18, 19, 22, 23, 26, 27, 30, 31, |
1380 | ); |
1381 | let r = _mm_min_epi8(a, b); |
1382 | #[rustfmt::skip] |
1383 | let e = _mm_setr_epi8( |
1384 | 1, 3, 5, 7, 9, 11, 13, 15, |
1385 | 17, 19, 21, 23, 25, 27, 29, 31, |
1386 | ); |
1387 | assert_eq_m128i(r, e); |
1388 | |
1389 | #[rustfmt::skip] |
1390 | let a = _mm_setr_epi8( |
1391 | 1, -4, -5, 8, -9, -12, 13, -16, |
1392 | 17, 20, 21, 24, 25, 28, 29, 32, |
1393 | ); |
1394 | #[rustfmt::skip] |
1395 | let b = _mm_setr_epi8( |
1396 | 2, -3, -6, 7, -10, -11, 14, -15, |
1397 | 18, 19, 22, 23, 26, 27, 30, 31, |
1398 | ); |
1399 | let r = _mm_min_epi8(a, b); |
1400 | #[rustfmt::skip] |
1401 | let e = _mm_setr_epi8( |
1402 | 1, -4, -6, 7, -10, -12, 13, -16, |
1403 | 17, 19, 21, 23, 25, 27, 29, 31, |
1404 | ); |
1405 | assert_eq_m128i(r, e); |
1406 | } |
1407 | |
1408 | #[simd_test(enable = "sse4.1" )] |
1409 | unsafe fn test_mm_min_epu16() { |
1410 | let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16); |
1411 | let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15); |
1412 | let r = _mm_min_epu16(a, b); |
1413 | let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15); |
1414 | assert_eq_m128i(r, e); |
1415 | } |
1416 | |
1417 | #[simd_test(enable = "sse4.1" )] |
1418 | unsafe fn test_mm_min_epi32() { |
1419 | let a = _mm_setr_epi32(1, 4, 5, 8); |
1420 | let b = _mm_setr_epi32(2, 3, 6, 7); |
1421 | let r = _mm_min_epi32(a, b); |
1422 | let e = _mm_setr_epi32(1, 3, 5, 7); |
1423 | assert_eq_m128i(r, e); |
1424 | |
1425 | let a = _mm_setr_epi32(-1, 4, 5, -7); |
1426 | let b = _mm_setr_epi32(-2, 3, -6, 8); |
1427 | let r = _mm_min_epi32(a, b); |
1428 | let e = _mm_setr_epi32(-2, 3, -6, -7); |
1429 | assert_eq_m128i(r, e); |
1430 | } |
1431 | |
1432 | #[simd_test(enable = "sse4.1" )] |
1433 | unsafe fn test_mm_min_epu32() { |
1434 | let a = _mm_setr_epi32(1, 4, 5, 8); |
1435 | let b = _mm_setr_epi32(2, 3, 6, 7); |
1436 | let r = _mm_min_epu32(a, b); |
1437 | let e = _mm_setr_epi32(1, 3, 5, 7); |
1438 | assert_eq_m128i(r, e); |
1439 | } |
1440 | |
1441 | #[simd_test(enable = "sse4.1" )] |
1442 | unsafe fn test_mm_packus_epi32() { |
1443 | let a = _mm_setr_epi32(1, 2, 3, 4); |
1444 | let b = _mm_setr_epi32(-1, -2, -3, -4); |
1445 | let r = _mm_packus_epi32(a, b); |
1446 | let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0); |
1447 | assert_eq_m128i(r, e); |
1448 | } |
1449 | |
1450 | #[simd_test(enable = "sse4.1" )] |
1451 | unsafe fn test_mm_cmpeq_epi64() { |
1452 | let a = _mm_setr_epi64x(0, 1); |
1453 | let b = _mm_setr_epi64x(0, 0); |
1454 | let r = _mm_cmpeq_epi64(a, b); |
1455 | let e = _mm_setr_epi64x(-1, 0); |
1456 | assert_eq_m128i(r, e); |
1457 | } |
1458 | |
1459 | #[simd_test(enable = "sse4.1" )] |
1460 | unsafe fn test_mm_cvtepi8_epi16() { |
1461 | let a = _mm_set1_epi8(10); |
1462 | let r = _mm_cvtepi8_epi16(a); |
1463 | let e = _mm_set1_epi16(10); |
1464 | assert_eq_m128i(r, e); |
1465 | let a = _mm_set1_epi8(-10); |
1466 | let r = _mm_cvtepi8_epi16(a); |
1467 | let e = _mm_set1_epi16(-10); |
1468 | assert_eq_m128i(r, e); |
1469 | } |
1470 | |
1471 | #[simd_test(enable = "sse4.1" )] |
1472 | unsafe fn test_mm_cvtepi8_epi32() { |
1473 | let a = _mm_set1_epi8(10); |
1474 | let r = _mm_cvtepi8_epi32(a); |
1475 | let e = _mm_set1_epi32(10); |
1476 | assert_eq_m128i(r, e); |
1477 | let a = _mm_set1_epi8(-10); |
1478 | let r = _mm_cvtepi8_epi32(a); |
1479 | let e = _mm_set1_epi32(-10); |
1480 | assert_eq_m128i(r, e); |
1481 | } |
1482 | |
1483 | #[simd_test(enable = "sse4.1" )] |
1484 | unsafe fn test_mm_cvtepi8_epi64() { |
1485 | let a = _mm_set1_epi8(10); |
1486 | let r = _mm_cvtepi8_epi64(a); |
1487 | let e = _mm_set1_epi64x(10); |
1488 | assert_eq_m128i(r, e); |
1489 | let a = _mm_set1_epi8(-10); |
1490 | let r = _mm_cvtepi8_epi64(a); |
1491 | let e = _mm_set1_epi64x(-10); |
1492 | assert_eq_m128i(r, e); |
1493 | } |
1494 | |
1495 | #[simd_test(enable = "sse4.1" )] |
1496 | unsafe fn test_mm_cvtepi16_epi32() { |
1497 | let a = _mm_set1_epi16(10); |
1498 | let r = _mm_cvtepi16_epi32(a); |
1499 | let e = _mm_set1_epi32(10); |
1500 | assert_eq_m128i(r, e); |
1501 | let a = _mm_set1_epi16(-10); |
1502 | let r = _mm_cvtepi16_epi32(a); |
1503 | let e = _mm_set1_epi32(-10); |
1504 | assert_eq_m128i(r, e); |
1505 | } |
1506 | |
1507 | #[simd_test(enable = "sse4.1" )] |
1508 | unsafe fn test_mm_cvtepi16_epi64() { |
1509 | let a = _mm_set1_epi16(10); |
1510 | let r = _mm_cvtepi16_epi64(a); |
1511 | let e = _mm_set1_epi64x(10); |
1512 | assert_eq_m128i(r, e); |
1513 | let a = _mm_set1_epi16(-10); |
1514 | let r = _mm_cvtepi16_epi64(a); |
1515 | let e = _mm_set1_epi64x(-10); |
1516 | assert_eq_m128i(r, e); |
1517 | } |
1518 | |
1519 | #[simd_test(enable = "sse4.1" )] |
1520 | unsafe fn test_mm_cvtepi32_epi64() { |
1521 | let a = _mm_set1_epi32(10); |
1522 | let r = _mm_cvtepi32_epi64(a); |
1523 | let e = _mm_set1_epi64x(10); |
1524 | assert_eq_m128i(r, e); |
1525 | let a = _mm_set1_epi32(-10); |
1526 | let r = _mm_cvtepi32_epi64(a); |
1527 | let e = _mm_set1_epi64x(-10); |
1528 | assert_eq_m128i(r, e); |
1529 | } |
1530 | |
1531 | #[simd_test(enable = "sse4.1" )] |
1532 | unsafe fn test_mm_cvtepu8_epi16() { |
1533 | let a = _mm_set1_epi8(10); |
1534 | let r = _mm_cvtepu8_epi16(a); |
1535 | let e = _mm_set1_epi16(10); |
1536 | assert_eq_m128i(r, e); |
1537 | } |
1538 | |
1539 | #[simd_test(enable = "sse4.1" )] |
1540 | unsafe fn test_mm_cvtepu8_epi32() { |
1541 | let a = _mm_set1_epi8(10); |
1542 | let r = _mm_cvtepu8_epi32(a); |
1543 | let e = _mm_set1_epi32(10); |
1544 | assert_eq_m128i(r, e); |
1545 | } |
1546 | |
1547 | #[simd_test(enable = "sse4.1" )] |
1548 | unsafe fn test_mm_cvtepu8_epi64() { |
1549 | let a = _mm_set1_epi8(10); |
1550 | let r = _mm_cvtepu8_epi64(a); |
1551 | let e = _mm_set1_epi64x(10); |
1552 | assert_eq_m128i(r, e); |
1553 | } |
1554 | |
1555 | #[simd_test(enable = "sse4.1" )] |
1556 | unsafe fn test_mm_cvtepu16_epi32() { |
1557 | let a = _mm_set1_epi16(10); |
1558 | let r = _mm_cvtepu16_epi32(a); |
1559 | let e = _mm_set1_epi32(10); |
1560 | assert_eq_m128i(r, e); |
1561 | } |
1562 | |
1563 | #[simd_test(enable = "sse4.1" )] |
1564 | unsafe fn test_mm_cvtepu16_epi64() { |
1565 | let a = _mm_set1_epi16(10); |
1566 | let r = _mm_cvtepu16_epi64(a); |
1567 | let e = _mm_set1_epi64x(10); |
1568 | assert_eq_m128i(r, e); |
1569 | } |
1570 | |
1571 | #[simd_test(enable = "sse4.1" )] |
1572 | unsafe fn test_mm_cvtepu32_epi64() { |
1573 | let a = _mm_set1_epi32(10); |
1574 | let r = _mm_cvtepu32_epi64(a); |
1575 | let e = _mm_set1_epi64x(10); |
1576 | assert_eq_m128i(r, e); |
1577 | } |
1578 | |
1579 | #[simd_test(enable = "sse4.1" )] |
1580 | unsafe fn test_mm_dp_pd() { |
1581 | let a = _mm_setr_pd(2.0, 3.0); |
1582 | let b = _mm_setr_pd(1.0, 4.0); |
1583 | let e = _mm_setr_pd(14.0, 0.0); |
1584 | assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e); |
1585 | } |
1586 | |
1587 | #[simd_test(enable = "sse4.1" )] |
1588 | unsafe fn test_mm_dp_ps() { |
1589 | let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0); |
1590 | let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0); |
1591 | let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0); |
1592 | assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e); |
1593 | } |
1594 | |
1595 | #[simd_test(enable = "sse4.1" )] |
1596 | unsafe fn test_mm_floor_pd() { |
1597 | let a = _mm_setr_pd(2.5, 4.5); |
1598 | let r = _mm_floor_pd(a); |
1599 | let e = _mm_setr_pd(2.0, 4.0); |
1600 | assert_eq_m128d(r, e); |
1601 | } |
1602 | |
1603 | #[simd_test(enable = "sse4.1" )] |
1604 | unsafe fn test_mm_floor_ps() { |
1605 | let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5); |
1606 | let r = _mm_floor_ps(a); |
1607 | let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0); |
1608 | assert_eq_m128(r, e); |
1609 | } |
1610 | |
1611 | #[simd_test(enable = "sse4.1" )] |
1612 | unsafe fn test_mm_floor_sd() { |
1613 | let a = _mm_setr_pd(2.5, 4.5); |
1614 | let b = _mm_setr_pd(-1.5, -3.5); |
1615 | let r = _mm_floor_sd(a, b); |
1616 | let e = _mm_setr_pd(-2.0, 4.5); |
1617 | assert_eq_m128d(r, e); |
1618 | } |
1619 | |
1620 | #[simd_test(enable = "sse4.1" )] |
1621 | unsafe fn test_mm_floor_ss() { |
1622 | let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5); |
1623 | let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5); |
1624 | let r = _mm_floor_ss(a, b); |
1625 | let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5); |
1626 | assert_eq_m128(r, e); |
1627 | } |
1628 | |
1629 | #[simd_test(enable = "sse4.1" )] |
1630 | unsafe fn test_mm_ceil_pd() { |
1631 | let a = _mm_setr_pd(1.5, 3.5); |
1632 | let r = _mm_ceil_pd(a); |
1633 | let e = _mm_setr_pd(2.0, 4.0); |
1634 | assert_eq_m128d(r, e); |
1635 | } |
1636 | |
1637 | #[simd_test(enable = "sse4.1" )] |
1638 | unsafe fn test_mm_ceil_ps() { |
1639 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); |
1640 | let r = _mm_ceil_ps(a); |
1641 | let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0); |
1642 | assert_eq_m128(r, e); |
1643 | } |
1644 | |
1645 | #[simd_test(enable = "sse4.1" )] |
1646 | unsafe fn test_mm_ceil_sd() { |
1647 | let a = _mm_setr_pd(1.5, 3.5); |
1648 | let b = _mm_setr_pd(-2.5, -4.5); |
1649 | let r = _mm_ceil_sd(a, b); |
1650 | let e = _mm_setr_pd(-2.0, 3.5); |
1651 | assert_eq_m128d(r, e); |
1652 | } |
1653 | |
1654 | #[simd_test(enable = "sse4.1" )] |
1655 | unsafe fn test_mm_ceil_ss() { |
1656 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); |
1657 | let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5); |
1658 | let r = _mm_ceil_ss(a, b); |
1659 | let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5); |
1660 | assert_eq_m128(r, e); |
1661 | } |
1662 | |
1663 | #[simd_test(enable = "sse4.1" )] |
1664 | unsafe fn test_mm_round_pd() { |
1665 | let a = _mm_setr_pd(1.25, 3.75); |
1666 | let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a); |
1667 | let e = _mm_setr_pd(1.0, 4.0); |
1668 | assert_eq_m128d(r, e); |
1669 | } |
1670 | |
1671 | #[simd_test(enable = "sse4.1" )] |
1672 | unsafe fn test_mm_round_ps() { |
1673 | let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25); |
1674 | let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a); |
1675 | let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0); |
1676 | assert_eq_m128(r, e); |
1677 | } |
1678 | |
1679 | #[simd_test(enable = "sse4.1" )] |
1680 | unsafe fn test_mm_round_sd() { |
1681 | let a = _mm_setr_pd(1.5, 3.5); |
1682 | let b = _mm_setr_pd(-2.5, -4.5); |
1683 | let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b); |
1684 | let e = _mm_setr_pd(-2.0, 3.5); |
1685 | assert_eq_m128d(r, e); |
1686 | |
1687 | let a = _mm_setr_pd(1.5, 3.5); |
1688 | let b = _mm_setr_pd(-2.5, -4.5); |
1689 | let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b); |
1690 | let e = _mm_setr_pd(-3.0, 3.5); |
1691 | assert_eq_m128d(r, e); |
1692 | |
1693 | let a = _mm_setr_pd(1.5, 3.5); |
1694 | let b = _mm_setr_pd(-2.5, -4.5); |
1695 | let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b); |
1696 | let e = _mm_setr_pd(-2.0, 3.5); |
1697 | assert_eq_m128d(r, e); |
1698 | |
1699 | let a = _mm_setr_pd(1.5, 3.5); |
1700 | let b = _mm_setr_pd(-2.5, -4.5); |
1701 | let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b); |
1702 | let e = _mm_setr_pd(-2.0, 3.5); |
1703 | assert_eq_m128d(r, e); |
1704 | } |
1705 | |
1706 | #[simd_test(enable = "sse4.1" )] |
1707 | unsafe fn test_mm_round_ss() { |
1708 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); |
1709 | let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5); |
1710 | let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b); |
1711 | let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5); |
1712 | assert_eq_m128(r, e); |
1713 | |
1714 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); |
1715 | let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5); |
1716 | let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b); |
1717 | let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5); |
1718 | assert_eq_m128(r, e); |
1719 | |
1720 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); |
1721 | let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5); |
1722 | let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b); |
1723 | let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5); |
1724 | assert_eq_m128(r, e); |
1725 | |
1726 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); |
1727 | let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5); |
1728 | let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b); |
1729 | let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5); |
1730 | assert_eq_m128(r, e); |
1731 | } |
1732 | |
1733 | #[simd_test(enable = "sse4.1" )] |
1734 | unsafe fn test_mm_minpos_epu16_1() { |
1735 | let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66); |
1736 | let r = _mm_minpos_epu16(a); |
1737 | let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0); |
1738 | assert_eq_m128i(r, e); |
1739 | } |
1740 | |
1741 | #[simd_test(enable = "sse4.1" )] |
1742 | unsafe fn test_mm_minpos_epu16_2() { |
1743 | let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66); |
1744 | let r = _mm_minpos_epu16(a); |
1745 | let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0); |
1746 | assert_eq_m128i(r, e); |
1747 | } |
1748 | |
1749 | #[simd_test(enable = "sse4.1" )] |
1750 | unsafe fn test_mm_minpos_epu16_3() { |
1751 | // Case where the minimum value is repeated |
1752 | let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 13); |
1753 | let r = _mm_minpos_epu16(a); |
1754 | let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0); |
1755 | assert_eq_m128i(r, e); |
1756 | } |
1757 | |
1758 | #[simd_test(enable = "sse4.1" )] |
1759 | unsafe fn test_mm_mul_epi32() { |
1760 | { |
1761 | let a = _mm_setr_epi32(1, 1, 1, 1); |
1762 | let b = _mm_setr_epi32(1, 2, 3, 4); |
1763 | let r = _mm_mul_epi32(a, b); |
1764 | let e = _mm_setr_epi64x(1, 3); |
1765 | assert_eq_m128i(r, e); |
1766 | } |
1767 | { |
1768 | let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */); |
1769 | let b = _mm_setr_epi32( |
1770 | -20, -256, /* ignored */ |
1771 | 666666, 666666, /* ignored */ |
1772 | ); |
1773 | let r = _mm_mul_epi32(a, b); |
1774 | let e = _mm_setr_epi64x(-300, 823043843622); |
1775 | assert_eq_m128i(r, e); |
1776 | } |
1777 | } |
1778 | |
1779 | #[simd_test(enable = "sse4.1" )] |
1780 | unsafe fn test_mm_mullo_epi32() { |
1781 | { |
1782 | let a = _mm_setr_epi32(1, 1, 1, 1); |
1783 | let b = _mm_setr_epi32(1, 2, 3, 4); |
1784 | let r = _mm_mullo_epi32(a, b); |
1785 | let e = _mm_setr_epi32(1, 2, 3, 4); |
1786 | assert_eq_m128i(r, e); |
1787 | } |
1788 | { |
1789 | let a = _mm_setr_epi32(15, -2, 1234567, 99999); |
1790 | let b = _mm_setr_epi32(-20, -256, 666666, -99999); |
1791 | let r = _mm_mullo_epi32(a, b); |
1792 | // Attention, most significant bit in r[2] is treated |
1793 | // as a sign bit: |
1794 | // 1234567 * 666666 = -1589877210 |
1795 | let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409); |
1796 | assert_eq_m128i(r, e); |
1797 | } |
1798 | } |
1799 | |
1800 | #[simd_test(enable = "sse4.1" )] |
1801 | unsafe fn test_mm_minpos_epu16() { |
1802 | let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3); |
1803 | let r = _mm_minpos_epu16(a); |
1804 | let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0); |
1805 | assert_eq_m128i(r, e); |
1806 | } |
1807 | |
1808 | #[simd_test(enable = "sse4.1" )] |
1809 | unsafe fn test_mm_mpsadbw_epu8() { |
1810 | #[rustfmt::skip] |
1811 | let a = _mm_setr_epi8( |
1812 | 0, 1, 2, 3, 4, 5, 6, 7, |
1813 | 8, 9, 10, 11, 12, 13, 14, 15, |
1814 | ); |
1815 | |
1816 | let r = _mm_mpsadbw_epu8::<0b000>(a, a); |
1817 | let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28); |
1818 | assert_eq_m128i(r, e); |
1819 | |
1820 | let r = _mm_mpsadbw_epu8::<0b001>(a, a); |
1821 | let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12); |
1822 | assert_eq_m128i(r, e); |
1823 | |
1824 | let r = _mm_mpsadbw_epu8::<0b100>(a, a); |
1825 | let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44); |
1826 | assert_eq_m128i(r, e); |
1827 | |
1828 | let r = _mm_mpsadbw_epu8::<0b101>(a, a); |
1829 | let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28); |
1830 | assert_eq_m128i(r, e); |
1831 | |
1832 | let r = _mm_mpsadbw_epu8::<0b111>(a, a); |
1833 | let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4); |
1834 | assert_eq_m128i(r, e); |
1835 | } |
1836 | |
1837 | #[simd_test(enable = "sse4.1" )] |
1838 | unsafe fn test_mm_testz_si128() { |
1839 | let a = _mm_set1_epi8(1); |
1840 | let mask = _mm_set1_epi8(0); |
1841 | let r = _mm_testz_si128(a, mask); |
1842 | assert_eq!(r, 1); |
1843 | let a = _mm_set1_epi8(0b101); |
1844 | let mask = _mm_set1_epi8(0b110); |
1845 | let r = _mm_testz_si128(a, mask); |
1846 | assert_eq!(r, 0); |
1847 | let a = _mm_set1_epi8(0b011); |
1848 | let mask = _mm_set1_epi8(0b100); |
1849 | let r = _mm_testz_si128(a, mask); |
1850 | assert_eq!(r, 1); |
1851 | } |
1852 | |
1853 | #[simd_test(enable = "sse4.1" )] |
1854 | unsafe fn test_mm_testc_si128() { |
1855 | let a = _mm_set1_epi8(-1); |
1856 | let mask = _mm_set1_epi8(0); |
1857 | let r = _mm_testc_si128(a, mask); |
1858 | assert_eq!(r, 1); |
1859 | let a = _mm_set1_epi8(0b101); |
1860 | let mask = _mm_set1_epi8(0b110); |
1861 | let r = _mm_testc_si128(a, mask); |
1862 | assert_eq!(r, 0); |
1863 | let a = _mm_set1_epi8(0b101); |
1864 | let mask = _mm_set1_epi8(0b100); |
1865 | let r = _mm_testc_si128(a, mask); |
1866 | assert_eq!(r, 1); |
1867 | } |
1868 | |
1869 | #[simd_test(enable = "sse4.1" )] |
1870 | unsafe fn test_mm_testnzc_si128() { |
1871 | let a = _mm_set1_epi8(0); |
1872 | let mask = _mm_set1_epi8(1); |
1873 | let r = _mm_testnzc_si128(a, mask); |
1874 | assert_eq!(r, 0); |
1875 | let a = _mm_set1_epi8(-1); |
1876 | let mask = _mm_set1_epi8(0); |
1877 | let r = _mm_testnzc_si128(a, mask); |
1878 | assert_eq!(r, 0); |
1879 | let a = _mm_set1_epi8(0b101); |
1880 | let mask = _mm_set1_epi8(0b110); |
1881 | let r = _mm_testnzc_si128(a, mask); |
1882 | assert_eq!(r, 1); |
1883 | let a = _mm_set1_epi8(0b101); |
1884 | let mask = _mm_set1_epi8(0b101); |
1885 | let r = _mm_testnzc_si128(a, mask); |
1886 | assert_eq!(r, 0); |
1887 | } |
1888 | |
1889 | #[simd_test(enable = "sse4.1" )] |
1890 | unsafe fn test_mm_test_all_zeros() { |
1891 | let a = _mm_set1_epi8(1); |
1892 | let mask = _mm_set1_epi8(0); |
1893 | let r = _mm_test_all_zeros(a, mask); |
1894 | assert_eq!(r, 1); |
1895 | let a = _mm_set1_epi8(0b101); |
1896 | let mask = _mm_set1_epi8(0b110); |
1897 | let r = _mm_test_all_zeros(a, mask); |
1898 | assert_eq!(r, 0); |
1899 | let a = _mm_set1_epi8(0b011); |
1900 | let mask = _mm_set1_epi8(0b100); |
1901 | let r = _mm_test_all_zeros(a, mask); |
1902 | assert_eq!(r, 1); |
1903 | } |
1904 | |
1905 | #[simd_test(enable = "sse4.1" )] |
1906 | unsafe fn test_mm_test_all_ones() { |
1907 | let a = _mm_set1_epi8(-1); |
1908 | let r = _mm_test_all_ones(a); |
1909 | assert_eq!(r, 1); |
1910 | let a = _mm_set1_epi8(0b101); |
1911 | let r = _mm_test_all_ones(a); |
1912 | assert_eq!(r, 0); |
1913 | } |
1914 | |
1915 | #[simd_test(enable = "sse4.1" )] |
1916 | unsafe fn test_mm_test_mix_ones_zeros() { |
1917 | let a = _mm_set1_epi8(0); |
1918 | let mask = _mm_set1_epi8(1); |
1919 | let r = _mm_test_mix_ones_zeros(a, mask); |
1920 | assert_eq!(r, 0); |
1921 | let a = _mm_set1_epi8(-1); |
1922 | let mask = _mm_set1_epi8(0); |
1923 | let r = _mm_test_mix_ones_zeros(a, mask); |
1924 | assert_eq!(r, 0); |
1925 | let a = _mm_set1_epi8(0b101); |
1926 | let mask = _mm_set1_epi8(0b110); |
1927 | let r = _mm_test_mix_ones_zeros(a, mask); |
1928 | assert_eq!(r, 1); |
1929 | let a = _mm_set1_epi8(0b101); |
1930 | let mask = _mm_set1_epi8(0b101); |
1931 | let r = _mm_test_mix_ones_zeros(a, mask); |
1932 | assert_eq!(r, 0); |
1933 | } |
1934 | |
1935 | #[simd_test(enable = "sse4.1" )] |
1936 | unsafe fn test_mm_stream_load_si128() { |
1937 | let a = _mm_set_epi64x(5, 6); |
1938 | let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _); |
1939 | assert_eq_m128i(a, r); |
1940 | } |
1941 | } |
1942 | |