1 | //! Streaming SIMD Extensions 4.1 (SSE4.1) |
2 | |
3 | use crate::{ |
4 | core_arch::{simd::*, simd_llvm::*, x86::*}, |
5 | mem::transmute, |
6 | }; |
7 | |
8 | #[cfg (test)] |
9 | use stdarch_test::assert_instr; |
10 | |
11 | // SSE4 rounding constants |
12 | /// round to nearest |
13 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
14 | pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00; |
15 | /// round down |
16 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
17 | pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01; |
18 | /// round up |
19 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
20 | pub const _MM_FROUND_TO_POS_INF: i32 = 0x02; |
21 | /// truncate |
22 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
23 | pub const _MM_FROUND_TO_ZERO: i32 = 0x03; |
24 | /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE` |
25 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
26 | pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04; |
27 | /// do not suppress exceptions |
28 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
29 | pub const _MM_FROUND_RAISE_EXC: i32 = 0x00; |
30 | /// suppress exceptions |
31 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
32 | pub const _MM_FROUND_NO_EXC: i32 = 0x08; |
33 | /// round to nearest and do not suppress exceptions |
34 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
35 | pub const _MM_FROUND_NINT: i32 = 0x00; |
36 | /// round down and do not suppress exceptions |
37 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
38 | pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF; |
39 | /// round up and do not suppress exceptions |
40 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
41 | pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF; |
42 | /// truncate and do not suppress exceptions |
43 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
44 | pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO; |
45 | /// use MXCSR.RC and do not suppress exceptions; see |
46 | /// `vendor::_MM_SET_ROUNDING_MODE` |
47 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
48 | pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION; |
49 | /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE` |
50 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
51 | pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION; |
52 | |
53 | /// Blend packed 8-bit integers from `a` and `b` using `mask` |
54 | /// |
55 | /// The high bit of each corresponding mask byte determines the selection. |
56 | /// If the high bit is set the element of `a` is selected. The element |
57 | /// of `b` is selected otherwise. |
58 | /// |
59 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8) |
60 | #[inline ] |
61 | #[target_feature (enable = "sse4.1" )] |
62 | #[cfg_attr (test, assert_instr(pblendvb))] |
63 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
64 | pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i { |
65 | let mask: i8x16 = simd_lt(x:mask.as_i8x16(), y:i8x16::splat(0)); |
66 | transmute(src:simd_select(m:mask, a:b.as_i8x16(), b:a.as_i8x16())) |
67 | } |
68 | |
69 | /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`. |
70 | /// |
71 | /// The mask bits determine the selection. A clear bit selects the |
72 | /// corresponding element of `a`, and a set bit the corresponding |
73 | /// element of `b`. |
74 | /// |
75 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16) |
76 | #[inline ] |
77 | #[target_feature (enable = "sse4.1" )] |
78 | #[cfg_attr (test, assert_instr(pblendw, IMM8 = 0xB1))] |
79 | #[rustc_legacy_const_generics (2)] |
80 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
81 | pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i { |
82 | static_assert_uimm_bits!(IMM8, 8); |
83 | transmute::<i16x8, _>(src:simd_shuffle!( |
84 | a.as_i16x8(), |
85 | b.as_i16x8(), |
86 | [ |
87 | [0, 8][IMM8 as usize & 1], |
88 | [1, 9][(IMM8 >> 1) as usize & 1], |
89 | [2, 10][(IMM8 >> 2) as usize & 1], |
90 | [3, 11][(IMM8 >> 3) as usize & 1], |
91 | [4, 12][(IMM8 >> 4) as usize & 1], |
92 | [5, 13][(IMM8 >> 5) as usize & 1], |
93 | [6, 14][(IMM8 >> 6) as usize & 1], |
94 | [7, 15][(IMM8 >> 7) as usize & 1], |
95 | ] |
96 | )) |
97 | } |
98 | |
99 | /// Blend packed double-precision (64-bit) floating-point elements from `a` |
100 | /// and `b` using `mask` |
101 | /// |
102 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd) |
103 | #[inline ] |
104 | #[target_feature (enable = "sse4.1" )] |
105 | #[cfg_attr (test, assert_instr(blendvpd))] |
106 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
107 | pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { |
108 | let mask: i64x2 = simd_lt(x:transmute::<_, i64x2>(mask), y:i64x2::splat(0)); |
109 | transmute(src:simd_select(m:mask, a:b.as_f64x2(), b:a.as_f64x2())) |
110 | } |
111 | |
112 | /// Blend packed single-precision (32-bit) floating-point elements from `a` |
113 | /// and `b` using `mask` |
114 | /// |
115 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps) |
116 | #[inline ] |
117 | #[target_feature (enable = "sse4.1" )] |
118 | #[cfg_attr (test, assert_instr(blendvps))] |
119 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
120 | pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { |
121 | let mask: i32x4 = simd_lt(x:transmute::<_, i32x4>(mask), y:i32x4::splat(0)); |
122 | transmute(src:simd_select(m:mask, a:b.as_f32x4(), b:a.as_f32x4())) |
123 | } |
124 | |
125 | /// Blend packed double-precision (64-bit) floating-point elements from `a` |
126 | /// and `b` using control mask `IMM2` |
127 | /// |
128 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd) |
129 | #[inline ] |
130 | #[target_feature (enable = "sse4.1" )] |
131 | // Note: LLVM7 prefers the single-precision floating-point domain when possible |
132 | // see https://bugs.llvm.org/show_bug.cgi?id=38195 |
133 | // #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))] |
134 | #[cfg_attr (test, assert_instr(blendps, IMM2 = 0b10))] |
135 | #[rustc_legacy_const_generics (2)] |
136 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
137 | pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d { |
138 | static_assert_uimm_bits!(IMM2, 2); |
139 | transmute::<f64x2, _>(src:simd_shuffle!( |
140 | a.as_f64x2(), |
141 | b.as_f64x2(), |
142 | [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]] |
143 | )) |
144 | } |
145 | |
146 | /// Blend packed single-precision (32-bit) floating-point elements from `a` |
147 | /// and `b` using mask `IMM4` |
148 | /// |
149 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps) |
150 | #[inline ] |
151 | #[target_feature (enable = "sse4.1" )] |
152 | #[cfg_attr (test, assert_instr(blendps, IMM4 = 0b0101))] |
153 | #[rustc_legacy_const_generics (2)] |
154 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
155 | pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 { |
156 | static_assert_uimm_bits!(IMM4, 4); |
157 | transmute::<f32x4, _>(src:simd_shuffle!( |
158 | a.as_f32x4(), |
159 | b.as_f32x4(), |
160 | [ |
161 | [0, 4][IMM4 as usize & 1], |
162 | [1, 5][(IMM4 >> 1) as usize & 1], |
163 | [2, 6][(IMM4 >> 2) as usize & 1], |
164 | [3, 7][(IMM4 >> 3) as usize & 1], |
165 | ] |
166 | )) |
167 | } |
168 | |
169 | /// Extracts a single-precision (32-bit) floating-point element from `a`, |
170 | /// selected with `IMM8`. The returned `i32` stores the float's bit-pattern, |
171 | /// and may be converted back to a floating point number via casting. |
172 | /// |
173 | /// # Example |
174 | /// ```rust |
175 | /// # #[cfg (target_arch = "x86" )] |
176 | /// # use std::arch::x86::*; |
177 | /// # #[cfg (target_arch = "x86_64" )] |
178 | /// # use std::arch::x86_64::*; |
179 | /// # fn main() { |
180 | /// # if is_x86_feature_detected!("sse4.1" ) { |
181 | /// # #[target_feature (enable = "sse4.1" )] |
182 | /// # unsafe fn worker() { |
183 | /// let mut float_store = vec![1.0, 1.0, 2.0, 3.0]; |
184 | /// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0); |
185 | /// let x: i32 = _mm_extract_ps::<2>(simd_floats); |
186 | /// float_store.push(f32::from_bits(x as u32)); |
187 | /// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]); |
188 | /// # } |
189 | /// # unsafe { worker() } |
190 | /// # } |
191 | /// # } |
192 | /// ``` |
193 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps) |
194 | #[inline ] |
195 | #[target_feature (enable = "sse4.1" )] |
196 | #[cfg_attr ( |
197 | all(test, not(target_os = "windows" )), |
198 | assert_instr(extractps, IMM8 = 0) |
199 | )] |
200 | #[rustc_legacy_const_generics (1)] |
201 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
202 | pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 { |
203 | static_assert_uimm_bits!(IMM8, 2); |
204 | simd_extract::<_, f32>(x:a, IMM8 as u32).to_bits() as i32 |
205 | } |
206 | |
207 | /// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit |
208 | /// integer containing the zero-extended integer data. |
209 | /// |
210 | /// See [LLVM commit D20468](https://reviews.llvm.org/D20468). |
211 | /// |
212 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8) |
213 | #[inline ] |
214 | #[target_feature (enable = "sse4.1" )] |
215 | #[cfg_attr (test, assert_instr(pextrb, IMM8 = 0))] |
216 | #[rustc_legacy_const_generics (1)] |
217 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
218 | pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 { |
219 | static_assert_uimm_bits!(IMM8, 4); |
220 | simd_extract::<_, u8>(x:a.as_u8x16(), IMM8 as u32) as i32 |
221 | } |
222 | |
223 | /// Extracts an 32-bit integer from `a` selected with `IMM8` |
224 | /// |
225 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32) |
226 | #[inline ] |
227 | #[target_feature (enable = "sse4.1" )] |
228 | #[cfg_attr ( |
229 | all(test, not(target_os = "windows" )), |
230 | assert_instr(extractps, IMM8 = 1) |
231 | )] |
232 | #[rustc_legacy_const_generics (1)] |
233 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
234 | pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 { |
235 | static_assert_uimm_bits!(IMM8, 2); |
236 | simd_extract::<_, i32>(x:a.as_i32x4(), IMM8 as u32) |
237 | } |
238 | |
239 | /// Select a single value in `a` to store at some position in `b`, |
240 | /// Then zero elements according to `IMM8`. |
241 | /// |
242 | /// `IMM8` specifies which bits from operand `a` will be copied, which bits in |
243 | /// the result they will be copied to, and which bits in the result will be |
244 | /// cleared. The following assignments are made: |
245 | /// |
246 | /// * Bits `[7:6]` specify the bits to copy from operand `a`: |
247 | /// - `00`: Selects bits `[31:0]` from operand `a`. |
248 | /// - `01`: Selects bits `[63:32]` from operand `a`. |
249 | /// - `10`: Selects bits `[95:64]` from operand `a`. |
250 | /// - `11`: Selects bits `[127:96]` from operand `a`. |
251 | /// |
252 | /// * Bits `[5:4]` specify the bits in the result to which the selected bits |
253 | /// from operand `a` are copied: |
254 | /// - `00`: Copies the selected bits from `a` to result bits `[31:0]`. |
255 | /// - `01`: Copies the selected bits from `a` to result bits `[63:32]`. |
256 | /// - `10`: Copies the selected bits from `a` to result bits `[95:64]`. |
257 | /// - `11`: Copies the selected bits from `a` to result bits `[127:96]`. |
258 | /// |
259 | /// * Bits `[3:0]`: If any of these bits are set, the corresponding result |
260 | /// element is cleared. |
261 | /// |
262 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps) |
263 | #[inline ] |
264 | #[target_feature (enable = "sse4.1" )] |
265 | #[cfg_attr (test, assert_instr(insertps, IMM8 = 0b1010))] |
266 | #[rustc_legacy_const_generics (2)] |
267 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
268 | pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 { |
269 | static_assert_uimm_bits!(IMM8, 8); |
270 | insertps(a, b, IMM8 as u8) |
271 | } |
272 | |
273 | /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a |
274 | /// location specified by `IMM8`. |
275 | /// |
276 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8) |
277 | #[inline ] |
278 | #[target_feature (enable = "sse4.1" )] |
279 | #[cfg_attr (test, assert_instr(pinsrb, IMM8 = 0))] |
280 | #[rustc_legacy_const_generics (2)] |
281 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
282 | pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i { |
283 | static_assert_uimm_bits!(IMM8, 4); |
284 | transmute(src:simd_insert(x:a.as_i8x16(), IMM8 as u32, val:i as i8)) |
285 | } |
286 | |
287 | /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a |
288 | /// location specified by `IMM8`. |
289 | /// |
290 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32) |
291 | #[inline ] |
292 | #[target_feature (enable = "sse4.1" )] |
293 | #[cfg_attr (test, assert_instr(pinsrd, IMM8 = 0))] |
294 | #[rustc_legacy_const_generics (2)] |
295 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
296 | pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i { |
297 | static_assert_uimm_bits!(IMM8, 2); |
298 | transmute(src:simd_insert(x:a.as_i32x4(), IMM8 as u32, val:i)) |
299 | } |
300 | |
301 | /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum |
302 | /// values in dst. |
303 | /// |
304 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8) |
305 | #[inline ] |
306 | #[target_feature (enable = "sse4.1" )] |
307 | #[cfg_attr (test, assert_instr(pmaxsb))] |
308 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
309 | pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i { |
310 | let a: i8x16 = a.as_i8x16(); |
311 | let b: i8x16 = b.as_i8x16(); |
312 | transmute(src:simd_select::<i8x16, _>(m:simd_gt(x:a, y:b), a, b)) |
313 | } |
314 | |
315 | /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed |
316 | /// maximum. |
317 | /// |
318 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16) |
319 | #[inline ] |
320 | #[target_feature (enable = "sse4.1" )] |
321 | #[cfg_attr (test, assert_instr(pmaxuw))] |
322 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
323 | pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i { |
324 | let a: u16x8 = a.as_u16x8(); |
325 | let b: u16x8 = b.as_u16x8(); |
326 | transmute(src:simd_select::<i16x8, _>(m:simd_gt(x:a, y:b), a, b)) |
327 | } |
328 | |
329 | /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum |
330 | /// values. |
331 | /// |
332 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32) |
333 | #[inline ] |
334 | #[target_feature (enable = "sse4.1" )] |
335 | #[cfg_attr (test, assert_instr(pmaxsd))] |
336 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
337 | pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i { |
338 | let a: i32x4 = a.as_i32x4(); |
339 | let b: i32x4 = b.as_i32x4(); |
340 | transmute(src:simd_select::<i32x4, _>(m:simd_gt(x:a, y:b), a, b)) |
341 | } |
342 | |
343 | /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed |
344 | /// maximum values. |
345 | /// |
346 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32) |
347 | #[inline ] |
348 | #[target_feature (enable = "sse4.1" )] |
349 | #[cfg_attr (test, assert_instr(pmaxud))] |
350 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
351 | pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i { |
352 | let a: u32x4 = a.as_u32x4(); |
353 | let b: u32x4 = b.as_u32x4(); |
354 | transmute(src:simd_select::<i32x4, _>(m:simd_gt(x:a, y:b), a, b)) |
355 | } |
356 | |
357 | /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum |
358 | /// values in dst. |
359 | /// |
360 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8) |
361 | #[inline ] |
362 | #[target_feature (enable = "sse4.1" )] |
363 | #[cfg_attr (test, assert_instr(pminsb))] |
364 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
365 | pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i { |
366 | let a: i8x16 = a.as_i8x16(); |
367 | let b: i8x16 = b.as_i8x16(); |
368 | transmute(src:simd_select::<i8x16, _>(m:simd_lt(x:a, y:b), a, b)) |
369 | } |
370 | |
371 | /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed |
372 | /// minimum. |
373 | /// |
374 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16) |
375 | #[inline ] |
376 | #[target_feature (enable = "sse4.1" )] |
377 | #[cfg_attr (test, assert_instr(pminuw))] |
378 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
379 | pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i { |
380 | let a: u16x8 = a.as_u16x8(); |
381 | let b: u16x8 = b.as_u16x8(); |
382 | transmute(src:simd_select::<i16x8, _>(m:simd_lt(x:a, y:b), a, b)) |
383 | } |
384 | |
385 | /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum |
386 | /// values. |
387 | /// |
388 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32) |
389 | #[inline ] |
390 | #[target_feature (enable = "sse4.1" )] |
391 | #[cfg_attr (test, assert_instr(pminsd))] |
392 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
393 | pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i { |
394 | let a: i32x4 = a.as_i32x4(); |
395 | let b: i32x4 = b.as_i32x4(); |
396 | transmute(src:simd_select::<i32x4, _>(m:simd_lt(x:a, y:b), a, b)) |
397 | } |
398 | |
399 | /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed |
400 | /// minimum values. |
401 | /// |
402 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32) |
403 | #[inline ] |
404 | #[target_feature (enable = "sse4.1" )] |
405 | #[cfg_attr (test, assert_instr(pminud))] |
406 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
407 | pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { |
408 | let a: u32x4 = a.as_u32x4(); |
409 | let b: u32x4 = b.as_u32x4(); |
410 | transmute(src:simd_select::<i32x4, _>(m:simd_lt(x:a, y:b), a, b)) |
411 | } |
412 | |
413 | /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers |
414 | /// using unsigned saturation |
415 | /// |
416 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32) |
417 | #[inline ] |
418 | #[target_feature (enable = "sse4.1" )] |
419 | #[cfg_attr (test, assert_instr(packusdw))] |
420 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
421 | pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { |
422 | transmute(src:packusdw(a:a.as_i32x4(), b:b.as_i32x4())) |
423 | } |
424 | |
425 | /// Compares packed 64-bit integers in `a` and `b` for equality |
426 | /// |
427 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64) |
428 | #[inline ] |
429 | #[target_feature (enable = "sse4.1" )] |
430 | #[cfg_attr (test, assert_instr(pcmpeqq))] |
431 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
432 | pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i { |
433 | transmute(src:simd_eq::<_, i64x2>(x:a.as_i64x2(), y:b.as_i64x2())) |
434 | } |
435 | |
436 | /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers |
437 | /// |
438 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16) |
439 | #[inline ] |
440 | #[target_feature (enable = "sse4.1" )] |
441 | #[cfg_attr (test, assert_instr(pmovsxbw))] |
442 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
443 | pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i { |
444 | let a: i8x16 = a.as_i8x16(); |
445 | let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); |
446 | transmute(src:simd_cast::<_, i16x8>(a)) |
447 | } |
448 | |
449 | /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers |
450 | /// |
451 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32) |
452 | #[inline ] |
453 | #[target_feature (enable = "sse4.1" )] |
454 | #[cfg_attr (test, assert_instr(pmovsxbd))] |
455 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
456 | pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i { |
457 | let a: i8x16 = a.as_i8x16(); |
458 | let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); |
459 | transmute(src:simd_cast::<_, i32x4>(a)) |
460 | } |
461 | |
462 | /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed |
463 | /// 64-bit integers |
464 | /// |
465 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64) |
466 | #[inline ] |
467 | #[target_feature (enable = "sse4.1" )] |
468 | #[cfg_attr (test, assert_instr(pmovsxbq))] |
469 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
470 | pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i { |
471 | let a: i8x16 = a.as_i8x16(); |
472 | let a: i8x2 = simd_shuffle!(a, a, [0, 1]); |
473 | transmute(src:simd_cast::<_, i64x2>(a)) |
474 | } |
475 | |
476 | /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers |
477 | /// |
478 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32) |
479 | #[inline ] |
480 | #[target_feature (enable = "sse4.1" )] |
481 | #[cfg_attr (test, assert_instr(pmovsxwd))] |
482 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
483 | pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i { |
484 | let a: i16x8 = a.as_i16x8(); |
485 | let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); |
486 | transmute(src:simd_cast::<_, i32x4>(a)) |
487 | } |
488 | |
489 | /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers |
490 | /// |
491 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64) |
492 | #[inline ] |
493 | #[target_feature (enable = "sse4.1" )] |
494 | #[cfg_attr (test, assert_instr(pmovsxwq))] |
495 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
496 | pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i { |
497 | let a: i16x8 = a.as_i16x8(); |
498 | let a: i16x2 = simd_shuffle!(a, a, [0, 1]); |
499 | transmute(src:simd_cast::<_, i64x2>(a)) |
500 | } |
501 | |
502 | /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers |
503 | /// |
504 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64) |
505 | #[inline ] |
506 | #[target_feature (enable = "sse4.1" )] |
507 | #[cfg_attr (test, assert_instr(pmovsxdq))] |
508 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
509 | pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i { |
510 | let a: i32x4 = a.as_i32x4(); |
511 | let a: i32x2 = simd_shuffle!(a, a, [0, 1]); |
512 | transmute(src:simd_cast::<_, i64x2>(a)) |
513 | } |
514 | |
515 | /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers |
516 | /// |
517 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16) |
518 | #[inline ] |
519 | #[target_feature (enable = "sse4.1" )] |
520 | #[cfg_attr (test, assert_instr(pmovzxbw))] |
521 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
522 | pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i { |
523 | let a: u8x16 = a.as_u8x16(); |
524 | let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); |
525 | transmute(src:simd_cast::<_, i16x8>(a)) |
526 | } |
527 | |
528 | /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers |
529 | /// |
530 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32) |
531 | #[inline ] |
532 | #[target_feature (enable = "sse4.1" )] |
533 | #[cfg_attr (test, assert_instr(pmovzxbd))] |
534 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
535 | pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i { |
536 | let a: u8x16 = a.as_u8x16(); |
537 | let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); |
538 | transmute(src:simd_cast::<_, i32x4>(a)) |
539 | } |
540 | |
541 | /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers |
542 | /// |
543 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64) |
544 | #[inline ] |
545 | #[target_feature (enable = "sse4.1" )] |
546 | #[cfg_attr (test, assert_instr(pmovzxbq))] |
547 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
548 | pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i { |
549 | let a: u8x16 = a.as_u8x16(); |
550 | let a: u8x2 = simd_shuffle!(a, a, [0, 1]); |
551 | transmute(src:simd_cast::<_, i64x2>(a)) |
552 | } |
553 | |
554 | /// Zeroes extend packed unsigned 16-bit integers in `a` |
555 | /// to packed 32-bit integers |
556 | /// |
557 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32) |
558 | #[inline ] |
559 | #[target_feature (enable = "sse4.1" )] |
560 | #[cfg_attr (test, assert_instr(pmovzxwd))] |
561 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
562 | pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i { |
563 | let a: u16x8 = a.as_u16x8(); |
564 | let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]); |
565 | transmute(src:simd_cast::<_, i32x4>(a)) |
566 | } |
567 | |
568 | /// Zeroes extend packed unsigned 16-bit integers in `a` |
569 | /// to packed 64-bit integers |
570 | /// |
571 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64) |
572 | #[inline ] |
573 | #[target_feature (enable = "sse4.1" )] |
574 | #[cfg_attr (test, assert_instr(pmovzxwq))] |
575 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
576 | pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i { |
577 | let a: u16x8 = a.as_u16x8(); |
578 | let a: u16x2 = simd_shuffle!(a, a, [0, 1]); |
579 | transmute(src:simd_cast::<_, i64x2>(a)) |
580 | } |
581 | |
582 | /// Zeroes extend packed unsigned 32-bit integers in `a` |
583 | /// to packed 64-bit integers |
584 | /// |
585 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64) |
586 | #[inline ] |
587 | #[target_feature (enable = "sse4.1" )] |
588 | #[cfg_attr (test, assert_instr(pmovzxdq))] |
589 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
590 | pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i { |
591 | let a: u32x4 = a.as_u32x4(); |
592 | let a: u32x2 = simd_shuffle!(a, a, [0, 1]); |
593 | transmute(src:simd_cast::<_, i64x2>(a)) |
594 | } |
595 | |
596 | /// Returns the dot product of two __m128d vectors. |
597 | /// |
598 | /// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask. |
599 | /// If a condition mask bit is zero, the corresponding multiplication is |
600 | /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of |
601 | /// the dot product will be stored in the return value component. Otherwise if |
602 | /// the broadcast mask bit is zero then the return component will be zero. |
603 | /// |
604 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd) |
605 | #[inline ] |
606 | #[target_feature (enable = "sse4.1" )] |
607 | #[cfg_attr (test, assert_instr(dppd, IMM8 = 0))] |
608 | #[rustc_legacy_const_generics (2)] |
609 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
610 | pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d { |
611 | static_assert_uimm_bits!(IMM8, 8); |
612 | dppd(a, b, IMM8 as u8) |
613 | } |
614 | |
615 | /// Returns the dot product of two __m128 vectors. |
616 | /// |
617 | /// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask. |
618 | /// If a condition mask bit is zero, the corresponding multiplication is |
619 | /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of |
620 | /// the dot product will be stored in the return value component. Otherwise if |
621 | /// the broadcast mask bit is zero then the return component will be zero. |
622 | /// |
623 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps) |
624 | #[inline ] |
625 | #[target_feature (enable = "sse4.1" )] |
626 | #[cfg_attr (test, assert_instr(dpps, IMM8 = 0))] |
627 | #[rustc_legacy_const_generics (2)] |
628 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
629 | pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 { |
630 | static_assert_uimm_bits!(IMM8, 8); |
631 | dpps(a, b, IMM8 as u8) |
632 | } |
633 | |
634 | /// Round the packed double-precision (64-bit) floating-point elements in `a` |
635 | /// down to an integer value, and stores the results as packed double-precision |
636 | /// floating-point elements. |
637 | /// |
638 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd) |
639 | #[inline ] |
640 | #[target_feature (enable = "sse4.1" )] |
641 | #[cfg_attr (test, assert_instr(roundpd))] |
642 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
643 | pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d { |
644 | simd_floor(a) |
645 | } |
646 | |
647 | /// Round the packed single-precision (32-bit) floating-point elements in `a` |
648 | /// down to an integer value, and stores the results as packed single-precision |
649 | /// floating-point elements. |
650 | /// |
651 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps) |
652 | #[inline ] |
653 | #[target_feature (enable = "sse4.1" )] |
654 | #[cfg_attr (test, assert_instr(roundps))] |
655 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
656 | pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 { |
657 | simd_floor(a) |
658 | } |
659 | |
660 | /// Round the lower double-precision (64-bit) floating-point element in `b` |
661 | /// down to an integer value, store the result as a double-precision |
662 | /// floating-point element in the lower element of the intrinsic result, |
663 | /// and copies the upper element from `a` to the upper element of the intrinsic |
664 | /// result. |
665 | /// |
666 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd) |
667 | #[inline ] |
668 | #[target_feature (enable = "sse4.1" )] |
669 | #[cfg_attr (test, assert_instr(roundsd))] |
670 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
671 | pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d { |
672 | roundsd(a, b, _MM_FROUND_FLOOR) |
673 | } |
674 | |
675 | /// Round the lower single-precision (32-bit) floating-point element in `b` |
676 | /// down to an integer value, store the result as a single-precision |
677 | /// floating-point element in the lower element of the intrinsic result, |
678 | /// and copies the upper 3 packed elements from `a` to the upper elements |
679 | /// of the intrinsic result. |
680 | /// |
681 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss) |
682 | #[inline ] |
683 | #[target_feature (enable = "sse4.1" )] |
684 | #[cfg_attr (test, assert_instr(roundss))] |
685 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
686 | pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 { |
687 | roundss(a, b, _MM_FROUND_FLOOR) |
688 | } |
689 | |
690 | /// Round the packed double-precision (64-bit) floating-point elements in `a` |
691 | /// up to an integer value, and stores the results as packed double-precision |
692 | /// floating-point elements. |
693 | /// |
694 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd) |
695 | #[inline ] |
696 | #[target_feature (enable = "sse4.1" )] |
697 | #[cfg_attr (test, assert_instr(roundpd))] |
698 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
699 | pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d { |
700 | simd_ceil(a) |
701 | } |
702 | |
703 | /// Round the packed single-precision (32-bit) floating-point elements in `a` |
704 | /// up to an integer value, and stores the results as packed single-precision |
705 | /// floating-point elements. |
706 | /// |
707 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps) |
708 | #[inline ] |
709 | #[target_feature (enable = "sse4.1" )] |
710 | #[cfg_attr (test, assert_instr(roundps))] |
711 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
712 | pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 { |
713 | simd_ceil(a) |
714 | } |
715 | |
716 | /// Round the lower double-precision (64-bit) floating-point element in `b` |
717 | /// up to an integer value, store the result as a double-precision |
718 | /// floating-point element in the lower element of the intrinsic result, |
719 | /// and copies the upper element from `a` to the upper element |
720 | /// of the intrinsic result. |
721 | /// |
722 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd) |
723 | #[inline ] |
724 | #[target_feature (enable = "sse4.1" )] |
725 | #[cfg_attr (test, assert_instr(roundsd))] |
726 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
727 | pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d { |
728 | roundsd(a, b, _MM_FROUND_CEIL) |
729 | } |
730 | |
731 | /// Round the lower single-precision (32-bit) floating-point element in `b` |
732 | /// up to an integer value, store the result as a single-precision |
733 | /// floating-point element in the lower element of the intrinsic result, |
734 | /// and copies the upper 3 packed elements from `a` to the upper elements |
735 | /// of the intrinsic result. |
736 | /// |
737 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss) |
738 | #[inline ] |
739 | #[target_feature (enable = "sse4.1" )] |
740 | #[cfg_attr (test, assert_instr(roundss))] |
741 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
742 | pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 { |
743 | roundss(a, b, _MM_FROUND_CEIL) |
744 | } |
745 | |
746 | /// Round the packed double-precision (64-bit) floating-point elements in `a` |
747 | /// using the `ROUNDING` parameter, and stores the results as packed |
748 | /// double-precision floating-point elements. |
749 | /// Rounding is done according to the rounding parameter, which can be one of: |
750 | /// |
751 | /// ``` |
752 | /// #[cfg(target_arch = "x86" )] |
753 | /// use std::arch::x86::*; |
754 | /// #[cfg(target_arch = "x86_64" )] |
755 | /// use std::arch::x86_64::*; |
756 | /// |
757 | /// # fn main() { |
758 | /// // round to nearest, and suppress exceptions: |
759 | /// # let _x = |
760 | /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; |
761 | /// // round down, and suppress exceptions: |
762 | /// # let _x = |
763 | /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; |
764 | /// // round up, and suppress exceptions: |
765 | /// # let _x = |
766 | /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; |
767 | /// // truncate, and suppress exceptions: |
768 | /// # let _x = |
769 | /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; |
770 | /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`: |
771 | /// # let _x = |
772 | /// _MM_FROUND_CUR_DIRECTION; |
773 | /// # } |
774 | /// ``` |
775 | /// |
776 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd) |
777 | #[inline ] |
778 | #[target_feature (enable = "sse4.1" )] |
779 | #[cfg_attr (test, assert_instr(roundpd, ROUNDING = 0))] |
780 | #[rustc_legacy_const_generics (1)] |
781 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
782 | pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d { |
783 | static_assert_uimm_bits!(ROUNDING, 4); |
784 | roundpd(a, ROUNDING) |
785 | } |
786 | |
787 | /// Round the packed single-precision (32-bit) floating-point elements in `a` |
788 | /// using the `ROUNDING` parameter, and stores the results as packed |
789 | /// single-precision floating-point elements. |
790 | /// Rounding is done according to the rounding parameter, which can be one of: |
791 | /// |
792 | /// ``` |
793 | /// #[cfg(target_arch = "x86" )] |
794 | /// use std::arch::x86::*; |
795 | /// #[cfg(target_arch = "x86_64" )] |
796 | /// use std::arch::x86_64::*; |
797 | /// |
798 | /// # fn main() { |
799 | /// // round to nearest, and suppress exceptions: |
800 | /// # let _x = |
801 | /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; |
802 | /// // round down, and suppress exceptions: |
803 | /// # let _x = |
804 | /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; |
805 | /// // round up, and suppress exceptions: |
806 | /// # let _x = |
807 | /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; |
808 | /// // truncate, and suppress exceptions: |
809 | /// # let _x = |
810 | /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; |
811 | /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`: |
812 | /// # let _x = |
813 | /// _MM_FROUND_CUR_DIRECTION; |
814 | /// # } |
815 | /// ``` |
816 | /// |
817 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps) |
818 | #[inline ] |
819 | #[target_feature (enable = "sse4.1" )] |
820 | #[cfg_attr (test, assert_instr(roundps, ROUNDING = 0))] |
821 | #[rustc_legacy_const_generics (1)] |
822 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
823 | pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 { |
824 | static_assert_uimm_bits!(ROUNDING, 4); |
825 | roundps(a, ROUNDING) |
826 | } |
827 | |
828 | /// Round the lower double-precision (64-bit) floating-point element in `b` |
829 | /// using the `ROUNDING` parameter, store the result as a double-precision |
830 | /// floating-point element in the lower element of the intrinsic result, |
831 | /// and copies the upper element from `a` to the upper element of the intrinsic |
832 | /// result. |
833 | /// Rounding is done according to the rounding parameter, which can be one of: |
834 | /// |
835 | /// ``` |
836 | /// #[cfg(target_arch = "x86" )] |
837 | /// use std::arch::x86::*; |
838 | /// #[cfg(target_arch = "x86_64" )] |
839 | /// use std::arch::x86_64::*; |
840 | /// |
841 | /// # fn main() { |
842 | /// // round to nearest, and suppress exceptions: |
843 | /// # let _x = |
844 | /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; |
845 | /// // round down, and suppress exceptions: |
846 | /// # let _x = |
847 | /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; |
848 | /// // round up, and suppress exceptions: |
849 | /// # let _x = |
850 | /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; |
851 | /// // truncate, and suppress exceptions: |
852 | /// # let _x = |
853 | /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; |
854 | /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`: |
855 | /// # let _x = |
856 | /// _MM_FROUND_CUR_DIRECTION; |
857 | /// # } |
858 | /// ``` |
859 | /// |
860 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd) |
861 | #[inline ] |
862 | #[target_feature (enable = "sse4.1" )] |
863 | #[cfg_attr (test, assert_instr(roundsd, ROUNDING = 0))] |
864 | #[rustc_legacy_const_generics (2)] |
865 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
866 | pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d { |
867 | static_assert_uimm_bits!(ROUNDING, 4); |
868 | roundsd(a, b, ROUNDING) |
869 | } |
870 | |
871 | /// Round the lower single-precision (32-bit) floating-point element in `b` |
872 | /// using the `ROUNDING` parameter, store the result as a single-precision |
873 | /// floating-point element in the lower element of the intrinsic result, |
874 | /// and copies the upper 3 packed elements from `a` to the upper elements |
875 | /// of the intrinsic result. |
876 | /// Rounding is done according to the rounding parameter, which can be one of: |
877 | /// |
878 | /// ``` |
879 | /// #[cfg(target_arch = "x86" )] |
880 | /// use std::arch::x86::*; |
881 | /// #[cfg(target_arch = "x86_64" )] |
882 | /// use std::arch::x86_64::*; |
883 | /// |
884 | /// # fn main() { |
885 | /// // round to nearest, and suppress exceptions: |
886 | /// # let _x = |
887 | /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; |
888 | /// // round down, and suppress exceptions: |
889 | /// # let _x = |
890 | /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; |
891 | /// // round up, and suppress exceptions: |
892 | /// # let _x = |
893 | /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; |
894 | /// // truncate, and suppress exceptions: |
895 | /// # let _x = |
896 | /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; |
897 | /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`: |
898 | /// # let _x = |
899 | /// _MM_FROUND_CUR_DIRECTION; |
900 | /// # } |
901 | /// ``` |
902 | /// |
903 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss) |
904 | #[inline ] |
905 | #[target_feature (enable = "sse4.1" )] |
906 | #[cfg_attr (test, assert_instr(roundss, ROUNDING = 0))] |
907 | #[rustc_legacy_const_generics (2)] |
908 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
909 | pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 { |
910 | static_assert_uimm_bits!(ROUNDING, 4); |
911 | roundss(a, b, ROUNDING) |
912 | } |
913 | |
914 | /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector, |
915 | /// returning a vector containing its value in its first position, and its |
916 | /// index |
917 | /// in its second position; all other elements are set to zero. |
918 | /// |
919 | /// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW` |
920 | /// instruction. |
921 | /// |
922 | /// Arguments: |
923 | /// |
924 | /// * `a` - A 128-bit vector of type `__m128i`. |
925 | /// |
926 | /// Returns: |
927 | /// |
928 | /// A 128-bit value where: |
929 | /// |
930 | /// * bits `[15:0]` - contain the minimum value found in parameter `a`, |
931 | /// * bits `[18:16]` - contain the index of the minimum value |
932 | /// * remaining bits are set to `0`. |
933 | /// |
934 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16) |
935 | #[inline ] |
936 | #[target_feature (enable = "sse4.1" )] |
937 | #[cfg_attr (test, assert_instr(phminposuw))] |
938 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
939 | pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i { |
940 | transmute(src:phminposuw(a.as_u16x8())) |
941 | } |
942 | |
943 | /// Multiplies the low 32-bit integers from each packed 64-bit |
944 | /// element in `a` and `b`, and returns the signed 64-bit result. |
945 | /// |
946 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32) |
947 | #[inline ] |
948 | #[target_feature (enable = "sse4.1" )] |
949 | #[cfg_attr (test, assert_instr(pmuldq))] |
950 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
951 | pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { |
952 | let a: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2())); |
953 | let b: i64x2 = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2())); |
954 | transmute(src:simd_mul(x:a, y:b)) |
955 | } |
956 | |
957 | /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate |
958 | /// 64-bit integers, and returns the lowest 32-bit, whatever they might be, |
959 | /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2), |
960 | /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping |
961 | /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would |
962 | /// return a negative number. |
963 | /// |
964 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32) |
965 | #[inline ] |
966 | #[target_feature (enable = "sse4.1" )] |
967 | #[cfg_attr (test, assert_instr(pmulld))] |
968 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
969 | pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i { |
970 | transmute(src:simd_mul(x:a.as_i32x4(), y:b.as_i32x4())) |
971 | } |
972 | |
973 | /// Subtracts 8-bit unsigned integer values and computes the absolute |
974 | /// values of the differences to the corresponding bits in the destination. |
975 | /// Then sums of the absolute differences are returned according to the bit |
976 | /// fields in the immediate operand. |
977 | /// |
978 | /// The following algorithm is performed: |
979 | /// |
980 | /// ```ignore |
981 | /// i = IMM8[2] * 4 |
982 | /// j = IMM8[1:0] * 4 |
983 | /// for k := 0 to 7 |
984 | /// d0 = abs(a[i + k + 0] - b[j + 0]) |
985 | /// d1 = abs(a[i + k + 1] - b[j + 1]) |
986 | /// d2 = abs(a[i + k + 2] - b[j + 2]) |
987 | /// d3 = abs(a[i + k + 3] - b[j + 3]) |
988 | /// r[k] = d0 + d1 + d2 + d3 |
989 | /// ``` |
990 | /// |
991 | /// Arguments: |
992 | /// |
993 | /// * `a` - A 128-bit vector of type `__m128i`. |
994 | /// * `b` - A 128-bit vector of type `__m128i`. |
995 | /// * `IMM8` - An 8-bit immediate operand specifying how the absolute |
996 | /// differences are to be calculated |
997 | /// * Bit `[2]` specify the offset for operand `a` |
998 | /// * Bits `[1:0]` specify the offset for operand `b` |
999 | /// |
1000 | /// Returns: |
1001 | /// |
1002 | /// * A `__m128i` vector containing the sums of the sets of absolute |
1003 | /// differences between both operands. |
1004 | /// |
1005 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8) |
1006 | #[inline ] |
1007 | #[target_feature (enable = "sse4.1" )] |
1008 | #[cfg_attr (test, assert_instr(mpsadbw, IMM8 = 0))] |
1009 | #[rustc_legacy_const_generics (2)] |
1010 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1011 | pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i { |
1012 | static_assert_uimm_bits!(IMM8, 3); |
1013 | transmute(src:mpsadbw(a:a.as_u8x16(), b:b.as_u8x16(), IMM8 as u8)) |
1014 | } |
1015 | |
1016 | /// Tests whether the specified bits in a 128-bit integer vector are all |
1017 | /// zeros. |
1018 | /// |
1019 | /// Arguments: |
1020 | /// |
1021 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1022 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
1023 | /// operand `a`. |
1024 | /// |
1025 | /// Returns: |
1026 | /// |
1027 | /// * `1` - if the specified bits are all zeros, |
1028 | /// * `0` - otherwise. |
1029 | /// |
1030 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128) |
1031 | #[inline ] |
1032 | #[target_feature (enable = "sse4.1" )] |
1033 | #[cfg_attr (test, assert_instr(ptest))] |
1034 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1035 | pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 { |
1036 | ptestz(a:a.as_i64x2(), mask:mask.as_i64x2()) |
1037 | } |
1038 | |
1039 | /// Tests whether the specified bits in a 128-bit integer vector are all |
1040 | /// ones. |
1041 | /// |
1042 | /// Arguments: |
1043 | /// |
1044 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1045 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
1046 | /// operand `a`. |
1047 | /// |
1048 | /// Returns: |
1049 | /// |
1050 | /// * `1` - if the specified bits are all ones, |
1051 | /// * `0` - otherwise. |
1052 | /// |
1053 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128) |
1054 | #[inline ] |
1055 | #[target_feature (enable = "sse4.1" )] |
1056 | #[cfg_attr (test, assert_instr(ptest))] |
1057 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1058 | pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 { |
1059 | ptestc(a:a.as_i64x2(), mask:mask.as_i64x2()) |
1060 | } |
1061 | |
1062 | /// Tests whether the specified bits in a 128-bit integer vector are |
1063 | /// neither all zeros nor all ones. |
1064 | /// |
1065 | /// Arguments: |
1066 | /// |
1067 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1068 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
1069 | /// operand `a`. |
1070 | /// |
1071 | /// Returns: |
1072 | /// |
1073 | /// * `1` - if the specified bits are neither all zeros nor all ones, |
1074 | /// * `0` - otherwise. |
1075 | /// |
1076 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128) |
1077 | #[inline ] |
1078 | #[target_feature (enable = "sse4.1" )] |
1079 | #[cfg_attr (test, assert_instr(ptest))] |
1080 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1081 | pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 { |
1082 | ptestnzc(a:a.as_i64x2(), mask:mask.as_i64x2()) |
1083 | } |
1084 | |
1085 | /// Tests whether the specified bits in a 128-bit integer vector are all |
1086 | /// zeros. |
1087 | /// |
1088 | /// Arguments: |
1089 | /// |
1090 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1091 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
1092 | /// operand `a`. |
1093 | /// |
1094 | /// Returns: |
1095 | /// |
1096 | /// * `1` - if the specified bits are all zeros, |
1097 | /// * `0` - otherwise. |
1098 | /// |
1099 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros) |
1100 | #[inline ] |
1101 | #[target_feature (enable = "sse4.1" )] |
1102 | #[cfg_attr (test, assert_instr(ptest))] |
1103 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1104 | pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 { |
1105 | _mm_testz_si128(a, mask) |
1106 | } |
1107 | |
1108 | /// Tests whether the specified bits in `a` 128-bit integer vector are all |
1109 | /// ones. |
1110 | /// |
1111 | /// Argument: |
1112 | /// |
1113 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1114 | /// |
1115 | /// Returns: |
1116 | /// |
1117 | /// * `1` - if the bits specified in the operand are all set to 1, |
1118 | /// * `0` - otherwise. |
1119 | /// |
1120 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones) |
1121 | #[inline ] |
1122 | #[target_feature (enable = "sse4.1" )] |
1123 | #[cfg_attr (test, assert_instr(pcmpeqd))] |
1124 | #[cfg_attr (test, assert_instr(ptest))] |
1125 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1126 | pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 { |
1127 | _mm_testc_si128(a, mask:_mm_cmpeq_epi32(a, b:a)) |
1128 | } |
1129 | |
1130 | /// Tests whether the specified bits in a 128-bit integer vector are |
1131 | /// neither all zeros nor all ones. |
1132 | /// |
1133 | /// Arguments: |
1134 | /// |
1135 | /// * `a` - A 128-bit integer vector containing the bits to be tested. |
1136 | /// * `mask` - A 128-bit integer vector selecting which bits to test in |
1137 | /// operand `a`. |
1138 | /// |
1139 | /// Returns: |
1140 | /// |
1141 | /// * `1` - if the specified bits are neither all zeros nor all ones, |
1142 | /// * `0` - otherwise. |
1143 | /// |
1144 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros) |
1145 | #[inline ] |
1146 | #[target_feature (enable = "sse4.1" )] |
1147 | #[cfg_attr (test, assert_instr(ptest))] |
1148 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1149 | pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { |
1150 | _mm_testnzc_si128(a, mask) |
1151 | } |
1152 | |
1153 | #[allow (improper_ctypes)] |
1154 | extern "C" { |
1155 | #[link_name = "llvm.x86.sse41.insertps" ] |
1156 | fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; |
1157 | #[link_name = "llvm.x86.sse41.packusdw" ] |
1158 | fn packusdw(a: i32x4, b: i32x4) -> u16x8; |
1159 | #[link_name = "llvm.x86.sse41.dppd" ] |
1160 | fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d; |
1161 | #[link_name = "llvm.x86.sse41.dpps" ] |
1162 | fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128; |
1163 | #[link_name = "llvm.x86.sse41.round.pd" ] |
1164 | fn roundpd(a: __m128d, rounding: i32) -> __m128d; |
1165 | #[link_name = "llvm.x86.sse41.round.ps" ] |
1166 | fn roundps(a: __m128, rounding: i32) -> __m128; |
1167 | #[link_name = "llvm.x86.sse41.round.sd" ] |
1168 | fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d; |
1169 | #[link_name = "llvm.x86.sse41.round.ss" ] |
1170 | fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128; |
1171 | #[link_name = "llvm.x86.sse41.phminposuw" ] |
1172 | fn phminposuw(a: u16x8) -> u16x8; |
1173 | #[link_name = "llvm.x86.sse41.mpsadbw" ] |
1174 | fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8; |
1175 | #[link_name = "llvm.x86.sse41.ptestz" ] |
1176 | fn ptestz(a: i64x2, mask: i64x2) -> i32; |
1177 | #[link_name = "llvm.x86.sse41.ptestc" ] |
1178 | fn ptestc(a: i64x2, mask: i64x2) -> i32; |
1179 | #[link_name = "llvm.x86.sse41.ptestnzc" ] |
1180 | fn ptestnzc(a: i64x2, mask: i64x2) -> i32; |
1181 | } |
1182 | |
1183 | #[cfg (test)] |
1184 | mod tests { |
1185 | use crate::core_arch::x86::*; |
1186 | use std::mem; |
1187 | use stdarch_test::simd_test; |
1188 | |
1189 | #[simd_test(enable = "sse4.1" )] |
1190 | unsafe fn test_mm_blendv_epi8() { |
1191 | #[rustfmt::skip] |
1192 | let a = _mm_setr_epi8( |
1193 | 0, 1, 2, 3, 4, 5, 6, 7, |
1194 | 8, 9, 10, 11, 12, 13, 14, 15, |
1195 | ); |
1196 | #[rustfmt::skip] |
1197 | let b = _mm_setr_epi8( |
1198 | 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, |
1199 | ); |
1200 | #[rustfmt::skip] |
1201 | let mask = _mm_setr_epi8( |
1202 | 0, -1, 0, -1, 0, -1, 0, -1, |
1203 | 0, -1, 0, -1, 0, -1, 0, -1, |
1204 | ); |
1205 | #[rustfmt::skip] |
1206 | let e = _mm_setr_epi8( |
1207 | 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, |
1208 | ); |
1209 | assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e); |
1210 | } |
1211 | |
1212 | #[simd_test(enable = "sse4.1" )] |
1213 | unsafe fn test_mm_blendv_pd() { |
1214 | let a = _mm_set1_pd(0.0); |
1215 | let b = _mm_set1_pd(1.0); |
1216 | let mask = transmute(_mm_setr_epi64x(0, -1)); |
1217 | let r = _mm_blendv_pd(a, b, mask); |
1218 | let e = _mm_setr_pd(0.0, 1.0); |
1219 | assert_eq_m128d(r, e); |
1220 | } |
1221 | |
1222 | #[simd_test(enable = "sse4.1" )] |
1223 | unsafe fn test_mm_blendv_ps() { |
1224 | let a = _mm_set1_ps(0.0); |
1225 | let b = _mm_set1_ps(1.0); |
1226 | let mask = transmute(_mm_setr_epi32(0, -1, 0, -1)); |
1227 | let r = _mm_blendv_ps(a, b, mask); |
1228 | let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0); |
1229 | assert_eq_m128(r, e); |
1230 | } |
1231 | |
1232 | #[simd_test(enable = "sse4.1" )] |
1233 | unsafe fn test_mm_blend_pd() { |
1234 | let a = _mm_set1_pd(0.0); |
1235 | let b = _mm_set1_pd(1.0); |
1236 | let r = _mm_blend_pd::<0b10>(a, b); |
1237 | let e = _mm_setr_pd(0.0, 1.0); |
1238 | assert_eq_m128d(r, e); |
1239 | } |
1240 | |
1241 | #[simd_test(enable = "sse4.1" )] |
1242 | unsafe fn test_mm_blend_ps() { |
1243 | let a = _mm_set1_ps(0.0); |
1244 | let b = _mm_set1_ps(1.0); |
1245 | let r = _mm_blend_ps::<0b1010>(a, b); |
1246 | let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0); |
1247 | assert_eq_m128(r, e); |
1248 | } |
1249 | |
1250 | #[simd_test(enable = "sse4.1" )] |
1251 | unsafe fn test_mm_blend_epi16() { |
1252 | let a = _mm_set1_epi16(0); |
1253 | let b = _mm_set1_epi16(1); |
1254 | let r = _mm_blend_epi16::<0b1010_1100>(a, b); |
1255 | let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1); |
1256 | assert_eq_m128i(r, e); |
1257 | } |
1258 | |
1259 | #[simd_test(enable = "sse4.1" )] |
1260 | unsafe fn test_mm_extract_ps() { |
1261 | let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0); |
1262 | let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32); |
1263 | assert_eq!(r, 1.0); |
1264 | let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32); |
1265 | assert_eq!(r, 3.0); |
1266 | } |
1267 | |
1268 | #[simd_test(enable = "sse4.1" )] |
1269 | unsafe fn test_mm_extract_epi8() { |
1270 | #[rustfmt::skip] |
1271 | let a = _mm_setr_epi8( |
1272 | -1, 1, 2, 3, 4, 5, 6, 7, |
1273 | 8, 9, 10, 11, 12, 13, 14, 15 |
1274 | ); |
1275 | let r1 = _mm_extract_epi8::<0>(a); |
1276 | let r2 = _mm_extract_epi8::<3>(a); |
1277 | assert_eq!(r1, 0xFF); |
1278 | assert_eq!(r2, 3); |
1279 | } |
1280 | |
1281 | #[simd_test(enable = "sse4.1" )] |
1282 | unsafe fn test_mm_extract_epi32() { |
1283 | let a = _mm_setr_epi32(0, 1, 2, 3); |
1284 | let r = _mm_extract_epi32::<1>(a); |
1285 | assert_eq!(r, 1); |
1286 | let r = _mm_extract_epi32::<3>(a); |
1287 | assert_eq!(r, 3); |
1288 | } |
1289 | |
1290 | #[simd_test(enable = "sse4.1" )] |
1291 | unsafe fn test_mm_insert_ps() { |
1292 | let a = _mm_set1_ps(1.0); |
1293 | let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
1294 | let r = _mm_insert_ps::<0b11_00_1100>(a, b); |
1295 | let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0); |
1296 | assert_eq_m128(r, e); |
1297 | } |
1298 | |
1299 | #[simd_test(enable = "sse4.1" )] |
1300 | unsafe fn test_mm_insert_epi8() { |
1301 | let a = _mm_set1_epi8(0); |
1302 | let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); |
1303 | let r = _mm_insert_epi8::<1>(a, 32); |
1304 | assert_eq_m128i(r, e); |
1305 | let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0); |
1306 | let r = _mm_insert_epi8::<14>(a, 32); |
1307 | assert_eq_m128i(r, e); |
1308 | } |
1309 | |
1310 | #[simd_test(enable = "sse4.1" )] |
1311 | unsafe fn test_mm_insert_epi32() { |
1312 | let a = _mm_set1_epi32(0); |
1313 | let e = _mm_setr_epi32(0, 32, 0, 0); |
1314 | let r = _mm_insert_epi32::<1>(a, 32); |
1315 | assert_eq_m128i(r, e); |
1316 | let e = _mm_setr_epi32(0, 0, 0, 32); |
1317 | let r = _mm_insert_epi32::<3>(a, 32); |
1318 | assert_eq_m128i(r, e); |
1319 | } |
1320 | |
1321 | #[simd_test(enable = "sse4.1" )] |
1322 | unsafe fn test_mm_max_epi8() { |
1323 | #[rustfmt::skip] |
1324 | let a = _mm_setr_epi8( |
1325 | 1, 4, 5, 8, 9, 12, 13, 16, |
1326 | 17, 20, 21, 24, 25, 28, 29, 32, |
1327 | ); |
1328 | #[rustfmt::skip] |
1329 | let b = _mm_setr_epi8( |
1330 | 2, 3, 6, 7, 10, 11, 14, 15, |
1331 | 18, 19, 22, 23, 26, 27, 30, 31, |
1332 | ); |
1333 | let r = _mm_max_epi8(a, b); |
1334 | #[rustfmt::skip] |
1335 | let e = _mm_setr_epi8( |
1336 | 2, 4, 6, 8, 10, 12, 14, 16, |
1337 | 18, 20, 22, 24, 26, 28, 30, 32, |
1338 | ); |
1339 | assert_eq_m128i(r, e); |
1340 | } |
1341 | |
1342 | #[simd_test(enable = "sse4.1" )] |
1343 | unsafe fn test_mm_max_epu16() { |
1344 | let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16); |
1345 | let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15); |
1346 | let r = _mm_max_epu16(a, b); |
1347 | let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16); |
1348 | assert_eq_m128i(r, e); |
1349 | } |
1350 | |
1351 | #[simd_test(enable = "sse4.1" )] |
1352 | unsafe fn test_mm_max_epi32() { |
1353 | let a = _mm_setr_epi32(1, 4, 5, 8); |
1354 | let b = _mm_setr_epi32(2, 3, 6, 7); |
1355 | let r = _mm_max_epi32(a, b); |
1356 | let e = _mm_setr_epi32(2, 4, 6, 8); |
1357 | assert_eq_m128i(r, e); |
1358 | } |
1359 | |
1360 | #[simd_test(enable = "sse4.1" )] |
1361 | unsafe fn test_mm_max_epu32() { |
1362 | let a = _mm_setr_epi32(1, 4, 5, 8); |
1363 | let b = _mm_setr_epi32(2, 3, 6, 7); |
1364 | let r = _mm_max_epu32(a, b); |
1365 | let e = _mm_setr_epi32(2, 4, 6, 8); |
1366 | assert_eq_m128i(r, e); |
1367 | } |
1368 | |
1369 | #[simd_test(enable = "sse4.1" )] |
1370 | unsafe fn test_mm_min_epi8_1() { |
1371 | #[rustfmt::skip] |
1372 | let a = _mm_setr_epi8( |
1373 | 1, 4, 5, 8, 9, 12, 13, 16, |
1374 | 17, 20, 21, 24, 25, 28, 29, 32, |
1375 | ); |
1376 | #[rustfmt::skip] |
1377 | let b = _mm_setr_epi8( |
1378 | 2, 3, 6, 7, 10, 11, 14, 15, |
1379 | 18, 19, 22, 23, 26, 27, 30, 31, |
1380 | ); |
1381 | let r = _mm_min_epi8(a, b); |
1382 | #[rustfmt::skip] |
1383 | let e = _mm_setr_epi8( |
1384 | 1, 3, 5, 7, 9, 11, 13, 15, |
1385 | 17, 19, 21, 23, 25, 27, 29, 31, |
1386 | ); |
1387 | assert_eq_m128i(r, e); |
1388 | } |
1389 | |
1390 | #[simd_test(enable = "sse4.1" )] |
1391 | unsafe fn test_mm_min_epi8_2() { |
1392 | #[rustfmt::skip] |
1393 | let a = _mm_setr_epi8( |
1394 | 1, -4, -5, 8, -9, -12, 13, -16, |
1395 | 17, 20, 21, 24, 25, 28, 29, 32, |
1396 | ); |
1397 | #[rustfmt::skip] |
1398 | let b = _mm_setr_epi8( |
1399 | 2, -3, -6, 7, -10, -11, 14, -15, |
1400 | 18, 19, 22, 23, 26, 27, 30, 31, |
1401 | ); |
1402 | let r = _mm_min_epi8(a, b); |
1403 | #[rustfmt::skip] |
1404 | let e = _mm_setr_epi8( |
1405 | 1, -4, -6, 7, -10, -12, 13, -16, |
1406 | 17, 19, 21, 23, 25, 27, 29, 31, |
1407 | ); |
1408 | assert_eq_m128i(r, e); |
1409 | } |
1410 | |
1411 | #[simd_test(enable = "sse4.1" )] |
1412 | unsafe fn test_mm_min_epu16() { |
1413 | let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16); |
1414 | let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15); |
1415 | let r = _mm_min_epu16(a, b); |
1416 | let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15); |
1417 | assert_eq_m128i(r, e); |
1418 | } |
1419 | |
1420 | #[simd_test(enable = "sse4.1" )] |
1421 | unsafe fn test_mm_min_epi32_1() { |
1422 | let a = _mm_setr_epi32(1, 4, 5, 8); |
1423 | let b = _mm_setr_epi32(2, 3, 6, 7); |
1424 | let r = _mm_min_epi32(a, b); |
1425 | let e = _mm_setr_epi32(1, 3, 5, 7); |
1426 | assert_eq_m128i(r, e); |
1427 | } |
1428 | |
1429 | #[simd_test(enable = "sse4.1" )] |
1430 | unsafe fn test_mm_min_epi32_2() { |
1431 | let a = _mm_setr_epi32(-1, 4, 5, -7); |
1432 | let b = _mm_setr_epi32(-2, 3, -6, 8); |
1433 | let r = _mm_min_epi32(a, b); |
1434 | let e = _mm_setr_epi32(-2, 3, -6, -7); |
1435 | assert_eq_m128i(r, e); |
1436 | } |
1437 | |
1438 | #[simd_test(enable = "sse4.1" )] |
1439 | unsafe fn test_mm_min_epu32() { |
1440 | let a = _mm_setr_epi32(1, 4, 5, 8); |
1441 | let b = _mm_setr_epi32(2, 3, 6, 7); |
1442 | let r = _mm_min_epu32(a, b); |
1443 | let e = _mm_setr_epi32(1, 3, 5, 7); |
1444 | assert_eq_m128i(r, e); |
1445 | } |
1446 | |
1447 | #[simd_test(enable = "sse4.1" )] |
1448 | unsafe fn test_mm_packus_epi32() { |
1449 | let a = _mm_setr_epi32(1, 2, 3, 4); |
1450 | let b = _mm_setr_epi32(-1, -2, -3, -4); |
1451 | let r = _mm_packus_epi32(a, b); |
1452 | let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0); |
1453 | assert_eq_m128i(r, e); |
1454 | } |
1455 | |
1456 | #[simd_test(enable = "sse4.1" )] |
1457 | unsafe fn test_mm_cmpeq_epi64() { |
1458 | let a = _mm_setr_epi64x(0, 1); |
1459 | let b = _mm_setr_epi64x(0, 0); |
1460 | let r = _mm_cmpeq_epi64(a, b); |
1461 | let e = _mm_setr_epi64x(-1, 0); |
1462 | assert_eq_m128i(r, e); |
1463 | } |
1464 | |
1465 | #[simd_test(enable = "sse4.1" )] |
1466 | unsafe fn test_mm_cvtepi8_epi16() { |
1467 | let a = _mm_set1_epi8(10); |
1468 | let r = _mm_cvtepi8_epi16(a); |
1469 | let e = _mm_set1_epi16(10); |
1470 | assert_eq_m128i(r, e); |
1471 | let a = _mm_set1_epi8(-10); |
1472 | let r = _mm_cvtepi8_epi16(a); |
1473 | let e = _mm_set1_epi16(-10); |
1474 | assert_eq_m128i(r, e); |
1475 | } |
1476 | |
1477 | #[simd_test(enable = "sse4.1" )] |
1478 | unsafe fn test_mm_cvtepi8_epi32() { |
1479 | let a = _mm_set1_epi8(10); |
1480 | let r = _mm_cvtepi8_epi32(a); |
1481 | let e = _mm_set1_epi32(10); |
1482 | assert_eq_m128i(r, e); |
1483 | let a = _mm_set1_epi8(-10); |
1484 | let r = _mm_cvtepi8_epi32(a); |
1485 | let e = _mm_set1_epi32(-10); |
1486 | assert_eq_m128i(r, e); |
1487 | } |
1488 | |
1489 | #[simd_test(enable = "sse4.1" )] |
1490 | unsafe fn test_mm_cvtepi8_epi64() { |
1491 | let a = _mm_set1_epi8(10); |
1492 | let r = _mm_cvtepi8_epi64(a); |
1493 | let e = _mm_set1_epi64x(10); |
1494 | assert_eq_m128i(r, e); |
1495 | let a = _mm_set1_epi8(-10); |
1496 | let r = _mm_cvtepi8_epi64(a); |
1497 | let e = _mm_set1_epi64x(-10); |
1498 | assert_eq_m128i(r, e); |
1499 | } |
1500 | |
1501 | #[simd_test(enable = "sse4.1" )] |
1502 | unsafe fn test_mm_cvtepi16_epi32() { |
1503 | let a = _mm_set1_epi16(10); |
1504 | let r = _mm_cvtepi16_epi32(a); |
1505 | let e = _mm_set1_epi32(10); |
1506 | assert_eq_m128i(r, e); |
1507 | let a = _mm_set1_epi16(-10); |
1508 | let r = _mm_cvtepi16_epi32(a); |
1509 | let e = _mm_set1_epi32(-10); |
1510 | assert_eq_m128i(r, e); |
1511 | } |
1512 | |
1513 | #[simd_test(enable = "sse4.1" )] |
1514 | unsafe fn test_mm_cvtepi16_epi64() { |
1515 | let a = _mm_set1_epi16(10); |
1516 | let r = _mm_cvtepi16_epi64(a); |
1517 | let e = _mm_set1_epi64x(10); |
1518 | assert_eq_m128i(r, e); |
1519 | let a = _mm_set1_epi16(-10); |
1520 | let r = _mm_cvtepi16_epi64(a); |
1521 | let e = _mm_set1_epi64x(-10); |
1522 | assert_eq_m128i(r, e); |
1523 | } |
1524 | |
1525 | #[simd_test(enable = "sse4.1" )] |
1526 | unsafe fn test_mm_cvtepi32_epi64() { |
1527 | let a = _mm_set1_epi32(10); |
1528 | let r = _mm_cvtepi32_epi64(a); |
1529 | let e = _mm_set1_epi64x(10); |
1530 | assert_eq_m128i(r, e); |
1531 | let a = _mm_set1_epi32(-10); |
1532 | let r = _mm_cvtepi32_epi64(a); |
1533 | let e = _mm_set1_epi64x(-10); |
1534 | assert_eq_m128i(r, e); |
1535 | } |
1536 | |
1537 | #[simd_test(enable = "sse4.1" )] |
1538 | unsafe fn test_mm_cvtepu8_epi16() { |
1539 | let a = _mm_set1_epi8(10); |
1540 | let r = _mm_cvtepu8_epi16(a); |
1541 | let e = _mm_set1_epi16(10); |
1542 | assert_eq_m128i(r, e); |
1543 | } |
1544 | |
1545 | #[simd_test(enable = "sse4.1" )] |
1546 | unsafe fn test_mm_cvtepu8_epi32() { |
1547 | let a = _mm_set1_epi8(10); |
1548 | let r = _mm_cvtepu8_epi32(a); |
1549 | let e = _mm_set1_epi32(10); |
1550 | assert_eq_m128i(r, e); |
1551 | } |
1552 | |
1553 | #[simd_test(enable = "sse4.1" )] |
1554 | unsafe fn test_mm_cvtepu8_epi64() { |
1555 | let a = _mm_set1_epi8(10); |
1556 | let r = _mm_cvtepu8_epi64(a); |
1557 | let e = _mm_set1_epi64x(10); |
1558 | assert_eq_m128i(r, e); |
1559 | } |
1560 | |
1561 | #[simd_test(enable = "sse4.1" )] |
1562 | unsafe fn test_mm_cvtepu16_epi32() { |
1563 | let a = _mm_set1_epi16(10); |
1564 | let r = _mm_cvtepu16_epi32(a); |
1565 | let e = _mm_set1_epi32(10); |
1566 | assert_eq_m128i(r, e); |
1567 | } |
1568 | |
1569 | #[simd_test(enable = "sse4.1" )] |
1570 | unsafe fn test_mm_cvtepu16_epi64() { |
1571 | let a = _mm_set1_epi16(10); |
1572 | let r = _mm_cvtepu16_epi64(a); |
1573 | let e = _mm_set1_epi64x(10); |
1574 | assert_eq_m128i(r, e); |
1575 | } |
1576 | |
1577 | #[simd_test(enable = "sse4.1" )] |
1578 | unsafe fn test_mm_cvtepu32_epi64() { |
1579 | let a = _mm_set1_epi32(10); |
1580 | let r = _mm_cvtepu32_epi64(a); |
1581 | let e = _mm_set1_epi64x(10); |
1582 | assert_eq_m128i(r, e); |
1583 | } |
1584 | |
1585 | #[simd_test(enable = "sse4.1" )] |
1586 | unsafe fn test_mm_dp_pd() { |
1587 | let a = _mm_setr_pd(2.0, 3.0); |
1588 | let b = _mm_setr_pd(1.0, 4.0); |
1589 | let e = _mm_setr_pd(14.0, 0.0); |
1590 | assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e); |
1591 | } |
1592 | |
1593 | #[simd_test(enable = "sse4.1" )] |
1594 | unsafe fn test_mm_dp_ps() { |
1595 | let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0); |
1596 | let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0); |
1597 | let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0); |
1598 | assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e); |
1599 | } |
1600 | |
1601 | #[simd_test(enable = "sse4.1" )] |
1602 | unsafe fn test_mm_floor_pd() { |
1603 | let a = _mm_setr_pd(2.5, 4.5); |
1604 | let r = _mm_floor_pd(a); |
1605 | let e = _mm_setr_pd(2.0, 4.0); |
1606 | assert_eq_m128d(r, e); |
1607 | } |
1608 | |
1609 | #[simd_test(enable = "sse4.1" )] |
1610 | unsafe fn test_mm_floor_ps() { |
1611 | let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5); |
1612 | let r = _mm_floor_ps(a); |
1613 | let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0); |
1614 | assert_eq_m128(r, e); |
1615 | } |
1616 | |
1617 | #[simd_test(enable = "sse4.1" )] |
1618 | unsafe fn test_mm_floor_sd() { |
1619 | let a = _mm_setr_pd(2.5, 4.5); |
1620 | let b = _mm_setr_pd(-1.5, -3.5); |
1621 | let r = _mm_floor_sd(a, b); |
1622 | let e = _mm_setr_pd(-2.0, 4.5); |
1623 | assert_eq_m128d(r, e); |
1624 | } |
1625 | |
1626 | #[simd_test(enable = "sse4.1" )] |
1627 | unsafe fn test_mm_floor_ss() { |
1628 | let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5); |
1629 | let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5); |
1630 | let r = _mm_floor_ss(a, b); |
1631 | let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5); |
1632 | assert_eq_m128(r, e); |
1633 | } |
1634 | |
1635 | #[simd_test(enable = "sse4.1" )] |
1636 | unsafe fn test_mm_ceil_pd() { |
1637 | let a = _mm_setr_pd(1.5, 3.5); |
1638 | let r = _mm_ceil_pd(a); |
1639 | let e = _mm_setr_pd(2.0, 4.0); |
1640 | assert_eq_m128d(r, e); |
1641 | } |
1642 | |
1643 | #[simd_test(enable = "sse4.1" )] |
1644 | unsafe fn test_mm_ceil_ps() { |
1645 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); |
1646 | let r = _mm_ceil_ps(a); |
1647 | let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0); |
1648 | assert_eq_m128(r, e); |
1649 | } |
1650 | |
1651 | #[simd_test(enable = "sse4.1" )] |
1652 | unsafe fn test_mm_ceil_sd() { |
1653 | let a = _mm_setr_pd(1.5, 3.5); |
1654 | let b = _mm_setr_pd(-2.5, -4.5); |
1655 | let r = _mm_ceil_sd(a, b); |
1656 | let e = _mm_setr_pd(-2.0, 3.5); |
1657 | assert_eq_m128d(r, e); |
1658 | } |
1659 | |
1660 | #[simd_test(enable = "sse4.1" )] |
1661 | unsafe fn test_mm_ceil_ss() { |
1662 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); |
1663 | let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5); |
1664 | let r = _mm_ceil_ss(a, b); |
1665 | let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5); |
1666 | assert_eq_m128(r, e); |
1667 | } |
1668 | |
1669 | #[simd_test(enable = "sse4.1" )] |
1670 | unsafe fn test_mm_round_pd() { |
1671 | let a = _mm_setr_pd(1.25, 3.75); |
1672 | let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a); |
1673 | let e = _mm_setr_pd(1.0, 4.0); |
1674 | assert_eq_m128d(r, e); |
1675 | } |
1676 | |
1677 | #[simd_test(enable = "sse4.1" )] |
1678 | unsafe fn test_mm_round_ps() { |
1679 | let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25); |
1680 | let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a); |
1681 | let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0); |
1682 | assert_eq_m128(r, e); |
1683 | } |
1684 | |
1685 | #[allow (deprecated)] // FIXME: This test uses deprecated CSR access functions |
1686 | #[simd_test(enable = "sse4.1" )] |
1687 | unsafe fn test_mm_round_sd() { |
1688 | let a = _mm_setr_pd(1.5, 3.5); |
1689 | let b = _mm_setr_pd(-2.5, -4.5); |
1690 | let old_mode = _MM_GET_ROUNDING_MODE(); |
1691 | _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); |
1692 | let r = _mm_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b); |
1693 | _MM_SET_ROUNDING_MODE(old_mode); |
1694 | let e = _mm_setr_pd(-2.0, 3.5); |
1695 | assert_eq_m128d(r, e); |
1696 | } |
1697 | |
1698 | #[allow (deprecated)] // FIXME: This test uses deprecated CSR access functions |
1699 | #[simd_test(enable = "sse4.1" )] |
1700 | unsafe fn test_mm_round_ss() { |
1701 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); |
1702 | let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5); |
1703 | let old_mode = _MM_GET_ROUNDING_MODE(); |
1704 | _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); |
1705 | let r = _mm_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b); |
1706 | _MM_SET_ROUNDING_MODE(old_mode); |
1707 | let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5); |
1708 | assert_eq_m128(r, e); |
1709 | } |
1710 | |
1711 | #[simd_test(enable = "sse4.1" )] |
1712 | unsafe fn test_mm_minpos_epu16_1() { |
1713 | let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66); |
1714 | let r = _mm_minpos_epu16(a); |
1715 | let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0); |
1716 | assert_eq_m128i(r, e); |
1717 | } |
1718 | |
1719 | #[simd_test(enable = "sse4.1" )] |
1720 | unsafe fn test_mm_minpos_epu16_2() { |
1721 | let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66); |
1722 | let r = _mm_minpos_epu16(a); |
1723 | let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0); |
1724 | assert_eq_m128i(r, e); |
1725 | } |
1726 | |
1727 | #[simd_test(enable = "sse4.1" )] |
1728 | unsafe fn test_mm_mul_epi32() { |
1729 | { |
1730 | let a = _mm_setr_epi32(1, 1, 1, 1); |
1731 | let b = _mm_setr_epi32(1, 2, 3, 4); |
1732 | let r = _mm_mul_epi32(a, b); |
1733 | let e = _mm_setr_epi64x(1, 3); |
1734 | assert_eq_m128i(r, e); |
1735 | } |
1736 | { |
1737 | let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */); |
1738 | let b = _mm_setr_epi32( |
1739 | -20, -256, /* ignored */ |
1740 | 666666, 666666, /* ignored */ |
1741 | ); |
1742 | let r = _mm_mul_epi32(a, b); |
1743 | let e = _mm_setr_epi64x(-300, 823043843622); |
1744 | assert_eq_m128i(r, e); |
1745 | } |
1746 | } |
1747 | |
1748 | #[simd_test(enable = "sse4.1" )] |
1749 | unsafe fn test_mm_mullo_epi32() { |
1750 | { |
1751 | let a = _mm_setr_epi32(1, 1, 1, 1); |
1752 | let b = _mm_setr_epi32(1, 2, 3, 4); |
1753 | let r = _mm_mullo_epi32(a, b); |
1754 | let e = _mm_setr_epi32(1, 2, 3, 4); |
1755 | assert_eq_m128i(r, e); |
1756 | } |
1757 | { |
1758 | let a = _mm_setr_epi32(15, -2, 1234567, 99999); |
1759 | let b = _mm_setr_epi32(-20, -256, 666666, -99999); |
1760 | let r = _mm_mullo_epi32(a, b); |
1761 | // Attention, most significant bit in r[2] is treated |
1762 | // as a sign bit: |
1763 | // 1234567 * 666666 = -1589877210 |
1764 | let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409); |
1765 | assert_eq_m128i(r, e); |
1766 | } |
1767 | } |
1768 | |
1769 | #[simd_test(enable = "sse4.1" )] |
1770 | unsafe fn test_mm_minpos_epu16() { |
1771 | let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3); |
1772 | let r = _mm_minpos_epu16(a); |
1773 | let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0); |
1774 | assert_eq_m128i(r, e); |
1775 | } |
1776 | |
1777 | #[simd_test(enable = "sse4.1" )] |
1778 | unsafe fn test_mm_mpsadbw_epu8() { |
1779 | #[rustfmt::skip] |
1780 | let a = _mm_setr_epi8( |
1781 | 0, 1, 2, 3, 4, 5, 6, 7, |
1782 | 8, 9, 10, 11, 12, 13, 14, 15, |
1783 | ); |
1784 | |
1785 | let r = _mm_mpsadbw_epu8::<0b000>(a, a); |
1786 | let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28); |
1787 | assert_eq_m128i(r, e); |
1788 | |
1789 | let r = _mm_mpsadbw_epu8::<0b001>(a, a); |
1790 | let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12); |
1791 | assert_eq_m128i(r, e); |
1792 | |
1793 | let r = _mm_mpsadbw_epu8::<0b100>(a, a); |
1794 | let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44); |
1795 | assert_eq_m128i(r, e); |
1796 | |
1797 | let r = _mm_mpsadbw_epu8::<0b101>(a, a); |
1798 | let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28); |
1799 | assert_eq_m128i(r, e); |
1800 | |
1801 | let r = _mm_mpsadbw_epu8::<0b111>(a, a); |
1802 | let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4); |
1803 | assert_eq_m128i(r, e); |
1804 | } |
1805 | |
1806 | #[simd_test(enable = "sse4.1" )] |
1807 | unsafe fn test_mm_testz_si128() { |
1808 | let a = _mm_set1_epi8(1); |
1809 | let mask = _mm_set1_epi8(0); |
1810 | let r = _mm_testz_si128(a, mask); |
1811 | assert_eq!(r, 1); |
1812 | let a = _mm_set1_epi8(0b101); |
1813 | let mask = _mm_set1_epi8(0b110); |
1814 | let r = _mm_testz_si128(a, mask); |
1815 | assert_eq!(r, 0); |
1816 | let a = _mm_set1_epi8(0b011); |
1817 | let mask = _mm_set1_epi8(0b100); |
1818 | let r = _mm_testz_si128(a, mask); |
1819 | assert_eq!(r, 1); |
1820 | } |
1821 | |
1822 | #[simd_test(enable = "sse4.1" )] |
1823 | unsafe fn test_mm_testc_si128() { |
1824 | let a = _mm_set1_epi8(-1); |
1825 | let mask = _mm_set1_epi8(0); |
1826 | let r = _mm_testc_si128(a, mask); |
1827 | assert_eq!(r, 1); |
1828 | let a = _mm_set1_epi8(0b101); |
1829 | let mask = _mm_set1_epi8(0b110); |
1830 | let r = _mm_testc_si128(a, mask); |
1831 | assert_eq!(r, 0); |
1832 | let a = _mm_set1_epi8(0b101); |
1833 | let mask = _mm_set1_epi8(0b100); |
1834 | let r = _mm_testc_si128(a, mask); |
1835 | assert_eq!(r, 1); |
1836 | } |
1837 | |
1838 | #[simd_test(enable = "sse4.1" )] |
1839 | unsafe fn test_mm_testnzc_si128() { |
1840 | let a = _mm_set1_epi8(0); |
1841 | let mask = _mm_set1_epi8(1); |
1842 | let r = _mm_testnzc_si128(a, mask); |
1843 | assert_eq!(r, 0); |
1844 | let a = _mm_set1_epi8(-1); |
1845 | let mask = _mm_set1_epi8(0); |
1846 | let r = _mm_testnzc_si128(a, mask); |
1847 | assert_eq!(r, 0); |
1848 | let a = _mm_set1_epi8(0b101); |
1849 | let mask = _mm_set1_epi8(0b110); |
1850 | let r = _mm_testnzc_si128(a, mask); |
1851 | assert_eq!(r, 1); |
1852 | let a = _mm_set1_epi8(0b101); |
1853 | let mask = _mm_set1_epi8(0b101); |
1854 | let r = _mm_testnzc_si128(a, mask); |
1855 | assert_eq!(r, 0); |
1856 | } |
1857 | |
1858 | #[simd_test(enable = "sse4.1" )] |
1859 | unsafe fn test_mm_test_all_zeros() { |
1860 | let a = _mm_set1_epi8(1); |
1861 | let mask = _mm_set1_epi8(0); |
1862 | let r = _mm_test_all_zeros(a, mask); |
1863 | assert_eq!(r, 1); |
1864 | let a = _mm_set1_epi8(0b101); |
1865 | let mask = _mm_set1_epi8(0b110); |
1866 | let r = _mm_test_all_zeros(a, mask); |
1867 | assert_eq!(r, 0); |
1868 | let a = _mm_set1_epi8(0b011); |
1869 | let mask = _mm_set1_epi8(0b100); |
1870 | let r = _mm_test_all_zeros(a, mask); |
1871 | assert_eq!(r, 1); |
1872 | } |
1873 | |
1874 | #[simd_test(enable = "sse4.1" )] |
1875 | unsafe fn test_mm_test_all_ones() { |
1876 | let a = _mm_set1_epi8(-1); |
1877 | let r = _mm_test_all_ones(a); |
1878 | assert_eq!(r, 1); |
1879 | let a = _mm_set1_epi8(0b101); |
1880 | let r = _mm_test_all_ones(a); |
1881 | assert_eq!(r, 0); |
1882 | } |
1883 | |
1884 | #[simd_test(enable = "sse4.1" )] |
1885 | unsafe fn test_mm_test_mix_ones_zeros() { |
1886 | let a = _mm_set1_epi8(0); |
1887 | let mask = _mm_set1_epi8(1); |
1888 | let r = _mm_test_mix_ones_zeros(a, mask); |
1889 | assert_eq!(r, 0); |
1890 | let a = _mm_set1_epi8(-1); |
1891 | let mask = _mm_set1_epi8(0); |
1892 | let r = _mm_test_mix_ones_zeros(a, mask); |
1893 | assert_eq!(r, 0); |
1894 | let a = _mm_set1_epi8(0b101); |
1895 | let mask = _mm_set1_epi8(0b110); |
1896 | let r = _mm_test_mix_ones_zeros(a, mask); |
1897 | assert_eq!(r, 1); |
1898 | let a = _mm_set1_epi8(0b101); |
1899 | let mask = _mm_set1_epi8(0b101); |
1900 | let r = _mm_test_mix_ones_zeros(a, mask); |
1901 | assert_eq!(r, 0); |
1902 | } |
1903 | } |
1904 | |