1 | //! Streaming SIMD Extensions (SSE) |
2 | |
3 | use crate::{ |
4 | core_arch::{simd::*, simd_llvm::*, x86::*}, |
5 | intrinsics, mem, ptr, |
6 | }; |
7 | |
8 | #[cfg (test)] |
9 | use stdarch_test::assert_instr; |
10 | |
11 | /// Adds the first component of `a` and `b`, the other components are copied |
12 | /// from `a`. |
13 | /// |
14 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss) |
15 | #[inline ] |
16 | #[target_feature (enable = "sse" )] |
17 | #[cfg_attr (test, assert_instr(addss))] |
18 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
19 | pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 { |
20 | addss(a, b) |
21 | } |
22 | |
23 | /// Adds __m128 vectors. |
24 | /// |
25 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps) |
26 | #[inline ] |
27 | #[target_feature (enable = "sse" )] |
28 | #[cfg_attr (test, assert_instr(addps))] |
29 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
30 | pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 { |
31 | simd_add(x:a, y:b) |
32 | } |
33 | |
34 | /// Subtracts the first component of `b` from `a`, the other components are |
35 | /// copied from `a`. |
36 | /// |
37 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss) |
38 | #[inline ] |
39 | #[target_feature (enable = "sse" )] |
40 | #[cfg_attr (test, assert_instr(subss))] |
41 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
42 | pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 { |
43 | subss(a, b) |
44 | } |
45 | |
46 | /// Subtracts __m128 vectors. |
47 | /// |
48 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps) |
49 | #[inline ] |
50 | #[target_feature (enable = "sse" )] |
51 | #[cfg_attr (test, assert_instr(subps))] |
52 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
53 | pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 { |
54 | simd_sub(x:a, y:b) |
55 | } |
56 | |
57 | /// Multiplies the first component of `a` and `b`, the other components are |
58 | /// copied from `a`. |
59 | /// |
60 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss) |
61 | #[inline ] |
62 | #[target_feature (enable = "sse" )] |
63 | #[cfg_attr (test, assert_instr(mulss))] |
64 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
65 | pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 { |
66 | mulss(a, b) |
67 | } |
68 | |
69 | /// Multiplies __m128 vectors. |
70 | /// |
71 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps) |
72 | #[inline ] |
73 | #[target_feature (enable = "sse" )] |
74 | #[cfg_attr (test, assert_instr(mulps))] |
75 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
76 | pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 { |
77 | simd_mul(x:a, y:b) |
78 | } |
79 | |
80 | /// Divides the first component of `b` by `a`, the other components are |
81 | /// copied from `a`. |
82 | /// |
83 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss) |
84 | #[inline ] |
85 | #[target_feature (enable = "sse" )] |
86 | #[cfg_attr (test, assert_instr(divss))] |
87 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
88 | pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 { |
89 | divss(a, b) |
90 | } |
91 | |
92 | /// Divides __m128 vectors. |
93 | /// |
94 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps) |
95 | #[inline ] |
96 | #[target_feature (enable = "sse" )] |
97 | #[cfg_attr (test, assert_instr(divps))] |
98 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
99 | pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 { |
100 | simd_div(x:a, y:b) |
101 | } |
102 | |
103 | /// Returns the square root of the first single-precision (32-bit) |
104 | /// floating-point element in `a`, the other elements are unchanged. |
105 | /// |
106 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss) |
107 | #[inline ] |
108 | #[target_feature (enable = "sse" )] |
109 | #[cfg_attr (test, assert_instr(sqrtss))] |
110 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
111 | pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 { |
112 | sqrtss(a) |
113 | } |
114 | |
115 | /// Returns the square root of packed single-precision (32-bit) floating-point |
116 | /// elements in `a`. |
117 | /// |
118 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps) |
119 | #[inline ] |
120 | #[target_feature (enable = "sse" )] |
121 | #[cfg_attr (test, assert_instr(sqrtps))] |
122 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
123 | pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 { |
124 | sqrtps(a) |
125 | } |
126 | |
127 | /// Returns the approximate reciprocal of the first single-precision |
128 | /// (32-bit) floating-point element in `a`, the other elements are unchanged. |
129 | /// |
130 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss) |
131 | #[inline ] |
132 | #[target_feature (enable = "sse" )] |
133 | #[cfg_attr (test, assert_instr(rcpss))] |
134 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
135 | pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 { |
136 | rcpss(a) |
137 | } |
138 | |
139 | /// Returns the approximate reciprocal of packed single-precision (32-bit) |
140 | /// floating-point elements in `a`. |
141 | /// |
142 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps) |
143 | #[inline ] |
144 | #[target_feature (enable = "sse" )] |
145 | #[cfg_attr (test, assert_instr(rcpps))] |
146 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
147 | pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 { |
148 | rcpps(a) |
149 | } |
150 | |
151 | /// Returns the approximate reciprocal square root of the first single-precision |
152 | /// (32-bit) floating-point element in `a`, the other elements are unchanged. |
153 | /// |
154 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss) |
155 | #[inline ] |
156 | #[target_feature (enable = "sse" )] |
157 | #[cfg_attr (test, assert_instr(rsqrtss))] |
158 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
159 | pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 { |
160 | rsqrtss(a) |
161 | } |
162 | |
163 | /// Returns the approximate reciprocal square root of packed single-precision |
164 | /// (32-bit) floating-point elements in `a`. |
165 | /// |
166 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps) |
167 | #[inline ] |
168 | #[target_feature (enable = "sse" )] |
169 | #[cfg_attr (test, assert_instr(rsqrtps))] |
170 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
171 | pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 { |
172 | rsqrtps(a) |
173 | } |
174 | |
175 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
176 | /// and `b`, and return the minimum value in the first element of the return |
177 | /// value, the other elements are copied from `a`. |
178 | /// |
179 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss) |
180 | #[inline ] |
181 | #[target_feature (enable = "sse" )] |
182 | #[cfg_attr (test, assert_instr(minss))] |
183 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
184 | pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 { |
185 | minss(a, b) |
186 | } |
187 | |
188 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
189 | /// `b`, and return the corresponding minimum values. |
190 | /// |
191 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps) |
192 | #[inline ] |
193 | #[target_feature (enable = "sse" )] |
194 | #[cfg_attr (test, assert_instr(minps))] |
195 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
196 | pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 { |
197 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`. |
198 | minps(a, b) |
199 | } |
200 | |
201 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
202 | /// and `b`, and return the maximum value in the first element of the return |
203 | /// value, the other elements are copied from `a`. |
204 | /// |
205 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss) |
206 | #[inline ] |
207 | #[target_feature (enable = "sse" )] |
208 | #[cfg_attr (test, assert_instr(maxss))] |
209 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
210 | pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 { |
211 | maxss(a, b) |
212 | } |
213 | |
214 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
215 | /// `b`, and return the corresponding maximum values. |
216 | /// |
217 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps) |
218 | #[inline ] |
219 | #[target_feature (enable = "sse" )] |
220 | #[cfg_attr (test, assert_instr(maxps))] |
221 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
222 | pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 { |
223 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`. |
224 | maxps(a, b) |
225 | } |
226 | |
227 | /// Bitwise AND of packed single-precision (32-bit) floating-point elements. |
228 | /// |
229 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps) |
230 | #[inline ] |
231 | #[target_feature (enable = "sse" )] |
232 | // i586 only seems to generate plain `and` instructions, so ignore it. |
233 | #[cfg_attr ( |
234 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
235 | assert_instr(andps) |
236 | )] |
237 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
238 | pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 { |
239 | let a: __m128i = mem::transmute(src:a); |
240 | let b: __m128i = mem::transmute(src:b); |
241 | mem::transmute(src:simd_and(x:a, y:b)) |
242 | } |
243 | |
244 | /// Bitwise AND-NOT of packed single-precision (32-bit) floating-point |
245 | /// elements. |
246 | /// |
247 | /// Computes `!a & b` for each bit in `a` and `b`. |
248 | /// |
249 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps) |
250 | #[inline ] |
251 | #[target_feature (enable = "sse" )] |
252 | // i586 only seems to generate plain `not` and `and` instructions, so ignore |
253 | // it. |
254 | #[cfg_attr ( |
255 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
256 | assert_instr(andnps) |
257 | )] |
258 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
259 | pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 { |
260 | let a: __m128i = mem::transmute(src:a); |
261 | let b: __m128i = mem::transmute(src:b); |
262 | let mask: __m128i = mem::transmute(src:i32x4::splat(-1)); |
263 | mem::transmute(src:simd_and(x:simd_xor(mask, a), y:b)) |
264 | } |
265 | |
266 | /// Bitwise OR of packed single-precision (32-bit) floating-point elements. |
267 | /// |
268 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps) |
269 | #[inline ] |
270 | #[target_feature (enable = "sse" )] |
271 | // i586 only seems to generate plain `or` instructions, so we ignore it. |
272 | #[cfg_attr ( |
273 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
274 | assert_instr(orps) |
275 | )] |
276 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
277 | pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 { |
278 | let a: __m128i = mem::transmute(src:a); |
279 | let b: __m128i = mem::transmute(src:b); |
280 | mem::transmute(src:simd_or(x:a, y:b)) |
281 | } |
282 | |
283 | /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point |
284 | /// elements. |
285 | /// |
286 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps) |
287 | #[inline ] |
288 | #[target_feature (enable = "sse" )] |
289 | // i586 only seems to generate plain `xor` instructions, so we ignore it. |
290 | #[cfg_attr ( |
291 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
292 | assert_instr(xorps) |
293 | )] |
294 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
295 | pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 { |
296 | let a: __m128i = mem::transmute(src:a); |
297 | let b: __m128i = mem::transmute(src:b); |
298 | mem::transmute(src:simd_xor(x:a, y:b)) |
299 | } |
300 | |
301 | /// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of |
302 | /// the result will be `0xffffffff` if the two inputs are equal, or `0` |
303 | /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`. |
304 | /// |
305 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss) |
306 | #[inline ] |
307 | #[target_feature (enable = "sse" )] |
308 | #[cfg_attr (test, assert_instr(cmpeqss))] |
309 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
310 | pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 { |
311 | cmpss(a, b, imm8:0) |
312 | } |
313 | |
314 | /// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits |
315 | /// of the result will be `0xffffffff` if `a.extract(0)` is less than |
316 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the |
317 | /// upper 96 bits of `a`. |
318 | /// |
319 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss) |
320 | #[inline ] |
321 | #[target_feature (enable = "sse" )] |
322 | #[cfg_attr (test, assert_instr(cmpltss))] |
323 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
324 | pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 { |
325 | cmpss(a, b, imm8:1) |
326 | } |
327 | |
328 | /// Compares the lowest `f32` of both inputs for less than or equal. The lowest |
329 | /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than |
330 | /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result |
331 | /// are the upper 96 bits of `a`. |
332 | /// |
333 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss) |
334 | #[inline ] |
335 | #[target_feature (enable = "sse" )] |
336 | #[cfg_attr (test, assert_instr(cmpless))] |
337 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
338 | pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 { |
339 | cmpss(a, b, imm8:2) |
340 | } |
341 | |
342 | /// Compares the lowest `f32` of both inputs for greater than. The lowest 32 |
343 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater |
344 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result |
345 | /// are the upper 96 bits of `a`. |
346 | /// |
347 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss) |
348 | #[inline ] |
349 | #[target_feature (enable = "sse" )] |
350 | #[cfg_attr (test, assert_instr(cmpltss))] |
351 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
352 | pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 { |
353 | simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3]) |
354 | } |
355 | |
356 | /// Compares the lowest `f32` of both inputs for greater than or equal. The |
357 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is |
358 | /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits |
359 | /// of the result are the upper 96 bits of `a`. |
360 | /// |
361 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss) |
362 | #[inline ] |
363 | #[target_feature (enable = "sse" )] |
364 | #[cfg_attr (test, assert_instr(cmpless))] |
365 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
366 | pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 { |
367 | simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3]) |
368 | } |
369 | |
370 | /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits |
371 | /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to |
372 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the |
373 | /// upper 96 bits of `a`. |
374 | /// |
375 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss) |
376 | #[inline ] |
377 | #[target_feature (enable = "sse" )] |
378 | #[cfg_attr (test, assert_instr(cmpneqss))] |
379 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
380 | pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 { |
381 | cmpss(a, b, imm8:4) |
382 | } |
383 | |
384 | /// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32 |
385 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than |
386 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the |
387 | /// upper 96 bits of `a`. |
388 | /// |
389 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss) |
390 | #[inline ] |
391 | #[target_feature (enable = "sse" )] |
392 | #[cfg_attr (test, assert_instr(cmpnltss))] |
393 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
394 | pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 { |
395 | cmpss(a, b, imm8:5) |
396 | } |
397 | |
398 | /// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The |
399 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
400 | /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits |
401 | /// of the result are the upper 96 bits of `a`. |
402 | /// |
403 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss) |
404 | #[inline ] |
405 | #[target_feature (enable = "sse" )] |
406 | #[cfg_attr (test, assert_instr(cmpnless))] |
407 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
408 | pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 { |
409 | cmpss(a, b, imm8:6) |
410 | } |
411 | |
412 | /// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32 |
413 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater |
414 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are |
415 | /// the upper 96 bits of `a`. |
416 | /// |
417 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss) |
418 | #[inline ] |
419 | #[target_feature (enable = "sse" )] |
420 | #[cfg_attr (test, assert_instr(cmpnltss))] |
421 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
422 | pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 { |
423 | simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3]) |
424 | } |
425 | |
426 | /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The |
427 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
428 | /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 |
429 | /// bits of the result are the upper 96 bits of `a`. |
430 | /// |
431 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss) |
432 | #[inline ] |
433 | #[target_feature (enable = "sse" )] |
434 | #[cfg_attr (test, assert_instr(cmpnless))] |
435 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
436 | pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 { |
437 | simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3]) |
438 | } |
439 | |
440 | /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of |
441 | /// the result will be `0xffffffff` if neither of `a.extract(0)` or |
442 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result |
443 | /// are the upper 96 bits of `a`. |
444 | /// |
445 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss) |
446 | #[inline ] |
447 | #[target_feature (enable = "sse" )] |
448 | #[cfg_attr (test, assert_instr(cmpordss))] |
449 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
450 | pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 { |
451 | cmpss(a, b, imm8:7) |
452 | } |
453 | |
454 | /// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits |
455 | /// of the result will be `0xffffffff` if any of `a.extract(0)` or |
456 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result |
457 | /// are the upper 96 bits of `a`. |
458 | /// |
459 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss) |
460 | #[inline ] |
461 | #[target_feature (enable = "sse" )] |
462 | #[cfg_attr (test, assert_instr(cmpunordss))] |
463 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
464 | pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 { |
465 | cmpss(a, b, imm8:3) |
466 | } |
467 | |
468 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
469 | /// The result in the output vector will be `0xffffffff` if the input elements |
470 | /// were equal, or `0` otherwise. |
471 | /// |
472 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps) |
473 | #[inline ] |
474 | #[target_feature (enable = "sse" )] |
475 | #[cfg_attr (test, assert_instr(cmpeqps))] |
476 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
477 | pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 { |
478 | cmpps(a, b, imm8:0) |
479 | } |
480 | |
481 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
482 | /// The result in the output vector will be `0xffffffff` if the input element |
483 | /// in `a` is less than the corresponding element in `b`, or `0` otherwise. |
484 | /// |
485 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps) |
486 | #[inline ] |
487 | #[target_feature (enable = "sse" )] |
488 | #[cfg_attr (test, assert_instr(cmpltps))] |
489 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
490 | pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 { |
491 | cmpps(a, b, imm8:1) |
492 | } |
493 | |
494 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
495 | /// The result in the output vector will be `0xffffffff` if the input element |
496 | /// in `a` is less than or equal to the corresponding element in `b`, or `0` |
497 | /// otherwise. |
498 | /// |
499 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps) |
500 | #[inline ] |
501 | #[target_feature (enable = "sse" )] |
502 | #[cfg_attr (test, assert_instr(cmpleps))] |
503 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
504 | pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 { |
505 | cmpps(a, b, imm8:2) |
506 | } |
507 | |
508 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
509 | /// The result in the output vector will be `0xffffffff` if the input element |
510 | /// in `a` is greater than the corresponding element in `b`, or `0` otherwise. |
511 | /// |
512 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps) |
513 | #[inline ] |
514 | #[target_feature (enable = "sse" )] |
515 | #[cfg_attr (test, assert_instr(cmpltps))] |
516 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
517 | pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 { |
518 | cmpps(a:b, b:a, imm8:1) |
519 | } |
520 | |
521 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
522 | /// The result in the output vector will be `0xffffffff` if the input element |
523 | /// in `a` is greater than or equal to the corresponding element in `b`, or `0` |
524 | /// otherwise. |
525 | /// |
526 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps) |
527 | #[inline ] |
528 | #[target_feature (enable = "sse" )] |
529 | #[cfg_attr (test, assert_instr(cmpleps))] |
530 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
531 | pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 { |
532 | cmpps(a:b, b:a, imm8:2) |
533 | } |
534 | |
535 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
536 | /// The result in the output vector will be `0xffffffff` if the input elements |
537 | /// are **not** equal, or `0` otherwise. |
538 | /// |
539 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps) |
540 | #[inline ] |
541 | #[target_feature (enable = "sse" )] |
542 | #[cfg_attr (test, assert_instr(cmpneqps))] |
543 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
544 | pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 { |
545 | cmpps(a, b, imm8:4) |
546 | } |
547 | |
548 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
549 | /// The result in the output vector will be `0xffffffff` if the input element |
550 | /// in `a` is **not** less than the corresponding element in `b`, or `0` |
551 | /// otherwise. |
552 | /// |
553 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps) |
554 | #[inline ] |
555 | #[target_feature (enable = "sse" )] |
556 | #[cfg_attr (test, assert_instr(cmpnltps))] |
557 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
558 | pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 { |
559 | cmpps(a, b, imm8:5) |
560 | } |
561 | |
562 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
563 | /// The result in the output vector will be `0xffffffff` if the input element |
564 | /// in `a` is **not** less than or equal to the corresponding element in `b`, or |
565 | /// `0` otherwise. |
566 | /// |
567 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps) |
568 | #[inline ] |
569 | #[target_feature (enable = "sse" )] |
570 | #[cfg_attr (test, assert_instr(cmpnleps))] |
571 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
572 | pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 { |
573 | cmpps(a, b, imm8:6) |
574 | } |
575 | |
576 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
577 | /// The result in the output vector will be `0xffffffff` if the input element |
578 | /// in `a` is **not** greater than the corresponding element in `b`, or `0` |
579 | /// otherwise. |
580 | /// |
581 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps) |
582 | #[inline ] |
583 | #[target_feature (enable = "sse" )] |
584 | #[cfg_attr (test, assert_instr(cmpnltps))] |
585 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
586 | pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 { |
587 | cmpps(a:b, b:a, imm8:5) |
588 | } |
589 | |
590 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
591 | /// The result in the output vector will be `0xffffffff` if the input element |
592 | /// in `a` is **not** greater than or equal to the corresponding element in `b`, |
593 | /// or `0` otherwise. |
594 | /// |
595 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps) |
596 | #[inline ] |
597 | #[target_feature (enable = "sse" )] |
598 | #[cfg_attr (test, assert_instr(cmpnleps))] |
599 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
600 | pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 { |
601 | cmpps(a:b, b:a, imm8:6) |
602 | } |
603 | |
604 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
605 | /// Returns four floats that have one of two possible bit patterns. The element |
606 | /// in the output vector will be `0xffffffff` if the input elements in `a` and |
607 | /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise. |
608 | /// |
609 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps) |
610 | #[inline ] |
611 | #[target_feature (enable = "sse" )] |
612 | #[cfg_attr (test, assert_instr(cmpordps))] |
613 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
614 | pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 { |
615 | cmpps(a:b, b:a, imm8:7) |
616 | } |
617 | |
618 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
619 | /// Returns four floats that have one of two possible bit patterns. The element |
620 | /// in the output vector will be `0xffffffff` if the input elements in `a` and |
621 | /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise. |
622 | /// |
623 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps) |
624 | #[inline ] |
625 | #[target_feature (enable = "sse" )] |
626 | #[cfg_attr (test, assert_instr(cmpunordps))] |
627 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
628 | pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 { |
629 | cmpps(a:b, b:a, imm8:3) |
630 | } |
631 | |
632 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
633 | /// `1` if they are equal, or `0` otherwise. |
634 | /// |
635 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss) |
636 | #[inline ] |
637 | #[target_feature (enable = "sse" )] |
638 | #[cfg_attr (test, assert_instr(comiss))] |
639 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
640 | pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 { |
641 | comieq_ss(a, b) |
642 | } |
643 | |
644 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
645 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
646 | /// |
647 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss) |
648 | #[inline ] |
649 | #[target_feature (enable = "sse" )] |
650 | #[cfg_attr (test, assert_instr(comiss))] |
651 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
652 | pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 { |
653 | comilt_ss(a, b) |
654 | } |
655 | |
656 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
657 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
658 | /// otherwise. |
659 | /// |
660 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss) |
661 | #[inline ] |
662 | #[target_feature (enable = "sse" )] |
663 | #[cfg_attr (test, assert_instr(comiss))] |
664 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
665 | pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 { |
666 | comile_ss(a, b) |
667 | } |
668 | |
669 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
670 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
671 | /// otherwise. |
672 | /// |
673 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss) |
674 | #[inline ] |
675 | #[target_feature (enable = "sse" )] |
676 | #[cfg_attr (test, assert_instr(comiss))] |
677 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
678 | pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 { |
679 | comigt_ss(a, b) |
680 | } |
681 | |
682 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
683 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
684 | /// `0` otherwise. |
685 | /// |
686 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss) |
687 | #[inline ] |
688 | #[target_feature (enable = "sse" )] |
689 | #[cfg_attr (test, assert_instr(comiss))] |
690 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
691 | pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 { |
692 | comige_ss(a, b) |
693 | } |
694 | |
695 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
696 | /// `1` if they are **not** equal, or `0` otherwise. |
697 | /// |
698 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss) |
699 | #[inline ] |
700 | #[target_feature (enable = "sse" )] |
701 | #[cfg_attr (test, assert_instr(comiss))] |
702 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
703 | pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 { |
704 | comineq_ss(a, b) |
705 | } |
706 | |
707 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
708 | /// `1` if they are equal, or `0` otherwise. This instruction will not signal |
709 | /// an exception if either argument is a quiet NaN. |
710 | /// |
711 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss) |
712 | #[inline ] |
713 | #[target_feature (enable = "sse" )] |
714 | #[cfg_attr (test, assert_instr(ucomiss))] |
715 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
716 | pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 { |
717 | ucomieq_ss(a, b) |
718 | } |
719 | |
720 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
721 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
722 | /// This instruction will not signal an exception if either argument is a quiet |
723 | /// NaN. |
724 | /// |
725 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss) |
726 | #[inline ] |
727 | #[target_feature (enable = "sse" )] |
728 | #[cfg_attr (test, assert_instr(ucomiss))] |
729 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
730 | pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 { |
731 | ucomilt_ss(a, b) |
732 | } |
733 | |
734 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
735 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
736 | /// otherwise. This instruction will not signal an exception if either argument |
737 | /// is a quiet NaN. |
738 | /// |
739 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss) |
740 | #[inline ] |
741 | #[target_feature (enable = "sse" )] |
742 | #[cfg_attr (test, assert_instr(ucomiss))] |
743 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
744 | pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 { |
745 | ucomile_ss(a, b) |
746 | } |
747 | |
748 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
749 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
750 | /// otherwise. This instruction will not signal an exception if either argument |
751 | /// is a quiet NaN. |
752 | /// |
753 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss) |
754 | #[inline ] |
755 | #[target_feature (enable = "sse" )] |
756 | #[cfg_attr (test, assert_instr(ucomiss))] |
757 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
758 | pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 { |
759 | ucomigt_ss(a, b) |
760 | } |
761 | |
762 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
763 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
764 | /// `0` otherwise. This instruction will not signal an exception if either |
765 | /// argument is a quiet NaN. |
766 | /// |
767 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss) |
768 | #[inline ] |
769 | #[target_feature (enable = "sse" )] |
770 | #[cfg_attr (test, assert_instr(ucomiss))] |
771 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
772 | pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 { |
773 | ucomige_ss(a, b) |
774 | } |
775 | |
776 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
777 | /// `1` if they are **not** equal, or `0` otherwise. This instruction will not |
778 | /// signal an exception if either argument is a quiet NaN. |
779 | /// |
780 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss) |
781 | #[inline ] |
782 | #[target_feature (enable = "sse" )] |
783 | #[cfg_attr (test, assert_instr(ucomiss))] |
784 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
785 | pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 { |
786 | ucomineq_ss(a, b) |
787 | } |
788 | |
789 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer. |
790 | /// |
791 | /// The result is rounded according to the current rounding mode. If the result |
792 | /// cannot be represented as a 32 bit integer the result will be `0x8000_0000` |
793 | /// (`i32::MIN`). |
794 | /// |
795 | /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output). |
796 | /// |
797 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32) |
798 | #[inline ] |
799 | #[target_feature (enable = "sse" )] |
800 | #[cfg_attr (test, assert_instr(cvtss2si))] |
801 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
802 | pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 { |
803 | cvtss2si(a) |
804 | } |
805 | |
806 | /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html). |
807 | /// |
808 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si) |
809 | #[inline ] |
810 | #[target_feature (enable = "sse" )] |
811 | #[cfg_attr (test, assert_instr(cvtss2si))] |
812 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
813 | pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 { |
814 | _mm_cvtss_si32(a) |
815 | } |
816 | |
817 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer |
818 | /// with |
819 | /// truncation. |
820 | /// |
821 | /// The result is rounded always using truncation (round towards zero). If the |
822 | /// result cannot be represented as a 32 bit integer the result will be |
823 | /// `0x8000_0000` (`i32::MIN`). |
824 | /// |
825 | /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output). |
826 | /// |
827 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32) |
828 | #[inline ] |
829 | #[target_feature (enable = "sse" )] |
830 | #[cfg_attr (test, assert_instr(cvttss2si))] |
831 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
832 | pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 { |
833 | cvttss2si(a) |
834 | } |
835 | |
836 | /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html). |
837 | /// |
838 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si) |
839 | #[inline ] |
840 | #[target_feature (enable = "sse" )] |
841 | #[cfg_attr (test, assert_instr(cvttss2si))] |
842 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
843 | pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 { |
844 | _mm_cvttss_si32(a) |
845 | } |
846 | |
847 | /// Extracts the lowest 32 bit float from the input vector. |
848 | /// |
849 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32) |
850 | #[inline ] |
851 | #[target_feature (enable = "sse" )] |
852 | // No point in using assert_instrs. In Unix x86_64 calling convention this is a |
853 | // no-op, and on Windows it's just a `mov`. |
854 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
855 | pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 { |
856 | simd_extract(x:a, idx:0) |
857 | } |
858 | |
859 | /// Converts a 32 bit integer to a 32 bit float. The result vector is the input |
860 | /// vector `a` with the lowest 32 bit float replaced by the converted integer. |
861 | /// |
862 | /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit |
863 | /// input). |
864 | /// |
865 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss) |
866 | #[inline ] |
867 | #[target_feature (enable = "sse" )] |
868 | #[cfg_attr (test, assert_instr(cvtsi2ss))] |
869 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
870 | pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 { |
871 | cvtsi2ss(a, b) |
872 | } |
873 | |
874 | /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html). |
875 | /// |
876 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss) |
877 | #[inline ] |
878 | #[target_feature (enable = "sse" )] |
879 | #[cfg_attr (test, assert_instr(cvtsi2ss))] |
880 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
881 | pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 { |
882 | _mm_cvtsi32_ss(a, b) |
883 | } |
884 | |
885 | /// Construct a `__m128` with the lowest element set to `a` and the rest set to |
886 | /// zero. |
887 | /// |
888 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss) |
889 | #[inline ] |
890 | #[target_feature (enable = "sse" )] |
891 | #[cfg_attr (test, assert_instr(movss))] |
892 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
893 | pub unsafe fn _mm_set_ss(a: f32) -> __m128 { |
894 | __m128(a, 0.0, 0.0, 0.0) |
895 | } |
896 | |
897 | /// Construct a `__m128` with all element set to `a`. |
898 | /// |
899 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps) |
900 | #[inline ] |
901 | #[target_feature (enable = "sse" )] |
902 | #[cfg_attr (test, assert_instr(shufps))] |
903 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
904 | pub unsafe fn _mm_set1_ps(a: f32) -> __m128 { |
905 | __m128(a, a, a, a) |
906 | } |
907 | |
908 | /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html) |
909 | /// |
910 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1) |
911 | #[inline ] |
912 | #[target_feature (enable = "sse" )] |
913 | #[cfg_attr (test, assert_instr(shufps))] |
914 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
915 | pub unsafe fn _mm_set_ps1(a: f32) -> __m128 { |
916 | _mm_set1_ps(a) |
917 | } |
918 | |
919 | /// Construct a `__m128` from four floating point values highest to lowest. |
920 | /// |
921 | /// Note that `a` will be the highest 32 bits of the result, and `d` the |
922 | /// lowest. This matches the standard way of writing bit patterns on x86: |
923 | /// |
924 | /// ```text |
925 | /// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0 |
926 | /// +---------+---------+---------+---------+ |
927 | /// | a | b | c | d | result |
928 | /// +---------+---------+---------+---------+ |
929 | /// ``` |
930 | /// |
931 | /// Alternatively: |
932 | /// |
933 | /// ```text |
934 | /// let v = _mm_set_ps(d, c, b, a); |
935 | /// ``` |
936 | /// |
937 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps) |
938 | #[inline ] |
939 | #[target_feature (enable = "sse" )] |
940 | #[cfg_attr (test, assert_instr(unpcklps))] |
941 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
942 | pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
943 | __m128(d, c, b, a) |
944 | } |
945 | |
946 | /// Construct a `__m128` from four floating point values lowest to highest. |
947 | /// |
948 | /// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32 |
949 | /// bits of the result, and `d` the highest. |
950 | /// |
951 | /// ```text |
952 | /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d)); |
953 | /// ``` |
954 | /// |
955 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps) |
956 | #[inline ] |
957 | #[target_feature (enable = "sse" )] |
958 | #[cfg_attr ( |
959 | all(test, any(target_os = "windows" , target_arch = "x86_64" )), |
960 | assert_instr(unpcklps) |
961 | )] |
962 | // On a 32-bit architecture on non-Windows it just copies the operands from the stack. |
963 | #[cfg_attr ( |
964 | all(test, all(not(target_os = "windows" ), target_arch = "x86" )), |
965 | assert_instr(movaps) |
966 | )] |
967 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
968 | pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
969 | __m128(a, b, c, d) |
970 | } |
971 | |
972 | /// Construct a `__m128` with all elements initialized to zero. |
973 | /// |
974 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps) |
975 | #[inline ] |
976 | #[target_feature (enable = "sse" )] |
977 | #[cfg_attr (test, assert_instr(xorps))] |
978 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
979 | pub unsafe fn _mm_setzero_ps() -> __m128 { |
980 | __m128(0.0, 0.0, 0.0, 0.0) |
981 | } |
982 | |
983 | /// A utility function for creating masks to use with Intel shuffle and |
984 | /// permute intrinsics. |
985 | #[inline ] |
986 | #[allow (non_snake_case)] |
987 | #[unstable (feature = "stdarch" , issue = "27731" )] |
988 | pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 { |
989 | ((z << 6) | (y << 4) | (x << 2) | w) as i32 |
990 | } |
991 | |
992 | /// Shuffles packed single-precision (32-bit) floating-point elements in `a` and |
993 | /// `b` using `MASK`. |
994 | /// |
995 | /// The lower half of result takes values from `a` and the higher half from |
996 | /// `b`. Mask is split to 2 control bits each to index the element from inputs. |
997 | /// |
998 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps) |
999 | /// |
1000 | /// Note that there appears to be a mistake within Intel's Intrinsics Guide. |
1001 | /// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32` |
1002 | /// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_). |
1003 | /// Performing an implicit type conversion between an unsigned integer and a signed integer |
1004 | /// does not cause a problem in C, however Rust's commitment to strong typing does not allow this. |
1005 | #[inline ] |
1006 | #[target_feature (enable = "sse" )] |
1007 | #[cfg_attr (test, assert_instr(shufps, MASK = 3))] |
1008 | #[rustc_legacy_const_generics (2)] |
1009 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1010 | pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 { |
1011 | static_assert_uimm_bits!(MASK, 8); |
1012 | simd_shuffle!( |
1013 | a, |
1014 | b, |
1015 | [ |
1016 | MASK as u32 & 0b11, |
1017 | (MASK as u32 >> 2) & 0b11, |
1018 | ((MASK as u32 >> 4) & 0b11) + 4, |
1019 | ((MASK as u32 >> 6) & 0b11) + 4, |
1020 | ], |
1021 | ) |
1022 | } |
1023 | |
1024 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
1025 | /// from the higher half of `a` and `b`. |
1026 | /// |
1027 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps) |
1028 | #[inline ] |
1029 | #[target_feature (enable = "sse" )] |
1030 | #[cfg_attr (test, assert_instr(unpckhps))] |
1031 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1032 | pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 { |
1033 | simd_shuffle!(a, b, [2, 6, 3, 7]) |
1034 | } |
1035 | |
1036 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
1037 | /// from the lower half of `a` and `b`. |
1038 | /// |
1039 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps) |
1040 | #[inline ] |
1041 | #[target_feature (enable = "sse" )] |
1042 | #[cfg_attr (test, assert_instr(unpcklps))] |
1043 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1044 | pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 { |
1045 | simd_shuffle!(a, b, [0, 4, 1, 5]) |
1046 | } |
1047 | |
1048 | /// Combine higher half of `a` and `b`. The higher half of `b` occupies the |
1049 | /// lower half of result. |
1050 | /// |
1051 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps) |
1052 | #[inline ] |
1053 | #[target_feature (enable = "sse" )] |
1054 | #[cfg_attr (all(test, not(target_os = "windows" )), assert_instr(movhlps))] |
1055 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1056 | pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 { |
1057 | // TODO; figure why this is a different instruction on Windows? |
1058 | simd_shuffle!(a, b, [6, 7, 2, 3]) |
1059 | } |
1060 | |
1061 | /// Combine lower half of `a` and `b`. The lower half of `b` occupies the |
1062 | /// higher half of result. |
1063 | /// |
1064 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps) |
1065 | #[inline ] |
1066 | #[target_feature (enable = "sse" )] |
1067 | #[cfg_attr (all(test, not(target_os = "windows" )), assert_instr(movlhps))] |
1068 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1069 | pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 { |
1070 | simd_shuffle!(a, b, [0, 1, 4, 5]) |
1071 | } |
1072 | |
1073 | /// Returns a mask of the most significant bit of each element in `a`. |
1074 | /// |
1075 | /// The mask is stored in the 4 least significant bits of the return value. |
1076 | /// All other bits are set to `0`. |
1077 | /// |
1078 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps) |
1079 | #[inline ] |
1080 | #[target_feature (enable = "sse" )] |
1081 | #[cfg_attr (test, assert_instr(movmskps))] |
1082 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1083 | pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { |
1084 | // Propagate the highest bit to the rest, because simd_bitmask |
1085 | // requires all-1 or all-0. |
1086 | let mask: i32x4 = simd_lt(x:transmute(a), y:i32x4::splat(0)); |
1087 | simd_bitmask::<i32x4, u8>(mask).into() |
1088 | } |
1089 | |
1090 | /// Construct a `__m128` with the lowest element read from `p` and the other |
1091 | /// elements set to zero. |
1092 | /// |
1093 | /// This corresponds to instructions `VMOVSS` / `MOVSS`. |
1094 | /// |
1095 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss) |
1096 | #[inline ] |
1097 | #[target_feature (enable = "sse" )] |
1098 | #[cfg_attr (test, assert_instr(movss))] |
1099 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1100 | pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 { |
1101 | __m128(*p, 0.0, 0.0, 0.0) |
1102 | } |
1103 | |
1104 | /// Construct a `__m128` by duplicating the value read from `p` into all |
1105 | /// elements. |
1106 | /// |
1107 | /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some |
1108 | /// shuffling. |
1109 | /// |
1110 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps) |
1111 | #[inline ] |
1112 | #[target_feature (enable = "sse" )] |
1113 | #[cfg_attr (test, assert_instr(movss))] |
1114 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1115 | pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 { |
1116 | let a: f32 = *p; |
1117 | __m128(a, a, a, a) |
1118 | } |
1119 | |
1120 | /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html) |
1121 | /// |
1122 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1) |
1123 | #[inline ] |
1124 | #[target_feature (enable = "sse" )] |
1125 | #[cfg_attr (test, assert_instr(movss))] |
1126 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1127 | pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 { |
1128 | _mm_load1_ps(p) |
1129 | } |
1130 | |
1131 | /// Loads four `f32` values from *aligned* memory into a `__m128`. If the |
1132 | /// pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1133 | /// protection fault will be triggered (fatal program crash). |
1134 | /// |
1135 | /// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned |
1136 | /// memory. |
1137 | /// |
1138 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. |
1139 | /// |
1140 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps) |
1141 | #[inline ] |
1142 | #[target_feature (enable = "sse" )] |
1143 | #[cfg_attr (test, assert_instr(movaps))] |
1144 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1145 | #[allow (clippy::cast_ptr_alignment)] |
1146 | pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 { |
1147 | *(p as *const __m128) |
1148 | } |
1149 | |
1150 | /// Loads four `f32` values from memory into a `__m128`. There are no |
1151 | /// restrictions |
1152 | /// on memory alignment. For aligned memory |
1153 | /// [`_mm_load_ps`](fn._mm_load_ps.html) |
1154 | /// may be faster. |
1155 | /// |
1156 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. |
1157 | /// |
1158 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps) |
1159 | #[inline ] |
1160 | #[target_feature (enable = "sse" )] |
1161 | #[cfg_attr (test, assert_instr(movups))] |
1162 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1163 | pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 { |
1164 | // Note: Using `*p` would require `f32` alignment, but `movups` has no |
1165 | // alignment restrictions. |
1166 | let mut dst: __m128 = _mm_undefined_ps(); |
1167 | ptr::copy_nonoverlapping( |
1168 | src:p as *const u8, |
1169 | &mut dst as *mut __m128 as *mut u8, |
1170 | count:mem::size_of::<__m128>(), |
1171 | ); |
1172 | dst |
1173 | } |
1174 | |
1175 | /// Loads four `f32` values from aligned memory into a `__m128` in reverse |
1176 | /// order. |
1177 | /// |
1178 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1179 | /// protection fault will be triggered (fatal program crash). |
1180 | /// |
1181 | /// Functionally equivalent to the following code sequence (assuming `p` |
1182 | /// satisfies the alignment restrictions): |
1183 | /// |
1184 | /// ```text |
1185 | /// let a0 = *p; |
1186 | /// let a1 = *p.add(1); |
1187 | /// let a2 = *p.add(2); |
1188 | /// let a3 = *p.add(3); |
1189 | /// __m128::new(a3, a2, a1, a0) |
1190 | /// ``` |
1191 | /// |
1192 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some |
1193 | /// shuffling. |
1194 | /// |
1195 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps) |
1196 | #[inline ] |
1197 | #[target_feature (enable = "sse" )] |
1198 | #[cfg_attr (test, assert_instr(movaps))] |
1199 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1200 | pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { |
1201 | let a: __m128 = _mm_load_ps(p); |
1202 | simd_shuffle!(a, a, [3, 2, 1, 0]) |
1203 | } |
1204 | |
1205 | /// Loads unaligned 64-bits of integer data from memory into new vector. |
1206 | /// |
1207 | /// `mem_addr` does not need to be aligned on any particular boundary. |
1208 | /// |
1209 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64) |
1210 | #[inline ] |
1211 | #[target_feature (enable = "sse" )] |
1212 | #[stable (feature = "simd_x86_mm_loadu_si64" , since = "1.46.0" )] |
1213 | pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { |
1214 | transmute(src:i64x2(ptr::read_unaligned(src:mem_addr as *const i64), 0)) |
1215 | } |
1216 | |
1217 | /// Stores the lowest 32 bit float of `a` into memory. |
1218 | /// |
1219 | /// This intrinsic corresponds to the `MOVSS` instruction. |
1220 | /// |
1221 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss) |
1222 | #[inline ] |
1223 | #[target_feature (enable = "sse" )] |
1224 | #[cfg_attr (test, assert_instr(movss))] |
1225 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1226 | pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) { |
1227 | *p = simd_extract(x:a, idx:0); |
1228 | } |
1229 | |
1230 | /// Stores the lowest 32 bit float of `a` repeated four times into *aligned* |
1231 | /// memory. |
1232 | /// |
1233 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1234 | /// protection fault will be triggered (fatal program crash). |
1235 | /// |
1236 | /// Functionally equivalent to the following code sequence (assuming `p` |
1237 | /// satisfies the alignment restrictions): |
1238 | /// |
1239 | /// ```text |
1240 | /// let x = a.extract(0); |
1241 | /// *p = x; |
1242 | /// *p.add(1) = x; |
1243 | /// *p.add(2) = x; |
1244 | /// *p.add(3) = x; |
1245 | /// ``` |
1246 | /// |
1247 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps) |
1248 | #[inline ] |
1249 | #[target_feature (enable = "sse" )] |
1250 | #[cfg_attr (test, assert_instr(movaps))] |
1251 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1252 | #[allow (clippy::cast_ptr_alignment)] |
1253 | pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) { |
1254 | let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]); |
1255 | *(p as *mut __m128) = b; |
1256 | } |
1257 | |
1258 | /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html) |
1259 | /// |
1260 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1) |
1261 | #[inline ] |
1262 | #[target_feature (enable = "sse" )] |
1263 | #[cfg_attr (test, assert_instr(movaps))] |
1264 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1265 | pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) { |
1266 | _mm_store1_ps(p, a); |
1267 | } |
1268 | |
1269 | /// Stores four 32-bit floats into *aligned* memory. |
1270 | /// |
1271 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1272 | /// protection fault will be triggered (fatal program crash). |
1273 | /// |
1274 | /// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned |
1275 | /// memory. |
1276 | /// |
1277 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. |
1278 | /// |
1279 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps) |
1280 | #[inline ] |
1281 | #[target_feature (enable = "sse" )] |
1282 | #[cfg_attr (test, assert_instr(movaps))] |
1283 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1284 | #[allow (clippy::cast_ptr_alignment)] |
1285 | pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) { |
1286 | *(p as *mut __m128) = a; |
1287 | } |
1288 | |
1289 | /// Stores four 32-bit floats into memory. There are no restrictions on memory |
1290 | /// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be |
1291 | /// faster. |
1292 | /// |
1293 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. |
1294 | /// |
1295 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps) |
1296 | #[inline ] |
1297 | #[target_feature (enable = "sse" )] |
1298 | #[cfg_attr (test, assert_instr(movups))] |
1299 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1300 | pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) { |
1301 | ptr::copy_nonoverlapping( |
1302 | &a as *const __m128 as *const u8, |
1303 | dst:p as *mut u8, |
1304 | count:mem::size_of::<__m128>(), |
1305 | ); |
1306 | } |
1307 | |
1308 | /// Stores four 32-bit floats into *aligned* memory in reverse order. |
1309 | /// |
1310 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1311 | /// protection fault will be triggered (fatal program crash). |
1312 | /// |
1313 | /// Functionally equivalent to the following code sequence (assuming `p` |
1314 | /// satisfies the alignment restrictions): |
1315 | /// |
1316 | /// ```text |
1317 | /// *p = a.extract(3); |
1318 | /// *p.add(1) = a.extract(2); |
1319 | /// *p.add(2) = a.extract(1); |
1320 | /// *p.add(3) = a.extract(0); |
1321 | /// ``` |
1322 | /// |
1323 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps) |
1324 | #[inline ] |
1325 | #[target_feature (enable = "sse" )] |
1326 | #[cfg_attr (test, assert_instr(movaps))] |
1327 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1328 | #[allow (clippy::cast_ptr_alignment)] |
1329 | pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) { |
1330 | let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]); |
1331 | *(p as *mut __m128) = b; |
1332 | } |
1333 | |
1334 | /// Returns a `__m128` with the first component from `b` and the remaining |
1335 | /// components from `a`. |
1336 | /// |
1337 | /// In other words for any `a` and `b`: |
1338 | /// ```text |
1339 | /// _mm_move_ss(a, b) == a.replace(0, b.extract(0)) |
1340 | /// ``` |
1341 | /// |
1342 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss) |
1343 | #[inline ] |
1344 | #[target_feature (enable = "sse" )] |
1345 | #[cfg_attr (test, assert_instr(movss))] |
1346 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1347 | pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 { |
1348 | simd_shuffle!(a, b, [4, 1, 2, 3]) |
1349 | } |
1350 | |
1351 | /// Performs a serializing operation on all store-to-memory instructions that |
1352 | /// were issued prior to this instruction. |
1353 | /// |
1354 | /// Guarantees that every store instruction that precedes, in program order, is |
1355 | /// globally visible before any store instruction which follows the fence in |
1356 | /// program order. |
1357 | /// |
1358 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence) |
1359 | #[inline ] |
1360 | #[target_feature (enable = "sse" )] |
1361 | #[cfg_attr (test, assert_instr(sfence))] |
1362 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1363 | pub unsafe fn _mm_sfence() { |
1364 | sfence() |
1365 | } |
1366 | |
1367 | /// Gets the unsigned 32-bit value of the MXCSR control and status register. |
1368 | /// |
1369 | /// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust |
1370 | /// floating-point operations may or may not result in this register getting updated with exception |
1371 | /// state, and the register can change between two invocations of this function even when no |
1372 | /// floating-point operations appear in the source code (since floating-point operations appearing |
1373 | /// earlier or later can be reordered). |
1374 | /// |
1375 | /// If you need to perform some floating-point operations and check whether they raised an |
1376 | /// exception, use an inline assembly block for the entire sequence of operations. |
1377 | /// |
1378 | /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html) |
1379 | /// |
1380 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr) |
1381 | #[inline ] |
1382 | #[target_feature (enable = "sse" )] |
1383 | #[cfg_attr (test, assert_instr(stmxcsr))] |
1384 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1385 | #[deprecated ( |
1386 | since = "1.75.0" , |
1387 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1388 | )] |
1389 | pub unsafe fn _mm_getcsr() -> u32 { |
1390 | let mut result: i32 = 0_i32; |
1391 | stmxcsr(&mut result as *mut _ as *mut i8); |
1392 | result as u32 |
1393 | } |
1394 | |
1395 | /// Sets the MXCSR register with the 32-bit unsigned integer value. |
1396 | /// |
1397 | /// This register controls how SIMD instructions handle floating point |
1398 | /// operations. Modifying this register only affects the current thread. |
1399 | /// |
1400 | /// It contains several groups of flags: |
1401 | /// |
1402 | /// * *Exception flags* report which exceptions occurred since last they were |
1403 | /// reset. |
1404 | /// |
1405 | /// * *Masking flags* can be used to mask (ignore) certain exceptions. By |
1406 | /// default |
1407 | /// these flags are all set to 1, so all exceptions are masked. When an |
1408 | /// an exception is masked, the processor simply sets the exception flag and |
1409 | /// continues the operation. If the exception is unmasked, the flag is also set |
1410 | /// but additionally an exception handler is invoked. |
1411 | /// |
1412 | /// * *Rounding mode flags* control the rounding mode of floating point |
1413 | /// instructions. |
1414 | /// |
1415 | /// * The *denormals-are-zero mode flag* turns all numbers which would be |
1416 | /// denormalized (exponent bits are all zeros) into zeros. |
1417 | /// |
1418 | /// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to |
1419 | /// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and |
1420 | /// will optimize accordingly. This even applies when the register is altered and later reset to its |
1421 | /// original value without any floating-point operations appearing in the source code between those |
1422 | /// operations (since floating-point operations appearing earlier or later can be reordered). |
1423 | /// |
1424 | /// If you need to perform some floating-point operations under a different masking flags, rounding |
1425 | /// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the |
1426 | /// original MXCSR register state before the end of the block. |
1427 | /// |
1428 | /// ## Exception Flags |
1429 | /// |
1430 | /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing |
1431 | /// Infinity by Infinity). |
1432 | /// |
1433 | /// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized |
1434 | /// number. Mainly this can cause loss of precision. |
1435 | /// |
1436 | /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred. |
1437 | /// |
1438 | /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a |
1439 | /// result was too large to be represented (e.g., an `f32` with absolute |
1440 | /// value |
1441 | /// greater than `2^128`). |
1442 | /// |
1443 | /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a |
1444 | /// result was too small to be represented in a normalized way (e.g., an |
1445 | /// `f32` |
1446 | /// with absulte value smaller than `2^-126`.) |
1447 | /// |
1448 | /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a. |
1449 | /// precision exception). This means some precision was lost due to rounding. |
1450 | /// For example, the fraction `1/3` cannot be represented accurately in a |
1451 | /// 32 or 64 bit float and computing it would cause this exception to be |
1452 | /// raised. Precision exceptions are very common, so they are usually masked. |
1453 | /// |
1454 | /// Exception flags can be read and set using the convenience functions |
1455 | /// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to |
1456 | /// check if an operation caused some overflow: |
1457 | /// |
1458 | /// ```rust,ignore |
1459 | /// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags |
1460 | /// // perform calculations |
1461 | /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 { |
1462 | /// // handle overflow |
1463 | /// } |
1464 | /// ``` |
1465 | /// |
1466 | /// ## Masking Flags |
1467 | /// |
1468 | /// There is one masking flag for each exception flag: `_MM_MASK_INVALID`, |
1469 | /// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`, |
1470 | /// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. |
1471 | /// |
1472 | /// A single masking bit can be set via |
1473 | /// |
1474 | /// ```rust,ignore |
1475 | /// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW); |
1476 | /// ``` |
1477 | /// |
1478 | /// However, since mask bits are by default all set to 1, it is more common to |
1479 | /// want to *disable* certain bits. For example, to unmask the underflow |
1480 | /// exception, use: |
1481 | /// |
1482 | /// ```rust,ignore |
1483 | /// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow |
1484 | /// exception |
1485 | /// ``` |
1486 | /// |
1487 | /// Warning: an unmasked exception will cause an exception handler to be |
1488 | /// called. |
1489 | /// The standard handler will simply terminate the process. So, in this case |
1490 | /// any underflow exception would terminate the current process with something |
1491 | /// like `signal: 8, SIGFPE: erroneous arithmetic operation`. |
1492 | /// |
1493 | /// ## Rounding Mode |
1494 | /// |
1495 | /// The rounding mode is describe using two bits. It can be read and set using |
1496 | /// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and |
1497 | /// `_MM_SET_ROUNDING_MODE(mode)`. |
1498 | /// |
1499 | /// The rounding modes are: |
1500 | /// |
1501 | /// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision |
1502 | /// value. If two values are equally close, round to even (i.e., least |
1503 | /// significant bit will be zero). |
1504 | /// |
1505 | /// * `_MM_ROUND_DOWN`: Round toward negative Infinity. |
1506 | /// |
1507 | /// * `_MM_ROUND_UP`: Round toward positive Infinity. |
1508 | /// |
1509 | /// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate). |
1510 | /// |
1511 | /// Example: |
1512 | /// |
1513 | /// ```rust,ignore |
1514 | /// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN) |
1515 | /// ``` |
1516 | /// |
1517 | /// ## Denormals-are-zero/Flush-to-zero Mode |
1518 | /// |
1519 | /// If this bit is set, values that would be denormalized will be set to zero |
1520 | /// instead. This is turned off by default. |
1521 | /// |
1522 | /// You can read and enable/disable this mode via the helper functions |
1523 | /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`: |
1524 | /// |
1525 | /// ```rust,ignore |
1526 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default) |
1527 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on |
1528 | /// ``` |
1529 | /// |
1530 | /// |
1531 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr) |
1532 | #[inline ] |
1533 | #[target_feature (enable = "sse" )] |
1534 | #[cfg_attr (test, assert_instr(ldmxcsr))] |
1535 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1536 | #[deprecated ( |
1537 | since = "1.75.0" , |
1538 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1539 | )] |
1540 | pub unsafe fn _mm_setcsr(val: u32) { |
1541 | ldmxcsr(&val as *const _ as *const i8); |
1542 | } |
1543 | |
1544 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1545 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1546 | pub const _MM_EXCEPT_INVALID: u32 = 0x0001; |
1547 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1548 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1549 | pub const _MM_EXCEPT_DENORM: u32 = 0x0002; |
1550 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1551 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1552 | pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004; |
1553 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1554 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1555 | pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008; |
1556 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1557 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1558 | pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010; |
1559 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1560 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1561 | pub const _MM_EXCEPT_INEXACT: u32 = 0x0020; |
1562 | /// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html) |
1563 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1564 | pub const _MM_EXCEPT_MASK: u32 = 0x003f; |
1565 | |
1566 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1567 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1568 | pub const _MM_MASK_INVALID: u32 = 0x0080; |
1569 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1570 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1571 | pub const _MM_MASK_DENORM: u32 = 0x0100; |
1572 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1573 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1574 | pub const _MM_MASK_DIV_ZERO: u32 = 0x0200; |
1575 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1576 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1577 | pub const _MM_MASK_OVERFLOW: u32 = 0x0400; |
1578 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1579 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1580 | pub const _MM_MASK_UNDERFLOW: u32 = 0x0800; |
1581 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1582 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1583 | pub const _MM_MASK_INEXACT: u32 = 0x1000; |
1584 | /// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html) |
1585 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1586 | pub const _MM_MASK_MASK: u32 = 0x1f80; |
1587 | |
1588 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1589 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1590 | pub const _MM_ROUND_NEAREST: u32 = 0x0000; |
1591 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1592 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1593 | pub const _MM_ROUND_DOWN: u32 = 0x2000; |
1594 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1595 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1596 | pub const _MM_ROUND_UP: u32 = 0x4000; |
1597 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1598 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1599 | pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000; |
1600 | |
1601 | /// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html) |
1602 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1603 | pub const _MM_ROUND_MASK: u32 = 0x6000; |
1604 | |
1605 | /// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html) |
1606 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1607 | pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000; |
1608 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1609 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1610 | pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000; |
1611 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1612 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1613 | pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; |
1614 | |
1615 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1616 | /// |
1617 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK) |
1618 | #[inline ] |
1619 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1620 | #[allow (non_snake_case)] |
1621 | #[target_feature (enable = "sse" )] |
1622 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1623 | #[deprecated ( |
1624 | since = "1.75.0" , |
1625 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1626 | )] |
1627 | pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { |
1628 | _mm_getcsr() & _MM_MASK_MASK |
1629 | } |
1630 | |
1631 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1632 | /// |
1633 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE) |
1634 | #[inline ] |
1635 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1636 | #[allow (non_snake_case)] |
1637 | #[target_feature (enable = "sse" )] |
1638 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1639 | #[deprecated ( |
1640 | since = "1.75.0" , |
1641 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1642 | )] |
1643 | pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { |
1644 | _mm_getcsr() & _MM_EXCEPT_MASK |
1645 | } |
1646 | |
1647 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1648 | /// |
1649 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE) |
1650 | #[inline ] |
1651 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1652 | #[allow (non_snake_case)] |
1653 | #[target_feature (enable = "sse" )] |
1654 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1655 | #[deprecated ( |
1656 | since = "1.75.0" , |
1657 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1658 | )] |
1659 | pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { |
1660 | _mm_getcsr() & _MM_FLUSH_ZERO_MASK |
1661 | } |
1662 | |
1663 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1664 | /// |
1665 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE) |
1666 | #[inline ] |
1667 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1668 | #[allow (non_snake_case)] |
1669 | #[target_feature (enable = "sse" )] |
1670 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1671 | #[deprecated ( |
1672 | since = "1.75.0" , |
1673 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1674 | )] |
1675 | pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { |
1676 | _mm_getcsr() & _MM_ROUND_MASK |
1677 | } |
1678 | |
1679 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1680 | /// |
1681 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK) |
1682 | #[inline ] |
1683 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1684 | #[allow (non_snake_case)] |
1685 | #[target_feature (enable = "sse" )] |
1686 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1687 | #[deprecated ( |
1688 | since = "1.75.0" , |
1689 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1690 | )] |
1691 | pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { |
1692 | _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x) |
1693 | } |
1694 | |
1695 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1696 | /// |
1697 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE) |
1698 | #[inline ] |
1699 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1700 | #[allow (non_snake_case)] |
1701 | #[target_feature (enable = "sse" )] |
1702 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1703 | #[deprecated ( |
1704 | since = "1.75.0" , |
1705 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1706 | )] |
1707 | pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { |
1708 | _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x) |
1709 | } |
1710 | |
1711 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1712 | /// |
1713 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE) |
1714 | #[inline ] |
1715 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1716 | #[allow (non_snake_case)] |
1717 | #[target_feature (enable = "sse" )] |
1718 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1719 | #[deprecated ( |
1720 | since = "1.75.0" , |
1721 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1722 | )] |
1723 | pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { |
1724 | let val: u32 = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x; |
1725 | // println!("setting csr={:x}", val); |
1726 | _mm_setcsr(val) |
1727 | } |
1728 | |
1729 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1730 | /// |
1731 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE) |
1732 | #[inline ] |
1733 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1734 | #[allow (non_snake_case)] |
1735 | #[target_feature (enable = "sse" )] |
1736 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1737 | #[deprecated ( |
1738 | since = "1.75.0" , |
1739 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1740 | )] |
1741 | pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) { |
1742 | _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x) |
1743 | } |
1744 | |
1745 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1746 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1747 | pub const _MM_HINT_T0: i32 = 3; |
1748 | |
1749 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1750 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1751 | pub const _MM_HINT_T1: i32 = 2; |
1752 | |
1753 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1754 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1755 | pub const _MM_HINT_T2: i32 = 1; |
1756 | |
1757 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1758 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1759 | pub const _MM_HINT_NTA: i32 = 0; |
1760 | |
1761 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1762 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1763 | pub const _MM_HINT_ET0: i32 = 7; |
1764 | |
1765 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1766 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1767 | pub const _MM_HINT_ET1: i32 = 6; |
1768 | |
1769 | /// Fetch the cache line that contains address `p` using the given `STRATEGY`. |
1770 | /// |
1771 | /// The `STRATEGY` must be one of: |
1772 | /// |
1773 | /// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the |
1774 | /// cache hierarchy. |
1775 | /// |
1776 | /// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher. |
1777 | /// |
1778 | /// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or |
1779 | /// an implementation-specific choice (e.g., L2 if there is no L3). |
1780 | /// |
1781 | /// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the |
1782 | /// non-temporal access (NTA) hint. It may be a place closer than main memory |
1783 | /// but outside of the cache hierarchy. This is used to reduce access latency |
1784 | /// without polluting the cache. |
1785 | /// |
1786 | /// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and |
1787 | /// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0` |
1788 | /// and `_MM_HINT_T1` but indicate an anticipation to write to the address. |
1789 | /// |
1790 | /// The actual implementation depends on the particular CPU. This instruction |
1791 | /// is considered a hint, so the CPU is also free to simply ignore the request. |
1792 | /// |
1793 | /// The amount of prefetched data depends on the cache line size of the |
1794 | /// specific CPU, but it will be at least 32 bytes. |
1795 | /// |
1796 | /// Common caveats: |
1797 | /// |
1798 | /// * Most modern CPUs already automatically prefetch data based on predicted |
1799 | /// access patterns. |
1800 | /// |
1801 | /// * Data is usually not fetched if this would cause a TLB miss or a page |
1802 | /// fault. |
1803 | /// |
1804 | /// * Too much prefetching can cause unnecessary cache evictions. |
1805 | /// |
1806 | /// * Prefetching may also fail if there are not enough memory-subsystem |
1807 | /// resources (e.g., request buffers). |
1808 | /// |
1809 | /// |
1810 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch) |
1811 | #[inline ] |
1812 | #[target_feature (enable = "sse" )] |
1813 | #[cfg_attr (test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))] |
1814 | #[cfg_attr (test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))] |
1815 | #[cfg_attr (test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))] |
1816 | #[cfg_attr (test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))] |
1817 | #[rustc_legacy_const_generics (1)] |
1818 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1819 | pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) { |
1820 | // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache). |
1821 | // `locality` and `rw` are based on our `STRATEGY`. |
1822 | prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, ty:1); |
1823 | } |
1824 | |
1825 | /// Returns vector of type __m128 with indeterminate elements. |
1826 | /// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`]. |
1827 | /// In practice, this is equivalent to [`mem::zeroed`]. |
1828 | /// |
1829 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps) |
1830 | #[inline ] |
1831 | #[target_feature (enable = "sse" )] |
1832 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1833 | pub unsafe fn _mm_undefined_ps() -> __m128 { |
1834 | _mm_set1_ps(0.0) |
1835 | } |
1836 | |
1837 | /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place. |
1838 | /// |
1839 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS) |
1840 | #[inline ] |
1841 | #[allow (non_snake_case)] |
1842 | #[target_feature (enable = "sse" )] |
1843 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1844 | pub unsafe fn _MM_TRANSPOSE4_PS( |
1845 | row0: &mut __m128, |
1846 | row1: &mut __m128, |
1847 | row2: &mut __m128, |
1848 | row3: &mut __m128, |
1849 | ) { |
1850 | let tmp0: __m128 = _mm_unpacklo_ps(*row0, *row1); |
1851 | let tmp2: __m128 = _mm_unpacklo_ps(*row2, *row3); |
1852 | let tmp1: __m128 = _mm_unpackhi_ps(*row0, *row1); |
1853 | let tmp3: __m128 = _mm_unpackhi_ps(*row2, *row3); |
1854 | |
1855 | *row0 = _mm_movelh_ps(a:tmp0, b:tmp2); |
1856 | *row1 = _mm_movehl_ps(a:tmp2, b:tmp0); |
1857 | *row2 = _mm_movelh_ps(a:tmp1, b:tmp3); |
1858 | *row3 = _mm_movehl_ps(a:tmp3, b:tmp1); |
1859 | } |
1860 | |
1861 | #[allow (improper_ctypes)] |
1862 | extern "C" { |
1863 | #[link_name = "llvm.x86.sse.add.ss" ] |
1864 | fn addss(a: __m128, b: __m128) -> __m128; |
1865 | #[link_name = "llvm.x86.sse.sub.ss" ] |
1866 | fn subss(a: __m128, b: __m128) -> __m128; |
1867 | #[link_name = "llvm.x86.sse.mul.ss" ] |
1868 | fn mulss(a: __m128, b: __m128) -> __m128; |
1869 | #[link_name = "llvm.x86.sse.div.ss" ] |
1870 | fn divss(a: __m128, b: __m128) -> __m128; |
1871 | #[link_name = "llvm.x86.sse.sqrt.ss" ] |
1872 | fn sqrtss(a: __m128) -> __m128; |
1873 | #[link_name = "llvm.x86.sse.sqrt.ps" ] |
1874 | fn sqrtps(a: __m128) -> __m128; |
1875 | #[link_name = "llvm.x86.sse.rcp.ss" ] |
1876 | fn rcpss(a: __m128) -> __m128; |
1877 | #[link_name = "llvm.x86.sse.rcp.ps" ] |
1878 | fn rcpps(a: __m128) -> __m128; |
1879 | #[link_name = "llvm.x86.sse.rsqrt.ss" ] |
1880 | fn rsqrtss(a: __m128) -> __m128; |
1881 | #[link_name = "llvm.x86.sse.rsqrt.ps" ] |
1882 | fn rsqrtps(a: __m128) -> __m128; |
1883 | #[link_name = "llvm.x86.sse.min.ss" ] |
1884 | fn minss(a: __m128, b: __m128) -> __m128; |
1885 | #[link_name = "llvm.x86.sse.min.ps" ] |
1886 | fn minps(a: __m128, b: __m128) -> __m128; |
1887 | #[link_name = "llvm.x86.sse.max.ss" ] |
1888 | fn maxss(a: __m128, b: __m128) -> __m128; |
1889 | #[link_name = "llvm.x86.sse.max.ps" ] |
1890 | fn maxps(a: __m128, b: __m128) -> __m128; |
1891 | #[link_name = "llvm.x86.sse.cmp.ps" ] |
1892 | fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128; |
1893 | #[link_name = "llvm.x86.sse.comieq.ss" ] |
1894 | fn comieq_ss(a: __m128, b: __m128) -> i32; |
1895 | #[link_name = "llvm.x86.sse.comilt.ss" ] |
1896 | fn comilt_ss(a: __m128, b: __m128) -> i32; |
1897 | #[link_name = "llvm.x86.sse.comile.ss" ] |
1898 | fn comile_ss(a: __m128, b: __m128) -> i32; |
1899 | #[link_name = "llvm.x86.sse.comigt.ss" ] |
1900 | fn comigt_ss(a: __m128, b: __m128) -> i32; |
1901 | #[link_name = "llvm.x86.sse.comige.ss" ] |
1902 | fn comige_ss(a: __m128, b: __m128) -> i32; |
1903 | #[link_name = "llvm.x86.sse.comineq.ss" ] |
1904 | fn comineq_ss(a: __m128, b: __m128) -> i32; |
1905 | #[link_name = "llvm.x86.sse.ucomieq.ss" ] |
1906 | fn ucomieq_ss(a: __m128, b: __m128) -> i32; |
1907 | #[link_name = "llvm.x86.sse.ucomilt.ss" ] |
1908 | fn ucomilt_ss(a: __m128, b: __m128) -> i32; |
1909 | #[link_name = "llvm.x86.sse.ucomile.ss" ] |
1910 | fn ucomile_ss(a: __m128, b: __m128) -> i32; |
1911 | #[link_name = "llvm.x86.sse.ucomigt.ss" ] |
1912 | fn ucomigt_ss(a: __m128, b: __m128) -> i32; |
1913 | #[link_name = "llvm.x86.sse.ucomige.ss" ] |
1914 | fn ucomige_ss(a: __m128, b: __m128) -> i32; |
1915 | #[link_name = "llvm.x86.sse.ucomineq.ss" ] |
1916 | fn ucomineq_ss(a: __m128, b: __m128) -> i32; |
1917 | #[link_name = "llvm.x86.sse.cvtss2si" ] |
1918 | fn cvtss2si(a: __m128) -> i32; |
1919 | #[link_name = "llvm.x86.sse.cvttss2si" ] |
1920 | fn cvttss2si(a: __m128) -> i32; |
1921 | #[link_name = "llvm.x86.sse.cvtsi2ss" ] |
1922 | fn cvtsi2ss(a: __m128, b: i32) -> __m128; |
1923 | #[link_name = "llvm.x86.sse.sfence" ] |
1924 | fn sfence(); |
1925 | #[link_name = "llvm.x86.sse.stmxcsr" ] |
1926 | fn stmxcsr(p: *mut i8); |
1927 | #[link_name = "llvm.x86.sse.ldmxcsr" ] |
1928 | fn ldmxcsr(p: *const i8); |
1929 | #[link_name = "llvm.prefetch" ] |
1930 | fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32); |
1931 | #[link_name = "llvm.x86.sse.cmp.ss" ] |
1932 | fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128; |
1933 | } |
1934 | |
1935 | /// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint. |
1936 | /// |
1937 | /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection |
1938 | /// exception _may_ be generated. |
1939 | /// |
1940 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps) |
1941 | #[inline ] |
1942 | #[target_feature (enable = "sse" )] |
1943 | #[cfg_attr (test, assert_instr(movntps))] |
1944 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1945 | #[allow (clippy::cast_ptr_alignment)] |
1946 | pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { |
1947 | intrinsics::nontemporal_store(ptr:mem_addr as *mut __m128, val:a); |
1948 | } |
1949 | |
1950 | #[cfg (test)] |
1951 | mod tests { |
1952 | use crate::{hint::black_box, mem::transmute}; |
1953 | use std::{boxed, f32::NAN}; |
1954 | use stdarch_test::simd_test; |
1955 | |
1956 | use crate::core_arch::{simd::*, x86::*}; |
1957 | |
1958 | #[simd_test(enable = "sse" )] |
1959 | unsafe fn test_mm_add_ps() { |
1960 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
1961 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
1962 | let r = _mm_add_ps(a, b); |
1963 | assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0)); |
1964 | } |
1965 | |
1966 | #[simd_test(enable = "sse" )] |
1967 | unsafe fn test_mm_add_ss() { |
1968 | let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0); |
1969 | let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0); |
1970 | let r = _mm_add_ss(a, b); |
1971 | assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0)); |
1972 | } |
1973 | |
1974 | #[simd_test(enable = "sse" )] |
1975 | unsafe fn test_mm_sub_ps() { |
1976 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
1977 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
1978 | let r = _mm_sub_ps(a, b); |
1979 | assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0)); |
1980 | } |
1981 | |
1982 | #[simd_test(enable = "sse" )] |
1983 | unsafe fn test_mm_sub_ss() { |
1984 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
1985 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
1986 | let r = _mm_sub_ss(a, b); |
1987 | assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0)); |
1988 | } |
1989 | |
1990 | #[simd_test(enable = "sse" )] |
1991 | unsafe fn test_mm_mul_ps() { |
1992 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
1993 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
1994 | let r = _mm_mul_ps(a, b); |
1995 | assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0)); |
1996 | } |
1997 | |
1998 | #[simd_test(enable = "sse" )] |
1999 | unsafe fn test_mm_mul_ss() { |
2000 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2001 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2002 | let r = _mm_mul_ss(a, b); |
2003 | assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0)); |
2004 | } |
2005 | |
2006 | #[simd_test(enable = "sse" )] |
2007 | unsafe fn test_mm_div_ps() { |
2008 | let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0); |
2009 | let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0); |
2010 | let r = _mm_div_ps(a, b); |
2011 | assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0)); |
2012 | } |
2013 | |
2014 | #[simd_test(enable = "sse" )] |
2015 | unsafe fn test_mm_div_ss() { |
2016 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2017 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2018 | let r = _mm_div_ss(a, b); |
2019 | assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0)); |
2020 | } |
2021 | |
2022 | #[simd_test(enable = "sse" )] |
2023 | unsafe fn test_mm_sqrt_ss() { |
2024 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2025 | let r = _mm_sqrt_ss(a); |
2026 | let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0); |
2027 | assert_eq_m128(r, e); |
2028 | } |
2029 | |
2030 | #[simd_test(enable = "sse" )] |
2031 | unsafe fn test_mm_sqrt_ps() { |
2032 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2033 | let r = _mm_sqrt_ps(a); |
2034 | let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0); |
2035 | assert_eq_m128(r, e); |
2036 | } |
2037 | |
2038 | #[simd_test(enable = "sse" )] |
2039 | unsafe fn test_mm_rcp_ss() { |
2040 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2041 | let r = _mm_rcp_ss(a); |
2042 | let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0); |
2043 | let rel_err = 0.00048828125; |
2044 | assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err); |
2045 | for i in 1..4 { |
2046 | assert_eq!(get_m128(r, i), get_m128(e, i)); |
2047 | } |
2048 | } |
2049 | |
2050 | #[simd_test(enable = "sse" )] |
2051 | unsafe fn test_mm_rcp_ps() { |
2052 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2053 | let r = _mm_rcp_ps(a); |
2054 | let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215); |
2055 | let rel_err = 0.00048828125; |
2056 | for i in 0..4 { |
2057 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); |
2058 | } |
2059 | } |
2060 | |
2061 | #[simd_test(enable = "sse" )] |
2062 | unsafe fn test_mm_rsqrt_ss() { |
2063 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2064 | let r = _mm_rsqrt_ss(a); |
2065 | let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0); |
2066 | let rel_err = 0.00048828125; |
2067 | for i in 0..4 { |
2068 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); |
2069 | } |
2070 | } |
2071 | |
2072 | #[simd_test(enable = "sse" )] |
2073 | unsafe fn test_mm_rsqrt_ps() { |
2074 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2075 | let r = _mm_rsqrt_ps(a); |
2076 | let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845); |
2077 | let rel_err = 0.00048828125; |
2078 | for i in 0..4 { |
2079 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); |
2080 | } |
2081 | } |
2082 | |
2083 | #[simd_test(enable = "sse" )] |
2084 | unsafe fn test_mm_min_ss() { |
2085 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2086 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2087 | let r = _mm_min_ss(a, b); |
2088 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); |
2089 | } |
2090 | |
2091 | #[simd_test(enable = "sse" )] |
2092 | unsafe fn test_mm_min_ps() { |
2093 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2094 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2095 | let r = _mm_min_ps(a, b); |
2096 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); |
2097 | |
2098 | // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min` |
2099 | // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic |
2100 | // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from |
2101 | // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals |
2102 | // `r1` to `a` and `r2` to `b`. |
2103 | let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); |
2104 | let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); |
2105 | let r1: [u8; 16] = transmute(_mm_min_ps(a, b)); |
2106 | let r2: [u8; 16] = transmute(_mm_min_ps(b, a)); |
2107 | let a: [u8; 16] = transmute(a); |
2108 | let b: [u8; 16] = transmute(b); |
2109 | assert_eq!(r1, b); |
2110 | assert_eq!(r2, a); |
2111 | assert_ne!(a, b); // sanity check that -0.0 is actually present |
2112 | } |
2113 | |
2114 | #[simd_test(enable = "sse" )] |
2115 | unsafe fn test_mm_max_ss() { |
2116 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2117 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2118 | let r = _mm_max_ss(a, b); |
2119 | assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0)); |
2120 | } |
2121 | |
2122 | #[simd_test(enable = "sse" )] |
2123 | unsafe fn test_mm_max_ps() { |
2124 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2125 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2126 | let r = _mm_max_ps(a, b); |
2127 | assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0)); |
2128 | |
2129 | // Check SSE-specific semantics for -0.0 handling. |
2130 | let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); |
2131 | let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); |
2132 | let r1: [u8; 16] = transmute(_mm_max_ps(a, b)); |
2133 | let r2: [u8; 16] = transmute(_mm_max_ps(b, a)); |
2134 | let a: [u8; 16] = transmute(a); |
2135 | let b: [u8; 16] = transmute(b); |
2136 | assert_eq!(r1, b); |
2137 | assert_eq!(r2, a); |
2138 | assert_ne!(a, b); // sanity check that -0.0 is actually present |
2139 | } |
2140 | |
2141 | #[simd_test(enable = "sse" )] |
2142 | unsafe fn test_mm_and_ps() { |
2143 | let a = transmute(u32x4::splat(0b0011)); |
2144 | let b = transmute(u32x4::splat(0b0101)); |
2145 | let r = _mm_and_ps(*black_box(&a), *black_box(&b)); |
2146 | let e = transmute(u32x4::splat(0b0001)); |
2147 | assert_eq_m128(r, e); |
2148 | } |
2149 | |
2150 | #[simd_test(enable = "sse" )] |
2151 | unsafe fn test_mm_andnot_ps() { |
2152 | let a = transmute(u32x4::splat(0b0011)); |
2153 | let b = transmute(u32x4::splat(0b0101)); |
2154 | let r = _mm_andnot_ps(*black_box(&a), *black_box(&b)); |
2155 | let e = transmute(u32x4::splat(0b0100)); |
2156 | assert_eq_m128(r, e); |
2157 | } |
2158 | |
2159 | #[simd_test(enable = "sse" )] |
2160 | unsafe fn test_mm_or_ps() { |
2161 | let a = transmute(u32x4::splat(0b0011)); |
2162 | let b = transmute(u32x4::splat(0b0101)); |
2163 | let r = _mm_or_ps(*black_box(&a), *black_box(&b)); |
2164 | let e = transmute(u32x4::splat(0b0111)); |
2165 | assert_eq_m128(r, e); |
2166 | } |
2167 | |
2168 | #[simd_test(enable = "sse" )] |
2169 | unsafe fn test_mm_xor_ps() { |
2170 | let a = transmute(u32x4::splat(0b0011)); |
2171 | let b = transmute(u32x4::splat(0b0101)); |
2172 | let r = _mm_xor_ps(*black_box(&a), *black_box(&b)); |
2173 | let e = transmute(u32x4::splat(0b0110)); |
2174 | assert_eq_m128(r, e); |
2175 | } |
2176 | |
2177 | #[simd_test(enable = "sse" )] |
2178 | unsafe fn test_mm_cmpeq_ss() { |
2179 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2180 | let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0); |
2181 | let r: u32x4 = transmute(_mm_cmpeq_ss(a, b)); |
2182 | let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0)); |
2183 | assert_eq!(r, e); |
2184 | |
2185 | let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2186 | let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2)); |
2187 | let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0)); |
2188 | assert_eq!(r2, e2); |
2189 | } |
2190 | |
2191 | #[simd_test(enable = "sse" )] |
2192 | unsafe fn test_mm_cmplt_ss() { |
2193 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2194 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2195 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2196 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2197 | |
2198 | let b1 = 0u32; // a.extract(0) < b.extract(0) |
2199 | let c1 = 0u32; // a.extract(0) < c.extract(0) |
2200 | let d1 = !0u32; // a.extract(0) < d.extract(0) |
2201 | |
2202 | let rb: u32x4 = transmute(_mm_cmplt_ss(a, b)); |
2203 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2204 | assert_eq!(rb, eb); |
2205 | |
2206 | let rc: u32x4 = transmute(_mm_cmplt_ss(a, c)); |
2207 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2208 | assert_eq!(rc, ec); |
2209 | |
2210 | let rd: u32x4 = transmute(_mm_cmplt_ss(a, d)); |
2211 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2212 | assert_eq!(rd, ed); |
2213 | } |
2214 | |
2215 | #[simd_test(enable = "sse" )] |
2216 | unsafe fn test_mm_cmple_ss() { |
2217 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2218 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2219 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2220 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2221 | |
2222 | let b1 = 0u32; // a.extract(0) <= b.extract(0) |
2223 | let c1 = !0u32; // a.extract(0) <= c.extract(0) |
2224 | let d1 = !0u32; // a.extract(0) <= d.extract(0) |
2225 | |
2226 | let rb: u32x4 = transmute(_mm_cmple_ss(a, b)); |
2227 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2228 | assert_eq!(rb, eb); |
2229 | |
2230 | let rc: u32x4 = transmute(_mm_cmple_ss(a, c)); |
2231 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2232 | assert_eq!(rc, ec); |
2233 | |
2234 | let rd: u32x4 = transmute(_mm_cmple_ss(a, d)); |
2235 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2236 | assert_eq!(rd, ed); |
2237 | } |
2238 | |
2239 | #[simd_test(enable = "sse" )] |
2240 | unsafe fn test_mm_cmpgt_ss() { |
2241 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2242 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2243 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2244 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2245 | |
2246 | let b1 = !0u32; // a.extract(0) > b.extract(0) |
2247 | let c1 = 0u32; // a.extract(0) > c.extract(0) |
2248 | let d1 = 0u32; // a.extract(0) > d.extract(0) |
2249 | |
2250 | let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b)); |
2251 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2252 | assert_eq!(rb, eb); |
2253 | |
2254 | let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c)); |
2255 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2256 | assert_eq!(rc, ec); |
2257 | |
2258 | let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d)); |
2259 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2260 | assert_eq!(rd, ed); |
2261 | } |
2262 | |
2263 | #[simd_test(enable = "sse" )] |
2264 | unsafe fn test_mm_cmpge_ss() { |
2265 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2266 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2267 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2268 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2269 | |
2270 | let b1 = !0u32; // a.extract(0) >= b.extract(0) |
2271 | let c1 = !0u32; // a.extract(0) >= c.extract(0) |
2272 | let d1 = 0u32; // a.extract(0) >= d.extract(0) |
2273 | |
2274 | let rb: u32x4 = transmute(_mm_cmpge_ss(a, b)); |
2275 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2276 | assert_eq!(rb, eb); |
2277 | |
2278 | let rc: u32x4 = transmute(_mm_cmpge_ss(a, c)); |
2279 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2280 | assert_eq!(rc, ec); |
2281 | |
2282 | let rd: u32x4 = transmute(_mm_cmpge_ss(a, d)); |
2283 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2284 | assert_eq!(rd, ed); |
2285 | } |
2286 | |
2287 | #[simd_test(enable = "sse" )] |
2288 | unsafe fn test_mm_cmpneq_ss() { |
2289 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2290 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2291 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2292 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2293 | |
2294 | let b1 = !0u32; // a.extract(0) != b.extract(0) |
2295 | let c1 = 0u32; // a.extract(0) != c.extract(0) |
2296 | let d1 = !0u32; // a.extract(0) != d.extract(0) |
2297 | |
2298 | let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b)); |
2299 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2300 | assert_eq!(rb, eb); |
2301 | |
2302 | let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c)); |
2303 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2304 | assert_eq!(rc, ec); |
2305 | |
2306 | let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d)); |
2307 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2308 | assert_eq!(rd, ed); |
2309 | } |
2310 | |
2311 | #[simd_test(enable = "sse" )] |
2312 | unsafe fn test_mm_cmpnlt_ss() { |
2313 | // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there |
2314 | // must be a difference. It may have to do with behavior in the |
2315 | // presence of NaNs (signaling or quiet). If so, we should add tests |
2316 | // for those. |
2317 | |
2318 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2319 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2320 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2321 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2322 | |
2323 | let b1 = !0u32; // a.extract(0) >= b.extract(0) |
2324 | let c1 = !0u32; // a.extract(0) >= c.extract(0) |
2325 | let d1 = 0u32; // a.extract(0) >= d.extract(0) |
2326 | |
2327 | let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b)); |
2328 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2329 | assert_eq!(rb, eb); |
2330 | |
2331 | let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c)); |
2332 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2333 | assert_eq!(rc, ec); |
2334 | |
2335 | let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d)); |
2336 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2337 | assert_eq!(rd, ed); |
2338 | } |
2339 | |
2340 | #[simd_test(enable = "sse" )] |
2341 | unsafe fn test_mm_cmpnle_ss() { |
2342 | // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there |
2343 | // must be a difference. It may have to do with behavior in the |
2344 | // presence |
2345 | // of NaNs (signaling or quiet). If so, we should add tests for those. |
2346 | |
2347 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2348 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2349 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2350 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2351 | |
2352 | let b1 = !0u32; // a.extract(0) > b.extract(0) |
2353 | let c1 = 0u32; // a.extract(0) > c.extract(0) |
2354 | let d1 = 0u32; // a.extract(0) > d.extract(0) |
2355 | |
2356 | let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b)); |
2357 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2358 | assert_eq!(rb, eb); |
2359 | |
2360 | let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c)); |
2361 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2362 | assert_eq!(rc, ec); |
2363 | |
2364 | let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d)); |
2365 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2366 | assert_eq!(rd, ed); |
2367 | } |
2368 | |
2369 | #[simd_test(enable = "sse" )] |
2370 | unsafe fn test_mm_cmpngt_ss() { |
2371 | // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there |
2372 | // must be a difference. It may have to do with behavior in the |
2373 | // presence of NaNs (signaling or quiet). If so, we should add tests |
2374 | // for those. |
2375 | |
2376 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2377 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2378 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2379 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2380 | |
2381 | let b1 = 0u32; // a.extract(0) <= b.extract(0) |
2382 | let c1 = !0u32; // a.extract(0) <= c.extract(0) |
2383 | let d1 = !0u32; // a.extract(0) <= d.extract(0) |
2384 | |
2385 | let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b)); |
2386 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2387 | assert_eq!(rb, eb); |
2388 | |
2389 | let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c)); |
2390 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2391 | assert_eq!(rc, ec); |
2392 | |
2393 | let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d)); |
2394 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2395 | assert_eq!(rd, ed); |
2396 | } |
2397 | |
2398 | #[simd_test(enable = "sse" )] |
2399 | unsafe fn test_mm_cmpnge_ss() { |
2400 | // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there |
2401 | // must be a difference. It may have to do with behavior in the |
2402 | // presence of NaNs (signaling or quiet). If so, we should add tests |
2403 | // for those. |
2404 | |
2405 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2406 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2407 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2408 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2409 | |
2410 | let b1 = 0u32; // a.extract(0) < b.extract(0) |
2411 | let c1 = 0u32; // a.extract(0) < c.extract(0) |
2412 | let d1 = !0u32; // a.extract(0) < d.extract(0) |
2413 | |
2414 | let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b)); |
2415 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2416 | assert_eq!(rb, eb); |
2417 | |
2418 | let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c)); |
2419 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2420 | assert_eq!(rc, ec); |
2421 | |
2422 | let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d)); |
2423 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2424 | assert_eq!(rd, ed); |
2425 | } |
2426 | |
2427 | #[simd_test(enable = "sse" )] |
2428 | unsafe fn test_mm_cmpord_ss() { |
2429 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2430 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2431 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); |
2432 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2433 | |
2434 | let b1 = !0u32; // a.extract(0) ord b.extract(0) |
2435 | let c1 = 0u32; // a.extract(0) ord c.extract(0) |
2436 | let d1 = !0u32; // a.extract(0) ord d.extract(0) |
2437 | |
2438 | let rb: u32x4 = transmute(_mm_cmpord_ss(a, b)); |
2439 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2440 | assert_eq!(rb, eb); |
2441 | |
2442 | let rc: u32x4 = transmute(_mm_cmpord_ss(a, c)); |
2443 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2444 | assert_eq!(rc, ec); |
2445 | |
2446 | let rd: u32x4 = transmute(_mm_cmpord_ss(a, d)); |
2447 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2448 | assert_eq!(rd, ed); |
2449 | } |
2450 | |
2451 | #[simd_test(enable = "sse" )] |
2452 | unsafe fn test_mm_cmpunord_ss() { |
2453 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2454 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2455 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); |
2456 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2457 | |
2458 | let b1 = 0u32; // a.extract(0) unord b.extract(0) |
2459 | let c1 = !0u32; // a.extract(0) unord c.extract(0) |
2460 | let d1 = 0u32; // a.extract(0) unord d.extract(0) |
2461 | |
2462 | let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b)); |
2463 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2464 | assert_eq!(rb, eb); |
2465 | |
2466 | let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c)); |
2467 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2468 | assert_eq!(rc, ec); |
2469 | |
2470 | let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d)); |
2471 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2472 | assert_eq!(rd, ed); |
2473 | } |
2474 | |
2475 | #[simd_test(enable = "sse" )] |
2476 | unsafe fn test_mm_cmpeq_ps() { |
2477 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2478 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2479 | let tru = !0u32; |
2480 | let fls = 0u32; |
2481 | |
2482 | let e = u32x4::new(fls, fls, tru, fls); |
2483 | let r: u32x4 = transmute(_mm_cmpeq_ps(a, b)); |
2484 | assert_eq!(r, e); |
2485 | } |
2486 | |
2487 | #[simd_test(enable = "sse" )] |
2488 | unsafe fn test_mm_cmplt_ps() { |
2489 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2490 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2491 | let tru = !0u32; |
2492 | let fls = 0u32; |
2493 | |
2494 | let e = u32x4::new(tru, fls, fls, fls); |
2495 | let r: u32x4 = transmute(_mm_cmplt_ps(a, b)); |
2496 | assert_eq!(r, e); |
2497 | } |
2498 | |
2499 | #[simd_test(enable = "sse" )] |
2500 | unsafe fn test_mm_cmple_ps() { |
2501 | let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0); |
2502 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2503 | let tru = !0u32; |
2504 | let fls = 0u32; |
2505 | |
2506 | let e = u32x4::new(tru, fls, tru, fls); |
2507 | let r: u32x4 = transmute(_mm_cmple_ps(a, b)); |
2508 | assert_eq!(r, e); |
2509 | } |
2510 | |
2511 | #[simd_test(enable = "sse" )] |
2512 | unsafe fn test_mm_cmpgt_ps() { |
2513 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2514 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); |
2515 | let tru = !0u32; |
2516 | let fls = 0u32; |
2517 | |
2518 | let e = u32x4::new(fls, tru, fls, fls); |
2519 | let r: u32x4 = transmute(_mm_cmpgt_ps(a, b)); |
2520 | assert_eq!(r, e); |
2521 | } |
2522 | |
2523 | #[simd_test(enable = "sse" )] |
2524 | unsafe fn test_mm_cmpge_ps() { |
2525 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2526 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); |
2527 | let tru = !0u32; |
2528 | let fls = 0u32; |
2529 | |
2530 | let e = u32x4::new(fls, tru, tru, fls); |
2531 | let r: u32x4 = transmute(_mm_cmpge_ps(a, b)); |
2532 | assert_eq!(r, e); |
2533 | } |
2534 | |
2535 | #[simd_test(enable = "sse" )] |
2536 | unsafe fn test_mm_cmpneq_ps() { |
2537 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2538 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2539 | let tru = !0u32; |
2540 | let fls = 0u32; |
2541 | |
2542 | let e = u32x4::new(tru, tru, fls, tru); |
2543 | let r: u32x4 = transmute(_mm_cmpneq_ps(a, b)); |
2544 | assert_eq!(r, e); |
2545 | } |
2546 | |
2547 | #[simd_test(enable = "sse" )] |
2548 | unsafe fn test_mm_cmpnlt_ps() { |
2549 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2550 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2551 | let tru = !0u32; |
2552 | let fls = 0u32; |
2553 | |
2554 | let e = u32x4::new(fls, tru, tru, tru); |
2555 | let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b)); |
2556 | assert_eq!(r, e); |
2557 | } |
2558 | |
2559 | #[simd_test(enable = "sse" )] |
2560 | unsafe fn test_mm_cmpnle_ps() { |
2561 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2562 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2563 | let tru = !0u32; |
2564 | let fls = 0u32; |
2565 | |
2566 | let e = u32x4::new(fls, tru, fls, tru); |
2567 | let r: u32x4 = transmute(_mm_cmpnle_ps(a, b)); |
2568 | assert_eq!(r, e); |
2569 | } |
2570 | |
2571 | #[simd_test(enable = "sse" )] |
2572 | unsafe fn test_mm_cmpngt_ps() { |
2573 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2574 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2575 | let tru = !0u32; |
2576 | let fls = 0u32; |
2577 | |
2578 | let e = u32x4::new(tru, fls, tru, tru); |
2579 | let r: u32x4 = transmute(_mm_cmpngt_ps(a, b)); |
2580 | assert_eq!(r, e); |
2581 | } |
2582 | |
2583 | #[simd_test(enable = "sse" )] |
2584 | unsafe fn test_mm_cmpnge_ps() { |
2585 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2586 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2587 | let tru = !0u32; |
2588 | let fls = 0u32; |
2589 | |
2590 | let e = u32x4::new(tru, fls, fls, tru); |
2591 | let r: u32x4 = transmute(_mm_cmpnge_ps(a, b)); |
2592 | assert_eq!(r, e); |
2593 | } |
2594 | |
2595 | #[simd_test(enable = "sse" )] |
2596 | unsafe fn test_mm_cmpord_ps() { |
2597 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); |
2598 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); |
2599 | let tru = !0u32; |
2600 | let fls = 0u32; |
2601 | |
2602 | let e = u32x4::new(tru, fls, fls, fls); |
2603 | let r: u32x4 = transmute(_mm_cmpord_ps(a, b)); |
2604 | assert_eq!(r, e); |
2605 | } |
2606 | |
2607 | #[simd_test(enable = "sse" )] |
2608 | unsafe fn test_mm_cmpunord_ps() { |
2609 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); |
2610 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); |
2611 | let tru = !0u32; |
2612 | let fls = 0u32; |
2613 | |
2614 | let e = u32x4::new(fls, tru, tru, tru); |
2615 | let r: u32x4 = transmute(_mm_cmpunord_ps(a, b)); |
2616 | assert_eq!(r, e); |
2617 | } |
2618 | |
2619 | #[simd_test(enable = "sse" )] |
2620 | unsafe fn test_mm_comieq_ss() { |
2621 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2622 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2623 | |
2624 | let ee = &[1i32, 0, 0, 0]; |
2625 | |
2626 | for i in 0..4 { |
2627 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2628 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2629 | |
2630 | let r = _mm_comieq_ss(a, b); |
2631 | |
2632 | assert_eq!( |
2633 | ee[i], r, |
2634 | "_mm_comieq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2635 | a, b, r, ee[i], i |
2636 | ); |
2637 | } |
2638 | } |
2639 | |
2640 | #[simd_test(enable = "sse" )] |
2641 | unsafe fn test_mm_comilt_ss() { |
2642 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2643 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2644 | |
2645 | let ee = &[0i32, 1, 0, 0]; |
2646 | |
2647 | for i in 0..4 { |
2648 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2649 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2650 | |
2651 | let r = _mm_comilt_ss(a, b); |
2652 | |
2653 | assert_eq!( |
2654 | ee[i], r, |
2655 | "_mm_comilt_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2656 | a, b, r, ee[i], i |
2657 | ); |
2658 | } |
2659 | } |
2660 | |
2661 | #[simd_test(enable = "sse" )] |
2662 | unsafe fn test_mm_comile_ss() { |
2663 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2664 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2665 | |
2666 | let ee = &[1i32, 1, 0, 0]; |
2667 | |
2668 | for i in 0..4 { |
2669 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2670 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2671 | |
2672 | let r = _mm_comile_ss(a, b); |
2673 | |
2674 | assert_eq!( |
2675 | ee[i], r, |
2676 | "_mm_comile_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2677 | a, b, r, ee[i], i |
2678 | ); |
2679 | } |
2680 | } |
2681 | |
2682 | #[simd_test(enable = "sse" )] |
2683 | unsafe fn test_mm_comigt_ss() { |
2684 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2685 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2686 | |
2687 | let ee = &[1i32, 0, 1, 0]; |
2688 | |
2689 | for i in 0..4 { |
2690 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2691 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2692 | |
2693 | let r = _mm_comige_ss(a, b); |
2694 | |
2695 | assert_eq!( |
2696 | ee[i], r, |
2697 | "_mm_comige_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2698 | a, b, r, ee[i], i |
2699 | ); |
2700 | } |
2701 | } |
2702 | |
2703 | #[simd_test(enable = "sse" )] |
2704 | unsafe fn test_mm_comineq_ss() { |
2705 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2706 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2707 | |
2708 | let ee = &[0i32, 1, 1, 1]; |
2709 | |
2710 | for i in 0..4 { |
2711 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2712 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2713 | |
2714 | let r = _mm_comineq_ss(a, b); |
2715 | |
2716 | assert_eq!( |
2717 | ee[i], r, |
2718 | "_mm_comineq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2719 | a, b, r, ee[i], i |
2720 | ); |
2721 | } |
2722 | } |
2723 | |
2724 | #[simd_test(enable = "sse" )] |
2725 | unsafe fn test_mm_ucomieq_ss() { |
2726 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2727 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2728 | |
2729 | let ee = &[1i32, 0, 0, 0]; |
2730 | |
2731 | for i in 0..4 { |
2732 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2733 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2734 | |
2735 | let r = _mm_ucomieq_ss(a, b); |
2736 | |
2737 | assert_eq!( |
2738 | ee[i], r, |
2739 | "_mm_ucomieq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2740 | a, b, r, ee[i], i |
2741 | ); |
2742 | } |
2743 | } |
2744 | |
2745 | #[simd_test(enable = "sse" )] |
2746 | unsafe fn test_mm_ucomilt_ss() { |
2747 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2748 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2749 | |
2750 | let ee = &[0i32, 1, 0, 0]; |
2751 | |
2752 | for i in 0..4 { |
2753 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2754 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2755 | |
2756 | let r = _mm_ucomilt_ss(a, b); |
2757 | |
2758 | assert_eq!( |
2759 | ee[i], r, |
2760 | "_mm_ucomilt_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2761 | a, b, r, ee[i], i |
2762 | ); |
2763 | } |
2764 | } |
2765 | |
2766 | #[simd_test(enable = "sse" )] |
2767 | unsafe fn test_mm_ucomile_ss() { |
2768 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2769 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2770 | |
2771 | let ee = &[1i32, 1, 0, 0]; |
2772 | |
2773 | for i in 0..4 { |
2774 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2775 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2776 | |
2777 | let r = _mm_ucomile_ss(a, b); |
2778 | |
2779 | assert_eq!( |
2780 | ee[i], r, |
2781 | "_mm_ucomile_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2782 | a, b, r, ee[i], i |
2783 | ); |
2784 | } |
2785 | } |
2786 | |
2787 | #[simd_test(enable = "sse" )] |
2788 | unsafe fn test_mm_ucomigt_ss() { |
2789 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2790 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2791 | |
2792 | let ee = &[0i32, 0, 1, 0]; |
2793 | |
2794 | for i in 0..4 { |
2795 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2796 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2797 | |
2798 | let r = _mm_ucomigt_ss(a, b); |
2799 | |
2800 | assert_eq!( |
2801 | ee[i], r, |
2802 | "_mm_ucomigt_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2803 | a, b, r, ee[i], i |
2804 | ); |
2805 | } |
2806 | } |
2807 | |
2808 | #[simd_test(enable = "sse" )] |
2809 | unsafe fn test_mm_ucomige_ss() { |
2810 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2811 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2812 | |
2813 | let ee = &[1i32, 0, 1, 0]; |
2814 | |
2815 | for i in 0..4 { |
2816 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2817 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2818 | |
2819 | let r = _mm_ucomige_ss(a, b); |
2820 | |
2821 | assert_eq!( |
2822 | ee[i], r, |
2823 | "_mm_ucomige_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2824 | a, b, r, ee[i], i |
2825 | ); |
2826 | } |
2827 | } |
2828 | |
2829 | #[simd_test(enable = "sse" )] |
2830 | unsafe fn test_mm_ucomineq_ss() { |
2831 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2832 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2833 | |
2834 | let ee = &[0i32, 1, 1, 1]; |
2835 | |
2836 | for i in 0..4 { |
2837 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2838 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2839 | |
2840 | let r = _mm_ucomineq_ss(a, b); |
2841 | |
2842 | assert_eq!( |
2843 | ee[i], r, |
2844 | "_mm_ucomineq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2845 | a, b, r, ee[i], i |
2846 | ); |
2847 | } |
2848 | } |
2849 | |
2850 | #[allow (deprecated)] // FIXME: This test uses deprecated CSR access functions |
2851 | #[simd_test(enable = "sse" )] |
2852 | #[cfg_attr (miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri |
2853 | unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() { |
2854 | // If one of the arguments is a quiet NaN `comieq_ss` should signal an |
2855 | // Invalid Operation Exception while `ucomieq_ss` should not. |
2856 | let aa = &[3.0f32, NAN, 23.0, NAN]; |
2857 | let bb = &[3.0f32, 47.5, NAN, NAN]; |
2858 | |
2859 | let ee = &[1i32, 0, 0, 0]; |
2860 | let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception? |
2861 | |
2862 | for i in 0..4 { |
2863 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2864 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2865 | |
2866 | _MM_SET_EXCEPTION_STATE(0); |
2867 | let r1 = _mm_comieq_ss(*black_box(&a), b); |
2868 | let s1 = _MM_GET_EXCEPTION_STATE(); |
2869 | |
2870 | _MM_SET_EXCEPTION_STATE(0); |
2871 | let r2 = _mm_ucomieq_ss(*black_box(&a), b); |
2872 | let s2 = _MM_GET_EXCEPTION_STATE(); |
2873 | |
2874 | assert_eq!( |
2875 | ee[i], r1, |
2876 | "_mm_comeq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2877 | a, b, r1, ee[i], i |
2878 | ); |
2879 | assert_eq!( |
2880 | ee[i], r2, |
2881 | "_mm_ucomeq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2882 | a, b, r2, ee[i], i |
2883 | ); |
2884 | assert_eq!( |
2885 | s1, |
2886 | exc[i] * _MM_EXCEPT_INVALID, |
2887 | "_mm_comieq_ss() set exception flags: {} (i= {})" , |
2888 | s1, |
2889 | i |
2890 | ); |
2891 | assert_eq!( |
2892 | s2, |
2893 | 0, // ucomieq_ss should not signal an exception |
2894 | "_mm_ucomieq_ss() set exception flags: {} (i= {})" , |
2895 | s2, |
2896 | i |
2897 | ); |
2898 | } |
2899 | } |
2900 | |
2901 | #[simd_test(enable = "sse" )] |
2902 | unsafe fn test_mm_cvtss_si32() { |
2903 | let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; |
2904 | let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520]; |
2905 | for i in 0..inputs.len() { |
2906 | let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0); |
2907 | let e = result[i]; |
2908 | let r = _mm_cvtss_si32(x); |
2909 | assert_eq!( |
2910 | e, r, |
2911 | "TestCase # {} _mm_cvtss_si32( {:?}) = {}, expected: {}" , |
2912 | i, x, r, e |
2913 | ); |
2914 | } |
2915 | } |
2916 | |
2917 | #[simd_test(enable = "sse" )] |
2918 | unsafe fn test_mm_cvttss_si32() { |
2919 | let inputs = &[ |
2920 | (42.0f32, 42i32), |
2921 | (-31.4, -31), |
2922 | (-33.5, -33), |
2923 | (-34.5, -34), |
2924 | (10.999, 10), |
2925 | (-5.99, -5), |
2926 | (4.0e10, i32::MIN), |
2927 | (4.0e-10, 0), |
2928 | (NAN, i32::MIN), |
2929 | (2147483500.1, 2147483520), |
2930 | ]; |
2931 | for i in 0..inputs.len() { |
2932 | let (xi, e) = inputs[i]; |
2933 | let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); |
2934 | let r = _mm_cvttss_si32(x); |
2935 | assert_eq!( |
2936 | e, r, |
2937 | "TestCase # {} _mm_cvttss_si32( {:?}) = {}, expected: {}" , |
2938 | i, x, r, e |
2939 | ); |
2940 | } |
2941 | } |
2942 | |
2943 | #[simd_test(enable = "sse" )] |
2944 | unsafe fn test_mm_cvtsi32_ss() { |
2945 | let inputs = &[ |
2946 | (4555i32, 4555.0f32), |
2947 | (322223333, 322223330.0), |
2948 | (-432, -432.0), |
2949 | (-322223333, -322223330.0), |
2950 | ]; |
2951 | |
2952 | for i in 0..inputs.len() { |
2953 | let (x, f) = inputs[i]; |
2954 | let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
2955 | let r = _mm_cvtsi32_ss(a, x); |
2956 | let e = _mm_setr_ps(f, 6.0, 7.0, 8.0); |
2957 | assert_eq_m128(e, r); |
2958 | } |
2959 | } |
2960 | |
2961 | #[simd_test(enable = "sse" )] |
2962 | unsafe fn test_mm_cvtss_f32() { |
2963 | let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0); |
2964 | assert_eq!(_mm_cvtss_f32(a), 312.0134); |
2965 | } |
2966 | |
2967 | #[simd_test(enable = "sse" )] |
2968 | unsafe fn test_mm_set_ss() { |
2969 | let r = _mm_set_ss(black_box(4.25)); |
2970 | assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0)); |
2971 | } |
2972 | |
2973 | #[simd_test(enable = "sse" )] |
2974 | unsafe fn test_mm_set1_ps() { |
2975 | let r1 = _mm_set1_ps(black_box(4.25)); |
2976 | let r2 = _mm_set_ps1(black_box(4.25)); |
2977 | assert_eq!(get_m128(r1, 0), 4.25); |
2978 | assert_eq!(get_m128(r1, 1), 4.25); |
2979 | assert_eq!(get_m128(r1, 2), 4.25); |
2980 | assert_eq!(get_m128(r1, 3), 4.25); |
2981 | assert_eq!(get_m128(r2, 0), 4.25); |
2982 | assert_eq!(get_m128(r2, 1), 4.25); |
2983 | assert_eq!(get_m128(r2, 2), 4.25); |
2984 | assert_eq!(get_m128(r2, 3), 4.25); |
2985 | } |
2986 | |
2987 | #[simd_test(enable = "sse" )] |
2988 | unsafe fn test_mm_set_ps() { |
2989 | let r = _mm_set_ps( |
2990 | black_box(1.0), |
2991 | black_box(2.0), |
2992 | black_box(3.0), |
2993 | black_box(4.0), |
2994 | ); |
2995 | assert_eq!(get_m128(r, 0), 4.0); |
2996 | assert_eq!(get_m128(r, 1), 3.0); |
2997 | assert_eq!(get_m128(r, 2), 2.0); |
2998 | assert_eq!(get_m128(r, 3), 1.0); |
2999 | } |
3000 | |
3001 | #[simd_test(enable = "sse" )] |
3002 | unsafe fn test_mm_setr_ps() { |
3003 | let r = _mm_setr_ps( |
3004 | black_box(1.0), |
3005 | black_box(2.0), |
3006 | black_box(3.0), |
3007 | black_box(4.0), |
3008 | ); |
3009 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0)); |
3010 | } |
3011 | |
3012 | #[simd_test(enable = "sse" )] |
3013 | unsafe fn test_mm_setzero_ps() { |
3014 | let r = *black_box(&_mm_setzero_ps()); |
3015 | assert_eq_m128(r, _mm_set1_ps(0.0)); |
3016 | } |
3017 | |
3018 | #[simd_test(enable = "sse" )] |
3019 | unsafe fn test_mm_shuffle() { |
3020 | assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11); |
3021 | assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00); |
3022 | assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01); |
3023 | } |
3024 | |
3025 | #[simd_test(enable = "sse" )] |
3026 | unsafe fn test_mm_shuffle_ps() { |
3027 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3028 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3029 | let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b); |
3030 | assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0)); |
3031 | } |
3032 | |
3033 | #[simd_test(enable = "sse" )] |
3034 | unsafe fn test_mm_unpackhi_ps() { |
3035 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3036 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3037 | let r = _mm_unpackhi_ps(a, b); |
3038 | assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0)); |
3039 | } |
3040 | |
3041 | #[simd_test(enable = "sse" )] |
3042 | unsafe fn test_mm_unpacklo_ps() { |
3043 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3044 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3045 | let r = _mm_unpacklo_ps(a, b); |
3046 | assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0)); |
3047 | } |
3048 | |
3049 | #[simd_test(enable = "sse" )] |
3050 | unsafe fn test_mm_movehl_ps() { |
3051 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3052 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3053 | let r = _mm_movehl_ps(a, b); |
3054 | assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0)); |
3055 | } |
3056 | |
3057 | #[simd_test(enable = "sse" )] |
3058 | unsafe fn test_mm_movelh_ps() { |
3059 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3060 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3061 | let r = _mm_movelh_ps(a, b); |
3062 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0)); |
3063 | } |
3064 | |
3065 | #[simd_test(enable = "sse" )] |
3066 | unsafe fn test_mm_load_ss() { |
3067 | let a = 42.0f32; |
3068 | let r = _mm_load_ss(&a as *const f32); |
3069 | assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0)); |
3070 | } |
3071 | |
3072 | #[simd_test(enable = "sse" )] |
3073 | unsafe fn test_mm_load1_ps() { |
3074 | let a = 42.0f32; |
3075 | let r = _mm_load1_ps(&a as *const f32); |
3076 | assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0)); |
3077 | } |
3078 | |
3079 | #[simd_test(enable = "sse" )] |
3080 | unsafe fn test_mm_load_ps() { |
3081 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
3082 | |
3083 | let mut p = vals.as_ptr(); |
3084 | let mut fixup = 0.0f32; |
3085 | |
3086 | // Make sure p is aligned, otherwise we might get a |
3087 | // (signal: 11, SIGSEGV: invalid memory reference) |
3088 | |
3089 | let unalignment = (p as usize) & 0xf; |
3090 | if unalignment != 0 { |
3091 | let delta = (16 - unalignment) >> 2; |
3092 | fixup = delta as f32; |
3093 | p = p.add(delta); |
3094 | } |
3095 | |
3096 | let r = _mm_load_ps(p); |
3097 | let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup)); |
3098 | assert_eq_m128(r, e); |
3099 | } |
3100 | |
3101 | #[simd_test(enable = "sse" )] |
3102 | unsafe fn test_mm_loadu_ps() { |
3103 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
3104 | let p = vals.as_ptr().add(3); |
3105 | let r = _mm_loadu_ps(black_box(p)); |
3106 | assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0)); |
3107 | } |
3108 | |
3109 | #[simd_test(enable = "sse" )] |
3110 | unsafe fn test_mm_loadr_ps() { |
3111 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
3112 | |
3113 | let mut p = vals.as_ptr(); |
3114 | let mut fixup = 0.0f32; |
3115 | |
3116 | // Make sure p is aligned, otherwise we might get a |
3117 | // (signal: 11, SIGSEGV: invalid memory reference) |
3118 | |
3119 | let unalignment = (p as usize) & 0xf; |
3120 | if unalignment != 0 { |
3121 | let delta = (16 - unalignment) >> 2; |
3122 | fixup = delta as f32; |
3123 | p = p.add(delta); |
3124 | } |
3125 | |
3126 | let r = _mm_loadr_ps(p); |
3127 | let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup)); |
3128 | assert_eq_m128(r, e); |
3129 | } |
3130 | |
3131 | #[simd_test(enable = "sse2" )] |
3132 | unsafe fn test_mm_loadu_si64() { |
3133 | let a = _mm_setr_epi64x(5, 6); |
3134 | let r = _mm_loadu_si64(&a as *const _ as *const _); |
3135 | assert_eq_m128i(r, _mm_setr_epi64x(5, 0)); |
3136 | } |
3137 | |
3138 | #[simd_test(enable = "sse" )] |
3139 | unsafe fn test_mm_store_ss() { |
3140 | let mut vals = [0.0f32; 8]; |
3141 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3142 | _mm_store_ss(vals.as_mut_ptr().add(1), a); |
3143 | |
3144 | assert_eq!(vals[0], 0.0); |
3145 | assert_eq!(vals[1], 1.0); |
3146 | assert_eq!(vals[2], 0.0); |
3147 | } |
3148 | |
3149 | #[simd_test(enable = "sse" )] |
3150 | unsafe fn test_mm_store1_ps() { |
3151 | let mut vals = [0.0f32; 8]; |
3152 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3153 | |
3154 | let mut ofs = 0; |
3155 | let mut p = vals.as_mut_ptr(); |
3156 | |
3157 | if (p as usize) & 0xf != 0 { |
3158 | ofs = (16 - ((p as usize) & 0xf)) >> 2; |
3159 | p = p.add(ofs); |
3160 | } |
3161 | |
3162 | _mm_store1_ps(p, *black_box(&a)); |
3163 | |
3164 | if ofs > 0 { |
3165 | assert_eq!(vals[ofs - 1], 0.0); |
3166 | } |
3167 | assert_eq!(vals[ofs + 0], 1.0); |
3168 | assert_eq!(vals[ofs + 1], 1.0); |
3169 | assert_eq!(vals[ofs + 2], 1.0); |
3170 | assert_eq!(vals[ofs + 3], 1.0); |
3171 | assert_eq!(vals[ofs + 4], 0.0); |
3172 | } |
3173 | |
3174 | #[simd_test(enable = "sse" )] |
3175 | unsafe fn test_mm_store_ps() { |
3176 | let mut vals = [0.0f32; 8]; |
3177 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3178 | |
3179 | let mut ofs = 0; |
3180 | let mut p = vals.as_mut_ptr(); |
3181 | |
3182 | // Align p to 16-byte boundary |
3183 | if (p as usize) & 0xf != 0 { |
3184 | ofs = (16 - ((p as usize) & 0xf)) >> 2; |
3185 | p = p.add(ofs); |
3186 | } |
3187 | |
3188 | _mm_store_ps(p, *black_box(&a)); |
3189 | |
3190 | if ofs > 0 { |
3191 | assert_eq!(vals[ofs - 1], 0.0); |
3192 | } |
3193 | assert_eq!(vals[ofs + 0], 1.0); |
3194 | assert_eq!(vals[ofs + 1], 2.0); |
3195 | assert_eq!(vals[ofs + 2], 3.0); |
3196 | assert_eq!(vals[ofs + 3], 4.0); |
3197 | assert_eq!(vals[ofs + 4], 0.0); |
3198 | } |
3199 | |
3200 | #[simd_test(enable = "sse" )] |
3201 | unsafe fn test_mm_storer_ps() { |
3202 | let mut vals = [0.0f32; 8]; |
3203 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3204 | |
3205 | let mut ofs = 0; |
3206 | let mut p = vals.as_mut_ptr(); |
3207 | |
3208 | // Align p to 16-byte boundary |
3209 | if (p as usize) & 0xf != 0 { |
3210 | ofs = (16 - ((p as usize) & 0xf)) >> 2; |
3211 | p = p.add(ofs); |
3212 | } |
3213 | |
3214 | _mm_storer_ps(p, *black_box(&a)); |
3215 | |
3216 | if ofs > 0 { |
3217 | assert_eq!(vals[ofs - 1], 0.0); |
3218 | } |
3219 | assert_eq!(vals[ofs + 0], 4.0); |
3220 | assert_eq!(vals[ofs + 1], 3.0); |
3221 | assert_eq!(vals[ofs + 2], 2.0); |
3222 | assert_eq!(vals[ofs + 3], 1.0); |
3223 | assert_eq!(vals[ofs + 4], 0.0); |
3224 | } |
3225 | |
3226 | #[simd_test(enable = "sse" )] |
3227 | unsafe fn test_mm_storeu_ps() { |
3228 | let mut vals = [0.0f32; 8]; |
3229 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3230 | |
3231 | let mut ofs = 0; |
3232 | let mut p = vals.as_mut_ptr(); |
3233 | |
3234 | // Make sure p is **not** aligned to 16-byte boundary |
3235 | if (p as usize) & 0xf == 0 { |
3236 | ofs = 1; |
3237 | p = p.add(1); |
3238 | } |
3239 | |
3240 | _mm_storeu_ps(p, *black_box(&a)); |
3241 | |
3242 | if ofs > 0 { |
3243 | assert_eq!(vals[ofs - 1], 0.0); |
3244 | } |
3245 | assert_eq!(vals[ofs + 0], 1.0); |
3246 | assert_eq!(vals[ofs + 1], 2.0); |
3247 | assert_eq!(vals[ofs + 2], 3.0); |
3248 | assert_eq!(vals[ofs + 3], 4.0); |
3249 | assert_eq!(vals[ofs + 4], 0.0); |
3250 | } |
3251 | |
3252 | #[simd_test(enable = "sse" )] |
3253 | unsafe fn test_mm_move_ss() { |
3254 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3255 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3256 | |
3257 | let r = _mm_move_ss(a, b); |
3258 | let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0); |
3259 | assert_eq_m128(e, r); |
3260 | } |
3261 | |
3262 | #[simd_test(enable = "sse" )] |
3263 | unsafe fn test_mm_movemask_ps() { |
3264 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0)); |
3265 | assert_eq!(r, 0b0101); |
3266 | |
3267 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0)); |
3268 | assert_eq!(r, 0b0111); |
3269 | } |
3270 | |
3271 | #[simd_test(enable = "sse" )] |
3272 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
3273 | #[cfg_attr (miri, ignore)] |
3274 | unsafe fn test_mm_sfence() { |
3275 | _mm_sfence(); |
3276 | } |
3277 | |
3278 | #[allow (deprecated)] // FIXME: This tests functions that are immediate UB |
3279 | #[simd_test(enable = "sse" )] |
3280 | #[cfg_attr (miri, ignore)] // Miri does not support accesing the CSR |
3281 | unsafe fn test_mm_getcsr_setcsr_1() { |
3282 | let saved_csr = _mm_getcsr(); |
3283 | |
3284 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); |
3285 | let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0); |
3286 | |
3287 | _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); |
3288 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); |
3289 | |
3290 | _mm_setcsr(saved_csr); |
3291 | |
3292 | let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0); |
3293 | assert_eq_m128(r, exp); // first component is a denormalized f32 |
3294 | } |
3295 | |
3296 | #[allow (deprecated)] // FIXME: This tests functions that are immediate UB |
3297 | #[simd_test(enable = "sse" )] |
3298 | #[cfg_attr (miri, ignore)] // Miri does not support accesing the CSR |
3299 | unsafe fn test_mm_getcsr_setcsr_2() { |
3300 | // Same as _mm_setcsr_1 test, but with opposite flag value. |
3301 | |
3302 | let saved_csr = _mm_getcsr(); |
3303 | |
3304 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); |
3305 | let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0); |
3306 | |
3307 | _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); |
3308 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); |
3309 | |
3310 | _mm_setcsr(saved_csr); |
3311 | |
3312 | let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0); |
3313 | assert_eq_m128(r, exp); // first component is a denormalized f32 |
3314 | } |
3315 | |
3316 | #[allow (deprecated)] // FIXME: This tests functions that are immediate UB |
3317 | #[simd_test(enable = "sse" )] |
3318 | #[cfg_attr (miri, ignore)] // Miri does not support accesing the CSR |
3319 | unsafe fn test_mm_getcsr_setcsr_underflow() { |
3320 | _MM_SET_EXCEPTION_STATE(0); |
3321 | |
3322 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); |
3323 | let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0); |
3324 | |
3325 | assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure |
3326 | |
3327 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); |
3328 | |
3329 | let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0); |
3330 | assert_eq_m128(r, exp); |
3331 | |
3332 | let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0; |
3333 | assert_eq!(underflow, true); |
3334 | } |
3335 | |
3336 | #[simd_test(enable = "sse" )] |
3337 | unsafe fn test_MM_TRANSPOSE4_PS() { |
3338 | let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3339 | let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3340 | let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0); |
3341 | let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0); |
3342 | |
3343 | _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d); |
3344 | |
3345 | assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0)); |
3346 | assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0)); |
3347 | assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0)); |
3348 | assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0)); |
3349 | } |
3350 | |
3351 | #[repr (align(16))] |
3352 | struct Memory { |
3353 | pub data: [f32; 4], |
3354 | } |
3355 | |
3356 | #[simd_test(enable = "sse" )] |
3357 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
3358 | // (non-temporal store) |
3359 | #[cfg_attr (miri, ignore)] |
3360 | unsafe fn test_mm_stream_ps() { |
3361 | let a = _mm_set1_ps(7.0); |
3362 | let mut mem = Memory { data: [-1.0; 4] }; |
3363 | |
3364 | _mm_stream_ps(&mut mem.data[0] as *mut f32, a); |
3365 | for i in 0..4 { |
3366 | assert_eq!(mem.data[i], get_m128(a, i)); |
3367 | } |
3368 | } |
3369 | } |
3370 | |