1 | //! Streaming SIMD Extensions (SSE) |
2 | |
3 | use crate::{ |
4 | core_arch::{simd::*, x86::*}, |
5 | intrinsics::simd::*, |
6 | mem, ptr, |
7 | }; |
8 | |
9 | #[cfg (test)] |
10 | use stdarch_test::assert_instr; |
11 | |
12 | /// Adds the first component of `a` and `b`, the other components are copied |
13 | /// from `a`. |
14 | /// |
15 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss) |
16 | #[inline ] |
17 | #[target_feature (enable = "sse" )] |
18 | #[cfg_attr (test, assert_instr(addss))] |
19 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
20 | pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 { |
21 | addss(a, b) |
22 | } |
23 | |
24 | /// Adds __m128 vectors. |
25 | /// |
26 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps) |
27 | #[inline ] |
28 | #[target_feature (enable = "sse" )] |
29 | #[cfg_attr (test, assert_instr(addps))] |
30 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
31 | pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 { |
32 | simd_add(x:a, y:b) |
33 | } |
34 | |
35 | /// Subtracts the first component of `b` from `a`, the other components are |
36 | /// copied from `a`. |
37 | /// |
38 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss) |
39 | #[inline ] |
40 | #[target_feature (enable = "sse" )] |
41 | #[cfg_attr (test, assert_instr(subss))] |
42 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
43 | pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 { |
44 | subss(a, b) |
45 | } |
46 | |
47 | /// Subtracts __m128 vectors. |
48 | /// |
49 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps) |
50 | #[inline ] |
51 | #[target_feature (enable = "sse" )] |
52 | #[cfg_attr (test, assert_instr(subps))] |
53 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
54 | pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 { |
55 | simd_sub(lhs:a, rhs:b) |
56 | } |
57 | |
58 | /// Multiplies the first component of `a` and `b`, the other components are |
59 | /// copied from `a`. |
60 | /// |
61 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss) |
62 | #[inline ] |
63 | #[target_feature (enable = "sse" )] |
64 | #[cfg_attr (test, assert_instr(mulss))] |
65 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
66 | pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 { |
67 | mulss(a, b) |
68 | } |
69 | |
70 | /// Multiplies __m128 vectors. |
71 | /// |
72 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps) |
73 | #[inline ] |
74 | #[target_feature (enable = "sse" )] |
75 | #[cfg_attr (test, assert_instr(mulps))] |
76 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
77 | pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 { |
78 | simd_mul(x:a, y:b) |
79 | } |
80 | |
81 | /// Divides the first component of `b` by `a`, the other components are |
82 | /// copied from `a`. |
83 | /// |
84 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss) |
85 | #[inline ] |
86 | #[target_feature (enable = "sse" )] |
87 | #[cfg_attr (test, assert_instr(divss))] |
88 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
89 | pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 { |
90 | divss(a, b) |
91 | } |
92 | |
93 | /// Divides __m128 vectors. |
94 | /// |
95 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps) |
96 | #[inline ] |
97 | #[target_feature (enable = "sse" )] |
98 | #[cfg_attr (test, assert_instr(divps))] |
99 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
100 | pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 { |
101 | simd_div(lhs:a, rhs:b) |
102 | } |
103 | |
104 | /// Returns the square root of the first single-precision (32-bit) |
105 | /// floating-point element in `a`, the other elements are unchanged. |
106 | /// |
107 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss) |
108 | #[inline ] |
109 | #[target_feature (enable = "sse" )] |
110 | #[cfg_attr (test, assert_instr(sqrtss))] |
111 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
112 | pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 { |
113 | sqrtss(a) |
114 | } |
115 | |
116 | /// Returns the square root of packed single-precision (32-bit) floating-point |
117 | /// elements in `a`. |
118 | /// |
119 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps) |
120 | #[inline ] |
121 | #[target_feature (enable = "sse" )] |
122 | #[cfg_attr (test, assert_instr(sqrtps))] |
123 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
124 | pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 { |
125 | sqrtps(a) |
126 | } |
127 | |
128 | /// Returns the approximate reciprocal of the first single-precision |
129 | /// (32-bit) floating-point element in `a`, the other elements are unchanged. |
130 | /// |
131 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss) |
132 | #[inline ] |
133 | #[target_feature (enable = "sse" )] |
134 | #[cfg_attr (test, assert_instr(rcpss))] |
135 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
136 | pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 { |
137 | rcpss(a) |
138 | } |
139 | |
140 | /// Returns the approximate reciprocal of packed single-precision (32-bit) |
141 | /// floating-point elements in `a`. |
142 | /// |
143 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps) |
144 | #[inline ] |
145 | #[target_feature (enable = "sse" )] |
146 | #[cfg_attr (test, assert_instr(rcpps))] |
147 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
148 | pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 { |
149 | rcpps(a) |
150 | } |
151 | |
152 | /// Returns the approximate reciprocal square root of the first single-precision |
153 | /// (32-bit) floating-point element in `a`, the other elements are unchanged. |
154 | /// |
155 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss) |
156 | #[inline ] |
157 | #[target_feature (enable = "sse" )] |
158 | #[cfg_attr (test, assert_instr(rsqrtss))] |
159 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
160 | pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 { |
161 | rsqrtss(a) |
162 | } |
163 | |
164 | /// Returns the approximate reciprocal square root of packed single-precision |
165 | /// (32-bit) floating-point elements in `a`. |
166 | /// |
167 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps) |
168 | #[inline ] |
169 | #[target_feature (enable = "sse" )] |
170 | #[cfg_attr (test, assert_instr(rsqrtps))] |
171 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
172 | pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 { |
173 | rsqrtps(a) |
174 | } |
175 | |
176 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
177 | /// and `b`, and return the minimum value in the first element of the return |
178 | /// value, the other elements are copied from `a`. |
179 | /// |
180 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss) |
181 | #[inline ] |
182 | #[target_feature (enable = "sse" )] |
183 | #[cfg_attr (test, assert_instr(minss))] |
184 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
185 | pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 { |
186 | minss(a, b) |
187 | } |
188 | |
189 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
190 | /// `b`, and return the corresponding minimum values. |
191 | /// |
192 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps) |
193 | #[inline ] |
194 | #[target_feature (enable = "sse" )] |
195 | #[cfg_attr (test, assert_instr(minps))] |
196 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
197 | pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 { |
198 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`. |
199 | minps(a, b) |
200 | } |
201 | |
202 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
203 | /// and `b`, and return the maximum value in the first element of the return |
204 | /// value, the other elements are copied from `a`. |
205 | /// |
206 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss) |
207 | #[inline ] |
208 | #[target_feature (enable = "sse" )] |
209 | #[cfg_attr (test, assert_instr(maxss))] |
210 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
211 | pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 { |
212 | maxss(a, b) |
213 | } |
214 | |
215 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
216 | /// `b`, and return the corresponding maximum values. |
217 | /// |
218 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps) |
219 | #[inline ] |
220 | #[target_feature (enable = "sse" )] |
221 | #[cfg_attr (test, assert_instr(maxps))] |
222 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
223 | pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 { |
224 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`. |
225 | maxps(a, b) |
226 | } |
227 | |
228 | /// Bitwise AND of packed single-precision (32-bit) floating-point elements. |
229 | /// |
230 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps) |
231 | #[inline ] |
232 | #[target_feature (enable = "sse" )] |
233 | // i586 only seems to generate plain `and` instructions, so ignore it. |
234 | #[cfg_attr ( |
235 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
236 | assert_instr(andps) |
237 | )] |
238 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
239 | pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 { |
240 | let a: __m128i = mem::transmute(src:a); |
241 | let b: __m128i = mem::transmute(src:b); |
242 | mem::transmute(src:simd_and(x:a, y:b)) |
243 | } |
244 | |
245 | /// Bitwise AND-NOT of packed single-precision (32-bit) floating-point |
246 | /// elements. |
247 | /// |
248 | /// Computes `!a & b` for each bit in `a` and `b`. |
249 | /// |
250 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps) |
251 | #[inline ] |
252 | #[target_feature (enable = "sse" )] |
253 | // i586 only seems to generate plain `not` and `and` instructions, so ignore |
254 | // it. |
255 | #[cfg_attr ( |
256 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
257 | assert_instr(andnps) |
258 | )] |
259 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
260 | pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 { |
261 | let a: __m128i = mem::transmute(src:a); |
262 | let b: __m128i = mem::transmute(src:b); |
263 | let mask: __m128i = mem::transmute(src:i32x4::splat(-1)); |
264 | mem::transmute(src:simd_and(x:simd_xor(mask, a), y:b)) |
265 | } |
266 | |
267 | /// Bitwise OR of packed single-precision (32-bit) floating-point elements. |
268 | /// |
269 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps) |
270 | #[inline ] |
271 | #[target_feature (enable = "sse" )] |
272 | // i586 only seems to generate plain `or` instructions, so we ignore it. |
273 | #[cfg_attr ( |
274 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
275 | assert_instr(orps) |
276 | )] |
277 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
278 | pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 { |
279 | let a: __m128i = mem::transmute(src:a); |
280 | let b: __m128i = mem::transmute(src:b); |
281 | mem::transmute(src:simd_or(x:a, y:b)) |
282 | } |
283 | |
284 | /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point |
285 | /// elements. |
286 | /// |
287 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps) |
288 | #[inline ] |
289 | #[target_feature (enable = "sse" )] |
290 | // i586 only seems to generate plain `xor` instructions, so we ignore it. |
291 | #[cfg_attr ( |
292 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
293 | assert_instr(xorps) |
294 | )] |
295 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
296 | pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 { |
297 | let a: __m128i = mem::transmute(src:a); |
298 | let b: __m128i = mem::transmute(src:b); |
299 | mem::transmute(src:simd_xor(x:a, y:b)) |
300 | } |
301 | |
302 | /// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of |
303 | /// the result will be `0xffffffff` if the two inputs are equal, or `0` |
304 | /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`. |
305 | /// |
306 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss) |
307 | #[inline ] |
308 | #[target_feature (enable = "sse" )] |
309 | #[cfg_attr (test, assert_instr(cmpeqss))] |
310 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
311 | pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 { |
312 | cmpss(a, b, imm8:0) |
313 | } |
314 | |
315 | /// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits |
316 | /// of the result will be `0xffffffff` if `a.extract(0)` is less than |
317 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the |
318 | /// upper 96 bits of `a`. |
319 | /// |
320 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss) |
321 | #[inline ] |
322 | #[target_feature (enable = "sse" )] |
323 | #[cfg_attr (test, assert_instr(cmpltss))] |
324 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
325 | pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 { |
326 | cmpss(a, b, imm8:1) |
327 | } |
328 | |
329 | /// Compares the lowest `f32` of both inputs for less than or equal. The lowest |
330 | /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than |
331 | /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result |
332 | /// are the upper 96 bits of `a`. |
333 | /// |
334 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss) |
335 | #[inline ] |
336 | #[target_feature (enable = "sse" )] |
337 | #[cfg_attr (test, assert_instr(cmpless))] |
338 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
339 | pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 { |
340 | cmpss(a, b, imm8:2) |
341 | } |
342 | |
343 | /// Compares the lowest `f32` of both inputs for greater than. The lowest 32 |
344 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater |
345 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result |
346 | /// are the upper 96 bits of `a`. |
347 | /// |
348 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss) |
349 | #[inline ] |
350 | #[target_feature (enable = "sse" )] |
351 | #[cfg_attr (test, assert_instr(cmpltss))] |
352 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
353 | pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 { |
354 | simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3]) |
355 | } |
356 | |
357 | /// Compares the lowest `f32` of both inputs for greater than or equal. The |
358 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is |
359 | /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits |
360 | /// of the result are the upper 96 bits of `a`. |
361 | /// |
362 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss) |
363 | #[inline ] |
364 | #[target_feature (enable = "sse" )] |
365 | #[cfg_attr (test, assert_instr(cmpless))] |
366 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
367 | pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 { |
368 | simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3]) |
369 | } |
370 | |
371 | /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits |
372 | /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to |
373 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the |
374 | /// upper 96 bits of `a`. |
375 | /// |
376 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss) |
377 | #[inline ] |
378 | #[target_feature (enable = "sse" )] |
379 | #[cfg_attr (test, assert_instr(cmpneqss))] |
380 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
381 | pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 { |
382 | cmpss(a, b, imm8:4) |
383 | } |
384 | |
385 | /// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32 |
386 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than |
387 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the |
388 | /// upper 96 bits of `a`. |
389 | /// |
390 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss) |
391 | #[inline ] |
392 | #[target_feature (enable = "sse" )] |
393 | #[cfg_attr (test, assert_instr(cmpnltss))] |
394 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
395 | pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 { |
396 | cmpss(a, b, imm8:5) |
397 | } |
398 | |
399 | /// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The |
400 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
401 | /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits |
402 | /// of the result are the upper 96 bits of `a`. |
403 | /// |
404 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss) |
405 | #[inline ] |
406 | #[target_feature (enable = "sse" )] |
407 | #[cfg_attr (test, assert_instr(cmpnless))] |
408 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
409 | pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 { |
410 | cmpss(a, b, imm8:6) |
411 | } |
412 | |
413 | /// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32 |
414 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater |
415 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are |
416 | /// the upper 96 bits of `a`. |
417 | /// |
418 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss) |
419 | #[inline ] |
420 | #[target_feature (enable = "sse" )] |
421 | #[cfg_attr (test, assert_instr(cmpnltss))] |
422 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
423 | pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 { |
424 | simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3]) |
425 | } |
426 | |
427 | /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The |
428 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
429 | /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 |
430 | /// bits of the result are the upper 96 bits of `a`. |
431 | /// |
432 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss) |
433 | #[inline ] |
434 | #[target_feature (enable = "sse" )] |
435 | #[cfg_attr (test, assert_instr(cmpnless))] |
436 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
437 | pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 { |
438 | simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3]) |
439 | } |
440 | |
441 | /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of |
442 | /// the result will be `0xffffffff` if neither of `a.extract(0)` or |
443 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result |
444 | /// are the upper 96 bits of `a`. |
445 | /// |
446 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss) |
447 | #[inline ] |
448 | #[target_feature (enable = "sse" )] |
449 | #[cfg_attr (test, assert_instr(cmpordss))] |
450 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
451 | pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 { |
452 | cmpss(a, b, imm8:7) |
453 | } |
454 | |
455 | /// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits |
456 | /// of the result will be `0xffffffff` if any of `a.extract(0)` or |
457 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result |
458 | /// are the upper 96 bits of `a`. |
459 | /// |
460 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss) |
461 | #[inline ] |
462 | #[target_feature (enable = "sse" )] |
463 | #[cfg_attr (test, assert_instr(cmpunordss))] |
464 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
465 | pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 { |
466 | cmpss(a, b, imm8:3) |
467 | } |
468 | |
469 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
470 | /// The result in the output vector will be `0xffffffff` if the input elements |
471 | /// were equal, or `0` otherwise. |
472 | /// |
473 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps) |
474 | #[inline ] |
475 | #[target_feature (enable = "sse" )] |
476 | #[cfg_attr (test, assert_instr(cmpeqps))] |
477 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
478 | pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 { |
479 | cmpps(a, b, imm8:0) |
480 | } |
481 | |
482 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
483 | /// The result in the output vector will be `0xffffffff` if the input element |
484 | /// in `a` is less than the corresponding element in `b`, or `0` otherwise. |
485 | /// |
486 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps) |
487 | #[inline ] |
488 | #[target_feature (enable = "sse" )] |
489 | #[cfg_attr (test, assert_instr(cmpltps))] |
490 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
491 | pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 { |
492 | cmpps(a, b, imm8:1) |
493 | } |
494 | |
495 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
496 | /// The result in the output vector will be `0xffffffff` if the input element |
497 | /// in `a` is less than or equal to the corresponding element in `b`, or `0` |
498 | /// otherwise. |
499 | /// |
500 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps) |
501 | #[inline ] |
502 | #[target_feature (enable = "sse" )] |
503 | #[cfg_attr (test, assert_instr(cmpleps))] |
504 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
505 | pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 { |
506 | cmpps(a, b, imm8:2) |
507 | } |
508 | |
509 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
510 | /// The result in the output vector will be `0xffffffff` if the input element |
511 | /// in `a` is greater than the corresponding element in `b`, or `0` otherwise. |
512 | /// |
513 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps) |
514 | #[inline ] |
515 | #[target_feature (enable = "sse" )] |
516 | #[cfg_attr (test, assert_instr(cmpltps))] |
517 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
518 | pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 { |
519 | cmpps(a:b, b:a, imm8:1) |
520 | } |
521 | |
522 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
523 | /// The result in the output vector will be `0xffffffff` if the input element |
524 | /// in `a` is greater than or equal to the corresponding element in `b`, or `0` |
525 | /// otherwise. |
526 | /// |
527 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps) |
528 | #[inline ] |
529 | #[target_feature (enable = "sse" )] |
530 | #[cfg_attr (test, assert_instr(cmpleps))] |
531 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
532 | pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 { |
533 | cmpps(a:b, b:a, imm8:2) |
534 | } |
535 | |
536 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
537 | /// The result in the output vector will be `0xffffffff` if the input elements |
538 | /// are **not** equal, or `0` otherwise. |
539 | /// |
540 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps) |
541 | #[inline ] |
542 | #[target_feature (enable = "sse" )] |
543 | #[cfg_attr (test, assert_instr(cmpneqps))] |
544 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
545 | pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 { |
546 | cmpps(a, b, imm8:4) |
547 | } |
548 | |
549 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
550 | /// The result in the output vector will be `0xffffffff` if the input element |
551 | /// in `a` is **not** less than the corresponding element in `b`, or `0` |
552 | /// otherwise. |
553 | /// |
554 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps) |
555 | #[inline ] |
556 | #[target_feature (enable = "sse" )] |
557 | #[cfg_attr (test, assert_instr(cmpnltps))] |
558 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
559 | pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 { |
560 | cmpps(a, b, imm8:5) |
561 | } |
562 | |
563 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
564 | /// The result in the output vector will be `0xffffffff` if the input element |
565 | /// in `a` is **not** less than or equal to the corresponding element in `b`, or |
566 | /// `0` otherwise. |
567 | /// |
568 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps) |
569 | #[inline ] |
570 | #[target_feature (enable = "sse" )] |
571 | #[cfg_attr (test, assert_instr(cmpnleps))] |
572 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
573 | pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 { |
574 | cmpps(a, b, imm8:6) |
575 | } |
576 | |
577 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
578 | /// The result in the output vector will be `0xffffffff` if the input element |
579 | /// in `a` is **not** greater than the corresponding element in `b`, or `0` |
580 | /// otherwise. |
581 | /// |
582 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps) |
583 | #[inline ] |
584 | #[target_feature (enable = "sse" )] |
585 | #[cfg_attr (test, assert_instr(cmpnltps))] |
586 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
587 | pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 { |
588 | cmpps(a:b, b:a, imm8:5) |
589 | } |
590 | |
591 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
592 | /// The result in the output vector will be `0xffffffff` if the input element |
593 | /// in `a` is **not** greater than or equal to the corresponding element in `b`, |
594 | /// or `0` otherwise. |
595 | /// |
596 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps) |
597 | #[inline ] |
598 | #[target_feature (enable = "sse" )] |
599 | #[cfg_attr (test, assert_instr(cmpnleps))] |
600 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
601 | pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 { |
602 | cmpps(a:b, b:a, imm8:6) |
603 | } |
604 | |
605 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
606 | /// Returns four floats that have one of two possible bit patterns. The element |
607 | /// in the output vector will be `0xffffffff` if the input elements in `a` and |
608 | /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise. |
609 | /// |
610 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps) |
611 | #[inline ] |
612 | #[target_feature (enable = "sse" )] |
613 | #[cfg_attr (test, assert_instr(cmpordps))] |
614 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
615 | pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 { |
616 | cmpps(a:b, b:a, imm8:7) |
617 | } |
618 | |
619 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
620 | /// Returns four floats that have one of two possible bit patterns. The element |
621 | /// in the output vector will be `0xffffffff` if the input elements in `a` and |
622 | /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise. |
623 | /// |
624 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps) |
625 | #[inline ] |
626 | #[target_feature (enable = "sse" )] |
627 | #[cfg_attr (test, assert_instr(cmpunordps))] |
628 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
629 | pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 { |
630 | cmpps(a:b, b:a, imm8:3) |
631 | } |
632 | |
633 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
634 | /// `1` if they are equal, or `0` otherwise. |
635 | /// |
636 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss) |
637 | #[inline ] |
638 | #[target_feature (enable = "sse" )] |
639 | #[cfg_attr (test, assert_instr(comiss))] |
640 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
641 | pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 { |
642 | comieq_ss(a, b) |
643 | } |
644 | |
645 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
646 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
647 | /// |
648 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss) |
649 | #[inline ] |
650 | #[target_feature (enable = "sse" )] |
651 | #[cfg_attr (test, assert_instr(comiss))] |
652 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
653 | pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 { |
654 | comilt_ss(a, b) |
655 | } |
656 | |
657 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
658 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
659 | /// otherwise. |
660 | /// |
661 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss) |
662 | #[inline ] |
663 | #[target_feature (enable = "sse" )] |
664 | #[cfg_attr (test, assert_instr(comiss))] |
665 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
666 | pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 { |
667 | comile_ss(a, b) |
668 | } |
669 | |
670 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
671 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
672 | /// otherwise. |
673 | /// |
674 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss) |
675 | #[inline ] |
676 | #[target_feature (enable = "sse" )] |
677 | #[cfg_attr (test, assert_instr(comiss))] |
678 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
679 | pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 { |
680 | comigt_ss(a, b) |
681 | } |
682 | |
683 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
684 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
685 | /// `0` otherwise. |
686 | /// |
687 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss) |
688 | #[inline ] |
689 | #[target_feature (enable = "sse" )] |
690 | #[cfg_attr (test, assert_instr(comiss))] |
691 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
692 | pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 { |
693 | comige_ss(a, b) |
694 | } |
695 | |
696 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
697 | /// `1` if they are **not** equal, or `0` otherwise. |
698 | /// |
699 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss) |
700 | #[inline ] |
701 | #[target_feature (enable = "sse" )] |
702 | #[cfg_attr (test, assert_instr(comiss))] |
703 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
704 | pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 { |
705 | comineq_ss(a, b) |
706 | } |
707 | |
708 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
709 | /// `1` if they are equal, or `0` otherwise. This instruction will not signal |
710 | /// an exception if either argument is a quiet NaN. |
711 | /// |
712 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss) |
713 | #[inline ] |
714 | #[target_feature (enable = "sse" )] |
715 | #[cfg_attr (test, assert_instr(ucomiss))] |
716 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
717 | pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 { |
718 | ucomieq_ss(a, b) |
719 | } |
720 | |
721 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
722 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
723 | /// This instruction will not signal an exception if either argument is a quiet |
724 | /// NaN. |
725 | /// |
726 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss) |
727 | #[inline ] |
728 | #[target_feature (enable = "sse" )] |
729 | #[cfg_attr (test, assert_instr(ucomiss))] |
730 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
731 | pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 { |
732 | ucomilt_ss(a, b) |
733 | } |
734 | |
735 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
736 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
737 | /// otherwise. This instruction will not signal an exception if either argument |
738 | /// is a quiet NaN. |
739 | /// |
740 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss) |
741 | #[inline ] |
742 | #[target_feature (enable = "sse" )] |
743 | #[cfg_attr (test, assert_instr(ucomiss))] |
744 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
745 | pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 { |
746 | ucomile_ss(a, b) |
747 | } |
748 | |
749 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
750 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
751 | /// otherwise. This instruction will not signal an exception if either argument |
752 | /// is a quiet NaN. |
753 | /// |
754 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss) |
755 | #[inline ] |
756 | #[target_feature (enable = "sse" )] |
757 | #[cfg_attr (test, assert_instr(ucomiss))] |
758 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
759 | pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 { |
760 | ucomigt_ss(a, b) |
761 | } |
762 | |
763 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
764 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
765 | /// `0` otherwise. This instruction will not signal an exception if either |
766 | /// argument is a quiet NaN. |
767 | /// |
768 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss) |
769 | #[inline ] |
770 | #[target_feature (enable = "sse" )] |
771 | #[cfg_attr (test, assert_instr(ucomiss))] |
772 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
773 | pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 { |
774 | ucomige_ss(a, b) |
775 | } |
776 | |
777 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
778 | /// `1` if they are **not** equal, or `0` otherwise. This instruction will not |
779 | /// signal an exception if either argument is a quiet NaN. |
780 | /// |
781 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss) |
782 | #[inline ] |
783 | #[target_feature (enable = "sse" )] |
784 | #[cfg_attr (test, assert_instr(ucomiss))] |
785 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
786 | pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 { |
787 | ucomineq_ss(a, b) |
788 | } |
789 | |
790 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer. |
791 | /// |
792 | /// The result is rounded according to the current rounding mode. If the result |
793 | /// cannot be represented as a 32 bit integer the result will be `0x8000_0000` |
794 | /// (`i32::MIN`). |
795 | /// |
796 | /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output). |
797 | /// |
798 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32) |
799 | #[inline ] |
800 | #[target_feature (enable = "sse" )] |
801 | #[cfg_attr (test, assert_instr(cvtss2si))] |
802 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
803 | pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 { |
804 | cvtss2si(a) |
805 | } |
806 | |
807 | /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html). |
808 | /// |
809 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si) |
810 | #[inline ] |
811 | #[target_feature (enable = "sse" )] |
812 | #[cfg_attr (test, assert_instr(cvtss2si))] |
813 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
814 | pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 { |
815 | _mm_cvtss_si32(a) |
816 | } |
817 | |
818 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer |
819 | /// with |
820 | /// truncation. |
821 | /// |
822 | /// The result is rounded always using truncation (round towards zero). If the |
823 | /// result cannot be represented as a 32 bit integer the result will be |
824 | /// `0x8000_0000` (`i32::MIN`). |
825 | /// |
826 | /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output). |
827 | /// |
828 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32) |
829 | #[inline ] |
830 | #[target_feature (enable = "sse" )] |
831 | #[cfg_attr (test, assert_instr(cvttss2si))] |
832 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
833 | pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 { |
834 | cvttss2si(a) |
835 | } |
836 | |
837 | /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html). |
838 | /// |
839 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si) |
840 | #[inline ] |
841 | #[target_feature (enable = "sse" )] |
842 | #[cfg_attr (test, assert_instr(cvttss2si))] |
843 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
844 | pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 { |
845 | _mm_cvttss_si32(a) |
846 | } |
847 | |
848 | /// Extracts the lowest 32 bit float from the input vector. |
849 | /// |
850 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32) |
851 | #[inline ] |
852 | #[target_feature (enable = "sse" )] |
853 | // No point in using assert_instrs. In Unix x86_64 calling convention this is a |
854 | // no-op, and on Windows it's just a `mov`. |
855 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
856 | pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 { |
857 | simd_extract!(a, 0) |
858 | } |
859 | |
860 | /// Converts a 32 bit integer to a 32 bit float. The result vector is the input |
861 | /// vector `a` with the lowest 32 bit float replaced by the converted integer. |
862 | /// |
863 | /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit |
864 | /// input). |
865 | /// |
866 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss) |
867 | #[inline ] |
868 | #[target_feature (enable = "sse" )] |
869 | #[cfg_attr (test, assert_instr(cvtsi2ss))] |
870 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
871 | pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 { |
872 | cvtsi2ss(a, b) |
873 | } |
874 | |
875 | /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html). |
876 | /// |
877 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss) |
878 | #[inline ] |
879 | #[target_feature (enable = "sse" )] |
880 | #[cfg_attr (test, assert_instr(cvtsi2ss))] |
881 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
882 | pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 { |
883 | _mm_cvtsi32_ss(a, b) |
884 | } |
885 | |
886 | /// Construct a `__m128` with the lowest element set to `a` and the rest set to |
887 | /// zero. |
888 | /// |
889 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss) |
890 | #[inline ] |
891 | #[target_feature (enable = "sse" )] |
892 | #[cfg_attr (test, assert_instr(movss))] |
893 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
894 | pub unsafe fn _mm_set_ss(a: f32) -> __m128 { |
895 | __m128(a, 0.0, 0.0, 0.0) |
896 | } |
897 | |
898 | /// Construct a `__m128` with all element set to `a`. |
899 | /// |
900 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps) |
901 | #[inline ] |
902 | #[target_feature (enable = "sse" )] |
903 | #[cfg_attr (test, assert_instr(shufps))] |
904 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
905 | pub unsafe fn _mm_set1_ps(a: f32) -> __m128 { |
906 | __m128(a, a, a, a) |
907 | } |
908 | |
909 | /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html) |
910 | /// |
911 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1) |
912 | #[inline ] |
913 | #[target_feature (enable = "sse" )] |
914 | #[cfg_attr (test, assert_instr(shufps))] |
915 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
916 | pub unsafe fn _mm_set_ps1(a: f32) -> __m128 { |
917 | _mm_set1_ps(a) |
918 | } |
919 | |
920 | /// Construct a `__m128` from four floating point values highest to lowest. |
921 | /// |
922 | /// Note that `a` will be the highest 32 bits of the result, and `d` the |
923 | /// lowest. This matches the standard way of writing bit patterns on x86: |
924 | /// |
925 | /// ```text |
926 | /// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0 |
927 | /// +---------+---------+---------+---------+ |
928 | /// | a | b | c | d | result |
929 | /// +---------+---------+---------+---------+ |
930 | /// ``` |
931 | /// |
932 | /// Alternatively: |
933 | /// |
934 | /// ```text |
935 | /// let v = _mm_set_ps(d, c, b, a); |
936 | /// ``` |
937 | /// |
938 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps) |
939 | #[inline ] |
940 | #[target_feature (enable = "sse" )] |
941 | #[cfg_attr (test, assert_instr(unpcklps))] |
942 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
943 | pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
944 | __m128(d, c, b, a) |
945 | } |
946 | |
947 | /// Construct a `__m128` from four floating point values lowest to highest. |
948 | /// |
949 | /// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32 |
950 | /// bits of the result, and `d` the highest. |
951 | /// |
952 | /// ```text |
953 | /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d)); |
954 | /// ``` |
955 | /// |
956 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps) |
957 | #[inline ] |
958 | #[target_feature (enable = "sse" )] |
959 | #[cfg_attr ( |
960 | all(test, any(target_os = "windows" , target_arch = "x86_64" )), |
961 | assert_instr(unpcklps) |
962 | )] |
963 | // On a 32-bit architecture on non-Windows it just copies the operands from the stack. |
964 | #[cfg_attr ( |
965 | all(test, all(not(target_os = "windows" ), target_arch = "x86" )), |
966 | assert_instr(movaps) |
967 | )] |
968 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
969 | pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
970 | __m128(a, b, c, d) |
971 | } |
972 | |
973 | /// Construct a `__m128` with all elements initialized to zero. |
974 | /// |
975 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps) |
976 | #[inline ] |
977 | #[target_feature (enable = "sse" )] |
978 | #[cfg_attr (test, assert_instr(xorps))] |
979 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
980 | pub unsafe fn _mm_setzero_ps() -> __m128 { |
981 | __m128(0.0, 0.0, 0.0, 0.0) |
982 | } |
983 | |
984 | /// A utility function for creating masks to use with Intel shuffle and |
985 | /// permute intrinsics. |
986 | #[inline ] |
987 | #[allow (non_snake_case)] |
988 | #[unstable (feature = "stdarch_x86_mm_shuffle" , issue = "111147" )] |
989 | pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 { |
990 | ((z << 6) | (y << 4) | (x << 2) | w) as i32 |
991 | } |
992 | |
993 | /// Shuffles packed single-precision (32-bit) floating-point elements in `a` and |
994 | /// `b` using `MASK`. |
995 | /// |
996 | /// The lower half of result takes values from `a` and the higher half from |
997 | /// `b`. Mask is split to 2 control bits each to index the element from inputs. |
998 | /// |
999 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps) |
1000 | /// |
1001 | /// Note that there appears to be a mistake within Intel's Intrinsics Guide. |
1002 | /// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32` |
1003 | /// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_). |
1004 | /// Performing an implicit type conversion between an unsigned integer and a signed integer |
1005 | /// does not cause a problem in C, however Rust's commitment to strong typing does not allow this. |
1006 | #[inline ] |
1007 | #[target_feature (enable = "sse" )] |
1008 | #[cfg_attr (test, assert_instr(shufps, MASK = 3))] |
1009 | #[rustc_legacy_const_generics (2)] |
1010 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1011 | pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 { |
1012 | static_assert_uimm_bits!(MASK, 8); |
1013 | simd_shuffle!( |
1014 | a, |
1015 | b, |
1016 | [ |
1017 | MASK as u32 & 0b11, |
1018 | (MASK as u32 >> 2) & 0b11, |
1019 | ((MASK as u32 >> 4) & 0b11) + 4, |
1020 | ((MASK as u32 >> 6) & 0b11) + 4, |
1021 | ], |
1022 | ) |
1023 | } |
1024 | |
1025 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
1026 | /// from the higher half of `a` and `b`. |
1027 | /// |
1028 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps) |
1029 | #[inline ] |
1030 | #[target_feature (enable = "sse" )] |
1031 | #[cfg_attr (test, assert_instr(unpckhps))] |
1032 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1033 | pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 { |
1034 | simd_shuffle!(a, b, [2, 6, 3, 7]) |
1035 | } |
1036 | |
1037 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
1038 | /// from the lower half of `a` and `b`. |
1039 | /// |
1040 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps) |
1041 | #[inline ] |
1042 | #[target_feature (enable = "sse" )] |
1043 | #[cfg_attr (test, assert_instr(unpcklps))] |
1044 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1045 | pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 { |
1046 | simd_shuffle!(a, b, [0, 4, 1, 5]) |
1047 | } |
1048 | |
1049 | /// Combine higher half of `a` and `b`. The higher half of `b` occupies the |
1050 | /// lower half of result. |
1051 | /// |
1052 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps) |
1053 | #[inline ] |
1054 | #[target_feature (enable = "sse" )] |
1055 | #[cfg_attr (all(test, not(target_os = "windows" )), assert_instr(movhlps))] |
1056 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1057 | pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 { |
1058 | // TODO; figure why this is a different instruction on Windows? |
1059 | simd_shuffle!(a, b, [6, 7, 2, 3]) |
1060 | } |
1061 | |
1062 | /// Combine lower half of `a` and `b`. The lower half of `b` occupies the |
1063 | /// higher half of result. |
1064 | /// |
1065 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps) |
1066 | #[inline ] |
1067 | #[target_feature (enable = "sse" )] |
1068 | #[cfg_attr (all(test, not(target_os = "windows" )), assert_instr(movlhps))] |
1069 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1070 | pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 { |
1071 | simd_shuffle!(a, b, [0, 1, 4, 5]) |
1072 | } |
1073 | |
1074 | /// Returns a mask of the most significant bit of each element in `a`. |
1075 | /// |
1076 | /// The mask is stored in the 4 least significant bits of the return value. |
1077 | /// All other bits are set to `0`. |
1078 | /// |
1079 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps) |
1080 | #[inline ] |
1081 | #[target_feature (enable = "sse" )] |
1082 | #[cfg_attr (test, assert_instr(movmskps))] |
1083 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1084 | pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { |
1085 | // Propagate the highest bit to the rest, because simd_bitmask |
1086 | // requires all-1 or all-0. |
1087 | let mask: i32x4 = simd_lt(x:transmute(a), y:i32x4::splat(0)); |
1088 | simd_bitmask::<i32x4, u8>(mask).into() |
1089 | } |
1090 | |
1091 | /// Construct a `__m128` with the lowest element read from `p` and the other |
1092 | /// elements set to zero. |
1093 | /// |
1094 | /// This corresponds to instructions `VMOVSS` / `MOVSS`. |
1095 | /// |
1096 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss) |
1097 | #[inline ] |
1098 | #[target_feature (enable = "sse" )] |
1099 | #[cfg_attr (test, assert_instr(movss))] |
1100 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1101 | pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 { |
1102 | __m128(*p, 0.0, 0.0, 0.0) |
1103 | } |
1104 | |
1105 | /// Construct a `__m128` by duplicating the value read from `p` into all |
1106 | /// elements. |
1107 | /// |
1108 | /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some |
1109 | /// shuffling. |
1110 | /// |
1111 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps) |
1112 | #[inline ] |
1113 | #[target_feature (enable = "sse" )] |
1114 | #[cfg_attr (test, assert_instr(movss))] |
1115 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1116 | pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 { |
1117 | let a: f32 = *p; |
1118 | __m128(a, a, a, a) |
1119 | } |
1120 | |
1121 | /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html) |
1122 | /// |
1123 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1) |
1124 | #[inline ] |
1125 | #[target_feature (enable = "sse" )] |
1126 | #[cfg_attr (test, assert_instr(movss))] |
1127 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1128 | pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 { |
1129 | _mm_load1_ps(p) |
1130 | } |
1131 | |
1132 | /// Loads four `f32` values from *aligned* memory into a `__m128`. If the |
1133 | /// pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1134 | /// protection fault will be triggered (fatal program crash). |
1135 | /// |
1136 | /// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned |
1137 | /// memory. |
1138 | /// |
1139 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. |
1140 | /// |
1141 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps) |
1142 | #[inline ] |
1143 | #[target_feature (enable = "sse" )] |
1144 | #[cfg_attr (test, assert_instr(movaps))] |
1145 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1146 | #[allow (clippy::cast_ptr_alignment)] |
1147 | pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 { |
1148 | *(p as *const __m128) |
1149 | } |
1150 | |
1151 | /// Loads four `f32` values from memory into a `__m128`. There are no |
1152 | /// restrictions |
1153 | /// on memory alignment. For aligned memory |
1154 | /// [`_mm_load_ps`](fn._mm_load_ps.html) |
1155 | /// may be faster. |
1156 | /// |
1157 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. |
1158 | /// |
1159 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps) |
1160 | #[inline ] |
1161 | #[target_feature (enable = "sse" )] |
1162 | #[cfg_attr (test, assert_instr(movups))] |
1163 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1164 | pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 { |
1165 | // Note: Using `*p` would require `f32` alignment, but `movups` has no |
1166 | // alignment restrictions. |
1167 | let mut dst: __m128 = _mm_undefined_ps(); |
1168 | ptr::copy_nonoverlapping( |
1169 | src:p as *const u8, |
1170 | dst:ptr::addr_of_mut!(dst) as *mut u8, |
1171 | count:mem::size_of::<__m128>(), |
1172 | ); |
1173 | dst |
1174 | } |
1175 | |
1176 | /// Loads four `f32` values from aligned memory into a `__m128` in reverse |
1177 | /// order. |
1178 | /// |
1179 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1180 | /// protection fault will be triggered (fatal program crash). |
1181 | /// |
1182 | /// Functionally equivalent to the following code sequence (assuming `p` |
1183 | /// satisfies the alignment restrictions): |
1184 | /// |
1185 | /// ```text |
1186 | /// let a0 = *p; |
1187 | /// let a1 = *p.add(1); |
1188 | /// let a2 = *p.add(2); |
1189 | /// let a3 = *p.add(3); |
1190 | /// __m128::new(a3, a2, a1, a0) |
1191 | /// ``` |
1192 | /// |
1193 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some |
1194 | /// shuffling. |
1195 | /// |
1196 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps) |
1197 | #[inline ] |
1198 | #[target_feature (enable = "sse" )] |
1199 | #[cfg_attr (test, assert_instr(movaps))] |
1200 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1201 | pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { |
1202 | let a: __m128 = _mm_load_ps(p); |
1203 | simd_shuffle!(a, a, [3, 2, 1, 0]) |
1204 | } |
1205 | |
1206 | /// Loads unaligned 64-bits of integer data from memory into new vector. |
1207 | /// |
1208 | /// `mem_addr` does not need to be aligned on any particular boundary. |
1209 | /// |
1210 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64) |
1211 | #[inline ] |
1212 | #[target_feature (enable = "sse" )] |
1213 | #[stable (feature = "simd_x86_mm_loadu_si64" , since = "1.46.0" )] |
1214 | pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { |
1215 | transmute(src:i64x2::new(x0:ptr::read_unaligned(mem_addr as *const i64), x1:0)) |
1216 | } |
1217 | |
1218 | /// Stores the lowest 32 bit float of `a` into memory. |
1219 | /// |
1220 | /// This intrinsic corresponds to the `MOVSS` instruction. |
1221 | /// |
1222 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss) |
1223 | #[inline ] |
1224 | #[target_feature (enable = "sse" )] |
1225 | #[cfg_attr (test, assert_instr(movss))] |
1226 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1227 | pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) { |
1228 | *p = simd_extract!(a, 0); |
1229 | } |
1230 | |
1231 | /// Stores the lowest 32 bit float of `a` repeated four times into *aligned* |
1232 | /// memory. |
1233 | /// |
1234 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1235 | /// protection fault will be triggered (fatal program crash). |
1236 | /// |
1237 | /// Functionally equivalent to the following code sequence (assuming `p` |
1238 | /// satisfies the alignment restrictions): |
1239 | /// |
1240 | /// ```text |
1241 | /// let x = a.extract(0); |
1242 | /// *p = x; |
1243 | /// *p.add(1) = x; |
1244 | /// *p.add(2) = x; |
1245 | /// *p.add(3) = x; |
1246 | /// ``` |
1247 | /// |
1248 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps) |
1249 | #[inline ] |
1250 | #[target_feature (enable = "sse" )] |
1251 | #[cfg_attr (test, assert_instr(movaps))] |
1252 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1253 | #[allow (clippy::cast_ptr_alignment)] |
1254 | pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) { |
1255 | let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]); |
1256 | *(p as *mut __m128) = b; |
1257 | } |
1258 | |
1259 | /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html) |
1260 | /// |
1261 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1) |
1262 | #[inline ] |
1263 | #[target_feature (enable = "sse" )] |
1264 | #[cfg_attr (test, assert_instr(movaps))] |
1265 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1266 | pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) { |
1267 | _mm_store1_ps(p, a); |
1268 | } |
1269 | |
1270 | /// Stores four 32-bit floats into *aligned* memory. |
1271 | /// |
1272 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1273 | /// protection fault will be triggered (fatal program crash). |
1274 | /// |
1275 | /// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned |
1276 | /// memory. |
1277 | /// |
1278 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. |
1279 | /// |
1280 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps) |
1281 | #[inline ] |
1282 | #[target_feature (enable = "sse" )] |
1283 | #[cfg_attr (test, assert_instr(movaps))] |
1284 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1285 | #[allow (clippy::cast_ptr_alignment)] |
1286 | pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) { |
1287 | *(p as *mut __m128) = a; |
1288 | } |
1289 | |
1290 | /// Stores four 32-bit floats into memory. There are no restrictions on memory |
1291 | /// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be |
1292 | /// faster. |
1293 | /// |
1294 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. |
1295 | /// |
1296 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps) |
1297 | #[inline ] |
1298 | #[target_feature (enable = "sse" )] |
1299 | #[cfg_attr (test, assert_instr(movups))] |
1300 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1301 | pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) { |
1302 | ptr::copy_nonoverlapping( |
1303 | src:ptr::addr_of!(a) as *const u8, |
1304 | dst:p as *mut u8, |
1305 | count:mem::size_of::<__m128>(), |
1306 | ); |
1307 | } |
1308 | |
1309 | /// Stores four 32-bit floats into *aligned* memory in reverse order. |
1310 | /// |
1311 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1312 | /// protection fault will be triggered (fatal program crash). |
1313 | /// |
1314 | /// Functionally equivalent to the following code sequence (assuming `p` |
1315 | /// satisfies the alignment restrictions): |
1316 | /// |
1317 | /// ```text |
1318 | /// *p = a.extract(3); |
1319 | /// *p.add(1) = a.extract(2); |
1320 | /// *p.add(2) = a.extract(1); |
1321 | /// *p.add(3) = a.extract(0); |
1322 | /// ``` |
1323 | /// |
1324 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps) |
1325 | #[inline ] |
1326 | #[target_feature (enable = "sse" )] |
1327 | #[cfg_attr (test, assert_instr(movaps))] |
1328 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1329 | #[allow (clippy::cast_ptr_alignment)] |
1330 | pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) { |
1331 | let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]); |
1332 | *(p as *mut __m128) = b; |
1333 | } |
1334 | |
1335 | /// Returns a `__m128` with the first component from `b` and the remaining |
1336 | /// components from `a`. |
1337 | /// |
1338 | /// In other words for any `a` and `b`: |
1339 | /// ```text |
1340 | /// _mm_move_ss(a, b) == a.replace(0, b.extract(0)) |
1341 | /// ``` |
1342 | /// |
1343 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss) |
1344 | #[inline ] |
1345 | #[target_feature (enable = "sse" )] |
1346 | #[cfg_attr (test, assert_instr(movss))] |
1347 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1348 | pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 { |
1349 | simd_shuffle!(a, b, [4, 1, 2, 3]) |
1350 | } |
1351 | |
1352 | /// Performs a serializing operation on all non-temporal ("streaming") store instructions that |
1353 | /// were issued by the current thread prior to this instruction. |
1354 | /// |
1355 | /// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is |
1356 | /// ordered before any load or store instruction which follows the fence in |
1357 | /// synchronization order. |
1358 | /// |
1359 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence) |
1360 | /// (but note that Intel is only documenting the hardware-level concerns related to this |
1361 | /// instruction; the Intel documentation does not take into account the extra concerns that arise |
1362 | /// because the Rust memory model is different from the x86 memory model.) |
1363 | /// |
1364 | /// # Safety of non-temporal stores |
1365 | /// |
1366 | /// After using any non-temporal store intrinsic, but before any other access to the memory that the |
1367 | /// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the |
1368 | /// intrinsic. |
1369 | /// |
1370 | /// Non-temporal stores behave very different from regular stores. For the purpose of the Rust |
1371 | /// memory model, these stores are happening asynchronously in a background thread. This means a |
1372 | /// non-temporal store can cause data races with other accesses, even other accesses on the same |
1373 | /// thread. It also means that cross-thread synchronization does not work as expected: let's say the |
1374 | /// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The |
1375 | /// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not |
1376 | /// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize |
1377 | /// with all the non-temporal stores previously started on this thread, which means in particular |
1378 | /// that subsequent synchronization with other threads will then work as intended again. |
1379 | /// |
1380 | /// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your |
1381 | /// code jumps back to code outside your library. This ensures all stores inside your function |
1382 | /// are synchronized-before the return, and thus transitively synchronized-before everything |
1383 | /// the caller does after your function returns. |
1384 | // |
1385 | // The following is not a doc comment since it's not clear whether we want to put this into the |
1386 | // docs, but it should be written out somewhere. |
1387 | // |
1388 | // Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot |
1389 | // inspect, and that behave like the following functions. This explains where the docs above come |
1390 | // from. |
1391 | // ``` |
1392 | // #[thread_local] |
1393 | // static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0); |
1394 | // |
1395 | // pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) { |
1396 | // PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed); |
1397 | // // Spawn a thread that will eventually do our write. |
1398 | // // We need to fetch a pointer to this thread's pending-write |
1399 | // // counter, so that we can access it from the background thread. |
1400 | // let pending_writes = addr_of!(PENDING_NONTEMP_WRITES); |
1401 | // // If this was actual Rust code we'd have to do some extra work |
1402 | // // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here. |
1403 | // std::thread::spawn(move || { |
1404 | // // Do the write in the background thread. |
1405 | // ptr.write(val); |
1406 | // // Register the write as done. Crucially, this is `Release`, so it |
1407 | // // syncs-with the `Acquire in `sfence`. |
1408 | // (&*pending_writes).fetch_sub(1, Release); |
1409 | // }); |
1410 | // } |
1411 | // |
1412 | // pub fn sfence() { |
1413 | // unsafe { |
1414 | // // Wait until there are no more pending writes. |
1415 | // while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {} |
1416 | // } |
1417 | // } |
1418 | // ``` |
1419 | #[inline ] |
1420 | #[target_feature (enable = "sse" )] |
1421 | #[cfg_attr (test, assert_instr(sfence))] |
1422 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1423 | pub unsafe fn _mm_sfence() { |
1424 | sfence() |
1425 | } |
1426 | |
1427 | /// Gets the unsigned 32-bit value of the MXCSR control and status register. |
1428 | /// |
1429 | /// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust |
1430 | /// floating-point operations may or may not result in this register getting updated with exception |
1431 | /// state, and the register can change between two invocations of this function even when no |
1432 | /// floating-point operations appear in the source code (since floating-point operations appearing |
1433 | /// earlier or later can be reordered). |
1434 | /// |
1435 | /// If you need to perform some floating-point operations and check whether they raised an |
1436 | /// exception, use an inline assembly block for the entire sequence of operations. |
1437 | /// |
1438 | /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html) |
1439 | /// |
1440 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr) |
1441 | #[inline ] |
1442 | #[target_feature (enable = "sse" )] |
1443 | #[cfg_attr (test, assert_instr(stmxcsr))] |
1444 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1445 | #[deprecated ( |
1446 | since = "1.75.0" , |
1447 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1448 | )] |
1449 | pub unsafe fn _mm_getcsr() -> u32 { |
1450 | let mut result: i32 = 0_i32; |
1451 | stmxcsr(ptr::addr_of_mut!(result) as *mut i8); |
1452 | result as u32 |
1453 | } |
1454 | |
1455 | /// Sets the MXCSR register with the 32-bit unsigned integer value. |
1456 | /// |
1457 | /// This register controls how SIMD instructions handle floating point |
1458 | /// operations. Modifying this register only affects the current thread. |
1459 | /// |
1460 | /// It contains several groups of flags: |
1461 | /// |
1462 | /// * *Exception flags* report which exceptions occurred since last they were |
1463 | /// reset. |
1464 | /// |
1465 | /// * *Masking flags* can be used to mask (ignore) certain exceptions. By |
1466 | /// default |
1467 | /// these flags are all set to 1, so all exceptions are masked. When an |
1468 | /// an exception is masked, the processor simply sets the exception flag and |
1469 | /// continues the operation. If the exception is unmasked, the flag is also set |
1470 | /// but additionally an exception handler is invoked. |
1471 | /// |
1472 | /// * *Rounding mode flags* control the rounding mode of floating point |
1473 | /// instructions. |
1474 | /// |
1475 | /// * The *denormals-are-zero mode flag* turns all numbers which would be |
1476 | /// denormalized (exponent bits are all zeros) into zeros. |
1477 | /// |
1478 | /// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to |
1479 | /// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and |
1480 | /// will optimize accordingly. This even applies when the register is altered and later reset to its |
1481 | /// original value without any floating-point operations appearing in the source code between those |
1482 | /// operations (since floating-point operations appearing earlier or later can be reordered). |
1483 | /// |
1484 | /// If you need to perform some floating-point operations under a different masking flags, rounding |
1485 | /// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the |
1486 | /// original MXCSR register state before the end of the block. |
1487 | /// |
1488 | /// ## Exception Flags |
1489 | /// |
1490 | /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing |
1491 | /// Infinity by Infinity). |
1492 | /// |
1493 | /// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized |
1494 | /// number. Mainly this can cause loss of precision. |
1495 | /// |
1496 | /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred. |
1497 | /// |
1498 | /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a |
1499 | /// result was too large to be represented (e.g., an `f32` with absolute |
1500 | /// value |
1501 | /// greater than `2^128`). |
1502 | /// |
1503 | /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a |
1504 | /// result was too small to be represented in a normalized way (e.g., an |
1505 | /// `f32` |
1506 | /// with absulte value smaller than `2^-126`.) |
1507 | /// |
1508 | /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a. |
1509 | /// precision exception). This means some precision was lost due to rounding. |
1510 | /// For example, the fraction `1/3` cannot be represented accurately in a |
1511 | /// 32 or 64 bit float and computing it would cause this exception to be |
1512 | /// raised. Precision exceptions are very common, so they are usually masked. |
1513 | /// |
1514 | /// Exception flags can be read and set using the convenience functions |
1515 | /// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to |
1516 | /// check if an operation caused some overflow: |
1517 | /// |
1518 | /// ```rust,ignore |
1519 | /// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags |
1520 | /// // perform calculations |
1521 | /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 { |
1522 | /// // handle overflow |
1523 | /// } |
1524 | /// ``` |
1525 | /// |
1526 | /// ## Masking Flags |
1527 | /// |
1528 | /// There is one masking flag for each exception flag: `_MM_MASK_INVALID`, |
1529 | /// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`, |
1530 | /// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. |
1531 | /// |
1532 | /// A single masking bit can be set via |
1533 | /// |
1534 | /// ```rust,ignore |
1535 | /// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW); |
1536 | /// ``` |
1537 | /// |
1538 | /// However, since mask bits are by default all set to 1, it is more common to |
1539 | /// want to *disable* certain bits. For example, to unmask the underflow |
1540 | /// exception, use: |
1541 | /// |
1542 | /// ```rust,ignore |
1543 | /// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow |
1544 | /// exception |
1545 | /// ``` |
1546 | /// |
1547 | /// Warning: an unmasked exception will cause an exception handler to be |
1548 | /// called. |
1549 | /// The standard handler will simply terminate the process. So, in this case |
1550 | /// any underflow exception would terminate the current process with something |
1551 | /// like `signal: 8, SIGFPE: erroneous arithmetic operation`. |
1552 | /// |
1553 | /// ## Rounding Mode |
1554 | /// |
1555 | /// The rounding mode is describe using two bits. It can be read and set using |
1556 | /// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and |
1557 | /// `_MM_SET_ROUNDING_MODE(mode)`. |
1558 | /// |
1559 | /// The rounding modes are: |
1560 | /// |
1561 | /// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision |
1562 | /// value. If two values are equally close, round to even (i.e., least |
1563 | /// significant bit will be zero). |
1564 | /// |
1565 | /// * `_MM_ROUND_DOWN`: Round toward negative Infinity. |
1566 | /// |
1567 | /// * `_MM_ROUND_UP`: Round toward positive Infinity. |
1568 | /// |
1569 | /// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate). |
1570 | /// |
1571 | /// Example: |
1572 | /// |
1573 | /// ```rust,ignore |
1574 | /// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN) |
1575 | /// ``` |
1576 | /// |
1577 | /// ## Denormals-are-zero/Flush-to-zero Mode |
1578 | /// |
1579 | /// If this bit is set, values that would be denormalized will be set to zero |
1580 | /// instead. This is turned off by default. |
1581 | /// |
1582 | /// You can read and enable/disable this mode via the helper functions |
1583 | /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`: |
1584 | /// |
1585 | /// ```rust,ignore |
1586 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default) |
1587 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on |
1588 | /// ``` |
1589 | /// |
1590 | /// |
1591 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr) |
1592 | #[inline ] |
1593 | #[target_feature (enable = "sse" )] |
1594 | #[cfg_attr (test, assert_instr(ldmxcsr))] |
1595 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1596 | #[deprecated ( |
1597 | since = "1.75.0" , |
1598 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1599 | )] |
1600 | pub unsafe fn _mm_setcsr(val: u32) { |
1601 | ldmxcsr(ptr::addr_of!(val) as *const i8); |
1602 | } |
1603 | |
1604 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1605 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1606 | pub const _MM_EXCEPT_INVALID: u32 = 0x0001; |
1607 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1608 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1609 | pub const _MM_EXCEPT_DENORM: u32 = 0x0002; |
1610 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1611 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1612 | pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004; |
1613 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1614 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1615 | pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008; |
1616 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1617 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1618 | pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010; |
1619 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1620 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1621 | pub const _MM_EXCEPT_INEXACT: u32 = 0x0020; |
1622 | /// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html) |
1623 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1624 | pub const _MM_EXCEPT_MASK: u32 = 0x003f; |
1625 | |
1626 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1627 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1628 | pub const _MM_MASK_INVALID: u32 = 0x0080; |
1629 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1630 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1631 | pub const _MM_MASK_DENORM: u32 = 0x0100; |
1632 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1633 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1634 | pub const _MM_MASK_DIV_ZERO: u32 = 0x0200; |
1635 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1636 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1637 | pub const _MM_MASK_OVERFLOW: u32 = 0x0400; |
1638 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1639 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1640 | pub const _MM_MASK_UNDERFLOW: u32 = 0x0800; |
1641 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1642 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1643 | pub const _MM_MASK_INEXACT: u32 = 0x1000; |
1644 | /// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html) |
1645 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1646 | pub const _MM_MASK_MASK: u32 = 0x1f80; |
1647 | |
1648 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1649 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1650 | pub const _MM_ROUND_NEAREST: u32 = 0x0000; |
1651 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1652 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1653 | pub const _MM_ROUND_DOWN: u32 = 0x2000; |
1654 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1655 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1656 | pub const _MM_ROUND_UP: u32 = 0x4000; |
1657 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1658 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1659 | pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000; |
1660 | |
1661 | /// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html) |
1662 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1663 | pub const _MM_ROUND_MASK: u32 = 0x6000; |
1664 | |
1665 | /// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html) |
1666 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1667 | pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000; |
1668 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1669 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1670 | pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000; |
1671 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1672 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1673 | pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; |
1674 | |
1675 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1676 | /// |
1677 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK) |
1678 | #[inline ] |
1679 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1680 | #[allow (non_snake_case)] |
1681 | #[target_feature (enable = "sse" )] |
1682 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1683 | #[deprecated ( |
1684 | since = "1.75.0" , |
1685 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1686 | )] |
1687 | pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { |
1688 | _mm_getcsr() & _MM_MASK_MASK |
1689 | } |
1690 | |
1691 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1692 | /// |
1693 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE) |
1694 | #[inline ] |
1695 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1696 | #[allow (non_snake_case)] |
1697 | #[target_feature (enable = "sse" )] |
1698 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1699 | #[deprecated ( |
1700 | since = "1.75.0" , |
1701 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1702 | )] |
1703 | pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { |
1704 | _mm_getcsr() & _MM_EXCEPT_MASK |
1705 | } |
1706 | |
1707 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1708 | /// |
1709 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE) |
1710 | #[inline ] |
1711 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1712 | #[allow (non_snake_case)] |
1713 | #[target_feature (enable = "sse" )] |
1714 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1715 | #[deprecated ( |
1716 | since = "1.75.0" , |
1717 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1718 | )] |
1719 | pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { |
1720 | _mm_getcsr() & _MM_FLUSH_ZERO_MASK |
1721 | } |
1722 | |
1723 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1724 | /// |
1725 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE) |
1726 | #[inline ] |
1727 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1728 | #[allow (non_snake_case)] |
1729 | #[target_feature (enable = "sse" )] |
1730 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1731 | #[deprecated ( |
1732 | since = "1.75.0" , |
1733 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1734 | )] |
1735 | pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { |
1736 | _mm_getcsr() & _MM_ROUND_MASK |
1737 | } |
1738 | |
1739 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1740 | /// |
1741 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK) |
1742 | #[inline ] |
1743 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1744 | #[allow (non_snake_case)] |
1745 | #[target_feature (enable = "sse" )] |
1746 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1747 | #[deprecated ( |
1748 | since = "1.75.0" , |
1749 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1750 | )] |
1751 | pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { |
1752 | _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x) |
1753 | } |
1754 | |
1755 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1756 | /// |
1757 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE) |
1758 | #[inline ] |
1759 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1760 | #[allow (non_snake_case)] |
1761 | #[target_feature (enable = "sse" )] |
1762 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1763 | #[deprecated ( |
1764 | since = "1.75.0" , |
1765 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1766 | )] |
1767 | pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { |
1768 | _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x) |
1769 | } |
1770 | |
1771 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1772 | /// |
1773 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE) |
1774 | #[inline ] |
1775 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1776 | #[allow (non_snake_case)] |
1777 | #[target_feature (enable = "sse" )] |
1778 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1779 | #[deprecated ( |
1780 | since = "1.75.0" , |
1781 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1782 | )] |
1783 | pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { |
1784 | let val: u32 = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x; |
1785 | // println!("setting csr={:x}", val); |
1786 | _mm_setcsr(val) |
1787 | } |
1788 | |
1789 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1790 | /// |
1791 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE) |
1792 | #[inline ] |
1793 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1794 | #[allow (non_snake_case)] |
1795 | #[target_feature (enable = "sse" )] |
1796 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1797 | #[deprecated ( |
1798 | since = "1.75.0" , |
1799 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1800 | )] |
1801 | pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) { |
1802 | _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x) |
1803 | } |
1804 | |
1805 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1806 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1807 | pub const _MM_HINT_T0: i32 = 3; |
1808 | |
1809 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1810 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1811 | pub const _MM_HINT_T1: i32 = 2; |
1812 | |
1813 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1814 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1815 | pub const _MM_HINT_T2: i32 = 1; |
1816 | |
1817 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1818 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1819 | pub const _MM_HINT_NTA: i32 = 0; |
1820 | |
1821 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1822 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1823 | pub const _MM_HINT_ET0: i32 = 7; |
1824 | |
1825 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1826 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1827 | pub const _MM_HINT_ET1: i32 = 6; |
1828 | |
1829 | /// Fetch the cache line that contains address `p` using the given `STRATEGY`. |
1830 | /// |
1831 | /// The `STRATEGY` must be one of: |
1832 | /// |
1833 | /// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the |
1834 | /// cache hierarchy. |
1835 | /// |
1836 | /// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher. |
1837 | /// |
1838 | /// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or |
1839 | /// an implementation-specific choice (e.g., L2 if there is no L3). |
1840 | /// |
1841 | /// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the |
1842 | /// non-temporal access (NTA) hint. It may be a place closer than main memory |
1843 | /// but outside of the cache hierarchy. This is used to reduce access latency |
1844 | /// without polluting the cache. |
1845 | /// |
1846 | /// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and |
1847 | /// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0` |
1848 | /// and `_MM_HINT_T1` but indicate an anticipation to write to the address. |
1849 | /// |
1850 | /// The actual implementation depends on the particular CPU. This instruction |
1851 | /// is considered a hint, so the CPU is also free to simply ignore the request. |
1852 | /// |
1853 | /// The amount of prefetched data depends on the cache line size of the |
1854 | /// specific CPU, but it will be at least 32 bytes. |
1855 | /// |
1856 | /// Common caveats: |
1857 | /// |
1858 | /// * Most modern CPUs already automatically prefetch data based on predicted |
1859 | /// access patterns. |
1860 | /// |
1861 | /// * Data is usually not fetched if this would cause a TLB miss or a page |
1862 | /// fault. |
1863 | /// |
1864 | /// * Too much prefetching can cause unnecessary cache evictions. |
1865 | /// |
1866 | /// * Prefetching may also fail if there are not enough memory-subsystem |
1867 | /// resources (e.g., request buffers). |
1868 | /// |
1869 | /// |
1870 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch) |
1871 | #[inline ] |
1872 | #[target_feature (enable = "sse" )] |
1873 | #[cfg_attr (test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))] |
1874 | #[cfg_attr (test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))] |
1875 | #[cfg_attr (test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))] |
1876 | #[cfg_attr (test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))] |
1877 | #[rustc_legacy_const_generics (1)] |
1878 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1879 | pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) { |
1880 | // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache). |
1881 | // `locality` and `rw` are based on our `STRATEGY`. |
1882 | prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, ty:1); |
1883 | } |
1884 | |
1885 | /// Returns vector of type __m128 with indeterminate elements. |
1886 | /// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`]. |
1887 | /// In practice, this is equivalent to [`mem::zeroed`]. |
1888 | /// |
1889 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps) |
1890 | #[inline ] |
1891 | #[target_feature (enable = "sse" )] |
1892 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1893 | pub unsafe fn _mm_undefined_ps() -> __m128 { |
1894 | _mm_set1_ps(0.0) |
1895 | } |
1896 | |
1897 | /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place. |
1898 | /// |
1899 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS) |
1900 | #[inline ] |
1901 | #[allow (non_snake_case)] |
1902 | #[target_feature (enable = "sse" )] |
1903 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1904 | pub unsafe fn _MM_TRANSPOSE4_PS( |
1905 | row0: &mut __m128, |
1906 | row1: &mut __m128, |
1907 | row2: &mut __m128, |
1908 | row3: &mut __m128, |
1909 | ) { |
1910 | let tmp0: __m128 = _mm_unpacklo_ps(*row0, *row1); |
1911 | let tmp2: __m128 = _mm_unpacklo_ps(*row2, *row3); |
1912 | let tmp1: __m128 = _mm_unpackhi_ps(*row0, *row1); |
1913 | let tmp3: __m128 = _mm_unpackhi_ps(*row2, *row3); |
1914 | |
1915 | *row0 = _mm_movelh_ps(a:tmp0, b:tmp2); |
1916 | *row1 = _mm_movehl_ps(a:tmp2, b:tmp0); |
1917 | *row2 = _mm_movelh_ps(a:tmp1, b:tmp3); |
1918 | *row3 = _mm_movehl_ps(a:tmp3, b:tmp1); |
1919 | } |
1920 | |
1921 | #[allow (improper_ctypes)] |
1922 | extern "C" { |
1923 | #[link_name = "llvm.x86.sse.add.ss" ] |
1924 | fn addss(a: __m128, b: __m128) -> __m128; |
1925 | #[link_name = "llvm.x86.sse.sub.ss" ] |
1926 | fn subss(a: __m128, b: __m128) -> __m128; |
1927 | #[link_name = "llvm.x86.sse.mul.ss" ] |
1928 | fn mulss(a: __m128, b: __m128) -> __m128; |
1929 | #[link_name = "llvm.x86.sse.div.ss" ] |
1930 | fn divss(a: __m128, b: __m128) -> __m128; |
1931 | #[link_name = "llvm.x86.sse.sqrt.ss" ] |
1932 | fn sqrtss(a: __m128) -> __m128; |
1933 | #[link_name = "llvm.x86.sse.sqrt.ps" ] |
1934 | fn sqrtps(a: __m128) -> __m128; |
1935 | #[link_name = "llvm.x86.sse.rcp.ss" ] |
1936 | fn rcpss(a: __m128) -> __m128; |
1937 | #[link_name = "llvm.x86.sse.rcp.ps" ] |
1938 | fn rcpps(a: __m128) -> __m128; |
1939 | #[link_name = "llvm.x86.sse.rsqrt.ss" ] |
1940 | fn rsqrtss(a: __m128) -> __m128; |
1941 | #[link_name = "llvm.x86.sse.rsqrt.ps" ] |
1942 | fn rsqrtps(a: __m128) -> __m128; |
1943 | #[link_name = "llvm.x86.sse.min.ss" ] |
1944 | fn minss(a: __m128, b: __m128) -> __m128; |
1945 | #[link_name = "llvm.x86.sse.min.ps" ] |
1946 | fn minps(a: __m128, b: __m128) -> __m128; |
1947 | #[link_name = "llvm.x86.sse.max.ss" ] |
1948 | fn maxss(a: __m128, b: __m128) -> __m128; |
1949 | #[link_name = "llvm.x86.sse.max.ps" ] |
1950 | fn maxps(a: __m128, b: __m128) -> __m128; |
1951 | #[link_name = "llvm.x86.sse.cmp.ps" ] |
1952 | fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128; |
1953 | #[link_name = "llvm.x86.sse.comieq.ss" ] |
1954 | fn comieq_ss(a: __m128, b: __m128) -> i32; |
1955 | #[link_name = "llvm.x86.sse.comilt.ss" ] |
1956 | fn comilt_ss(a: __m128, b: __m128) -> i32; |
1957 | #[link_name = "llvm.x86.sse.comile.ss" ] |
1958 | fn comile_ss(a: __m128, b: __m128) -> i32; |
1959 | #[link_name = "llvm.x86.sse.comigt.ss" ] |
1960 | fn comigt_ss(a: __m128, b: __m128) -> i32; |
1961 | #[link_name = "llvm.x86.sse.comige.ss" ] |
1962 | fn comige_ss(a: __m128, b: __m128) -> i32; |
1963 | #[link_name = "llvm.x86.sse.comineq.ss" ] |
1964 | fn comineq_ss(a: __m128, b: __m128) -> i32; |
1965 | #[link_name = "llvm.x86.sse.ucomieq.ss" ] |
1966 | fn ucomieq_ss(a: __m128, b: __m128) -> i32; |
1967 | #[link_name = "llvm.x86.sse.ucomilt.ss" ] |
1968 | fn ucomilt_ss(a: __m128, b: __m128) -> i32; |
1969 | #[link_name = "llvm.x86.sse.ucomile.ss" ] |
1970 | fn ucomile_ss(a: __m128, b: __m128) -> i32; |
1971 | #[link_name = "llvm.x86.sse.ucomigt.ss" ] |
1972 | fn ucomigt_ss(a: __m128, b: __m128) -> i32; |
1973 | #[link_name = "llvm.x86.sse.ucomige.ss" ] |
1974 | fn ucomige_ss(a: __m128, b: __m128) -> i32; |
1975 | #[link_name = "llvm.x86.sse.ucomineq.ss" ] |
1976 | fn ucomineq_ss(a: __m128, b: __m128) -> i32; |
1977 | #[link_name = "llvm.x86.sse.cvtss2si" ] |
1978 | fn cvtss2si(a: __m128) -> i32; |
1979 | #[link_name = "llvm.x86.sse.cvttss2si" ] |
1980 | fn cvttss2si(a: __m128) -> i32; |
1981 | #[link_name = "llvm.x86.sse.cvtsi2ss" ] |
1982 | fn cvtsi2ss(a: __m128, b: i32) -> __m128; |
1983 | #[link_name = "llvm.x86.sse.sfence" ] |
1984 | fn sfence(); |
1985 | #[link_name = "llvm.x86.sse.stmxcsr" ] |
1986 | fn stmxcsr(p: *mut i8); |
1987 | #[link_name = "llvm.x86.sse.ldmxcsr" ] |
1988 | fn ldmxcsr(p: *const i8); |
1989 | #[link_name = "llvm.prefetch" ] |
1990 | fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32); |
1991 | #[link_name = "llvm.x86.sse.cmp.ss" ] |
1992 | fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128; |
1993 | } |
1994 | |
1995 | /// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint. |
1996 | /// |
1997 | /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection |
1998 | /// exception _may_ be generated. |
1999 | /// |
2000 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps) |
2001 | /// |
2002 | /// # Safety of non-temporal stores |
2003 | /// |
2004 | /// After using this intrinsic, but before any other access to the memory that this intrinsic |
2005 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In |
2006 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they |
2007 | /// return. |
2008 | /// |
2009 | /// See [`_mm_sfence`] for details. |
2010 | #[inline ] |
2011 | #[target_feature (enable = "sse" )] |
2012 | #[cfg_attr (test, assert_instr(movntps))] |
2013 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
2014 | #[allow (clippy::cast_ptr_alignment)] |
2015 | pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { |
2016 | intrinsics::nontemporal_store(ptr:mem_addr as *mut __m128, val:a); |
2017 | } |
2018 | |
2019 | #[cfg (test)] |
2020 | mod tests { |
2021 | use crate::{hint::black_box, mem::transmute, ptr}; |
2022 | use std::{boxed, f32::NAN}; |
2023 | use stdarch_test::simd_test; |
2024 | |
2025 | use crate::core_arch::{simd::*, x86::*}; |
2026 | |
2027 | #[simd_test(enable = "sse" )] |
2028 | unsafe fn test_mm_add_ps() { |
2029 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2030 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2031 | let r = _mm_add_ps(a, b); |
2032 | assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0)); |
2033 | } |
2034 | |
2035 | #[simd_test(enable = "sse" )] |
2036 | unsafe fn test_mm_add_ss() { |
2037 | let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0); |
2038 | let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0); |
2039 | let r = _mm_add_ss(a, b); |
2040 | assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0)); |
2041 | } |
2042 | |
2043 | #[simd_test(enable = "sse" )] |
2044 | unsafe fn test_mm_sub_ps() { |
2045 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2046 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2047 | let r = _mm_sub_ps(a, b); |
2048 | assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0)); |
2049 | } |
2050 | |
2051 | #[simd_test(enable = "sse" )] |
2052 | unsafe fn test_mm_sub_ss() { |
2053 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2054 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2055 | let r = _mm_sub_ss(a, b); |
2056 | assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0)); |
2057 | } |
2058 | |
2059 | #[simd_test(enable = "sse" )] |
2060 | unsafe fn test_mm_mul_ps() { |
2061 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2062 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2063 | let r = _mm_mul_ps(a, b); |
2064 | assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0)); |
2065 | } |
2066 | |
2067 | #[simd_test(enable = "sse" )] |
2068 | unsafe fn test_mm_mul_ss() { |
2069 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2070 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2071 | let r = _mm_mul_ss(a, b); |
2072 | assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0)); |
2073 | } |
2074 | |
2075 | #[simd_test(enable = "sse" )] |
2076 | unsafe fn test_mm_div_ps() { |
2077 | let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0); |
2078 | let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0); |
2079 | let r = _mm_div_ps(a, b); |
2080 | assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0)); |
2081 | } |
2082 | |
2083 | #[simd_test(enable = "sse" )] |
2084 | unsafe fn test_mm_div_ss() { |
2085 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2086 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2087 | let r = _mm_div_ss(a, b); |
2088 | assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0)); |
2089 | } |
2090 | |
2091 | #[simd_test(enable = "sse" )] |
2092 | unsafe fn test_mm_sqrt_ss() { |
2093 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2094 | let r = _mm_sqrt_ss(a); |
2095 | let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0); |
2096 | assert_eq_m128(r, e); |
2097 | } |
2098 | |
2099 | #[simd_test(enable = "sse" )] |
2100 | unsafe fn test_mm_sqrt_ps() { |
2101 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2102 | let r = _mm_sqrt_ps(a); |
2103 | let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0); |
2104 | assert_eq_m128(r, e); |
2105 | } |
2106 | |
2107 | #[simd_test(enable = "sse" )] |
2108 | unsafe fn test_mm_rcp_ss() { |
2109 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2110 | let r = _mm_rcp_ss(a); |
2111 | let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0); |
2112 | let rel_err = 0.00048828125; |
2113 | assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err); |
2114 | for i in 1..4 { |
2115 | assert_eq!(get_m128(r, i), get_m128(e, i)); |
2116 | } |
2117 | } |
2118 | |
2119 | #[simd_test(enable = "sse" )] |
2120 | unsafe fn test_mm_rcp_ps() { |
2121 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2122 | let r = _mm_rcp_ps(a); |
2123 | let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215); |
2124 | let rel_err = 0.00048828125; |
2125 | for i in 0..4 { |
2126 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); |
2127 | } |
2128 | } |
2129 | |
2130 | #[simd_test(enable = "sse" )] |
2131 | unsafe fn test_mm_rsqrt_ss() { |
2132 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2133 | let r = _mm_rsqrt_ss(a); |
2134 | let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0); |
2135 | let rel_err = 0.00048828125; |
2136 | for i in 0..4 { |
2137 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); |
2138 | } |
2139 | } |
2140 | |
2141 | #[simd_test(enable = "sse" )] |
2142 | unsafe fn test_mm_rsqrt_ps() { |
2143 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2144 | let r = _mm_rsqrt_ps(a); |
2145 | let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845); |
2146 | let rel_err = 0.00048828125; |
2147 | for i in 0..4 { |
2148 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); |
2149 | } |
2150 | } |
2151 | |
2152 | #[simd_test(enable = "sse" )] |
2153 | unsafe fn test_mm_min_ss() { |
2154 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2155 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2156 | let r = _mm_min_ss(a, b); |
2157 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); |
2158 | } |
2159 | |
2160 | #[simd_test(enable = "sse" )] |
2161 | unsafe fn test_mm_min_ps() { |
2162 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2163 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2164 | let r = _mm_min_ps(a, b); |
2165 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); |
2166 | |
2167 | // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min` |
2168 | // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic |
2169 | // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from |
2170 | // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals |
2171 | // `r1` to `a` and `r2` to `b`. |
2172 | let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); |
2173 | let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); |
2174 | let r1: [u8; 16] = transmute(_mm_min_ps(a, b)); |
2175 | let r2: [u8; 16] = transmute(_mm_min_ps(b, a)); |
2176 | let a: [u8; 16] = transmute(a); |
2177 | let b: [u8; 16] = transmute(b); |
2178 | assert_eq!(r1, b); |
2179 | assert_eq!(r2, a); |
2180 | assert_ne!(a, b); // sanity check that -0.0 is actually present |
2181 | } |
2182 | |
2183 | #[simd_test(enable = "sse" )] |
2184 | unsafe fn test_mm_max_ss() { |
2185 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2186 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2187 | let r = _mm_max_ss(a, b); |
2188 | assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0)); |
2189 | } |
2190 | |
2191 | #[simd_test(enable = "sse" )] |
2192 | unsafe fn test_mm_max_ps() { |
2193 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2194 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2195 | let r = _mm_max_ps(a, b); |
2196 | assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0)); |
2197 | |
2198 | // Check SSE-specific semantics for -0.0 handling. |
2199 | let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); |
2200 | let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); |
2201 | let r1: [u8; 16] = transmute(_mm_max_ps(a, b)); |
2202 | let r2: [u8; 16] = transmute(_mm_max_ps(b, a)); |
2203 | let a: [u8; 16] = transmute(a); |
2204 | let b: [u8; 16] = transmute(b); |
2205 | assert_eq!(r1, b); |
2206 | assert_eq!(r2, a); |
2207 | assert_ne!(a, b); // sanity check that -0.0 is actually present |
2208 | } |
2209 | |
2210 | #[simd_test(enable = "sse" )] |
2211 | unsafe fn test_mm_and_ps() { |
2212 | let a = transmute(u32x4::splat(0b0011)); |
2213 | let b = transmute(u32x4::splat(0b0101)); |
2214 | let r = _mm_and_ps(*black_box(&a), *black_box(&b)); |
2215 | let e = transmute(u32x4::splat(0b0001)); |
2216 | assert_eq_m128(r, e); |
2217 | } |
2218 | |
2219 | #[simd_test(enable = "sse" )] |
2220 | unsafe fn test_mm_andnot_ps() { |
2221 | let a = transmute(u32x4::splat(0b0011)); |
2222 | let b = transmute(u32x4::splat(0b0101)); |
2223 | let r = _mm_andnot_ps(*black_box(&a), *black_box(&b)); |
2224 | let e = transmute(u32x4::splat(0b0100)); |
2225 | assert_eq_m128(r, e); |
2226 | } |
2227 | |
2228 | #[simd_test(enable = "sse" )] |
2229 | unsafe fn test_mm_or_ps() { |
2230 | let a = transmute(u32x4::splat(0b0011)); |
2231 | let b = transmute(u32x4::splat(0b0101)); |
2232 | let r = _mm_or_ps(*black_box(&a), *black_box(&b)); |
2233 | let e = transmute(u32x4::splat(0b0111)); |
2234 | assert_eq_m128(r, e); |
2235 | } |
2236 | |
2237 | #[simd_test(enable = "sse" )] |
2238 | unsafe fn test_mm_xor_ps() { |
2239 | let a = transmute(u32x4::splat(0b0011)); |
2240 | let b = transmute(u32x4::splat(0b0101)); |
2241 | let r = _mm_xor_ps(*black_box(&a), *black_box(&b)); |
2242 | let e = transmute(u32x4::splat(0b0110)); |
2243 | assert_eq_m128(r, e); |
2244 | } |
2245 | |
2246 | #[simd_test(enable = "sse" )] |
2247 | unsafe fn test_mm_cmpeq_ss() { |
2248 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2249 | let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0); |
2250 | let r: u32x4 = transmute(_mm_cmpeq_ss(a, b)); |
2251 | let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0)); |
2252 | assert_eq!(r, e); |
2253 | |
2254 | let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2255 | let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2)); |
2256 | let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0)); |
2257 | assert_eq!(r2, e2); |
2258 | } |
2259 | |
2260 | #[simd_test(enable = "sse" )] |
2261 | unsafe fn test_mm_cmplt_ss() { |
2262 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2263 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2264 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2265 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2266 | |
2267 | let b1 = 0u32; // a.extract(0) < b.extract(0) |
2268 | let c1 = 0u32; // a.extract(0) < c.extract(0) |
2269 | let d1 = !0u32; // a.extract(0) < d.extract(0) |
2270 | |
2271 | let rb: u32x4 = transmute(_mm_cmplt_ss(a, b)); |
2272 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2273 | assert_eq!(rb, eb); |
2274 | |
2275 | let rc: u32x4 = transmute(_mm_cmplt_ss(a, c)); |
2276 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2277 | assert_eq!(rc, ec); |
2278 | |
2279 | let rd: u32x4 = transmute(_mm_cmplt_ss(a, d)); |
2280 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2281 | assert_eq!(rd, ed); |
2282 | } |
2283 | |
2284 | #[simd_test(enable = "sse" )] |
2285 | unsafe fn test_mm_cmple_ss() { |
2286 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2287 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2288 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2289 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2290 | |
2291 | let b1 = 0u32; // a.extract(0) <= b.extract(0) |
2292 | let c1 = !0u32; // a.extract(0) <= c.extract(0) |
2293 | let d1 = !0u32; // a.extract(0) <= d.extract(0) |
2294 | |
2295 | let rb: u32x4 = transmute(_mm_cmple_ss(a, b)); |
2296 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2297 | assert_eq!(rb, eb); |
2298 | |
2299 | let rc: u32x4 = transmute(_mm_cmple_ss(a, c)); |
2300 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2301 | assert_eq!(rc, ec); |
2302 | |
2303 | let rd: u32x4 = transmute(_mm_cmple_ss(a, d)); |
2304 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2305 | assert_eq!(rd, ed); |
2306 | } |
2307 | |
2308 | #[simd_test(enable = "sse" )] |
2309 | unsafe fn test_mm_cmpgt_ss() { |
2310 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2311 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2312 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2313 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2314 | |
2315 | let b1 = !0u32; // a.extract(0) > b.extract(0) |
2316 | let c1 = 0u32; // a.extract(0) > c.extract(0) |
2317 | let d1 = 0u32; // a.extract(0) > d.extract(0) |
2318 | |
2319 | let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b)); |
2320 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2321 | assert_eq!(rb, eb); |
2322 | |
2323 | let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c)); |
2324 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2325 | assert_eq!(rc, ec); |
2326 | |
2327 | let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d)); |
2328 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2329 | assert_eq!(rd, ed); |
2330 | } |
2331 | |
2332 | #[simd_test(enable = "sse" )] |
2333 | unsafe fn test_mm_cmpge_ss() { |
2334 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2335 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2336 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2337 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2338 | |
2339 | let b1 = !0u32; // a.extract(0) >= b.extract(0) |
2340 | let c1 = !0u32; // a.extract(0) >= c.extract(0) |
2341 | let d1 = 0u32; // a.extract(0) >= d.extract(0) |
2342 | |
2343 | let rb: u32x4 = transmute(_mm_cmpge_ss(a, b)); |
2344 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2345 | assert_eq!(rb, eb); |
2346 | |
2347 | let rc: u32x4 = transmute(_mm_cmpge_ss(a, c)); |
2348 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2349 | assert_eq!(rc, ec); |
2350 | |
2351 | let rd: u32x4 = transmute(_mm_cmpge_ss(a, d)); |
2352 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2353 | assert_eq!(rd, ed); |
2354 | } |
2355 | |
2356 | #[simd_test(enable = "sse" )] |
2357 | unsafe fn test_mm_cmpneq_ss() { |
2358 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2359 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2360 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2361 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2362 | |
2363 | let b1 = !0u32; // a.extract(0) != b.extract(0) |
2364 | let c1 = 0u32; // a.extract(0) != c.extract(0) |
2365 | let d1 = !0u32; // a.extract(0) != d.extract(0) |
2366 | |
2367 | let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b)); |
2368 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2369 | assert_eq!(rb, eb); |
2370 | |
2371 | let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c)); |
2372 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2373 | assert_eq!(rc, ec); |
2374 | |
2375 | let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d)); |
2376 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2377 | assert_eq!(rd, ed); |
2378 | } |
2379 | |
2380 | #[simd_test(enable = "sse" )] |
2381 | unsafe fn test_mm_cmpnlt_ss() { |
2382 | // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there |
2383 | // must be a difference. It may have to do with behavior in the |
2384 | // presence of NaNs (signaling or quiet). If so, we should add tests |
2385 | // for those. |
2386 | |
2387 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2388 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2389 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2390 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2391 | |
2392 | let b1 = !0u32; // a.extract(0) >= b.extract(0) |
2393 | let c1 = !0u32; // a.extract(0) >= c.extract(0) |
2394 | let d1 = 0u32; // a.extract(0) >= d.extract(0) |
2395 | |
2396 | let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b)); |
2397 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2398 | assert_eq!(rb, eb); |
2399 | |
2400 | let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c)); |
2401 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2402 | assert_eq!(rc, ec); |
2403 | |
2404 | let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d)); |
2405 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2406 | assert_eq!(rd, ed); |
2407 | } |
2408 | |
2409 | #[simd_test(enable = "sse" )] |
2410 | unsafe fn test_mm_cmpnle_ss() { |
2411 | // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there |
2412 | // must be a difference. It may have to do with behavior in the |
2413 | // presence |
2414 | // of NaNs (signaling or quiet). If so, we should add tests for those. |
2415 | |
2416 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2417 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2418 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2419 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2420 | |
2421 | let b1 = !0u32; // a.extract(0) > b.extract(0) |
2422 | let c1 = 0u32; // a.extract(0) > c.extract(0) |
2423 | let d1 = 0u32; // a.extract(0) > d.extract(0) |
2424 | |
2425 | let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b)); |
2426 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2427 | assert_eq!(rb, eb); |
2428 | |
2429 | let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c)); |
2430 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2431 | assert_eq!(rc, ec); |
2432 | |
2433 | let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d)); |
2434 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2435 | assert_eq!(rd, ed); |
2436 | } |
2437 | |
2438 | #[simd_test(enable = "sse" )] |
2439 | unsafe fn test_mm_cmpngt_ss() { |
2440 | // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there |
2441 | // must be a difference. It may have to do with behavior in the |
2442 | // presence of NaNs (signaling or quiet). If so, we should add tests |
2443 | // for those. |
2444 | |
2445 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2446 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2447 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2448 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2449 | |
2450 | let b1 = 0u32; // a.extract(0) <= b.extract(0) |
2451 | let c1 = !0u32; // a.extract(0) <= c.extract(0) |
2452 | let d1 = !0u32; // a.extract(0) <= d.extract(0) |
2453 | |
2454 | let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b)); |
2455 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2456 | assert_eq!(rb, eb); |
2457 | |
2458 | let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c)); |
2459 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2460 | assert_eq!(rc, ec); |
2461 | |
2462 | let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d)); |
2463 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2464 | assert_eq!(rd, ed); |
2465 | } |
2466 | |
2467 | #[simd_test(enable = "sse" )] |
2468 | unsafe fn test_mm_cmpnge_ss() { |
2469 | // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there |
2470 | // must be a difference. It may have to do with behavior in the |
2471 | // presence of NaNs (signaling or quiet). If so, we should add tests |
2472 | // for those. |
2473 | |
2474 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2475 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2476 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2477 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2478 | |
2479 | let b1 = 0u32; // a.extract(0) < b.extract(0) |
2480 | let c1 = 0u32; // a.extract(0) < c.extract(0) |
2481 | let d1 = !0u32; // a.extract(0) < d.extract(0) |
2482 | |
2483 | let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b)); |
2484 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2485 | assert_eq!(rb, eb); |
2486 | |
2487 | let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c)); |
2488 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2489 | assert_eq!(rc, ec); |
2490 | |
2491 | let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d)); |
2492 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2493 | assert_eq!(rd, ed); |
2494 | } |
2495 | |
2496 | #[simd_test(enable = "sse" )] |
2497 | unsafe fn test_mm_cmpord_ss() { |
2498 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2499 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2500 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); |
2501 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2502 | |
2503 | let b1 = !0u32; // a.extract(0) ord b.extract(0) |
2504 | let c1 = 0u32; // a.extract(0) ord c.extract(0) |
2505 | let d1 = !0u32; // a.extract(0) ord d.extract(0) |
2506 | |
2507 | let rb: u32x4 = transmute(_mm_cmpord_ss(a, b)); |
2508 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2509 | assert_eq!(rb, eb); |
2510 | |
2511 | let rc: u32x4 = transmute(_mm_cmpord_ss(a, c)); |
2512 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2513 | assert_eq!(rc, ec); |
2514 | |
2515 | let rd: u32x4 = transmute(_mm_cmpord_ss(a, d)); |
2516 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2517 | assert_eq!(rd, ed); |
2518 | } |
2519 | |
2520 | #[simd_test(enable = "sse" )] |
2521 | unsafe fn test_mm_cmpunord_ss() { |
2522 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2523 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2524 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); |
2525 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2526 | |
2527 | let b1 = 0u32; // a.extract(0) unord b.extract(0) |
2528 | let c1 = !0u32; // a.extract(0) unord c.extract(0) |
2529 | let d1 = 0u32; // a.extract(0) unord d.extract(0) |
2530 | |
2531 | let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b)); |
2532 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2533 | assert_eq!(rb, eb); |
2534 | |
2535 | let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c)); |
2536 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2537 | assert_eq!(rc, ec); |
2538 | |
2539 | let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d)); |
2540 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2541 | assert_eq!(rd, ed); |
2542 | } |
2543 | |
2544 | #[simd_test(enable = "sse" )] |
2545 | unsafe fn test_mm_cmpeq_ps() { |
2546 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2547 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2548 | let tru = !0u32; |
2549 | let fls = 0u32; |
2550 | |
2551 | let e = u32x4::new(fls, fls, tru, fls); |
2552 | let r: u32x4 = transmute(_mm_cmpeq_ps(a, b)); |
2553 | assert_eq!(r, e); |
2554 | } |
2555 | |
2556 | #[simd_test(enable = "sse" )] |
2557 | unsafe fn test_mm_cmplt_ps() { |
2558 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2559 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2560 | let tru = !0u32; |
2561 | let fls = 0u32; |
2562 | |
2563 | let e = u32x4::new(tru, fls, fls, fls); |
2564 | let r: u32x4 = transmute(_mm_cmplt_ps(a, b)); |
2565 | assert_eq!(r, e); |
2566 | } |
2567 | |
2568 | #[simd_test(enable = "sse" )] |
2569 | unsafe fn test_mm_cmple_ps() { |
2570 | let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0); |
2571 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2572 | let tru = !0u32; |
2573 | let fls = 0u32; |
2574 | |
2575 | let e = u32x4::new(tru, fls, tru, fls); |
2576 | let r: u32x4 = transmute(_mm_cmple_ps(a, b)); |
2577 | assert_eq!(r, e); |
2578 | } |
2579 | |
2580 | #[simd_test(enable = "sse" )] |
2581 | unsafe fn test_mm_cmpgt_ps() { |
2582 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2583 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); |
2584 | let tru = !0u32; |
2585 | let fls = 0u32; |
2586 | |
2587 | let e = u32x4::new(fls, tru, fls, fls); |
2588 | let r: u32x4 = transmute(_mm_cmpgt_ps(a, b)); |
2589 | assert_eq!(r, e); |
2590 | } |
2591 | |
2592 | #[simd_test(enable = "sse" )] |
2593 | unsafe fn test_mm_cmpge_ps() { |
2594 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2595 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); |
2596 | let tru = !0u32; |
2597 | let fls = 0u32; |
2598 | |
2599 | let e = u32x4::new(fls, tru, tru, fls); |
2600 | let r: u32x4 = transmute(_mm_cmpge_ps(a, b)); |
2601 | assert_eq!(r, e); |
2602 | } |
2603 | |
2604 | #[simd_test(enable = "sse" )] |
2605 | unsafe fn test_mm_cmpneq_ps() { |
2606 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2607 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2608 | let tru = !0u32; |
2609 | let fls = 0u32; |
2610 | |
2611 | let e = u32x4::new(tru, tru, fls, tru); |
2612 | let r: u32x4 = transmute(_mm_cmpneq_ps(a, b)); |
2613 | assert_eq!(r, e); |
2614 | } |
2615 | |
2616 | #[simd_test(enable = "sse" )] |
2617 | unsafe fn test_mm_cmpnlt_ps() { |
2618 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2619 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2620 | let tru = !0u32; |
2621 | let fls = 0u32; |
2622 | |
2623 | let e = u32x4::new(fls, tru, tru, tru); |
2624 | let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b)); |
2625 | assert_eq!(r, e); |
2626 | } |
2627 | |
2628 | #[simd_test(enable = "sse" )] |
2629 | unsafe fn test_mm_cmpnle_ps() { |
2630 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2631 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2632 | let tru = !0u32; |
2633 | let fls = 0u32; |
2634 | |
2635 | let e = u32x4::new(fls, tru, fls, tru); |
2636 | let r: u32x4 = transmute(_mm_cmpnle_ps(a, b)); |
2637 | assert_eq!(r, e); |
2638 | } |
2639 | |
2640 | #[simd_test(enable = "sse" )] |
2641 | unsafe fn test_mm_cmpngt_ps() { |
2642 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2643 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2644 | let tru = !0u32; |
2645 | let fls = 0u32; |
2646 | |
2647 | let e = u32x4::new(tru, fls, tru, tru); |
2648 | let r: u32x4 = transmute(_mm_cmpngt_ps(a, b)); |
2649 | assert_eq!(r, e); |
2650 | } |
2651 | |
2652 | #[simd_test(enable = "sse" )] |
2653 | unsafe fn test_mm_cmpnge_ps() { |
2654 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2655 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2656 | let tru = !0u32; |
2657 | let fls = 0u32; |
2658 | |
2659 | let e = u32x4::new(tru, fls, fls, tru); |
2660 | let r: u32x4 = transmute(_mm_cmpnge_ps(a, b)); |
2661 | assert_eq!(r, e); |
2662 | } |
2663 | |
2664 | #[simd_test(enable = "sse" )] |
2665 | unsafe fn test_mm_cmpord_ps() { |
2666 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); |
2667 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); |
2668 | let tru = !0u32; |
2669 | let fls = 0u32; |
2670 | |
2671 | let e = u32x4::new(tru, fls, fls, fls); |
2672 | let r: u32x4 = transmute(_mm_cmpord_ps(a, b)); |
2673 | assert_eq!(r, e); |
2674 | } |
2675 | |
2676 | #[simd_test(enable = "sse" )] |
2677 | unsafe fn test_mm_cmpunord_ps() { |
2678 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); |
2679 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); |
2680 | let tru = !0u32; |
2681 | let fls = 0u32; |
2682 | |
2683 | let e = u32x4::new(fls, tru, tru, tru); |
2684 | let r: u32x4 = transmute(_mm_cmpunord_ps(a, b)); |
2685 | assert_eq!(r, e); |
2686 | } |
2687 | |
2688 | #[simd_test(enable = "sse" )] |
2689 | unsafe fn test_mm_comieq_ss() { |
2690 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2691 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2692 | |
2693 | let ee = &[1i32, 0, 0, 0]; |
2694 | |
2695 | for i in 0..4 { |
2696 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2697 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2698 | |
2699 | let r = _mm_comieq_ss(a, b); |
2700 | |
2701 | assert_eq!( |
2702 | ee[i], r, |
2703 | "_mm_comieq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2704 | a, b, r, ee[i], i |
2705 | ); |
2706 | } |
2707 | } |
2708 | |
2709 | #[simd_test(enable = "sse" )] |
2710 | unsafe fn test_mm_comilt_ss() { |
2711 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2712 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2713 | |
2714 | let ee = &[0i32, 1, 0, 0]; |
2715 | |
2716 | for i in 0..4 { |
2717 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2718 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2719 | |
2720 | let r = _mm_comilt_ss(a, b); |
2721 | |
2722 | assert_eq!( |
2723 | ee[i], r, |
2724 | "_mm_comilt_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2725 | a, b, r, ee[i], i |
2726 | ); |
2727 | } |
2728 | } |
2729 | |
2730 | #[simd_test(enable = "sse" )] |
2731 | unsafe fn test_mm_comile_ss() { |
2732 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2733 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2734 | |
2735 | let ee = &[1i32, 1, 0, 0]; |
2736 | |
2737 | for i in 0..4 { |
2738 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2739 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2740 | |
2741 | let r = _mm_comile_ss(a, b); |
2742 | |
2743 | assert_eq!( |
2744 | ee[i], r, |
2745 | "_mm_comile_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2746 | a, b, r, ee[i], i |
2747 | ); |
2748 | } |
2749 | } |
2750 | |
2751 | #[simd_test(enable = "sse" )] |
2752 | unsafe fn test_mm_comigt_ss() { |
2753 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2754 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2755 | |
2756 | let ee = &[1i32, 0, 1, 0]; |
2757 | |
2758 | for i in 0..4 { |
2759 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2760 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2761 | |
2762 | let r = _mm_comige_ss(a, b); |
2763 | |
2764 | assert_eq!( |
2765 | ee[i], r, |
2766 | "_mm_comige_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2767 | a, b, r, ee[i], i |
2768 | ); |
2769 | } |
2770 | } |
2771 | |
2772 | #[simd_test(enable = "sse" )] |
2773 | unsafe fn test_mm_comineq_ss() { |
2774 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2775 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2776 | |
2777 | let ee = &[0i32, 1, 1, 1]; |
2778 | |
2779 | for i in 0..4 { |
2780 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2781 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2782 | |
2783 | let r = _mm_comineq_ss(a, b); |
2784 | |
2785 | assert_eq!( |
2786 | ee[i], r, |
2787 | "_mm_comineq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2788 | a, b, r, ee[i], i |
2789 | ); |
2790 | } |
2791 | } |
2792 | |
2793 | #[simd_test(enable = "sse" )] |
2794 | unsafe fn test_mm_ucomieq_ss() { |
2795 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2796 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2797 | |
2798 | let ee = &[1i32, 0, 0, 0]; |
2799 | |
2800 | for i in 0..4 { |
2801 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2802 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2803 | |
2804 | let r = _mm_ucomieq_ss(a, b); |
2805 | |
2806 | assert_eq!( |
2807 | ee[i], r, |
2808 | "_mm_ucomieq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2809 | a, b, r, ee[i], i |
2810 | ); |
2811 | } |
2812 | } |
2813 | |
2814 | #[simd_test(enable = "sse" )] |
2815 | unsafe fn test_mm_ucomilt_ss() { |
2816 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2817 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2818 | |
2819 | let ee = &[0i32, 1, 0, 0]; |
2820 | |
2821 | for i in 0..4 { |
2822 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2823 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2824 | |
2825 | let r = _mm_ucomilt_ss(a, b); |
2826 | |
2827 | assert_eq!( |
2828 | ee[i], r, |
2829 | "_mm_ucomilt_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2830 | a, b, r, ee[i], i |
2831 | ); |
2832 | } |
2833 | } |
2834 | |
2835 | #[simd_test(enable = "sse" )] |
2836 | unsafe fn test_mm_ucomile_ss() { |
2837 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2838 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2839 | |
2840 | let ee = &[1i32, 1, 0, 0]; |
2841 | |
2842 | for i in 0..4 { |
2843 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2844 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2845 | |
2846 | let r = _mm_ucomile_ss(a, b); |
2847 | |
2848 | assert_eq!( |
2849 | ee[i], r, |
2850 | "_mm_ucomile_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2851 | a, b, r, ee[i], i |
2852 | ); |
2853 | } |
2854 | } |
2855 | |
2856 | #[simd_test(enable = "sse" )] |
2857 | unsafe fn test_mm_ucomigt_ss() { |
2858 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2859 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2860 | |
2861 | let ee = &[0i32, 0, 1, 0]; |
2862 | |
2863 | for i in 0..4 { |
2864 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2865 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2866 | |
2867 | let r = _mm_ucomigt_ss(a, b); |
2868 | |
2869 | assert_eq!( |
2870 | ee[i], r, |
2871 | "_mm_ucomigt_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2872 | a, b, r, ee[i], i |
2873 | ); |
2874 | } |
2875 | } |
2876 | |
2877 | #[simd_test(enable = "sse" )] |
2878 | unsafe fn test_mm_ucomige_ss() { |
2879 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2880 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2881 | |
2882 | let ee = &[1i32, 0, 1, 0]; |
2883 | |
2884 | for i in 0..4 { |
2885 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2886 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2887 | |
2888 | let r = _mm_ucomige_ss(a, b); |
2889 | |
2890 | assert_eq!( |
2891 | ee[i], r, |
2892 | "_mm_ucomige_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2893 | a, b, r, ee[i], i |
2894 | ); |
2895 | } |
2896 | } |
2897 | |
2898 | #[simd_test(enable = "sse" )] |
2899 | unsafe fn test_mm_ucomineq_ss() { |
2900 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2901 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2902 | |
2903 | let ee = &[0i32, 1, 1, 1]; |
2904 | |
2905 | for i in 0..4 { |
2906 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2907 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2908 | |
2909 | let r = _mm_ucomineq_ss(a, b); |
2910 | |
2911 | assert_eq!( |
2912 | ee[i], r, |
2913 | "_mm_ucomineq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2914 | a, b, r, ee[i], i |
2915 | ); |
2916 | } |
2917 | } |
2918 | |
2919 | #[allow (deprecated)] // FIXME: This test uses deprecated CSR access functions |
2920 | #[simd_test(enable = "sse" )] |
2921 | #[cfg_attr (miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri |
2922 | unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() { |
2923 | // If one of the arguments is a quiet NaN `comieq_ss` should signal an |
2924 | // Invalid Operation Exception while `ucomieq_ss` should not. |
2925 | let aa = &[3.0f32, NAN, 23.0, NAN]; |
2926 | let bb = &[3.0f32, 47.5, NAN, NAN]; |
2927 | |
2928 | let ee = &[1i32, 0, 0, 0]; |
2929 | let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception? |
2930 | |
2931 | for i in 0..4 { |
2932 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2933 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2934 | |
2935 | _MM_SET_EXCEPTION_STATE(0); |
2936 | let r1 = _mm_comieq_ss(*black_box(&a), b); |
2937 | let s1 = _MM_GET_EXCEPTION_STATE(); |
2938 | |
2939 | _MM_SET_EXCEPTION_STATE(0); |
2940 | let r2 = _mm_ucomieq_ss(*black_box(&a), b); |
2941 | let s2 = _MM_GET_EXCEPTION_STATE(); |
2942 | |
2943 | assert_eq!( |
2944 | ee[i], r1, |
2945 | "_mm_comeq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2946 | a, b, r1, ee[i], i |
2947 | ); |
2948 | assert_eq!( |
2949 | ee[i], r2, |
2950 | "_mm_ucomeq_ss( {:?}, {:?}) = {}, expected: {} (i= {})" , |
2951 | a, b, r2, ee[i], i |
2952 | ); |
2953 | assert_eq!( |
2954 | s1, |
2955 | exc[i] * _MM_EXCEPT_INVALID, |
2956 | "_mm_comieq_ss() set exception flags: {} (i= {})" , |
2957 | s1, |
2958 | i |
2959 | ); |
2960 | assert_eq!( |
2961 | s2, |
2962 | 0, // ucomieq_ss should not signal an exception |
2963 | "_mm_ucomieq_ss() set exception flags: {} (i= {})" , |
2964 | s2, |
2965 | i |
2966 | ); |
2967 | } |
2968 | } |
2969 | |
2970 | #[simd_test(enable = "sse" )] |
2971 | unsafe fn test_mm_cvtss_si32() { |
2972 | let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; |
2973 | let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520]; |
2974 | for i in 0..inputs.len() { |
2975 | let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0); |
2976 | let e = result[i]; |
2977 | let r = _mm_cvtss_si32(x); |
2978 | assert_eq!( |
2979 | e, r, |
2980 | "TestCase # {} _mm_cvtss_si32( {:?}) = {}, expected: {}" , |
2981 | i, x, r, e |
2982 | ); |
2983 | } |
2984 | } |
2985 | |
2986 | #[simd_test(enable = "sse" )] |
2987 | unsafe fn test_mm_cvttss_si32() { |
2988 | let inputs = &[ |
2989 | (42.0f32, 42i32), |
2990 | (-31.4, -31), |
2991 | (-33.5, -33), |
2992 | (-34.5, -34), |
2993 | (10.999, 10), |
2994 | (-5.99, -5), |
2995 | (4.0e10, i32::MIN), |
2996 | (4.0e-10, 0), |
2997 | (NAN, i32::MIN), |
2998 | (2147483500.1, 2147483520), |
2999 | ]; |
3000 | for (i, &(xi, e)) in inputs.iter().enumerate() { |
3001 | let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); |
3002 | let r = _mm_cvttss_si32(x); |
3003 | assert_eq!( |
3004 | e, r, |
3005 | "TestCase # {} _mm_cvttss_si32( {:?}) = {}, expected: {}" , |
3006 | i, x, r, e |
3007 | ); |
3008 | } |
3009 | } |
3010 | |
3011 | #[simd_test(enable = "sse" )] |
3012 | unsafe fn test_mm_cvtsi32_ss() { |
3013 | let inputs = &[ |
3014 | (4555i32, 4555.0f32), |
3015 | (322223333, 322223330.0), |
3016 | (-432, -432.0), |
3017 | (-322223333, -322223330.0), |
3018 | ]; |
3019 | |
3020 | for &(x, f) in inputs.iter() { |
3021 | let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3022 | let r = _mm_cvtsi32_ss(a, x); |
3023 | let e = _mm_setr_ps(f, 6.0, 7.0, 8.0); |
3024 | assert_eq_m128(e, r); |
3025 | } |
3026 | } |
3027 | |
3028 | #[simd_test(enable = "sse" )] |
3029 | unsafe fn test_mm_cvtss_f32() { |
3030 | let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0); |
3031 | assert_eq!(_mm_cvtss_f32(a), 312.0134); |
3032 | } |
3033 | |
3034 | #[simd_test(enable = "sse" )] |
3035 | unsafe fn test_mm_set_ss() { |
3036 | let r = _mm_set_ss(black_box(4.25)); |
3037 | assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0)); |
3038 | } |
3039 | |
3040 | #[simd_test(enable = "sse" )] |
3041 | unsafe fn test_mm_set1_ps() { |
3042 | let r1 = _mm_set1_ps(black_box(4.25)); |
3043 | let r2 = _mm_set_ps1(black_box(4.25)); |
3044 | assert_eq!(get_m128(r1, 0), 4.25); |
3045 | assert_eq!(get_m128(r1, 1), 4.25); |
3046 | assert_eq!(get_m128(r1, 2), 4.25); |
3047 | assert_eq!(get_m128(r1, 3), 4.25); |
3048 | assert_eq!(get_m128(r2, 0), 4.25); |
3049 | assert_eq!(get_m128(r2, 1), 4.25); |
3050 | assert_eq!(get_m128(r2, 2), 4.25); |
3051 | assert_eq!(get_m128(r2, 3), 4.25); |
3052 | } |
3053 | |
3054 | #[simd_test(enable = "sse" )] |
3055 | unsafe fn test_mm_set_ps() { |
3056 | let r = _mm_set_ps( |
3057 | black_box(1.0), |
3058 | black_box(2.0), |
3059 | black_box(3.0), |
3060 | black_box(4.0), |
3061 | ); |
3062 | assert_eq!(get_m128(r, 0), 4.0); |
3063 | assert_eq!(get_m128(r, 1), 3.0); |
3064 | assert_eq!(get_m128(r, 2), 2.0); |
3065 | assert_eq!(get_m128(r, 3), 1.0); |
3066 | } |
3067 | |
3068 | #[simd_test(enable = "sse" )] |
3069 | unsafe fn test_mm_setr_ps() { |
3070 | let r = _mm_setr_ps( |
3071 | black_box(1.0), |
3072 | black_box(2.0), |
3073 | black_box(3.0), |
3074 | black_box(4.0), |
3075 | ); |
3076 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0)); |
3077 | } |
3078 | |
3079 | #[simd_test(enable = "sse" )] |
3080 | unsafe fn test_mm_setzero_ps() { |
3081 | let r = *black_box(&_mm_setzero_ps()); |
3082 | assert_eq_m128(r, _mm_set1_ps(0.0)); |
3083 | } |
3084 | |
3085 | #[simd_test(enable = "sse" )] |
3086 | unsafe fn test_mm_shuffle() { |
3087 | assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11); |
3088 | assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00); |
3089 | assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01); |
3090 | } |
3091 | |
3092 | #[simd_test(enable = "sse" )] |
3093 | unsafe fn test_mm_shuffle_ps() { |
3094 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3095 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3096 | let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b); |
3097 | assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0)); |
3098 | } |
3099 | |
3100 | #[simd_test(enable = "sse" )] |
3101 | unsafe fn test_mm_unpackhi_ps() { |
3102 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3103 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3104 | let r = _mm_unpackhi_ps(a, b); |
3105 | assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0)); |
3106 | } |
3107 | |
3108 | #[simd_test(enable = "sse" )] |
3109 | unsafe fn test_mm_unpacklo_ps() { |
3110 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3111 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3112 | let r = _mm_unpacklo_ps(a, b); |
3113 | assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0)); |
3114 | } |
3115 | |
3116 | #[simd_test(enable = "sse" )] |
3117 | unsafe fn test_mm_movehl_ps() { |
3118 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3119 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3120 | let r = _mm_movehl_ps(a, b); |
3121 | assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0)); |
3122 | } |
3123 | |
3124 | #[simd_test(enable = "sse" )] |
3125 | unsafe fn test_mm_movelh_ps() { |
3126 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3127 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3128 | let r = _mm_movelh_ps(a, b); |
3129 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0)); |
3130 | } |
3131 | |
3132 | #[simd_test(enable = "sse" )] |
3133 | unsafe fn test_mm_load_ss() { |
3134 | let a = 42.0f32; |
3135 | let r = _mm_load_ss(ptr::addr_of!(a)); |
3136 | assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0)); |
3137 | } |
3138 | |
3139 | #[simd_test(enable = "sse" )] |
3140 | unsafe fn test_mm_load1_ps() { |
3141 | let a = 42.0f32; |
3142 | let r = _mm_load1_ps(ptr::addr_of!(a)); |
3143 | assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0)); |
3144 | } |
3145 | |
3146 | #[simd_test(enable = "sse" )] |
3147 | unsafe fn test_mm_load_ps() { |
3148 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
3149 | |
3150 | let mut p = vals.as_ptr(); |
3151 | let mut fixup = 0.0f32; |
3152 | |
3153 | // Make sure p is aligned, otherwise we might get a |
3154 | // (signal: 11, SIGSEGV: invalid memory reference) |
3155 | |
3156 | let unalignment = (p as usize) & 0xf; |
3157 | if unalignment != 0 { |
3158 | let delta = (16 - unalignment) >> 2; |
3159 | fixup = delta as f32; |
3160 | p = p.add(delta); |
3161 | } |
3162 | |
3163 | let r = _mm_load_ps(p); |
3164 | let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup)); |
3165 | assert_eq_m128(r, e); |
3166 | } |
3167 | |
3168 | #[simd_test(enable = "sse" )] |
3169 | unsafe fn test_mm_loadu_ps() { |
3170 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
3171 | let p = vals.as_ptr().add(3); |
3172 | let r = _mm_loadu_ps(black_box(p)); |
3173 | assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0)); |
3174 | } |
3175 | |
3176 | #[simd_test(enable = "sse" )] |
3177 | unsafe fn test_mm_loadr_ps() { |
3178 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
3179 | |
3180 | let mut p = vals.as_ptr(); |
3181 | let mut fixup = 0.0f32; |
3182 | |
3183 | // Make sure p is aligned, otherwise we might get a |
3184 | // (signal: 11, SIGSEGV: invalid memory reference) |
3185 | |
3186 | let unalignment = (p as usize) & 0xf; |
3187 | if unalignment != 0 { |
3188 | let delta = (16 - unalignment) >> 2; |
3189 | fixup = delta as f32; |
3190 | p = p.add(delta); |
3191 | } |
3192 | |
3193 | let r = _mm_loadr_ps(p); |
3194 | let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup)); |
3195 | assert_eq_m128(r, e); |
3196 | } |
3197 | |
3198 | #[simd_test(enable = "sse2" )] |
3199 | unsafe fn test_mm_loadu_si64() { |
3200 | let a = _mm_setr_epi64x(5, 6); |
3201 | let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _); |
3202 | assert_eq_m128i(r, _mm_setr_epi64x(5, 0)); |
3203 | } |
3204 | |
3205 | #[simd_test(enable = "sse" )] |
3206 | unsafe fn test_mm_store_ss() { |
3207 | let mut vals = [0.0f32; 8]; |
3208 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3209 | _mm_store_ss(vals.as_mut_ptr().add(1), a); |
3210 | |
3211 | assert_eq!(vals[0], 0.0); |
3212 | assert_eq!(vals[1], 1.0); |
3213 | assert_eq!(vals[2], 0.0); |
3214 | } |
3215 | |
3216 | #[simd_test(enable = "sse" )] |
3217 | unsafe fn test_mm_store1_ps() { |
3218 | let mut vals = [0.0f32; 8]; |
3219 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3220 | |
3221 | let mut ofs = 0; |
3222 | let mut p = vals.as_mut_ptr(); |
3223 | |
3224 | if (p as usize) & 0xf != 0 { |
3225 | ofs = (16 - ((p as usize) & 0xf)) >> 2; |
3226 | p = p.add(ofs); |
3227 | } |
3228 | |
3229 | _mm_store1_ps(p, *black_box(&a)); |
3230 | |
3231 | if ofs > 0 { |
3232 | assert_eq!(vals[ofs - 1], 0.0); |
3233 | } |
3234 | assert_eq!(vals[ofs + 0], 1.0); |
3235 | assert_eq!(vals[ofs + 1], 1.0); |
3236 | assert_eq!(vals[ofs + 2], 1.0); |
3237 | assert_eq!(vals[ofs + 3], 1.0); |
3238 | assert_eq!(vals[ofs + 4], 0.0); |
3239 | } |
3240 | |
3241 | #[simd_test(enable = "sse" )] |
3242 | unsafe fn test_mm_store_ps() { |
3243 | let mut vals = [0.0f32; 8]; |
3244 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3245 | |
3246 | let mut ofs = 0; |
3247 | let mut p = vals.as_mut_ptr(); |
3248 | |
3249 | // Align p to 16-byte boundary |
3250 | if (p as usize) & 0xf != 0 { |
3251 | ofs = (16 - ((p as usize) & 0xf)) >> 2; |
3252 | p = p.add(ofs); |
3253 | } |
3254 | |
3255 | _mm_store_ps(p, *black_box(&a)); |
3256 | |
3257 | if ofs > 0 { |
3258 | assert_eq!(vals[ofs - 1], 0.0); |
3259 | } |
3260 | assert_eq!(vals[ofs + 0], 1.0); |
3261 | assert_eq!(vals[ofs + 1], 2.0); |
3262 | assert_eq!(vals[ofs + 2], 3.0); |
3263 | assert_eq!(vals[ofs + 3], 4.0); |
3264 | assert_eq!(vals[ofs + 4], 0.0); |
3265 | } |
3266 | |
3267 | #[simd_test(enable = "sse" )] |
3268 | unsafe fn test_mm_storer_ps() { |
3269 | let mut vals = [0.0f32; 8]; |
3270 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3271 | |
3272 | let mut ofs = 0; |
3273 | let mut p = vals.as_mut_ptr(); |
3274 | |
3275 | // Align p to 16-byte boundary |
3276 | if (p as usize) & 0xf != 0 { |
3277 | ofs = (16 - ((p as usize) & 0xf)) >> 2; |
3278 | p = p.add(ofs); |
3279 | } |
3280 | |
3281 | _mm_storer_ps(p, *black_box(&a)); |
3282 | |
3283 | if ofs > 0 { |
3284 | assert_eq!(vals[ofs - 1], 0.0); |
3285 | } |
3286 | assert_eq!(vals[ofs + 0], 4.0); |
3287 | assert_eq!(vals[ofs + 1], 3.0); |
3288 | assert_eq!(vals[ofs + 2], 2.0); |
3289 | assert_eq!(vals[ofs + 3], 1.0); |
3290 | assert_eq!(vals[ofs + 4], 0.0); |
3291 | } |
3292 | |
3293 | #[simd_test(enable = "sse" )] |
3294 | unsafe fn test_mm_storeu_ps() { |
3295 | let mut vals = [0.0f32; 8]; |
3296 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3297 | |
3298 | let mut ofs = 0; |
3299 | let mut p = vals.as_mut_ptr(); |
3300 | |
3301 | // Make sure p is **not** aligned to 16-byte boundary |
3302 | if (p as usize) & 0xf == 0 { |
3303 | ofs = 1; |
3304 | p = p.add(1); |
3305 | } |
3306 | |
3307 | _mm_storeu_ps(p, *black_box(&a)); |
3308 | |
3309 | if ofs > 0 { |
3310 | assert_eq!(vals[ofs - 1], 0.0); |
3311 | } |
3312 | assert_eq!(vals[ofs + 0], 1.0); |
3313 | assert_eq!(vals[ofs + 1], 2.0); |
3314 | assert_eq!(vals[ofs + 2], 3.0); |
3315 | assert_eq!(vals[ofs + 3], 4.0); |
3316 | assert_eq!(vals[ofs + 4], 0.0); |
3317 | } |
3318 | |
3319 | #[simd_test(enable = "sse" )] |
3320 | unsafe fn test_mm_move_ss() { |
3321 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3322 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3323 | |
3324 | let r = _mm_move_ss(a, b); |
3325 | let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0); |
3326 | assert_eq_m128(e, r); |
3327 | } |
3328 | |
3329 | #[simd_test(enable = "sse" )] |
3330 | unsafe fn test_mm_movemask_ps() { |
3331 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0)); |
3332 | assert_eq!(r, 0b0101); |
3333 | |
3334 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0)); |
3335 | assert_eq!(r, 0b0111); |
3336 | } |
3337 | |
3338 | #[simd_test(enable = "sse" )] |
3339 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
3340 | #[cfg_attr (miri, ignore)] |
3341 | unsafe fn test_mm_sfence() { |
3342 | _mm_sfence(); |
3343 | } |
3344 | |
3345 | #[allow (deprecated)] // FIXME: This tests functions that are immediate UB |
3346 | #[simd_test(enable = "sse" )] |
3347 | #[cfg_attr (miri, ignore)] // Miri does not support accesing the CSR |
3348 | unsafe fn test_mm_getcsr_setcsr_1() { |
3349 | let saved_csr = _mm_getcsr(); |
3350 | |
3351 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); |
3352 | let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0); |
3353 | |
3354 | _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); |
3355 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); |
3356 | |
3357 | _mm_setcsr(saved_csr); |
3358 | |
3359 | let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0); |
3360 | assert_eq_m128(r, exp); // first component is a denormalized f32 |
3361 | } |
3362 | |
3363 | #[allow (deprecated)] // FIXME: This tests functions that are immediate UB |
3364 | #[simd_test(enable = "sse" )] |
3365 | #[cfg_attr (miri, ignore)] // Miri does not support accesing the CSR |
3366 | unsafe fn test_mm_getcsr_setcsr_2() { |
3367 | // Same as _mm_setcsr_1 test, but with opposite flag value. |
3368 | |
3369 | let saved_csr = _mm_getcsr(); |
3370 | |
3371 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); |
3372 | let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0); |
3373 | |
3374 | _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); |
3375 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); |
3376 | |
3377 | _mm_setcsr(saved_csr); |
3378 | |
3379 | let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0); |
3380 | assert_eq_m128(r, exp); // first component is a denormalized f32 |
3381 | } |
3382 | |
3383 | #[allow (deprecated)] // FIXME: This tests functions that are immediate UB |
3384 | #[simd_test(enable = "sse" )] |
3385 | #[cfg_attr (miri, ignore)] // Miri does not support accesing the CSR |
3386 | unsafe fn test_mm_getcsr_setcsr_underflow() { |
3387 | _MM_SET_EXCEPTION_STATE(0); |
3388 | |
3389 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); |
3390 | let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0); |
3391 | |
3392 | assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure |
3393 | |
3394 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); |
3395 | |
3396 | let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0); |
3397 | assert_eq_m128(r, exp); |
3398 | |
3399 | let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0; |
3400 | assert!(underflow); |
3401 | } |
3402 | |
3403 | #[simd_test(enable = "sse" )] |
3404 | unsafe fn test_MM_TRANSPOSE4_PS() { |
3405 | let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3406 | let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3407 | let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0); |
3408 | let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0); |
3409 | |
3410 | _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d); |
3411 | |
3412 | assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0)); |
3413 | assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0)); |
3414 | assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0)); |
3415 | assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0)); |
3416 | } |
3417 | |
3418 | #[repr (align(16))] |
3419 | struct Memory { |
3420 | pub data: [f32; 4], |
3421 | } |
3422 | |
3423 | #[simd_test(enable = "sse" )] |
3424 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
3425 | // (non-temporal store) |
3426 | #[cfg_attr (miri, ignore)] |
3427 | unsafe fn test_mm_stream_ps() { |
3428 | let a = _mm_set1_ps(7.0); |
3429 | let mut mem = Memory { data: [-1.0; 4] }; |
3430 | |
3431 | _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a); |
3432 | for i in 0..4 { |
3433 | assert_eq!(mem.data[i], get_m128(a, i)); |
3434 | } |
3435 | } |
3436 | } |
3437 | |