1 | //! Streaming SIMD Extensions (SSE) |
2 | |
3 | use crate::{ |
4 | core_arch::{simd::*, x86::*}, |
5 | intrinsics::simd::*, |
6 | intrinsics::sqrtf32, |
7 | mem, ptr, |
8 | }; |
9 | |
10 | #[cfg (test)] |
11 | use stdarch_test::assert_instr; |
12 | |
13 | /// Adds the first component of `a` and `b`, the other components are copied |
14 | /// from `a`. |
15 | /// |
16 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss) |
17 | #[inline ] |
18 | #[target_feature (enable = "sse" )] |
19 | #[cfg_attr (test, assert_instr(addss))] |
20 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
21 | pub fn _mm_add_ss(a: __m128, b: __m128) -> __m128 { |
22 | unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) + _mm_cvtss_f32(b)) } |
23 | } |
24 | |
25 | /// Adds packed single-precision (32-bit) floating-point elements in `a` and |
26 | /// `b`. |
27 | /// |
28 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps) |
29 | #[inline ] |
30 | #[target_feature (enable = "sse" )] |
31 | #[cfg_attr (test, assert_instr(addps))] |
32 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
33 | pub fn _mm_add_ps(a: __m128, b: __m128) -> __m128 { |
34 | unsafe { simd_add(x:a, y:b) } |
35 | } |
36 | |
37 | /// Subtracts the first component of `b` from `a`, the other components are |
38 | /// copied from `a`. |
39 | /// |
40 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss) |
41 | #[inline ] |
42 | #[target_feature (enable = "sse" )] |
43 | #[cfg_attr (test, assert_instr(subss))] |
44 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
45 | pub fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 { |
46 | unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) - _mm_cvtss_f32(b)) } |
47 | } |
48 | |
49 | /// Subtracts packed single-precision (32-bit) floating-point elements in `a` and |
50 | /// `b`. |
51 | /// |
52 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps) |
53 | #[inline ] |
54 | #[target_feature (enable = "sse" )] |
55 | #[cfg_attr (test, assert_instr(subps))] |
56 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
57 | pub fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 { |
58 | unsafe { simd_sub(lhs:a, rhs:b) } |
59 | } |
60 | |
61 | /// Multiplies the first component of `a` and `b`, the other components are |
62 | /// copied from `a`. |
63 | /// |
64 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss) |
65 | #[inline ] |
66 | #[target_feature (enable = "sse" )] |
67 | #[cfg_attr (test, assert_instr(mulss))] |
68 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
69 | pub fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 { |
70 | unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) * _mm_cvtss_f32(b)) } |
71 | } |
72 | |
73 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` and |
74 | /// `b`. |
75 | /// |
76 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps) |
77 | #[inline ] |
78 | #[target_feature (enable = "sse" )] |
79 | #[cfg_attr (test, assert_instr(mulps))] |
80 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
81 | pub fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 { |
82 | unsafe { simd_mul(x:a, y:b) } |
83 | } |
84 | |
85 | /// Divides the first component of `b` by `a`, the other components are |
86 | /// copied from `a`. |
87 | /// |
88 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss) |
89 | #[inline ] |
90 | #[target_feature (enable = "sse" )] |
91 | #[cfg_attr (test, assert_instr(divss))] |
92 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
93 | pub fn _mm_div_ss(a: __m128, b: __m128) -> __m128 { |
94 | unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) / _mm_cvtss_f32(b)) } |
95 | } |
96 | |
97 | /// Divides packed single-precision (32-bit) floating-point elements in `a` and |
98 | /// `b`. |
99 | /// |
100 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps) |
101 | #[inline ] |
102 | #[target_feature (enable = "sse" )] |
103 | #[cfg_attr (test, assert_instr(divps))] |
104 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
105 | pub fn _mm_div_ps(a: __m128, b: __m128) -> __m128 { |
106 | unsafe { simd_div(lhs:a, rhs:b) } |
107 | } |
108 | |
109 | /// Returns the square root of the first single-precision (32-bit) |
110 | /// floating-point element in `a`, the other elements are unchanged. |
111 | /// |
112 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss) |
113 | #[inline ] |
114 | #[target_feature (enable = "sse" )] |
115 | #[cfg_attr (test, assert_instr(sqrtss))] |
116 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
117 | pub fn _mm_sqrt_ss(a: __m128) -> __m128 { |
118 | unsafe { simd_insert!(a, 0, sqrtf32(_mm_cvtss_f32(a))) } |
119 | } |
120 | |
121 | /// Returns the square root of packed single-precision (32-bit) floating-point |
122 | /// elements in `a`. |
123 | /// |
124 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps) |
125 | #[inline ] |
126 | #[target_feature (enable = "sse" )] |
127 | #[cfg_attr (test, assert_instr(sqrtps))] |
128 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
129 | pub fn _mm_sqrt_ps(a: __m128) -> __m128 { |
130 | unsafe { simd_fsqrt(a) } |
131 | } |
132 | |
133 | /// Returns the approximate reciprocal of the first single-precision |
134 | /// (32-bit) floating-point element in `a`, the other elements are unchanged. |
135 | /// |
136 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss) |
137 | #[inline ] |
138 | #[target_feature (enable = "sse" )] |
139 | #[cfg_attr (test, assert_instr(rcpss))] |
140 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
141 | pub fn _mm_rcp_ss(a: __m128) -> __m128 { |
142 | unsafe { rcpss(a) } |
143 | } |
144 | |
145 | /// Returns the approximate reciprocal of packed single-precision (32-bit) |
146 | /// floating-point elements in `a`. |
147 | /// |
148 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps) |
149 | #[inline ] |
150 | #[target_feature (enable = "sse" )] |
151 | #[cfg_attr (test, assert_instr(rcpps))] |
152 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
153 | pub fn _mm_rcp_ps(a: __m128) -> __m128 { |
154 | unsafe { rcpps(a) } |
155 | } |
156 | |
157 | /// Returns the approximate reciprocal square root of the first single-precision |
158 | /// (32-bit) floating-point element in `a`, the other elements are unchanged. |
159 | /// |
160 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss) |
161 | #[inline ] |
162 | #[target_feature (enable = "sse" )] |
163 | #[cfg_attr (test, assert_instr(rsqrtss))] |
164 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
165 | pub fn _mm_rsqrt_ss(a: __m128) -> __m128 { |
166 | unsafe { rsqrtss(a) } |
167 | } |
168 | |
169 | /// Returns the approximate reciprocal square root of packed single-precision |
170 | /// (32-bit) floating-point elements in `a`. |
171 | /// |
172 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps) |
173 | #[inline ] |
174 | #[target_feature (enable = "sse" )] |
175 | #[cfg_attr (test, assert_instr(rsqrtps))] |
176 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
177 | pub fn _mm_rsqrt_ps(a: __m128) -> __m128 { |
178 | unsafe { rsqrtps(a) } |
179 | } |
180 | |
181 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
182 | /// and `b`, and return the minimum value in the first element of the return |
183 | /// value, the other elements are copied from `a`. |
184 | /// |
185 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss) |
186 | #[inline ] |
187 | #[target_feature (enable = "sse" )] |
188 | #[cfg_attr (test, assert_instr(minss))] |
189 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
190 | pub fn _mm_min_ss(a: __m128, b: __m128) -> __m128 { |
191 | unsafe { minss(a, b) } |
192 | } |
193 | |
194 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
195 | /// `b`, and return the corresponding minimum values. |
196 | /// |
197 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps) |
198 | #[inline ] |
199 | #[target_feature (enable = "sse" )] |
200 | #[cfg_attr (test, assert_instr(minps))] |
201 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
202 | pub fn _mm_min_ps(a: __m128, b: __m128) -> __m128 { |
203 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`. |
204 | unsafe { minps(a, b) } |
205 | } |
206 | |
207 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
208 | /// and `b`, and return the maximum value in the first element of the return |
209 | /// value, the other elements are copied from `a`. |
210 | /// |
211 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss) |
212 | #[inline ] |
213 | #[target_feature (enable = "sse" )] |
214 | #[cfg_attr (test, assert_instr(maxss))] |
215 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
216 | pub fn _mm_max_ss(a: __m128, b: __m128) -> __m128 { |
217 | unsafe { maxss(a, b) } |
218 | } |
219 | |
220 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
221 | /// `b`, and return the corresponding maximum values. |
222 | /// |
223 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps) |
224 | #[inline ] |
225 | #[target_feature (enable = "sse" )] |
226 | #[cfg_attr (test, assert_instr(maxps))] |
227 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
228 | pub fn _mm_max_ps(a: __m128, b: __m128) -> __m128 { |
229 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`. |
230 | unsafe { maxps(a, b) } |
231 | } |
232 | |
233 | /// Bitwise AND of packed single-precision (32-bit) floating-point elements. |
234 | /// |
235 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps) |
236 | #[inline ] |
237 | #[target_feature (enable = "sse" )] |
238 | // i586 only seems to generate plain `and` instructions, so ignore it. |
239 | #[cfg_attr ( |
240 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
241 | assert_instr(andps) |
242 | )] |
243 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
244 | pub fn _mm_and_ps(a: __m128, b: __m128) -> __m128 { |
245 | unsafe { |
246 | let a: __m128i = mem::transmute(src:a); |
247 | let b: __m128i = mem::transmute(src:b); |
248 | mem::transmute(src:simd_and(x:a, y:b)) |
249 | } |
250 | } |
251 | |
252 | /// Bitwise AND-NOT of packed single-precision (32-bit) floating-point |
253 | /// elements. |
254 | /// |
255 | /// Computes `!a & b` for each bit in `a` and `b`. |
256 | /// |
257 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps) |
258 | #[inline ] |
259 | #[target_feature (enable = "sse" )] |
260 | // i586 only seems to generate plain `not` and `and` instructions, so ignore |
261 | // it. |
262 | #[cfg_attr ( |
263 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
264 | assert_instr(andnps) |
265 | )] |
266 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
267 | pub fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 { |
268 | unsafe { |
269 | let a: __m128i = mem::transmute(src:a); |
270 | let b: __m128i = mem::transmute(src:b); |
271 | let mask: __m128i = mem::transmute(src:i32x4::splat(-1)); |
272 | mem::transmute(src:simd_and(x:simd_xor(mask, a), y:b)) |
273 | } |
274 | } |
275 | |
276 | /// Bitwise OR of packed single-precision (32-bit) floating-point elements. |
277 | /// |
278 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps) |
279 | #[inline ] |
280 | #[target_feature (enable = "sse" )] |
281 | // i586 only seems to generate plain `or` instructions, so we ignore it. |
282 | #[cfg_attr ( |
283 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
284 | assert_instr(orps) |
285 | )] |
286 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
287 | pub fn _mm_or_ps(a: __m128, b: __m128) -> __m128 { |
288 | unsafe { |
289 | let a: __m128i = mem::transmute(src:a); |
290 | let b: __m128i = mem::transmute(src:b); |
291 | mem::transmute(src:simd_or(x:a, y:b)) |
292 | } |
293 | } |
294 | |
295 | /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point |
296 | /// elements. |
297 | /// |
298 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps) |
299 | #[inline ] |
300 | #[target_feature (enable = "sse" )] |
301 | // i586 only seems to generate plain `xor` instructions, so we ignore it. |
302 | #[cfg_attr ( |
303 | all(test, any(target_arch = "x86_64" , target_feature = "sse2" )), |
304 | assert_instr(xorps) |
305 | )] |
306 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
307 | pub fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 { |
308 | unsafe { |
309 | let a: __m128i = mem::transmute(src:a); |
310 | let b: __m128i = mem::transmute(src:b); |
311 | mem::transmute(src:simd_xor(x:a, y:b)) |
312 | } |
313 | } |
314 | |
315 | /// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of |
316 | /// the result will be `0xffffffff` if the two inputs are equal, or `0` |
317 | /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`. |
318 | /// |
319 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss) |
320 | #[inline ] |
321 | #[target_feature (enable = "sse" )] |
322 | #[cfg_attr (test, assert_instr(cmpeqss))] |
323 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
324 | pub fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 { |
325 | unsafe { cmpss(a, b, imm8:0) } |
326 | } |
327 | |
328 | /// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits |
329 | /// of the result will be `0xffffffff` if `a.extract(0)` is less than |
330 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the |
331 | /// upper 96 bits of `a`. |
332 | /// |
333 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss) |
334 | #[inline ] |
335 | #[target_feature (enable = "sse" )] |
336 | #[cfg_attr (test, assert_instr(cmpltss))] |
337 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
338 | pub fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 { |
339 | unsafe { cmpss(a, b, imm8:1) } |
340 | } |
341 | |
342 | /// Compares the lowest `f32` of both inputs for less than or equal. The lowest |
343 | /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than |
344 | /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result |
345 | /// are the upper 96 bits of `a`. |
346 | /// |
347 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss) |
348 | #[inline ] |
349 | #[target_feature (enable = "sse" )] |
350 | #[cfg_attr (test, assert_instr(cmpless))] |
351 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
352 | pub fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 { |
353 | unsafe { cmpss(a, b, imm8:2) } |
354 | } |
355 | |
356 | /// Compares the lowest `f32` of both inputs for greater than. The lowest 32 |
357 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater |
358 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result |
359 | /// are the upper 96 bits of `a`. |
360 | /// |
361 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss) |
362 | #[inline ] |
363 | #[target_feature (enable = "sse" )] |
364 | #[cfg_attr (test, assert_instr(cmpltss))] |
365 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
366 | pub fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 { |
367 | unsafe { simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3]) } |
368 | } |
369 | |
370 | /// Compares the lowest `f32` of both inputs for greater than or equal. The |
371 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is |
372 | /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits |
373 | /// of the result are the upper 96 bits of `a`. |
374 | /// |
375 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss) |
376 | #[inline ] |
377 | #[target_feature (enable = "sse" )] |
378 | #[cfg_attr (test, assert_instr(cmpless))] |
379 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
380 | pub fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 { |
381 | unsafe { simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3]) } |
382 | } |
383 | |
384 | /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits |
385 | /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to |
386 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the |
387 | /// upper 96 bits of `a`. |
388 | /// |
389 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss) |
390 | #[inline ] |
391 | #[target_feature (enable = "sse" )] |
392 | #[cfg_attr (test, assert_instr(cmpneqss))] |
393 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
394 | pub fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 { |
395 | unsafe { cmpss(a, b, imm8:4) } |
396 | } |
397 | |
398 | /// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32 |
399 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than |
400 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the |
401 | /// upper 96 bits of `a`. |
402 | /// |
403 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss) |
404 | #[inline ] |
405 | #[target_feature (enable = "sse" )] |
406 | #[cfg_attr (test, assert_instr(cmpnltss))] |
407 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
408 | pub fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 { |
409 | unsafe { cmpss(a, b, imm8:5) } |
410 | } |
411 | |
412 | /// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The |
413 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
414 | /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits |
415 | /// of the result are the upper 96 bits of `a`. |
416 | /// |
417 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss) |
418 | #[inline ] |
419 | #[target_feature (enable = "sse" )] |
420 | #[cfg_attr (test, assert_instr(cmpnless))] |
421 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
422 | pub fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 { |
423 | unsafe { cmpss(a, b, imm8:6) } |
424 | } |
425 | |
426 | /// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32 |
427 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater |
428 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are |
429 | /// the upper 96 bits of `a`. |
430 | /// |
431 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss) |
432 | #[inline ] |
433 | #[target_feature (enable = "sse" )] |
434 | #[cfg_attr (test, assert_instr(cmpnltss))] |
435 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
436 | pub fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 { |
437 | unsafe { simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3]) } |
438 | } |
439 | |
440 | /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The |
441 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
442 | /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 |
443 | /// bits of the result are the upper 96 bits of `a`. |
444 | /// |
445 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss) |
446 | #[inline ] |
447 | #[target_feature (enable = "sse" )] |
448 | #[cfg_attr (test, assert_instr(cmpnless))] |
449 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
450 | pub fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 { |
451 | unsafe { simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3]) } |
452 | } |
453 | |
454 | /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of |
455 | /// the result will be `0xffffffff` if neither of `a.extract(0)` or |
456 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result |
457 | /// are the upper 96 bits of `a`. |
458 | /// |
459 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss) |
460 | #[inline ] |
461 | #[target_feature (enable = "sse" )] |
462 | #[cfg_attr (test, assert_instr(cmpordss))] |
463 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
464 | pub fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 { |
465 | unsafe { cmpss(a, b, imm8:7) } |
466 | } |
467 | |
468 | /// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits |
469 | /// of the result will be `0xffffffff` if any of `a.extract(0)` or |
470 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result |
471 | /// are the upper 96 bits of `a`. |
472 | /// |
473 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss) |
474 | #[inline ] |
475 | #[target_feature (enable = "sse" )] |
476 | #[cfg_attr (test, assert_instr(cmpunordss))] |
477 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
478 | pub fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 { |
479 | unsafe { cmpss(a, b, imm8:3) } |
480 | } |
481 | |
482 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
483 | /// The result in the output vector will be `0xffffffff` if the input elements |
484 | /// were equal, or `0` otherwise. |
485 | /// |
486 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps) |
487 | #[inline ] |
488 | #[target_feature (enable = "sse" )] |
489 | #[cfg_attr (test, assert_instr(cmpeqps))] |
490 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
491 | pub fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 { |
492 | unsafe { cmpps(a, b, imm8:0) } |
493 | } |
494 | |
495 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
496 | /// The result in the output vector will be `0xffffffff` if the input element |
497 | /// in `a` is less than the corresponding element in `b`, or `0` otherwise. |
498 | /// |
499 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps) |
500 | #[inline ] |
501 | #[target_feature (enable = "sse" )] |
502 | #[cfg_attr (test, assert_instr(cmpltps))] |
503 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
504 | pub fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 { |
505 | unsafe { cmpps(a, b, imm8:1) } |
506 | } |
507 | |
508 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
509 | /// The result in the output vector will be `0xffffffff` if the input element |
510 | /// in `a` is less than or equal to the corresponding element in `b`, or `0` |
511 | /// otherwise. |
512 | /// |
513 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps) |
514 | #[inline ] |
515 | #[target_feature (enable = "sse" )] |
516 | #[cfg_attr (test, assert_instr(cmpleps))] |
517 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
518 | pub fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 { |
519 | unsafe { cmpps(a, b, imm8:2) } |
520 | } |
521 | |
522 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
523 | /// The result in the output vector will be `0xffffffff` if the input element |
524 | /// in `a` is greater than the corresponding element in `b`, or `0` otherwise. |
525 | /// |
526 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps) |
527 | #[inline ] |
528 | #[target_feature (enable = "sse" )] |
529 | #[cfg_attr (test, assert_instr(cmpltps))] |
530 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
531 | pub fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 { |
532 | unsafe { cmpps(a:b, b:a, imm8:1) } |
533 | } |
534 | |
535 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
536 | /// The result in the output vector will be `0xffffffff` if the input element |
537 | /// in `a` is greater than or equal to the corresponding element in `b`, or `0` |
538 | /// otherwise. |
539 | /// |
540 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps) |
541 | #[inline ] |
542 | #[target_feature (enable = "sse" )] |
543 | #[cfg_attr (test, assert_instr(cmpleps))] |
544 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
545 | pub fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 { |
546 | unsafe { cmpps(a:b, b:a, imm8:2) } |
547 | } |
548 | |
549 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
550 | /// The result in the output vector will be `0xffffffff` if the input elements |
551 | /// are **not** equal, or `0` otherwise. |
552 | /// |
553 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps) |
554 | #[inline ] |
555 | #[target_feature (enable = "sse" )] |
556 | #[cfg_attr (test, assert_instr(cmpneqps))] |
557 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
558 | pub fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 { |
559 | unsafe { cmpps(a, b, imm8:4) } |
560 | } |
561 | |
562 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
563 | /// The result in the output vector will be `0xffffffff` if the input element |
564 | /// in `a` is **not** less than the corresponding element in `b`, or `0` |
565 | /// otherwise. |
566 | /// |
567 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps) |
568 | #[inline ] |
569 | #[target_feature (enable = "sse" )] |
570 | #[cfg_attr (test, assert_instr(cmpnltps))] |
571 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
572 | pub fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 { |
573 | unsafe { cmpps(a, b, imm8:5) } |
574 | } |
575 | |
576 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
577 | /// The result in the output vector will be `0xffffffff` if the input element |
578 | /// in `a` is **not** less than or equal to the corresponding element in `b`, or |
579 | /// `0` otherwise. |
580 | /// |
581 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps) |
582 | #[inline ] |
583 | #[target_feature (enable = "sse" )] |
584 | #[cfg_attr (test, assert_instr(cmpnleps))] |
585 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
586 | pub fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 { |
587 | unsafe { cmpps(a, b, imm8:6) } |
588 | } |
589 | |
590 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
591 | /// The result in the output vector will be `0xffffffff` if the input element |
592 | /// in `a` is **not** greater than the corresponding element in `b`, or `0` |
593 | /// otherwise. |
594 | /// |
595 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps) |
596 | #[inline ] |
597 | #[target_feature (enable = "sse" )] |
598 | #[cfg_attr (test, assert_instr(cmpnltps))] |
599 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
600 | pub fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 { |
601 | unsafe { cmpps(a:b, b:a, imm8:5) } |
602 | } |
603 | |
604 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
605 | /// The result in the output vector will be `0xffffffff` if the input element |
606 | /// in `a` is **not** greater than or equal to the corresponding element in `b`, |
607 | /// or `0` otherwise. |
608 | /// |
609 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps) |
610 | #[inline ] |
611 | #[target_feature (enable = "sse" )] |
612 | #[cfg_attr (test, assert_instr(cmpnleps))] |
613 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
614 | pub fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 { |
615 | unsafe { cmpps(a:b, b:a, imm8:6) } |
616 | } |
617 | |
618 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
619 | /// Returns four floats that have one of two possible bit patterns. The element |
620 | /// in the output vector will be `0xffffffff` if the input elements in `a` and |
621 | /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise. |
622 | /// |
623 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps) |
624 | #[inline ] |
625 | #[target_feature (enable = "sse" )] |
626 | #[cfg_attr (test, assert_instr(cmpordps))] |
627 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
628 | pub fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 { |
629 | unsafe { cmpps(a:b, b:a, imm8:7) } |
630 | } |
631 | |
632 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
633 | /// Returns four floats that have one of two possible bit patterns. The element |
634 | /// in the output vector will be `0xffffffff` if the input elements in `a` and |
635 | /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise. |
636 | /// |
637 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps) |
638 | #[inline ] |
639 | #[target_feature (enable = "sse" )] |
640 | #[cfg_attr (test, assert_instr(cmpunordps))] |
641 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
642 | pub fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 { |
643 | unsafe { cmpps(a:b, b:a, imm8:3) } |
644 | } |
645 | |
646 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
647 | /// `1` if they are equal, or `0` otherwise. |
648 | /// |
649 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss) |
650 | #[inline ] |
651 | #[target_feature (enable = "sse" )] |
652 | #[cfg_attr (test, assert_instr(comiss))] |
653 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
654 | pub fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 { |
655 | unsafe { comieq_ss(a, b) } |
656 | } |
657 | |
658 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
659 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
660 | /// |
661 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss) |
662 | #[inline ] |
663 | #[target_feature (enable = "sse" )] |
664 | #[cfg_attr (test, assert_instr(comiss))] |
665 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
666 | pub fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 { |
667 | unsafe { comilt_ss(a, b) } |
668 | } |
669 | |
670 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
671 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
672 | /// otherwise. |
673 | /// |
674 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss) |
675 | #[inline ] |
676 | #[target_feature (enable = "sse" )] |
677 | #[cfg_attr (test, assert_instr(comiss))] |
678 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
679 | pub fn _mm_comile_ss(a: __m128, b: __m128) -> i32 { |
680 | unsafe { comile_ss(a, b) } |
681 | } |
682 | |
683 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
684 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
685 | /// otherwise. |
686 | /// |
687 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss) |
688 | #[inline ] |
689 | #[target_feature (enable = "sse" )] |
690 | #[cfg_attr (test, assert_instr(comiss))] |
691 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
692 | pub fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 { |
693 | unsafe { comigt_ss(a, b) } |
694 | } |
695 | |
696 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
697 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
698 | /// `0` otherwise. |
699 | /// |
700 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss) |
701 | #[inline ] |
702 | #[target_feature (enable = "sse" )] |
703 | #[cfg_attr (test, assert_instr(comiss))] |
704 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
705 | pub fn _mm_comige_ss(a: __m128, b: __m128) -> i32 { |
706 | unsafe { comige_ss(a, b) } |
707 | } |
708 | |
709 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
710 | /// `1` if they are **not** equal, or `0` otherwise. |
711 | /// |
712 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss) |
713 | #[inline ] |
714 | #[target_feature (enable = "sse" )] |
715 | #[cfg_attr (test, assert_instr(comiss))] |
716 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
717 | pub fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 { |
718 | unsafe { comineq_ss(a, b) } |
719 | } |
720 | |
721 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
722 | /// `1` if they are equal, or `0` otherwise. This instruction will not signal |
723 | /// an exception if either argument is a quiet NaN. |
724 | /// |
725 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss) |
726 | #[inline ] |
727 | #[target_feature (enable = "sse" )] |
728 | #[cfg_attr (test, assert_instr(ucomiss))] |
729 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
730 | pub fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 { |
731 | unsafe { ucomieq_ss(a, b) } |
732 | } |
733 | |
734 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
735 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
736 | /// This instruction will not signal an exception if either argument is a quiet |
737 | /// NaN. |
738 | /// |
739 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss) |
740 | #[inline ] |
741 | #[target_feature (enable = "sse" )] |
742 | #[cfg_attr (test, assert_instr(ucomiss))] |
743 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
744 | pub fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 { |
745 | unsafe { ucomilt_ss(a, b) } |
746 | } |
747 | |
748 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
749 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
750 | /// otherwise. This instruction will not signal an exception if either argument |
751 | /// is a quiet NaN. |
752 | /// |
753 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss) |
754 | #[inline ] |
755 | #[target_feature (enable = "sse" )] |
756 | #[cfg_attr (test, assert_instr(ucomiss))] |
757 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
758 | pub fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 { |
759 | unsafe { ucomile_ss(a, b) } |
760 | } |
761 | |
762 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
763 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
764 | /// otherwise. This instruction will not signal an exception if either argument |
765 | /// is a quiet NaN. |
766 | /// |
767 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss) |
768 | #[inline ] |
769 | #[target_feature (enable = "sse" )] |
770 | #[cfg_attr (test, assert_instr(ucomiss))] |
771 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
772 | pub fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 { |
773 | unsafe { ucomigt_ss(a, b) } |
774 | } |
775 | |
776 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
777 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
778 | /// `0` otherwise. This instruction will not signal an exception if either |
779 | /// argument is a quiet NaN. |
780 | /// |
781 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss) |
782 | #[inline ] |
783 | #[target_feature (enable = "sse" )] |
784 | #[cfg_attr (test, assert_instr(ucomiss))] |
785 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
786 | pub fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 { |
787 | unsafe { ucomige_ss(a, b) } |
788 | } |
789 | |
790 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
791 | /// `1` if they are **not** equal, or `0` otherwise. This instruction will not |
792 | /// signal an exception if either argument is a quiet NaN. |
793 | /// |
794 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss) |
795 | #[inline ] |
796 | #[target_feature (enable = "sse" )] |
797 | #[cfg_attr (test, assert_instr(ucomiss))] |
798 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
799 | pub fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 { |
800 | unsafe { ucomineq_ss(a, b) } |
801 | } |
802 | |
803 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer. |
804 | /// |
805 | /// The result is rounded according to the current rounding mode. If the result |
806 | /// cannot be represented as a 32 bit integer the result will be `0x8000_0000` |
807 | /// (`i32::MIN`). |
808 | /// |
809 | /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output). |
810 | /// |
811 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32) |
812 | #[inline ] |
813 | #[target_feature (enable = "sse" )] |
814 | #[cfg_attr (test, assert_instr(cvtss2si))] |
815 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
816 | pub fn _mm_cvtss_si32(a: __m128) -> i32 { |
817 | unsafe { cvtss2si(a) } |
818 | } |
819 | |
820 | /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html). |
821 | /// |
822 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si) |
823 | #[inline ] |
824 | #[target_feature (enable = "sse" )] |
825 | #[cfg_attr (test, assert_instr(cvtss2si))] |
826 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
827 | pub fn _mm_cvt_ss2si(a: __m128) -> i32 { |
828 | _mm_cvtss_si32(a) |
829 | } |
830 | |
831 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer |
832 | /// with |
833 | /// truncation. |
834 | /// |
835 | /// The result is rounded always using truncation (round towards zero). If the |
836 | /// result cannot be represented as a 32 bit integer the result will be |
837 | /// `0x8000_0000` (`i32::MIN`). |
838 | /// |
839 | /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output). |
840 | /// |
841 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32) |
842 | #[inline ] |
843 | #[target_feature (enable = "sse" )] |
844 | #[cfg_attr (test, assert_instr(cvttss2si))] |
845 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
846 | pub fn _mm_cvttss_si32(a: __m128) -> i32 { |
847 | unsafe { cvttss2si(a) } |
848 | } |
849 | |
850 | /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html). |
851 | /// |
852 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si) |
853 | #[inline ] |
854 | #[target_feature (enable = "sse" )] |
855 | #[cfg_attr (test, assert_instr(cvttss2si))] |
856 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
857 | pub fn _mm_cvtt_ss2si(a: __m128) -> i32 { |
858 | _mm_cvttss_si32(a) |
859 | } |
860 | |
861 | /// Extracts the lowest 32 bit float from the input vector. |
862 | /// |
863 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32) |
864 | #[inline ] |
865 | #[target_feature (enable = "sse" )] |
866 | // No point in using assert_instrs. In Unix x86_64 calling convention this is a |
867 | // no-op, and on msvc it's just a `mov`. |
868 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
869 | pub fn _mm_cvtss_f32(a: __m128) -> f32 { |
870 | unsafe { simd_extract!(a, 0) } |
871 | } |
872 | |
873 | /// Converts a 32 bit integer to a 32 bit float. The result vector is the input |
874 | /// vector `a` with the lowest 32 bit float replaced by the converted integer. |
875 | /// |
876 | /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit |
877 | /// input). |
878 | /// |
879 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss) |
880 | #[inline ] |
881 | #[target_feature (enable = "sse" )] |
882 | #[cfg_attr (test, assert_instr(cvtsi2ss))] |
883 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
884 | pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 { |
885 | unsafe { cvtsi2ss(a, b) } |
886 | } |
887 | |
888 | /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html). |
889 | /// |
890 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss) |
891 | #[inline ] |
892 | #[target_feature (enable = "sse" )] |
893 | #[cfg_attr (test, assert_instr(cvtsi2ss))] |
894 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
895 | pub fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 { |
896 | _mm_cvtsi32_ss(a, b) |
897 | } |
898 | |
899 | /// Construct a `__m128` with the lowest element set to `a` and the rest set to |
900 | /// zero. |
901 | /// |
902 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss) |
903 | #[inline ] |
904 | #[target_feature (enable = "sse" )] |
905 | #[cfg_attr (test, assert_instr(movss))] |
906 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
907 | pub fn _mm_set_ss(a: f32) -> __m128 { |
908 | __m128([a, 0.0, 0.0, 0.0]) |
909 | } |
910 | |
911 | /// Construct a `__m128` with all element set to `a`. |
912 | /// |
913 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps) |
914 | #[inline ] |
915 | #[target_feature (enable = "sse" )] |
916 | #[cfg_attr (test, assert_instr(shufps))] |
917 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
918 | pub fn _mm_set1_ps(a: f32) -> __m128 { |
919 | __m128([a, a, a, a]) |
920 | } |
921 | |
922 | /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html) |
923 | /// |
924 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1) |
925 | #[inline ] |
926 | #[target_feature (enable = "sse" )] |
927 | #[cfg_attr (test, assert_instr(shufps))] |
928 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
929 | pub fn _mm_set_ps1(a: f32) -> __m128 { |
930 | _mm_set1_ps(a) |
931 | } |
932 | |
933 | /// Construct a `__m128` from four floating point values highest to lowest. |
934 | /// |
935 | /// Note that `a` will be the highest 32 bits of the result, and `d` the |
936 | /// lowest. This matches the standard way of writing bit patterns on x86: |
937 | /// |
938 | /// ```text |
939 | /// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0 |
940 | /// +---------+---------+---------+---------+ |
941 | /// | a | b | c | d | result |
942 | /// +---------+---------+---------+---------+ |
943 | /// ``` |
944 | /// |
945 | /// Alternatively: |
946 | /// |
947 | /// ```text |
948 | /// let v = _mm_set_ps(d, c, b, a); |
949 | /// ``` |
950 | /// |
951 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps) |
952 | #[inline ] |
953 | #[target_feature (enable = "sse" )] |
954 | #[cfg_attr (test, assert_instr(unpcklps))] |
955 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
956 | pub fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
957 | __m128([d, c, b, a]) |
958 | } |
959 | |
960 | /// Construct a `__m128` from four floating point values lowest to highest. |
961 | /// |
962 | /// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32 |
963 | /// bits of the result, and `d` the highest. |
964 | /// |
965 | /// ```text |
966 | /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d)); |
967 | /// ``` |
968 | /// |
969 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps) |
970 | #[inline ] |
971 | #[target_feature (enable = "sse" )] |
972 | #[cfg_attr ( |
973 | all(test, any(target_env = "msvc" , target_arch = "x86_64" )), |
974 | assert_instr(unpcklps) |
975 | )] |
976 | // On a 32-bit architecture on non-msvc it just copies the operands from the stack. |
977 | #[cfg_attr ( |
978 | all(test, all(not(target_env = "msvc" ), target_arch = "x86" )), |
979 | assert_instr(movaps) |
980 | )] |
981 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
982 | pub fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
983 | __m128([a, b, c, d]) |
984 | } |
985 | |
986 | /// Construct a `__m128` with all elements initialized to zero. |
987 | /// |
988 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps) |
989 | #[inline ] |
990 | #[target_feature (enable = "sse" )] |
991 | #[cfg_attr (test, assert_instr(xorps))] |
992 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
993 | pub fn _mm_setzero_ps() -> __m128 { |
994 | const { unsafe { mem::zeroed() } } |
995 | } |
996 | |
997 | /// A utility function for creating masks to use with Intel shuffle and |
998 | /// permute intrinsics. |
999 | #[inline ] |
1000 | #[allow (non_snake_case)] |
1001 | #[unstable (feature = "stdarch_x86_mm_shuffle" , issue = "111147" )] |
1002 | pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 { |
1003 | ((z << 6) | (y << 4) | (x << 2) | w) as i32 |
1004 | } |
1005 | |
1006 | /// Shuffles packed single-precision (32-bit) floating-point elements in `a` and |
1007 | /// `b` using `MASK`. |
1008 | /// |
1009 | /// The lower half of result takes values from `a` and the higher half from |
1010 | /// `b`. Mask is split to 2 control bits each to index the element from inputs. |
1011 | /// |
1012 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps) |
1013 | /// |
1014 | /// Note that there appears to be a mistake within Intel's Intrinsics Guide. |
1015 | /// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32` |
1016 | /// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_). |
1017 | /// Performing an implicit type conversion between an unsigned integer and a signed integer |
1018 | /// does not cause a problem in C, however Rust's commitment to strong typing does not allow this. |
1019 | #[inline ] |
1020 | #[target_feature (enable = "sse" )] |
1021 | #[cfg_attr (test, assert_instr(shufps, MASK = 3))] |
1022 | #[rustc_legacy_const_generics (2)] |
1023 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1024 | pub fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 { |
1025 | static_assert_uimm_bits!(MASK, 8); |
1026 | unsafe { |
1027 | simd_shuffle!( |
1028 | a, |
1029 | b, |
1030 | [ |
1031 | MASK as u32 & 0b11, |
1032 | (MASK as u32 >> 2) & 0b11, |
1033 | ((MASK as u32 >> 4) & 0b11) + 4, |
1034 | ((MASK as u32 >> 6) & 0b11) + 4, |
1035 | ], |
1036 | ) |
1037 | } |
1038 | } |
1039 | |
1040 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
1041 | /// from the higher half of `a` and `b`. |
1042 | /// |
1043 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps) |
1044 | #[inline ] |
1045 | #[target_feature (enable = "sse" )] |
1046 | #[cfg_attr (test, assert_instr(unpckhps))] |
1047 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1048 | pub fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 { |
1049 | unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) } |
1050 | } |
1051 | |
1052 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
1053 | /// from the lower half of `a` and `b`. |
1054 | /// |
1055 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps) |
1056 | #[inline ] |
1057 | #[target_feature (enable = "sse" )] |
1058 | #[cfg_attr (test, assert_instr(unpcklps))] |
1059 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1060 | pub fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 { |
1061 | unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) } |
1062 | } |
1063 | |
1064 | /// Combine higher half of `a` and `b`. The higher half of `b` occupies the |
1065 | /// lower half of result. |
1066 | /// |
1067 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps) |
1068 | #[inline ] |
1069 | #[target_feature (enable = "sse" )] |
1070 | #[cfg_attr (all(test, not(target_env = "msvc" )), assert_instr(movhlps))] |
1071 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1072 | pub fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 { |
1073 | // TODO; figure why this is a different instruction on msvc? |
1074 | unsafe { simd_shuffle!(a, b, [6, 7, 2, 3]) } |
1075 | } |
1076 | |
1077 | /// Combine lower half of `a` and `b`. The lower half of `b` occupies the |
1078 | /// higher half of result. |
1079 | /// |
1080 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps) |
1081 | #[inline ] |
1082 | #[target_feature (enable = "sse" )] |
1083 | #[cfg_attr (all(test, not(target_env = "msvc" )), assert_instr(movlhps))] |
1084 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1085 | pub fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 { |
1086 | unsafe { simd_shuffle!(a, b, [0, 1, 4, 5]) } |
1087 | } |
1088 | |
1089 | /// Returns a mask of the most significant bit of each element in `a`. |
1090 | /// |
1091 | /// The mask is stored in the 4 least significant bits of the return value. |
1092 | /// All other bits are set to `0`. |
1093 | /// |
1094 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps) |
1095 | #[inline ] |
1096 | #[target_feature (enable = "sse" )] |
1097 | #[cfg_attr (test, assert_instr(movmskps))] |
1098 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1099 | pub fn _mm_movemask_ps(a: __m128) -> i32 { |
1100 | // Propagate the highest bit to the rest, because simd_bitmask |
1101 | // requires all-1 or all-0. |
1102 | unsafe { |
1103 | let mask: i32x4 = simd_lt(x:transmute(a), y:i32x4::ZERO); |
1104 | simd_bitmask::<i32x4, u8>(mask).into() |
1105 | } |
1106 | } |
1107 | |
1108 | /// Construct a `__m128` with the lowest element read from `p` and the other |
1109 | /// elements set to zero. |
1110 | /// |
1111 | /// This corresponds to instructions `VMOVSS` / `MOVSS`. |
1112 | /// |
1113 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss) |
1114 | #[inline ] |
1115 | #[target_feature (enable = "sse" )] |
1116 | #[cfg_attr (test, assert_instr(movss))] |
1117 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1118 | pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 { |
1119 | __m128([*p, 0.0, 0.0, 0.0]) |
1120 | } |
1121 | |
1122 | /// Construct a `__m128` by duplicating the value read from `p` into all |
1123 | /// elements. |
1124 | /// |
1125 | /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some |
1126 | /// shuffling. |
1127 | /// |
1128 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps) |
1129 | #[inline ] |
1130 | #[target_feature (enable = "sse" )] |
1131 | #[cfg_attr (test, assert_instr(movss))] |
1132 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1133 | pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 { |
1134 | let a: f32 = *p; |
1135 | __m128([a, a, a, a]) |
1136 | } |
1137 | |
1138 | /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html) |
1139 | /// |
1140 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1) |
1141 | #[inline ] |
1142 | #[target_feature (enable = "sse" )] |
1143 | #[cfg_attr (test, assert_instr(movss))] |
1144 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1145 | pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 { |
1146 | _mm_load1_ps(p) |
1147 | } |
1148 | |
1149 | /// Loads four `f32` values from *aligned* memory into a `__m128`. If the |
1150 | /// pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1151 | /// protection fault will be triggered (fatal program crash). |
1152 | /// |
1153 | /// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned |
1154 | /// memory. |
1155 | /// |
1156 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. |
1157 | /// |
1158 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps) |
1159 | #[inline ] |
1160 | #[target_feature (enable = "sse" )] |
1161 | #[cfg_attr (test, assert_instr(movaps))] |
1162 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1163 | #[allow (clippy::cast_ptr_alignment)] |
1164 | pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 { |
1165 | *(p as *const __m128) |
1166 | } |
1167 | |
1168 | /// Loads four `f32` values from memory into a `__m128`. There are no |
1169 | /// restrictions |
1170 | /// on memory alignment. For aligned memory |
1171 | /// [`_mm_load_ps`](fn._mm_load_ps.html) |
1172 | /// may be faster. |
1173 | /// |
1174 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. |
1175 | /// |
1176 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps) |
1177 | #[inline ] |
1178 | #[target_feature (enable = "sse" )] |
1179 | #[cfg_attr (test, assert_instr(movups))] |
1180 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1181 | pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 { |
1182 | // Note: Using `*p` would require `f32` alignment, but `movups` has no |
1183 | // alignment restrictions. |
1184 | let mut dst: __m128 = _mm_undefined_ps(); |
1185 | ptr::copy_nonoverlapping( |
1186 | src:p as *const u8, |
1187 | dst:ptr::addr_of_mut!(dst) as *mut u8, |
1188 | count:mem::size_of::<__m128>(), |
1189 | ); |
1190 | dst |
1191 | } |
1192 | |
1193 | /// Loads four `f32` values from aligned memory into a `__m128` in reverse |
1194 | /// order. |
1195 | /// |
1196 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1197 | /// protection fault will be triggered (fatal program crash). |
1198 | /// |
1199 | /// Functionally equivalent to the following code sequence (assuming `p` |
1200 | /// satisfies the alignment restrictions): |
1201 | /// |
1202 | /// ```text |
1203 | /// let a0 = *p; |
1204 | /// let a1 = *p.add(1); |
1205 | /// let a2 = *p.add(2); |
1206 | /// let a3 = *p.add(3); |
1207 | /// __m128::new(a3, a2, a1, a0) |
1208 | /// ``` |
1209 | /// |
1210 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some |
1211 | /// shuffling. |
1212 | /// |
1213 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps) |
1214 | #[inline ] |
1215 | #[target_feature (enable = "sse" )] |
1216 | #[cfg_attr (test, assert_instr(movaps))] |
1217 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1218 | pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { |
1219 | let a: __m128 = _mm_load_ps(p); |
1220 | simd_shuffle!(a, a, [3, 2, 1, 0]) |
1221 | } |
1222 | |
1223 | /// Stores the lowest 32 bit float of `a` into memory. |
1224 | /// |
1225 | /// This intrinsic corresponds to the `MOVSS` instruction. |
1226 | /// |
1227 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss) |
1228 | #[inline ] |
1229 | #[target_feature (enable = "sse" )] |
1230 | #[cfg_attr (test, assert_instr(movss))] |
1231 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1232 | pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) { |
1233 | *p = simd_extract!(a, 0); |
1234 | } |
1235 | |
1236 | /// Stores the lowest 32 bit float of `a` repeated four times into *aligned* |
1237 | /// memory. |
1238 | /// |
1239 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1240 | /// protection fault will be triggered (fatal program crash). |
1241 | /// |
1242 | /// Functionally equivalent to the following code sequence (assuming `p` |
1243 | /// satisfies the alignment restrictions): |
1244 | /// |
1245 | /// ```text |
1246 | /// let x = a.extract(0); |
1247 | /// *p = x; |
1248 | /// *p.add(1) = x; |
1249 | /// *p.add(2) = x; |
1250 | /// *p.add(3) = x; |
1251 | /// ``` |
1252 | /// |
1253 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps) |
1254 | #[inline ] |
1255 | #[target_feature (enable = "sse" )] |
1256 | #[cfg_attr (test, assert_instr(movaps))] |
1257 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1258 | #[allow (clippy::cast_ptr_alignment)] |
1259 | pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) { |
1260 | let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]); |
1261 | *(p as *mut __m128) = b; |
1262 | } |
1263 | |
1264 | /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html) |
1265 | /// |
1266 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1) |
1267 | #[inline ] |
1268 | #[target_feature (enable = "sse" )] |
1269 | #[cfg_attr (test, assert_instr(movaps))] |
1270 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1271 | pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) { |
1272 | _mm_store1_ps(p, a); |
1273 | } |
1274 | |
1275 | /// Stores four 32-bit floats into *aligned* memory. |
1276 | /// |
1277 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1278 | /// protection fault will be triggered (fatal program crash). |
1279 | /// |
1280 | /// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned |
1281 | /// memory. |
1282 | /// |
1283 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. |
1284 | /// |
1285 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps) |
1286 | #[inline ] |
1287 | #[target_feature (enable = "sse" )] |
1288 | #[cfg_attr (test, assert_instr(movaps))] |
1289 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1290 | #[allow (clippy::cast_ptr_alignment)] |
1291 | pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) { |
1292 | *(p as *mut __m128) = a; |
1293 | } |
1294 | |
1295 | /// Stores four 32-bit floats into memory. There are no restrictions on memory |
1296 | /// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be |
1297 | /// faster. |
1298 | /// |
1299 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. |
1300 | /// |
1301 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps) |
1302 | #[inline ] |
1303 | #[target_feature (enable = "sse" )] |
1304 | #[cfg_attr (test, assert_instr(movups))] |
1305 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1306 | pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) { |
1307 | ptr::copy_nonoverlapping( |
1308 | src:ptr::addr_of!(a) as *const u8, |
1309 | dst:p as *mut u8, |
1310 | count:mem::size_of::<__m128>(), |
1311 | ); |
1312 | } |
1313 | |
1314 | /// Stores four 32-bit floats into *aligned* memory in reverse order. |
1315 | /// |
1316 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1317 | /// protection fault will be triggered (fatal program crash). |
1318 | /// |
1319 | /// Functionally equivalent to the following code sequence (assuming `p` |
1320 | /// satisfies the alignment restrictions): |
1321 | /// |
1322 | /// ```text |
1323 | /// *p = a.extract(3); |
1324 | /// *p.add(1) = a.extract(2); |
1325 | /// *p.add(2) = a.extract(1); |
1326 | /// *p.add(3) = a.extract(0); |
1327 | /// ``` |
1328 | /// |
1329 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps) |
1330 | #[inline ] |
1331 | #[target_feature (enable = "sse" )] |
1332 | #[cfg_attr (test, assert_instr(movaps))] |
1333 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1334 | #[allow (clippy::cast_ptr_alignment)] |
1335 | pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) { |
1336 | let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]); |
1337 | *(p as *mut __m128) = b; |
1338 | } |
1339 | |
1340 | /// Returns a `__m128` with the first component from `b` and the remaining |
1341 | /// components from `a`. |
1342 | /// |
1343 | /// In other words for any `a` and `b`: |
1344 | /// ```text |
1345 | /// _mm_move_ss(a, b) == a.replace(0, b.extract(0)) |
1346 | /// ``` |
1347 | /// |
1348 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss) |
1349 | #[inline ] |
1350 | #[target_feature (enable = "sse" )] |
1351 | #[cfg_attr (test, assert_instr(movss))] |
1352 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1353 | pub fn _mm_move_ss(a: __m128, b: __m128) -> __m128 { |
1354 | unsafe { simd_shuffle!(a, b, [4, 1, 2, 3]) } |
1355 | } |
1356 | |
1357 | /// Performs a serializing operation on all non-temporal ("streaming") store instructions that |
1358 | /// were issued by the current thread prior to this instruction. |
1359 | /// |
1360 | /// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is |
1361 | /// ordered before any load or store instruction which follows the fence in |
1362 | /// synchronization order. |
1363 | /// |
1364 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence) |
1365 | /// (but note that Intel is only documenting the hardware-level concerns related to this |
1366 | /// instruction; the Intel documentation does not take into account the extra concerns that arise |
1367 | /// because the Rust memory model is different from the x86 memory model.) |
1368 | /// |
1369 | /// # Safety of non-temporal stores |
1370 | /// |
1371 | /// After using any non-temporal store intrinsic, but before any other access to the memory that the |
1372 | /// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the |
1373 | /// intrinsic. |
1374 | /// |
1375 | /// Non-temporal stores behave very different from regular stores. For the purpose of the Rust |
1376 | /// memory model, these stores are happening asynchronously in a background thread. This means a |
1377 | /// non-temporal store can cause data races with other accesses, even other accesses on the same |
1378 | /// thread. It also means that cross-thread synchronization does not work as expected: let's say the |
1379 | /// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The |
1380 | /// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not |
1381 | /// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize |
1382 | /// with all the non-temporal stores previously started on this thread, which means in particular |
1383 | /// that subsequent synchronization with other threads will then work as intended again. |
1384 | /// |
1385 | /// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your |
1386 | /// code jumps back to code outside your library. This ensures all stores inside your function |
1387 | /// are synchronized-before the return, and thus transitively synchronized-before everything |
1388 | /// the caller does after your function returns. |
1389 | // |
1390 | // The following is not a doc comment since it's not clear whether we want to put this into the |
1391 | // docs, but it should be written out somewhere. |
1392 | // |
1393 | // Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot |
1394 | // inspect, and that behave like the following functions. This explains where the docs above come |
1395 | // from. |
1396 | // ``` |
1397 | // #[thread_local] |
1398 | // static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0); |
1399 | // |
1400 | // pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) { |
1401 | // PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed); |
1402 | // // Spawn a thread that will eventually do our write. |
1403 | // // We need to fetch a pointer to this thread's pending-write |
1404 | // // counter, so that we can access it from the background thread. |
1405 | // let pending_writes = addr_of!(PENDING_NONTEMP_WRITES); |
1406 | // // If this was actual Rust code we'd have to do some extra work |
1407 | // // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here. |
1408 | // std::thread::spawn(move || { |
1409 | // // Do the write in the background thread. |
1410 | // ptr.write(val); |
1411 | // // Register the write as done. Crucially, this is `Release`, so it |
1412 | // // syncs-with the `Acquire in `sfence`. |
1413 | // (&*pending_writes).fetch_sub(1, Release); |
1414 | // }); |
1415 | // } |
1416 | // |
1417 | // pub fn sfence() { |
1418 | // unsafe { |
1419 | // // Wait until there are no more pending writes. |
1420 | // while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {} |
1421 | // } |
1422 | // } |
1423 | // ``` |
1424 | #[inline ] |
1425 | #[target_feature (enable = "sse" )] |
1426 | #[cfg_attr (test, assert_instr(sfence))] |
1427 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1428 | pub unsafe fn _mm_sfence() { |
1429 | sfence() |
1430 | } |
1431 | |
1432 | /// Gets the unsigned 32-bit value of the MXCSR control and status register. |
1433 | /// |
1434 | /// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust |
1435 | /// floating-point operations may or may not result in this register getting updated with exception |
1436 | /// state, and the register can change between two invocations of this function even when no |
1437 | /// floating-point operations appear in the source code (since floating-point operations appearing |
1438 | /// earlier or later can be reordered). |
1439 | /// |
1440 | /// If you need to perform some floating-point operations and check whether they raised an |
1441 | /// exception, use an inline assembly block for the entire sequence of operations. |
1442 | /// |
1443 | /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html) |
1444 | /// |
1445 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr) |
1446 | #[inline ] |
1447 | #[target_feature (enable = "sse" )] |
1448 | #[cfg_attr (test, assert_instr(stmxcsr))] |
1449 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1450 | #[deprecated ( |
1451 | since = "1.75.0" , |
1452 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1453 | )] |
1454 | pub unsafe fn _mm_getcsr() -> u32 { |
1455 | unsafe { |
1456 | let mut result: i32 = 0_i32; |
1457 | stmxcsr(ptr::addr_of_mut!(result) as *mut i8); |
1458 | result as u32 |
1459 | } |
1460 | } |
1461 | |
1462 | /// Sets the MXCSR register with the 32-bit unsigned integer value. |
1463 | /// |
1464 | /// This register controls how SIMD instructions handle floating point |
1465 | /// operations. Modifying this register only affects the current thread. |
1466 | /// |
1467 | /// It contains several groups of flags: |
1468 | /// |
1469 | /// * *Exception flags* report which exceptions occurred since last they were reset. |
1470 | /// |
1471 | /// * *Masking flags* can be used to mask (ignore) certain exceptions. By default |
1472 | /// these flags are all set to 1, so all exceptions are masked. When |
1473 | /// an exception is masked, the processor simply sets the exception flag and |
1474 | /// continues the operation. If the exception is unmasked, the flag is also set |
1475 | /// but additionally an exception handler is invoked. |
1476 | /// |
1477 | /// * *Rounding mode flags* control the rounding mode of floating point |
1478 | /// instructions. |
1479 | /// |
1480 | /// * The *denormals-are-zero mode flag* turns all numbers which would be |
1481 | /// denormalized (exponent bits are all zeros) into zeros. |
1482 | /// |
1483 | /// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to |
1484 | /// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and |
1485 | /// will optimize accordingly. This even applies when the register is altered and later reset to its |
1486 | /// original value without any floating-point operations appearing in the source code between those |
1487 | /// operations (since floating-point operations appearing earlier or later can be reordered). |
1488 | /// |
1489 | /// If you need to perform some floating-point operations under a different masking flags, rounding |
1490 | /// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the |
1491 | /// original MXCSR register state before the end of the block. |
1492 | /// |
1493 | /// ## Exception Flags |
1494 | /// |
1495 | /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing |
1496 | /// Infinity by Infinity). |
1497 | /// |
1498 | /// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized |
1499 | /// number. Mainly this can cause loss of precision. |
1500 | /// |
1501 | /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred. |
1502 | /// |
1503 | /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a |
1504 | /// result was too large to be represented (e.g., an `f32` with absolute |
1505 | /// value greater than `2^128`). |
1506 | /// |
1507 | /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a |
1508 | /// result was too small to be represented in a normalized way (e.g., an |
1509 | /// `f32` with absolute value smaller than `2^-126`.) |
1510 | /// |
1511 | /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a. |
1512 | /// precision exception). This means some precision was lost due to rounding. |
1513 | /// For example, the fraction `1/3` cannot be represented accurately in a |
1514 | /// 32 or 64 bit float and computing it would cause this exception to be |
1515 | /// raised. Precision exceptions are very common, so they are usually masked. |
1516 | /// |
1517 | /// Exception flags can be read and set using the convenience functions |
1518 | /// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to |
1519 | /// check if an operation caused some overflow: |
1520 | /// |
1521 | /// ```rust,ignore |
1522 | /// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags |
1523 | /// // perform calculations |
1524 | /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 { |
1525 | /// // handle overflow |
1526 | /// } |
1527 | /// ``` |
1528 | /// |
1529 | /// ## Masking Flags |
1530 | /// |
1531 | /// There is one masking flag for each exception flag: `_MM_MASK_INVALID`, |
1532 | /// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`, |
1533 | /// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. |
1534 | /// |
1535 | /// A single masking bit can be set via |
1536 | /// |
1537 | /// ```rust,ignore |
1538 | /// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW); |
1539 | /// ``` |
1540 | /// |
1541 | /// However, since mask bits are by default all set to 1, it is more common to |
1542 | /// want to *disable* certain bits. For example, to unmask the underflow |
1543 | /// exception, use: |
1544 | /// |
1545 | /// ```rust,ignore |
1546 | /// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow |
1547 | /// exception |
1548 | /// ``` |
1549 | /// |
1550 | /// Warning: an unmasked exception will cause an exception handler to be |
1551 | /// called. |
1552 | /// The standard handler will simply terminate the process. So, in this case |
1553 | /// any underflow exception would terminate the current process with something |
1554 | /// like `signal: 8, SIGFPE: erroneous arithmetic operation`. |
1555 | /// |
1556 | /// ## Rounding Mode |
1557 | /// |
1558 | /// The rounding mode is describe using two bits. It can be read and set using |
1559 | /// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and |
1560 | /// `_MM_SET_ROUNDING_MODE(mode)`. |
1561 | /// |
1562 | /// The rounding modes are: |
1563 | /// |
1564 | /// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision |
1565 | /// value. If two values are equally close, round to even (i.e., least |
1566 | /// significant bit will be zero). |
1567 | /// |
1568 | /// * `_MM_ROUND_DOWN`: Round toward negative Infinity. |
1569 | /// |
1570 | /// * `_MM_ROUND_UP`: Round toward positive Infinity. |
1571 | /// |
1572 | /// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate). |
1573 | /// |
1574 | /// Example: |
1575 | /// |
1576 | /// ```rust,ignore |
1577 | /// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN) |
1578 | /// ``` |
1579 | /// |
1580 | /// ## Denormals-are-zero/Flush-to-zero Mode |
1581 | /// |
1582 | /// If this bit is set, values that would be denormalized will be set to zero |
1583 | /// instead. This is turned off by default. |
1584 | /// |
1585 | /// You can read and enable/disable this mode via the helper functions |
1586 | /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`: |
1587 | /// |
1588 | /// ```rust,ignore |
1589 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default) |
1590 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on |
1591 | /// ``` |
1592 | /// |
1593 | /// |
1594 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr) |
1595 | #[inline ] |
1596 | #[target_feature (enable = "sse" )] |
1597 | #[cfg_attr (test, assert_instr(ldmxcsr))] |
1598 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1599 | #[deprecated ( |
1600 | since = "1.75.0" , |
1601 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1602 | )] |
1603 | pub unsafe fn _mm_setcsr(val: u32) { |
1604 | ldmxcsr(ptr::addr_of!(val) as *const i8); |
1605 | } |
1606 | |
1607 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1608 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1609 | pub const _MM_EXCEPT_INVALID: u32 = 0x0001; |
1610 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1611 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1612 | pub const _MM_EXCEPT_DENORM: u32 = 0x0002; |
1613 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1614 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1615 | pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004; |
1616 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1617 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1618 | pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008; |
1619 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1620 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1621 | pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010; |
1622 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1623 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1624 | pub const _MM_EXCEPT_INEXACT: u32 = 0x0020; |
1625 | /// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html) |
1626 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1627 | pub const _MM_EXCEPT_MASK: u32 = 0x003f; |
1628 | |
1629 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1630 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1631 | pub const _MM_MASK_INVALID: u32 = 0x0080; |
1632 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1633 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1634 | pub const _MM_MASK_DENORM: u32 = 0x0100; |
1635 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1636 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1637 | pub const _MM_MASK_DIV_ZERO: u32 = 0x0200; |
1638 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1639 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1640 | pub const _MM_MASK_OVERFLOW: u32 = 0x0400; |
1641 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1642 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1643 | pub const _MM_MASK_UNDERFLOW: u32 = 0x0800; |
1644 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1645 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1646 | pub const _MM_MASK_INEXACT: u32 = 0x1000; |
1647 | /// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html) |
1648 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1649 | pub const _MM_MASK_MASK: u32 = 0x1f80; |
1650 | |
1651 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1652 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1653 | pub const _MM_ROUND_NEAREST: u32 = 0x0000; |
1654 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1655 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1656 | pub const _MM_ROUND_DOWN: u32 = 0x2000; |
1657 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1658 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1659 | pub const _MM_ROUND_UP: u32 = 0x4000; |
1660 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1661 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1662 | pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000; |
1663 | |
1664 | /// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html) |
1665 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1666 | pub const _MM_ROUND_MASK: u32 = 0x6000; |
1667 | |
1668 | /// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html) |
1669 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1670 | pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000; |
1671 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1672 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1673 | pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000; |
1674 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1675 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1676 | pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; |
1677 | |
1678 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1679 | /// |
1680 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK) |
1681 | #[inline ] |
1682 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1683 | #[allow (non_snake_case)] |
1684 | #[target_feature (enable = "sse" )] |
1685 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1686 | #[deprecated ( |
1687 | since = "1.75.0" , |
1688 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1689 | )] |
1690 | pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { |
1691 | _mm_getcsr() & _MM_MASK_MASK |
1692 | } |
1693 | |
1694 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1695 | /// |
1696 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE) |
1697 | #[inline ] |
1698 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1699 | #[allow (non_snake_case)] |
1700 | #[target_feature (enable = "sse" )] |
1701 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1702 | #[deprecated ( |
1703 | since = "1.75.0" , |
1704 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1705 | )] |
1706 | pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { |
1707 | _mm_getcsr() & _MM_EXCEPT_MASK |
1708 | } |
1709 | |
1710 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1711 | /// |
1712 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE) |
1713 | #[inline ] |
1714 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1715 | #[allow (non_snake_case)] |
1716 | #[target_feature (enable = "sse" )] |
1717 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1718 | #[deprecated ( |
1719 | since = "1.75.0" , |
1720 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1721 | )] |
1722 | pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { |
1723 | _mm_getcsr() & _MM_FLUSH_ZERO_MASK |
1724 | } |
1725 | |
1726 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1727 | /// |
1728 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE) |
1729 | #[inline ] |
1730 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1731 | #[allow (non_snake_case)] |
1732 | #[target_feature (enable = "sse" )] |
1733 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1734 | #[deprecated ( |
1735 | since = "1.75.0" , |
1736 | note = "see `_mm_getcsr` documentation - use inline assembly instead" |
1737 | )] |
1738 | pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { |
1739 | _mm_getcsr() & _MM_ROUND_MASK |
1740 | } |
1741 | |
1742 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1743 | /// |
1744 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK) |
1745 | #[inline ] |
1746 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1747 | #[allow (non_snake_case)] |
1748 | #[target_feature (enable = "sse" )] |
1749 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1750 | #[deprecated ( |
1751 | since = "1.75.0" , |
1752 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1753 | )] |
1754 | pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { |
1755 | _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | (x & _MM_MASK_MASK)) |
1756 | } |
1757 | |
1758 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1759 | /// |
1760 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE) |
1761 | #[inline ] |
1762 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1763 | #[allow (non_snake_case)] |
1764 | #[target_feature (enable = "sse" )] |
1765 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1766 | #[deprecated ( |
1767 | since = "1.75.0" , |
1768 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1769 | )] |
1770 | pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { |
1771 | _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | (x & _MM_EXCEPT_MASK)) |
1772 | } |
1773 | |
1774 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1775 | /// |
1776 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE) |
1777 | #[inline ] |
1778 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1779 | #[allow (non_snake_case)] |
1780 | #[target_feature (enable = "sse" )] |
1781 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1782 | #[deprecated ( |
1783 | since = "1.75.0" , |
1784 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1785 | )] |
1786 | pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { |
1787 | _mm_setcsr((_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | (x & _MM_FLUSH_ZERO_MASK)) |
1788 | } |
1789 | |
1790 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) |
1791 | /// |
1792 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE) |
1793 | #[inline ] |
1794 | #[allow (deprecated)] // Deprecated function implemented on top of deprecated function |
1795 | #[allow (non_snake_case)] |
1796 | #[target_feature (enable = "sse" )] |
1797 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1798 | #[deprecated ( |
1799 | since = "1.75.0" , |
1800 | note = "see `_mm_setcsr` documentation - use inline assembly instead" |
1801 | )] |
1802 | pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) { |
1803 | _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | (x & _MM_ROUND_MASK)) |
1804 | } |
1805 | |
1806 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1807 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1808 | pub const _MM_HINT_T0: i32 = 3; |
1809 | |
1810 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1811 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1812 | pub const _MM_HINT_T1: i32 = 2; |
1813 | |
1814 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1815 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1816 | pub const _MM_HINT_T2: i32 = 1; |
1817 | |
1818 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1819 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1820 | pub const _MM_HINT_NTA: i32 = 0; |
1821 | |
1822 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1823 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1824 | pub const _MM_HINT_ET0: i32 = 7; |
1825 | |
1826 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1827 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1828 | pub const _MM_HINT_ET1: i32 = 6; |
1829 | |
1830 | /// Fetch the cache line that contains address `p` using the given `STRATEGY`. |
1831 | /// |
1832 | /// The `STRATEGY` must be one of: |
1833 | /// |
1834 | /// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the |
1835 | /// cache hierarchy. |
1836 | /// |
1837 | /// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher. |
1838 | /// |
1839 | /// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or |
1840 | /// an implementation-specific choice (e.g., L2 if there is no L3). |
1841 | /// |
1842 | /// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the |
1843 | /// non-temporal access (NTA) hint. It may be a place closer than main memory |
1844 | /// but outside of the cache hierarchy. This is used to reduce access latency |
1845 | /// without polluting the cache. |
1846 | /// |
1847 | /// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and |
1848 | /// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0` |
1849 | /// and `_MM_HINT_T1` but indicate an anticipation to write to the address. |
1850 | /// |
1851 | /// The actual implementation depends on the particular CPU. This instruction |
1852 | /// is considered a hint, so the CPU is also free to simply ignore the request. |
1853 | /// |
1854 | /// The amount of prefetched data depends on the cache line size of the |
1855 | /// specific CPU, but it will be at least 32 bytes. |
1856 | /// |
1857 | /// Common caveats: |
1858 | /// |
1859 | /// * Most modern CPUs already automatically prefetch data based on predicted |
1860 | /// access patterns. |
1861 | /// |
1862 | /// * Data is usually not fetched if this would cause a TLB miss or a page |
1863 | /// fault. |
1864 | /// |
1865 | /// * Too much prefetching can cause unnecessary cache evictions. |
1866 | /// |
1867 | /// * Prefetching may also fail if there are not enough memory-subsystem |
1868 | /// resources (e.g., request buffers). |
1869 | /// |
1870 | /// |
1871 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch) |
1872 | #[inline ] |
1873 | #[target_feature (enable = "sse" )] |
1874 | #[cfg_attr (test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))] |
1875 | #[cfg_attr (test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))] |
1876 | #[cfg_attr (test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))] |
1877 | #[cfg_attr (test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))] |
1878 | #[rustc_legacy_const_generics (1)] |
1879 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1880 | pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) { |
1881 | static_assert_uimm_bits!(STRATEGY, 3); |
1882 | // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache). |
1883 | // `locality` and `rw` are based on our `STRATEGY`. |
1884 | prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, ty:1); |
1885 | } |
1886 | |
1887 | /// Returns vector of type __m128 with indeterminate elements. |
1888 | /// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`]. |
1889 | /// In practice, this is equivalent to [`mem::zeroed`]. |
1890 | /// |
1891 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps) |
1892 | #[inline ] |
1893 | #[target_feature (enable = "sse" )] |
1894 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1895 | pub fn _mm_undefined_ps() -> __m128 { |
1896 | const { unsafe { mem::zeroed() } } |
1897 | } |
1898 | |
1899 | /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place. |
1900 | /// |
1901 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS) |
1902 | #[inline ] |
1903 | #[allow (non_snake_case)] |
1904 | #[target_feature (enable = "sse" )] |
1905 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
1906 | pub fn _MM_TRANSPOSE4_PS( |
1907 | row0: &mut __m128, |
1908 | row1: &mut __m128, |
1909 | row2: &mut __m128, |
1910 | row3: &mut __m128, |
1911 | ) { |
1912 | let tmp0: __m128 = _mm_unpacklo_ps(*row0, *row1); |
1913 | let tmp2: __m128 = _mm_unpacklo_ps(*row2, *row3); |
1914 | let tmp1: __m128 = _mm_unpackhi_ps(*row0, *row1); |
1915 | let tmp3: __m128 = _mm_unpackhi_ps(*row2, *row3); |
1916 | |
1917 | *row0 = _mm_movelh_ps(a:tmp0, b:tmp2); |
1918 | *row1 = _mm_movehl_ps(a:tmp2, b:tmp0); |
1919 | *row2 = _mm_movelh_ps(a:tmp1, b:tmp3); |
1920 | *row3 = _mm_movehl_ps(a:tmp3, b:tmp1); |
1921 | } |
1922 | |
1923 | #[allow (improper_ctypes)] |
1924 | unsafe extern "C" { |
1925 | #[link_name = "llvm.x86.sse.rcp.ss" ] |
1926 | unsafefn rcpss(a: __m128) -> __m128; |
1927 | #[link_name = "llvm.x86.sse.rcp.ps" ] |
1928 | unsafefn rcpps(a: __m128) -> __m128; |
1929 | #[link_name = "llvm.x86.sse.rsqrt.ss" ] |
1930 | unsafefn rsqrtss(a: __m128) -> __m128; |
1931 | #[link_name = "llvm.x86.sse.rsqrt.ps" ] |
1932 | unsafefn rsqrtps(a: __m128) -> __m128; |
1933 | #[link_name = "llvm.x86.sse.min.ss" ] |
1934 | unsafefn minss(a: __m128, b: __m128) -> __m128; |
1935 | #[link_name = "llvm.x86.sse.min.ps" ] |
1936 | unsafefn minps(a: __m128, b: __m128) -> __m128; |
1937 | #[link_name = "llvm.x86.sse.max.ss" ] |
1938 | unsafefn maxss(a: __m128, b: __m128) -> __m128; |
1939 | #[link_name = "llvm.x86.sse.max.ps" ] |
1940 | unsafefn maxps(a: __m128, b: __m128) -> __m128; |
1941 | #[link_name = "llvm.x86.sse.cmp.ps" ] |
1942 | unsafefn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128; |
1943 | #[link_name = "llvm.x86.sse.comieq.ss" ] |
1944 | unsafefn comieq_ss(a: __m128, b: __m128) -> i32; |
1945 | #[link_name = "llvm.x86.sse.comilt.ss" ] |
1946 | unsafefn comilt_ss(a: __m128, b: __m128) -> i32; |
1947 | #[link_name = "llvm.x86.sse.comile.ss" ] |
1948 | unsafefn comile_ss(a: __m128, b: __m128) -> i32; |
1949 | #[link_name = "llvm.x86.sse.comigt.ss" ] |
1950 | unsafefn comigt_ss(a: __m128, b: __m128) -> i32; |
1951 | #[link_name = "llvm.x86.sse.comige.ss" ] |
1952 | unsafefn comige_ss(a: __m128, b: __m128) -> i32; |
1953 | #[link_name = "llvm.x86.sse.comineq.ss" ] |
1954 | unsafefn comineq_ss(a: __m128, b: __m128) -> i32; |
1955 | #[link_name = "llvm.x86.sse.ucomieq.ss" ] |
1956 | unsafefn ucomieq_ss(a: __m128, b: __m128) -> i32; |
1957 | #[link_name = "llvm.x86.sse.ucomilt.ss" ] |
1958 | unsafefn ucomilt_ss(a: __m128, b: __m128) -> i32; |
1959 | #[link_name = "llvm.x86.sse.ucomile.ss" ] |
1960 | unsafefn ucomile_ss(a: __m128, b: __m128) -> i32; |
1961 | #[link_name = "llvm.x86.sse.ucomigt.ss" ] |
1962 | unsafefn ucomigt_ss(a: __m128, b: __m128) -> i32; |
1963 | #[link_name = "llvm.x86.sse.ucomige.ss" ] |
1964 | unsafefn ucomige_ss(a: __m128, b: __m128) -> i32; |
1965 | #[link_name = "llvm.x86.sse.ucomineq.ss" ] |
1966 | unsafefn ucomineq_ss(a: __m128, b: __m128) -> i32; |
1967 | #[link_name = "llvm.x86.sse.cvtss2si" ] |
1968 | unsafefn cvtss2si(a: __m128) -> i32; |
1969 | #[link_name = "llvm.x86.sse.cvttss2si" ] |
1970 | unsafefn cvttss2si(a: __m128) -> i32; |
1971 | #[link_name = "llvm.x86.sse.cvtsi2ss" ] |
1972 | unsafefn cvtsi2ss(a: __m128, b: i32) -> __m128; |
1973 | #[link_name = "llvm.x86.sse.sfence" ] |
1974 | unsafefn sfence(); |
1975 | #[link_name = "llvm.x86.sse.stmxcsr" ] |
1976 | unsafefn stmxcsr(p: *mut i8); |
1977 | #[link_name = "llvm.x86.sse.ldmxcsr" ] |
1978 | unsafefn ldmxcsr(p: *const i8); |
1979 | #[link_name = "llvm.prefetch" ] |
1980 | unsafefn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32); |
1981 | #[link_name = "llvm.x86.sse.cmp.ss" ] |
1982 | unsafefn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128; |
1983 | } |
1984 | |
1985 | /// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint. |
1986 | /// |
1987 | /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection |
1988 | /// exception _may_ be generated. |
1989 | /// |
1990 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps) |
1991 | /// |
1992 | /// # Safety of non-temporal stores |
1993 | /// |
1994 | /// After using this intrinsic, but before any other access to the memory that this intrinsic |
1995 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In |
1996 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they |
1997 | /// return. |
1998 | /// |
1999 | /// See [`_mm_sfence`] for details. |
2000 | #[inline ] |
2001 | #[target_feature (enable = "sse" )] |
2002 | #[cfg_attr (test, assert_instr(movntps))] |
2003 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
2004 | #[allow (clippy::cast_ptr_alignment)] |
2005 | pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { |
2006 | crate::arch::asm!( |
2007 | vps!("movntps" , ",{a}" ), |
2008 | p = in(reg) mem_addr, |
2009 | a = in(xmm_reg) a, |
2010 | options(nostack, preserves_flags), |
2011 | ); |
2012 | } |
2013 | |
2014 | #[cfg (test)] |
2015 | mod tests { |
2016 | use crate::{hint::black_box, mem::transmute, ptr}; |
2017 | use std::boxed; |
2018 | use stdarch_test::simd_test; |
2019 | |
2020 | use crate::core_arch::{simd::*, x86::*}; |
2021 | |
2022 | const NAN: f32 = f32::NAN; |
2023 | |
2024 | #[simd_test(enable = "sse" )] |
2025 | unsafe fn test_mm_add_ps() { |
2026 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2027 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2028 | let r = _mm_add_ps(a, b); |
2029 | assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0)); |
2030 | } |
2031 | |
2032 | #[simd_test(enable = "sse" )] |
2033 | unsafe fn test_mm_add_ss() { |
2034 | let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0); |
2035 | let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0); |
2036 | let r = _mm_add_ss(a, b); |
2037 | assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0)); |
2038 | } |
2039 | |
2040 | #[simd_test(enable = "sse" )] |
2041 | unsafe fn test_mm_sub_ps() { |
2042 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2043 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2044 | let r = _mm_sub_ps(a, b); |
2045 | assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0)); |
2046 | } |
2047 | |
2048 | #[simd_test(enable = "sse" )] |
2049 | unsafe fn test_mm_sub_ss() { |
2050 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2051 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2052 | let r = _mm_sub_ss(a, b); |
2053 | assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0)); |
2054 | } |
2055 | |
2056 | #[simd_test(enable = "sse" )] |
2057 | unsafe fn test_mm_mul_ps() { |
2058 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2059 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2060 | let r = _mm_mul_ps(a, b); |
2061 | assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0)); |
2062 | } |
2063 | |
2064 | #[simd_test(enable = "sse" )] |
2065 | unsafe fn test_mm_mul_ss() { |
2066 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2067 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2068 | let r = _mm_mul_ss(a, b); |
2069 | assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0)); |
2070 | } |
2071 | |
2072 | #[simd_test(enable = "sse" )] |
2073 | unsafe fn test_mm_div_ps() { |
2074 | let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0); |
2075 | let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0); |
2076 | let r = _mm_div_ps(a, b); |
2077 | assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0)); |
2078 | } |
2079 | |
2080 | #[simd_test(enable = "sse" )] |
2081 | unsafe fn test_mm_div_ss() { |
2082 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2083 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2084 | let r = _mm_div_ss(a, b); |
2085 | assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0)); |
2086 | } |
2087 | |
2088 | #[simd_test(enable = "sse" )] |
2089 | unsafe fn test_mm_sqrt_ss() { |
2090 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2091 | let r = _mm_sqrt_ss(a); |
2092 | let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0); |
2093 | assert_eq_m128(r, e); |
2094 | } |
2095 | |
2096 | #[simd_test(enable = "sse" )] |
2097 | unsafe fn test_mm_sqrt_ps() { |
2098 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2099 | let r = _mm_sqrt_ps(a); |
2100 | let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0); |
2101 | assert_eq_m128(r, e); |
2102 | } |
2103 | |
2104 | #[simd_test(enable = "sse" )] |
2105 | unsafe fn test_mm_rcp_ss() { |
2106 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2107 | let r = _mm_rcp_ss(a); |
2108 | let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0); |
2109 | let rel_err = 0.00048828125; |
2110 | assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err); |
2111 | for i in 1..4 { |
2112 | assert_eq!(get_m128(r, i), get_m128(e, i)); |
2113 | } |
2114 | } |
2115 | |
2116 | #[simd_test(enable = "sse" )] |
2117 | unsafe fn test_mm_rcp_ps() { |
2118 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2119 | let r = _mm_rcp_ps(a); |
2120 | let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215); |
2121 | let rel_err = 0.00048828125; |
2122 | for i in 0..4 { |
2123 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); |
2124 | } |
2125 | } |
2126 | |
2127 | #[simd_test(enable = "sse" )] |
2128 | unsafe fn test_mm_rsqrt_ss() { |
2129 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2130 | let r = _mm_rsqrt_ss(a); |
2131 | let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0); |
2132 | let rel_err = 0.00048828125; |
2133 | for i in 0..4 { |
2134 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); |
2135 | } |
2136 | } |
2137 | |
2138 | #[simd_test(enable = "sse" )] |
2139 | unsafe fn test_mm_rsqrt_ps() { |
2140 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); |
2141 | let r = _mm_rsqrt_ps(a); |
2142 | let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845); |
2143 | let rel_err = 0.00048828125; |
2144 | for i in 0..4 { |
2145 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); |
2146 | } |
2147 | } |
2148 | |
2149 | #[simd_test(enable = "sse" )] |
2150 | unsafe fn test_mm_min_ss() { |
2151 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2152 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2153 | let r = _mm_min_ss(a, b); |
2154 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); |
2155 | } |
2156 | |
2157 | #[simd_test(enable = "sse" )] |
2158 | unsafe fn test_mm_min_ps() { |
2159 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2160 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2161 | let r = _mm_min_ps(a, b); |
2162 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); |
2163 | |
2164 | // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min` |
2165 | // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic |
2166 | // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from |
2167 | // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals |
2168 | // `r1` to `a` and `r2` to `b`. |
2169 | let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); |
2170 | let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); |
2171 | let r1: [u8; 16] = transmute(_mm_min_ps(a, b)); |
2172 | let r2: [u8; 16] = transmute(_mm_min_ps(b, a)); |
2173 | let a: [u8; 16] = transmute(a); |
2174 | let b: [u8; 16] = transmute(b); |
2175 | assert_eq!(r1, b); |
2176 | assert_eq!(r2, a); |
2177 | assert_ne!(a, b); // sanity check that -0.0 is actually present |
2178 | } |
2179 | |
2180 | #[simd_test(enable = "sse" )] |
2181 | unsafe fn test_mm_max_ss() { |
2182 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2183 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2184 | let r = _mm_max_ss(a, b); |
2185 | assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0)); |
2186 | } |
2187 | |
2188 | #[simd_test(enable = "sse" )] |
2189 | unsafe fn test_mm_max_ps() { |
2190 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); |
2191 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); |
2192 | let r = _mm_max_ps(a, b); |
2193 | assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0)); |
2194 | |
2195 | // Check SSE-specific semantics for -0.0 handling. |
2196 | let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); |
2197 | let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); |
2198 | let r1: [u8; 16] = transmute(_mm_max_ps(a, b)); |
2199 | let r2: [u8; 16] = transmute(_mm_max_ps(b, a)); |
2200 | let a: [u8; 16] = transmute(a); |
2201 | let b: [u8; 16] = transmute(b); |
2202 | assert_eq!(r1, b); |
2203 | assert_eq!(r2, a); |
2204 | assert_ne!(a, b); // sanity check that -0.0 is actually present |
2205 | } |
2206 | |
2207 | #[simd_test(enable = "sse" )] |
2208 | unsafe fn test_mm_and_ps() { |
2209 | let a = transmute(u32x4::splat(0b0011)); |
2210 | let b = transmute(u32x4::splat(0b0101)); |
2211 | let r = _mm_and_ps(*black_box(&a), *black_box(&b)); |
2212 | let e = transmute(u32x4::splat(0b0001)); |
2213 | assert_eq_m128(r, e); |
2214 | } |
2215 | |
2216 | #[simd_test(enable = "sse" )] |
2217 | unsafe fn test_mm_andnot_ps() { |
2218 | let a = transmute(u32x4::splat(0b0011)); |
2219 | let b = transmute(u32x4::splat(0b0101)); |
2220 | let r = _mm_andnot_ps(*black_box(&a), *black_box(&b)); |
2221 | let e = transmute(u32x4::splat(0b0100)); |
2222 | assert_eq_m128(r, e); |
2223 | } |
2224 | |
2225 | #[simd_test(enable = "sse" )] |
2226 | unsafe fn test_mm_or_ps() { |
2227 | let a = transmute(u32x4::splat(0b0011)); |
2228 | let b = transmute(u32x4::splat(0b0101)); |
2229 | let r = _mm_or_ps(*black_box(&a), *black_box(&b)); |
2230 | let e = transmute(u32x4::splat(0b0111)); |
2231 | assert_eq_m128(r, e); |
2232 | } |
2233 | |
2234 | #[simd_test(enable = "sse" )] |
2235 | unsafe fn test_mm_xor_ps() { |
2236 | let a = transmute(u32x4::splat(0b0011)); |
2237 | let b = transmute(u32x4::splat(0b0101)); |
2238 | let r = _mm_xor_ps(*black_box(&a), *black_box(&b)); |
2239 | let e = transmute(u32x4::splat(0b0110)); |
2240 | assert_eq_m128(r, e); |
2241 | } |
2242 | |
2243 | #[simd_test(enable = "sse" )] |
2244 | unsafe fn test_mm_cmpeq_ss() { |
2245 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2246 | let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0); |
2247 | let r: u32x4 = transmute(_mm_cmpeq_ss(a, b)); |
2248 | let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0)); |
2249 | assert_eq!(r, e); |
2250 | |
2251 | let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2252 | let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2)); |
2253 | let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0)); |
2254 | assert_eq!(r2, e2); |
2255 | } |
2256 | |
2257 | #[simd_test(enable = "sse" )] |
2258 | unsafe fn test_mm_cmplt_ss() { |
2259 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2260 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2261 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2262 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2263 | |
2264 | let b1 = 0u32; // a.extract(0) < b.extract(0) |
2265 | let c1 = 0u32; // a.extract(0) < c.extract(0) |
2266 | let d1 = !0u32; // a.extract(0) < d.extract(0) |
2267 | |
2268 | let rb: u32x4 = transmute(_mm_cmplt_ss(a, b)); |
2269 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2270 | assert_eq!(rb, eb); |
2271 | |
2272 | let rc: u32x4 = transmute(_mm_cmplt_ss(a, c)); |
2273 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2274 | assert_eq!(rc, ec); |
2275 | |
2276 | let rd: u32x4 = transmute(_mm_cmplt_ss(a, d)); |
2277 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2278 | assert_eq!(rd, ed); |
2279 | } |
2280 | |
2281 | #[simd_test(enable = "sse" )] |
2282 | unsafe fn test_mm_cmple_ss() { |
2283 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2284 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2285 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2286 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2287 | |
2288 | let b1 = 0u32; // a.extract(0) <= b.extract(0) |
2289 | let c1 = !0u32; // a.extract(0) <= c.extract(0) |
2290 | let d1 = !0u32; // a.extract(0) <= d.extract(0) |
2291 | |
2292 | let rb: u32x4 = transmute(_mm_cmple_ss(a, b)); |
2293 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2294 | assert_eq!(rb, eb); |
2295 | |
2296 | let rc: u32x4 = transmute(_mm_cmple_ss(a, c)); |
2297 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2298 | assert_eq!(rc, ec); |
2299 | |
2300 | let rd: u32x4 = transmute(_mm_cmple_ss(a, d)); |
2301 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2302 | assert_eq!(rd, ed); |
2303 | } |
2304 | |
2305 | #[simd_test(enable = "sse" )] |
2306 | unsafe fn test_mm_cmpgt_ss() { |
2307 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2308 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2309 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2310 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2311 | |
2312 | let b1 = !0u32; // a.extract(0) > b.extract(0) |
2313 | let c1 = 0u32; // a.extract(0) > c.extract(0) |
2314 | let d1 = 0u32; // a.extract(0) > d.extract(0) |
2315 | |
2316 | let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b)); |
2317 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2318 | assert_eq!(rb, eb); |
2319 | |
2320 | let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c)); |
2321 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2322 | assert_eq!(rc, ec); |
2323 | |
2324 | let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d)); |
2325 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2326 | assert_eq!(rd, ed); |
2327 | } |
2328 | |
2329 | #[simd_test(enable = "sse" )] |
2330 | unsafe fn test_mm_cmpge_ss() { |
2331 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2332 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2333 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2334 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2335 | |
2336 | let b1 = !0u32; // a.extract(0) >= b.extract(0) |
2337 | let c1 = !0u32; // a.extract(0) >= c.extract(0) |
2338 | let d1 = 0u32; // a.extract(0) >= d.extract(0) |
2339 | |
2340 | let rb: u32x4 = transmute(_mm_cmpge_ss(a, b)); |
2341 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2342 | assert_eq!(rb, eb); |
2343 | |
2344 | let rc: u32x4 = transmute(_mm_cmpge_ss(a, c)); |
2345 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2346 | assert_eq!(rc, ec); |
2347 | |
2348 | let rd: u32x4 = transmute(_mm_cmpge_ss(a, d)); |
2349 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2350 | assert_eq!(rd, ed); |
2351 | } |
2352 | |
2353 | #[simd_test(enable = "sse" )] |
2354 | unsafe fn test_mm_cmpneq_ss() { |
2355 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2356 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2357 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2358 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2359 | |
2360 | let b1 = !0u32; // a.extract(0) != b.extract(0) |
2361 | let c1 = 0u32; // a.extract(0) != c.extract(0) |
2362 | let d1 = !0u32; // a.extract(0) != d.extract(0) |
2363 | |
2364 | let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b)); |
2365 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2366 | assert_eq!(rb, eb); |
2367 | |
2368 | let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c)); |
2369 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2370 | assert_eq!(rc, ec); |
2371 | |
2372 | let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d)); |
2373 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2374 | assert_eq!(rd, ed); |
2375 | } |
2376 | |
2377 | #[simd_test(enable = "sse" )] |
2378 | unsafe fn test_mm_cmpnlt_ss() { |
2379 | // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there |
2380 | // must be a difference. It may have to do with behavior in the |
2381 | // presence of NaNs (signaling or quiet). If so, we should add tests |
2382 | // for those. |
2383 | |
2384 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2385 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2386 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2387 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2388 | |
2389 | let b1 = !0u32; // a.extract(0) >= b.extract(0) |
2390 | let c1 = !0u32; // a.extract(0) >= c.extract(0) |
2391 | let d1 = 0u32; // a.extract(0) >= d.extract(0) |
2392 | |
2393 | let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b)); |
2394 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2395 | assert_eq!(rb, eb); |
2396 | |
2397 | let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c)); |
2398 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2399 | assert_eq!(rc, ec); |
2400 | |
2401 | let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d)); |
2402 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2403 | assert_eq!(rd, ed); |
2404 | } |
2405 | |
2406 | #[simd_test(enable = "sse" )] |
2407 | unsafe fn test_mm_cmpnle_ss() { |
2408 | // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there |
2409 | // must be a difference. It may have to do with behavior in the |
2410 | // presence |
2411 | // of NaNs (signaling or quiet). If so, we should add tests for those. |
2412 | |
2413 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2414 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2415 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2416 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2417 | |
2418 | let b1 = !0u32; // a.extract(0) > b.extract(0) |
2419 | let c1 = 0u32; // a.extract(0) > c.extract(0) |
2420 | let d1 = 0u32; // a.extract(0) > d.extract(0) |
2421 | |
2422 | let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b)); |
2423 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2424 | assert_eq!(rb, eb); |
2425 | |
2426 | let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c)); |
2427 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2428 | assert_eq!(rc, ec); |
2429 | |
2430 | let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d)); |
2431 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2432 | assert_eq!(rd, ed); |
2433 | } |
2434 | |
2435 | #[simd_test(enable = "sse" )] |
2436 | unsafe fn test_mm_cmpngt_ss() { |
2437 | // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there |
2438 | // must be a difference. It may have to do with behavior in the |
2439 | // presence of NaNs (signaling or quiet). If so, we should add tests |
2440 | // for those. |
2441 | |
2442 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2443 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2444 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2445 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2446 | |
2447 | let b1 = 0u32; // a.extract(0) <= b.extract(0) |
2448 | let c1 = !0u32; // a.extract(0) <= c.extract(0) |
2449 | let d1 = !0u32; // a.extract(0) <= d.extract(0) |
2450 | |
2451 | let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b)); |
2452 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2453 | assert_eq!(rb, eb); |
2454 | |
2455 | let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c)); |
2456 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2457 | assert_eq!(rc, ec); |
2458 | |
2459 | let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d)); |
2460 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2461 | assert_eq!(rd, ed); |
2462 | } |
2463 | |
2464 | #[simd_test(enable = "sse" )] |
2465 | unsafe fn test_mm_cmpnge_ss() { |
2466 | // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there |
2467 | // must be a difference. It may have to do with behavior in the |
2468 | // presence of NaNs (signaling or quiet). If so, we should add tests |
2469 | // for those. |
2470 | |
2471 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2472 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2473 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); |
2474 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2475 | |
2476 | let b1 = 0u32; // a.extract(0) < b.extract(0) |
2477 | let c1 = 0u32; // a.extract(0) < c.extract(0) |
2478 | let d1 = !0u32; // a.extract(0) < d.extract(0) |
2479 | |
2480 | let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b)); |
2481 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2482 | assert_eq!(rb, eb); |
2483 | |
2484 | let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c)); |
2485 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2486 | assert_eq!(rc, ec); |
2487 | |
2488 | let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d)); |
2489 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2490 | assert_eq!(rd, ed); |
2491 | } |
2492 | |
2493 | #[simd_test(enable = "sse" )] |
2494 | unsafe fn test_mm_cmpord_ss() { |
2495 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2496 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2497 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); |
2498 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2499 | |
2500 | let b1 = !0u32; // a.extract(0) ord b.extract(0) |
2501 | let c1 = 0u32; // a.extract(0) ord c.extract(0) |
2502 | let d1 = !0u32; // a.extract(0) ord d.extract(0) |
2503 | |
2504 | let rb: u32x4 = transmute(_mm_cmpord_ss(a, b)); |
2505 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2506 | assert_eq!(rb, eb); |
2507 | |
2508 | let rc: u32x4 = transmute(_mm_cmpord_ss(a, c)); |
2509 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2510 | assert_eq!(rc, ec); |
2511 | |
2512 | let rd: u32x4 = transmute(_mm_cmpord_ss(a, d)); |
2513 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2514 | assert_eq!(rd, ed); |
2515 | } |
2516 | |
2517 | #[simd_test(enable = "sse" )] |
2518 | unsafe fn test_mm_cmpunord_ss() { |
2519 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
2520 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); |
2521 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); |
2522 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); |
2523 | |
2524 | let b1 = 0u32; // a.extract(0) unord b.extract(0) |
2525 | let c1 = !0u32; // a.extract(0) unord c.extract(0) |
2526 | let d1 = 0u32; // a.extract(0) unord d.extract(0) |
2527 | |
2528 | let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b)); |
2529 | let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); |
2530 | assert_eq!(rb, eb); |
2531 | |
2532 | let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c)); |
2533 | let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); |
2534 | assert_eq!(rc, ec); |
2535 | |
2536 | let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d)); |
2537 | let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); |
2538 | assert_eq!(rd, ed); |
2539 | } |
2540 | |
2541 | #[simd_test(enable = "sse" )] |
2542 | unsafe fn test_mm_cmpeq_ps() { |
2543 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2544 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2545 | let tru = !0u32; |
2546 | let fls = 0u32; |
2547 | |
2548 | let e = u32x4::new(fls, fls, tru, fls); |
2549 | let r: u32x4 = transmute(_mm_cmpeq_ps(a, b)); |
2550 | assert_eq!(r, e); |
2551 | } |
2552 | |
2553 | #[simd_test(enable = "sse" )] |
2554 | unsafe fn test_mm_cmplt_ps() { |
2555 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2556 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2557 | let tru = !0u32; |
2558 | let fls = 0u32; |
2559 | |
2560 | let e = u32x4::new(tru, fls, fls, fls); |
2561 | let r: u32x4 = transmute(_mm_cmplt_ps(a, b)); |
2562 | assert_eq!(r, e); |
2563 | } |
2564 | |
2565 | #[simd_test(enable = "sse" )] |
2566 | unsafe fn test_mm_cmple_ps() { |
2567 | let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0); |
2568 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2569 | let tru = !0u32; |
2570 | let fls = 0u32; |
2571 | |
2572 | let e = u32x4::new(tru, fls, tru, fls); |
2573 | let r: u32x4 = transmute(_mm_cmple_ps(a, b)); |
2574 | assert_eq!(r, e); |
2575 | } |
2576 | |
2577 | #[simd_test(enable = "sse" )] |
2578 | unsafe fn test_mm_cmpgt_ps() { |
2579 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2580 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); |
2581 | let tru = !0u32; |
2582 | let fls = 0u32; |
2583 | |
2584 | let e = u32x4::new(fls, tru, fls, fls); |
2585 | let r: u32x4 = transmute(_mm_cmpgt_ps(a, b)); |
2586 | assert_eq!(r, e); |
2587 | } |
2588 | |
2589 | #[simd_test(enable = "sse" )] |
2590 | unsafe fn test_mm_cmpge_ps() { |
2591 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2592 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); |
2593 | let tru = !0u32; |
2594 | let fls = 0u32; |
2595 | |
2596 | let e = u32x4::new(fls, tru, tru, fls); |
2597 | let r: u32x4 = transmute(_mm_cmpge_ps(a, b)); |
2598 | assert_eq!(r, e); |
2599 | } |
2600 | |
2601 | #[simd_test(enable = "sse" )] |
2602 | unsafe fn test_mm_cmpneq_ps() { |
2603 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2604 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); |
2605 | let tru = !0u32; |
2606 | let fls = 0u32; |
2607 | |
2608 | let e = u32x4::new(tru, tru, fls, tru); |
2609 | let r: u32x4 = transmute(_mm_cmpneq_ps(a, b)); |
2610 | assert_eq!(r, e); |
2611 | } |
2612 | |
2613 | #[simd_test(enable = "sse" )] |
2614 | unsafe fn test_mm_cmpnlt_ps() { |
2615 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2616 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2617 | let tru = !0u32; |
2618 | let fls = 0u32; |
2619 | |
2620 | let e = u32x4::new(fls, tru, tru, tru); |
2621 | let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b)); |
2622 | assert_eq!(r, e); |
2623 | } |
2624 | |
2625 | #[simd_test(enable = "sse" )] |
2626 | unsafe fn test_mm_cmpnle_ps() { |
2627 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2628 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2629 | let tru = !0u32; |
2630 | let fls = 0u32; |
2631 | |
2632 | let e = u32x4::new(fls, tru, fls, tru); |
2633 | let r: u32x4 = transmute(_mm_cmpnle_ps(a, b)); |
2634 | assert_eq!(r, e); |
2635 | } |
2636 | |
2637 | #[simd_test(enable = "sse" )] |
2638 | unsafe fn test_mm_cmpngt_ps() { |
2639 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2640 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2641 | let tru = !0u32; |
2642 | let fls = 0u32; |
2643 | |
2644 | let e = u32x4::new(tru, fls, tru, tru); |
2645 | let r: u32x4 = transmute(_mm_cmpngt_ps(a, b)); |
2646 | assert_eq!(r, e); |
2647 | } |
2648 | |
2649 | #[simd_test(enable = "sse" )] |
2650 | unsafe fn test_mm_cmpnge_ps() { |
2651 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); |
2652 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); |
2653 | let tru = !0u32; |
2654 | let fls = 0u32; |
2655 | |
2656 | let e = u32x4::new(tru, fls, fls, tru); |
2657 | let r: u32x4 = transmute(_mm_cmpnge_ps(a, b)); |
2658 | assert_eq!(r, e); |
2659 | } |
2660 | |
2661 | #[simd_test(enable = "sse" )] |
2662 | unsafe fn test_mm_cmpord_ps() { |
2663 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); |
2664 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); |
2665 | let tru = !0u32; |
2666 | let fls = 0u32; |
2667 | |
2668 | let e = u32x4::new(tru, fls, fls, fls); |
2669 | let r: u32x4 = transmute(_mm_cmpord_ps(a, b)); |
2670 | assert_eq!(r, e); |
2671 | } |
2672 | |
2673 | #[simd_test(enable = "sse" )] |
2674 | unsafe fn test_mm_cmpunord_ps() { |
2675 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); |
2676 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); |
2677 | let tru = !0u32; |
2678 | let fls = 0u32; |
2679 | |
2680 | let e = u32x4::new(fls, tru, tru, tru); |
2681 | let r: u32x4 = transmute(_mm_cmpunord_ps(a, b)); |
2682 | assert_eq!(r, e); |
2683 | } |
2684 | |
2685 | #[simd_test(enable = "sse" )] |
2686 | unsafe fn test_mm_comieq_ss() { |
2687 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2688 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2689 | |
2690 | let ee = &[1i32, 0, 0, 0]; |
2691 | |
2692 | for i in 0..4 { |
2693 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2694 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2695 | |
2696 | let r = _mm_comieq_ss(a, b); |
2697 | |
2698 | assert_eq!( |
2699 | ee[i], r, |
2700 | "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2701 | a, b, r, ee[i], i |
2702 | ); |
2703 | } |
2704 | } |
2705 | |
2706 | #[simd_test(enable = "sse" )] |
2707 | unsafe fn test_mm_comilt_ss() { |
2708 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2709 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2710 | |
2711 | let ee = &[0i32, 1, 0, 0]; |
2712 | |
2713 | for i in 0..4 { |
2714 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2715 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2716 | |
2717 | let r = _mm_comilt_ss(a, b); |
2718 | |
2719 | assert_eq!( |
2720 | ee[i], r, |
2721 | "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2722 | a, b, r, ee[i], i |
2723 | ); |
2724 | } |
2725 | } |
2726 | |
2727 | #[simd_test(enable = "sse" )] |
2728 | unsafe fn test_mm_comile_ss() { |
2729 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2730 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2731 | |
2732 | let ee = &[1i32, 1, 0, 0]; |
2733 | |
2734 | for i in 0..4 { |
2735 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2736 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2737 | |
2738 | let r = _mm_comile_ss(a, b); |
2739 | |
2740 | assert_eq!( |
2741 | ee[i], r, |
2742 | "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2743 | a, b, r, ee[i], i |
2744 | ); |
2745 | } |
2746 | } |
2747 | |
2748 | #[simd_test(enable = "sse" )] |
2749 | unsafe fn test_mm_comigt_ss() { |
2750 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2751 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2752 | |
2753 | let ee = &[1i32, 0, 1, 0]; |
2754 | |
2755 | for i in 0..4 { |
2756 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2757 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2758 | |
2759 | let r = _mm_comige_ss(a, b); |
2760 | |
2761 | assert_eq!( |
2762 | ee[i], r, |
2763 | "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2764 | a, b, r, ee[i], i |
2765 | ); |
2766 | } |
2767 | } |
2768 | |
2769 | #[simd_test(enable = "sse" )] |
2770 | unsafe fn test_mm_comineq_ss() { |
2771 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2772 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2773 | |
2774 | let ee = &[0i32, 1, 1, 1]; |
2775 | |
2776 | for i in 0..4 { |
2777 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2778 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2779 | |
2780 | let r = _mm_comineq_ss(a, b); |
2781 | |
2782 | assert_eq!( |
2783 | ee[i], r, |
2784 | "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2785 | a, b, r, ee[i], i |
2786 | ); |
2787 | } |
2788 | } |
2789 | |
2790 | #[simd_test(enable = "sse" )] |
2791 | unsafe fn test_mm_ucomieq_ss() { |
2792 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2793 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2794 | |
2795 | let ee = &[1i32, 0, 0, 0]; |
2796 | |
2797 | for i in 0..4 { |
2798 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2799 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2800 | |
2801 | let r = _mm_ucomieq_ss(a, b); |
2802 | |
2803 | assert_eq!( |
2804 | ee[i], r, |
2805 | "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2806 | a, b, r, ee[i], i |
2807 | ); |
2808 | } |
2809 | } |
2810 | |
2811 | #[simd_test(enable = "sse" )] |
2812 | unsafe fn test_mm_ucomilt_ss() { |
2813 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2814 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2815 | |
2816 | let ee = &[0i32, 1, 0, 0]; |
2817 | |
2818 | for i in 0..4 { |
2819 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2820 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2821 | |
2822 | let r = _mm_ucomilt_ss(a, b); |
2823 | |
2824 | assert_eq!( |
2825 | ee[i], r, |
2826 | "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2827 | a, b, r, ee[i], i |
2828 | ); |
2829 | } |
2830 | } |
2831 | |
2832 | #[simd_test(enable = "sse" )] |
2833 | unsafe fn test_mm_ucomile_ss() { |
2834 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2835 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2836 | |
2837 | let ee = &[1i32, 1, 0, 0]; |
2838 | |
2839 | for i in 0..4 { |
2840 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2841 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2842 | |
2843 | let r = _mm_ucomile_ss(a, b); |
2844 | |
2845 | assert_eq!( |
2846 | ee[i], r, |
2847 | "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2848 | a, b, r, ee[i], i |
2849 | ); |
2850 | } |
2851 | } |
2852 | |
2853 | #[simd_test(enable = "sse" )] |
2854 | unsafe fn test_mm_ucomigt_ss() { |
2855 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2856 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2857 | |
2858 | let ee = &[0i32, 0, 1, 0]; |
2859 | |
2860 | for i in 0..4 { |
2861 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2862 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2863 | |
2864 | let r = _mm_ucomigt_ss(a, b); |
2865 | |
2866 | assert_eq!( |
2867 | ee[i], r, |
2868 | "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2869 | a, b, r, ee[i], i |
2870 | ); |
2871 | } |
2872 | } |
2873 | |
2874 | #[simd_test(enable = "sse" )] |
2875 | unsafe fn test_mm_ucomige_ss() { |
2876 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2877 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2878 | |
2879 | let ee = &[1i32, 0, 1, 0]; |
2880 | |
2881 | for i in 0..4 { |
2882 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2883 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2884 | |
2885 | let r = _mm_ucomige_ss(a, b); |
2886 | |
2887 | assert_eq!( |
2888 | ee[i], r, |
2889 | "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2890 | a, b, r, ee[i], i |
2891 | ); |
2892 | } |
2893 | } |
2894 | |
2895 | #[simd_test(enable = "sse" )] |
2896 | unsafe fn test_mm_ucomineq_ss() { |
2897 | let aa = &[3.0f32, 12.0, 23.0, NAN]; |
2898 | let bb = &[3.0f32, 47.5, 1.5, NAN]; |
2899 | |
2900 | let ee = &[0i32, 1, 1, 1]; |
2901 | |
2902 | for i in 0..4 { |
2903 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); |
2904 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); |
2905 | |
2906 | let r = _mm_ucomineq_ss(a, b); |
2907 | |
2908 | assert_eq!( |
2909 | ee[i], r, |
2910 | "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})" , |
2911 | a, b, r, ee[i], i |
2912 | ); |
2913 | } |
2914 | } |
2915 | |
2916 | #[simd_test(enable = "sse" )] |
2917 | unsafe fn test_mm_cvtss_si32() { |
2918 | let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; |
2919 | let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520]; |
2920 | for i in 0..inputs.len() { |
2921 | let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0); |
2922 | let e = result[i]; |
2923 | let r = _mm_cvtss_si32(x); |
2924 | assert_eq!( |
2925 | e, r, |
2926 | "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}" , |
2927 | i, x, r, e |
2928 | ); |
2929 | } |
2930 | } |
2931 | |
2932 | #[simd_test(enable = "sse" )] |
2933 | unsafe fn test_mm_cvttss_si32() { |
2934 | let inputs = &[ |
2935 | (42.0f32, 42i32), |
2936 | (-31.4, -31), |
2937 | (-33.5, -33), |
2938 | (-34.5, -34), |
2939 | (10.999, 10), |
2940 | (-5.99, -5), |
2941 | (4.0e10, i32::MIN), |
2942 | (4.0e-10, 0), |
2943 | (NAN, i32::MIN), |
2944 | (2147483500.1, 2147483520), |
2945 | ]; |
2946 | for (i, &(xi, e)) in inputs.iter().enumerate() { |
2947 | let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); |
2948 | let r = _mm_cvttss_si32(x); |
2949 | assert_eq!( |
2950 | e, r, |
2951 | "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}" , |
2952 | i, x, r, e |
2953 | ); |
2954 | } |
2955 | } |
2956 | |
2957 | #[simd_test(enable = "sse" )] |
2958 | unsafe fn test_mm_cvtsi32_ss() { |
2959 | let inputs = &[ |
2960 | (4555i32, 4555.0f32), |
2961 | (322223333, 322223330.0), |
2962 | (-432, -432.0), |
2963 | (-322223333, -322223330.0), |
2964 | ]; |
2965 | |
2966 | for &(x, f) in inputs.iter() { |
2967 | let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
2968 | let r = _mm_cvtsi32_ss(a, x); |
2969 | let e = _mm_setr_ps(f, 6.0, 7.0, 8.0); |
2970 | assert_eq_m128(e, r); |
2971 | } |
2972 | } |
2973 | |
2974 | #[simd_test(enable = "sse" )] |
2975 | unsafe fn test_mm_cvtss_f32() { |
2976 | let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0); |
2977 | assert_eq!(_mm_cvtss_f32(a), 312.0134); |
2978 | } |
2979 | |
2980 | #[simd_test(enable = "sse" )] |
2981 | unsafe fn test_mm_set_ss() { |
2982 | let r = _mm_set_ss(black_box(4.25)); |
2983 | assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0)); |
2984 | } |
2985 | |
2986 | #[simd_test(enable = "sse" )] |
2987 | unsafe fn test_mm_set1_ps() { |
2988 | let r1 = _mm_set1_ps(black_box(4.25)); |
2989 | let r2 = _mm_set_ps1(black_box(4.25)); |
2990 | assert_eq!(get_m128(r1, 0), 4.25); |
2991 | assert_eq!(get_m128(r1, 1), 4.25); |
2992 | assert_eq!(get_m128(r1, 2), 4.25); |
2993 | assert_eq!(get_m128(r1, 3), 4.25); |
2994 | assert_eq!(get_m128(r2, 0), 4.25); |
2995 | assert_eq!(get_m128(r2, 1), 4.25); |
2996 | assert_eq!(get_m128(r2, 2), 4.25); |
2997 | assert_eq!(get_m128(r2, 3), 4.25); |
2998 | } |
2999 | |
3000 | #[simd_test(enable = "sse" )] |
3001 | unsafe fn test_mm_set_ps() { |
3002 | let r = _mm_set_ps( |
3003 | black_box(1.0), |
3004 | black_box(2.0), |
3005 | black_box(3.0), |
3006 | black_box(4.0), |
3007 | ); |
3008 | assert_eq!(get_m128(r, 0), 4.0); |
3009 | assert_eq!(get_m128(r, 1), 3.0); |
3010 | assert_eq!(get_m128(r, 2), 2.0); |
3011 | assert_eq!(get_m128(r, 3), 1.0); |
3012 | } |
3013 | |
3014 | #[simd_test(enable = "sse" )] |
3015 | unsafe fn test_mm_setr_ps() { |
3016 | let r = _mm_setr_ps( |
3017 | black_box(1.0), |
3018 | black_box(2.0), |
3019 | black_box(3.0), |
3020 | black_box(4.0), |
3021 | ); |
3022 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0)); |
3023 | } |
3024 | |
3025 | #[simd_test(enable = "sse" )] |
3026 | unsafe fn test_mm_setzero_ps() { |
3027 | let r = *black_box(&_mm_setzero_ps()); |
3028 | assert_eq_m128(r, _mm_set1_ps(0.0)); |
3029 | } |
3030 | |
3031 | #[simd_test(enable = "sse" )] |
3032 | unsafe fn test_mm_shuffle() { |
3033 | assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11); |
3034 | assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00); |
3035 | assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01); |
3036 | } |
3037 | |
3038 | #[simd_test(enable = "sse" )] |
3039 | unsafe fn test_mm_shuffle_ps() { |
3040 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3041 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3042 | let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b); |
3043 | assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0)); |
3044 | } |
3045 | |
3046 | #[simd_test(enable = "sse" )] |
3047 | unsafe fn test_mm_unpackhi_ps() { |
3048 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3049 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3050 | let r = _mm_unpackhi_ps(a, b); |
3051 | assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0)); |
3052 | } |
3053 | |
3054 | #[simd_test(enable = "sse" )] |
3055 | unsafe fn test_mm_unpacklo_ps() { |
3056 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3057 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3058 | let r = _mm_unpacklo_ps(a, b); |
3059 | assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0)); |
3060 | } |
3061 | |
3062 | #[simd_test(enable = "sse" )] |
3063 | unsafe fn test_mm_movehl_ps() { |
3064 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3065 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3066 | let r = _mm_movehl_ps(a, b); |
3067 | assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0)); |
3068 | } |
3069 | |
3070 | #[simd_test(enable = "sse" )] |
3071 | unsafe fn test_mm_movelh_ps() { |
3072 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3073 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3074 | let r = _mm_movelh_ps(a, b); |
3075 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0)); |
3076 | } |
3077 | |
3078 | #[simd_test(enable = "sse" )] |
3079 | unsafe fn test_mm_load_ss() { |
3080 | let a = 42.0f32; |
3081 | let r = _mm_load_ss(ptr::addr_of!(a)); |
3082 | assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0)); |
3083 | } |
3084 | |
3085 | #[simd_test(enable = "sse" )] |
3086 | unsafe fn test_mm_load1_ps() { |
3087 | let a = 42.0f32; |
3088 | let r = _mm_load1_ps(ptr::addr_of!(a)); |
3089 | assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0)); |
3090 | } |
3091 | |
3092 | #[simd_test(enable = "sse" )] |
3093 | unsafe fn test_mm_load_ps() { |
3094 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
3095 | |
3096 | let mut p = vals.as_ptr(); |
3097 | let mut fixup = 0.0f32; |
3098 | |
3099 | // Make sure p is aligned, otherwise we might get a |
3100 | // (signal: 11, SIGSEGV: invalid memory reference) |
3101 | |
3102 | let unalignment = (p as usize) & 0xf; |
3103 | if unalignment != 0 { |
3104 | let delta = (16 - unalignment) >> 2; |
3105 | fixup = delta as f32; |
3106 | p = p.add(delta); |
3107 | } |
3108 | |
3109 | let r = _mm_load_ps(p); |
3110 | let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup)); |
3111 | assert_eq_m128(r, e); |
3112 | } |
3113 | |
3114 | #[simd_test(enable = "sse" )] |
3115 | unsafe fn test_mm_loadu_ps() { |
3116 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
3117 | let p = vals.as_ptr().add(3); |
3118 | let r = _mm_loadu_ps(black_box(p)); |
3119 | assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0)); |
3120 | } |
3121 | |
3122 | #[simd_test(enable = "sse" )] |
3123 | unsafe fn test_mm_loadr_ps() { |
3124 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
3125 | |
3126 | let mut p = vals.as_ptr(); |
3127 | let mut fixup = 0.0f32; |
3128 | |
3129 | // Make sure p is aligned, otherwise we might get a |
3130 | // (signal: 11, SIGSEGV: invalid memory reference) |
3131 | |
3132 | let unalignment = (p as usize) & 0xf; |
3133 | if unalignment != 0 { |
3134 | let delta = (16 - unalignment) >> 2; |
3135 | fixup = delta as f32; |
3136 | p = p.add(delta); |
3137 | } |
3138 | |
3139 | let r = _mm_loadr_ps(p); |
3140 | let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup)); |
3141 | assert_eq_m128(r, e); |
3142 | } |
3143 | |
3144 | #[simd_test(enable = "sse" )] |
3145 | unsafe fn test_mm_store_ss() { |
3146 | let mut vals = [0.0f32; 8]; |
3147 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3148 | _mm_store_ss(vals.as_mut_ptr().add(1), a); |
3149 | |
3150 | assert_eq!(vals[0], 0.0); |
3151 | assert_eq!(vals[1], 1.0); |
3152 | assert_eq!(vals[2], 0.0); |
3153 | } |
3154 | |
3155 | #[simd_test(enable = "sse" )] |
3156 | unsafe fn test_mm_store1_ps() { |
3157 | let mut vals = [0.0f32; 8]; |
3158 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3159 | |
3160 | let mut ofs = 0; |
3161 | let mut p = vals.as_mut_ptr(); |
3162 | |
3163 | if (p as usize) & 0xf != 0 { |
3164 | ofs = (16 - ((p as usize) & 0xf)) >> 2; |
3165 | p = p.add(ofs); |
3166 | } |
3167 | |
3168 | _mm_store1_ps(p, *black_box(&a)); |
3169 | |
3170 | if ofs > 0 { |
3171 | assert_eq!(vals[ofs - 1], 0.0); |
3172 | } |
3173 | assert_eq!(vals[ofs + 0], 1.0); |
3174 | assert_eq!(vals[ofs + 1], 1.0); |
3175 | assert_eq!(vals[ofs + 2], 1.0); |
3176 | assert_eq!(vals[ofs + 3], 1.0); |
3177 | assert_eq!(vals[ofs + 4], 0.0); |
3178 | } |
3179 | |
3180 | #[simd_test(enable = "sse" )] |
3181 | unsafe fn test_mm_store_ps() { |
3182 | let mut vals = [0.0f32; 8]; |
3183 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3184 | |
3185 | let mut ofs = 0; |
3186 | let mut p = vals.as_mut_ptr(); |
3187 | |
3188 | // Align p to 16-byte boundary |
3189 | if (p as usize) & 0xf != 0 { |
3190 | ofs = (16 - ((p as usize) & 0xf)) >> 2; |
3191 | p = p.add(ofs); |
3192 | } |
3193 | |
3194 | _mm_store_ps(p, *black_box(&a)); |
3195 | |
3196 | if ofs > 0 { |
3197 | assert_eq!(vals[ofs - 1], 0.0); |
3198 | } |
3199 | assert_eq!(vals[ofs + 0], 1.0); |
3200 | assert_eq!(vals[ofs + 1], 2.0); |
3201 | assert_eq!(vals[ofs + 2], 3.0); |
3202 | assert_eq!(vals[ofs + 3], 4.0); |
3203 | assert_eq!(vals[ofs + 4], 0.0); |
3204 | } |
3205 | |
3206 | #[simd_test(enable = "sse" )] |
3207 | unsafe fn test_mm_storer_ps() { |
3208 | let mut vals = [0.0f32; 8]; |
3209 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3210 | |
3211 | let mut ofs = 0; |
3212 | let mut p = vals.as_mut_ptr(); |
3213 | |
3214 | // Align p to 16-byte boundary |
3215 | if (p as usize) & 0xf != 0 { |
3216 | ofs = (16 - ((p as usize) & 0xf)) >> 2; |
3217 | p = p.add(ofs); |
3218 | } |
3219 | |
3220 | _mm_storer_ps(p, *black_box(&a)); |
3221 | |
3222 | if ofs > 0 { |
3223 | assert_eq!(vals[ofs - 1], 0.0); |
3224 | } |
3225 | assert_eq!(vals[ofs + 0], 4.0); |
3226 | assert_eq!(vals[ofs + 1], 3.0); |
3227 | assert_eq!(vals[ofs + 2], 2.0); |
3228 | assert_eq!(vals[ofs + 3], 1.0); |
3229 | assert_eq!(vals[ofs + 4], 0.0); |
3230 | } |
3231 | |
3232 | #[simd_test(enable = "sse" )] |
3233 | unsafe fn test_mm_storeu_ps() { |
3234 | let mut vals = [0.0f32; 8]; |
3235 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3236 | |
3237 | let mut ofs = 0; |
3238 | let mut p = vals.as_mut_ptr(); |
3239 | |
3240 | // Make sure p is **not** aligned to 16-byte boundary |
3241 | if (p as usize) & 0xf == 0 { |
3242 | ofs = 1; |
3243 | p = p.add(1); |
3244 | } |
3245 | |
3246 | _mm_storeu_ps(p, *black_box(&a)); |
3247 | |
3248 | if ofs > 0 { |
3249 | assert_eq!(vals[ofs - 1], 0.0); |
3250 | } |
3251 | assert_eq!(vals[ofs + 0], 1.0); |
3252 | assert_eq!(vals[ofs + 1], 2.0); |
3253 | assert_eq!(vals[ofs + 2], 3.0); |
3254 | assert_eq!(vals[ofs + 3], 4.0); |
3255 | assert_eq!(vals[ofs + 4], 0.0); |
3256 | } |
3257 | |
3258 | #[simd_test(enable = "sse" )] |
3259 | unsafe fn test_mm_move_ss() { |
3260 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3261 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3262 | |
3263 | let r = _mm_move_ss(a, b); |
3264 | let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0); |
3265 | assert_eq_m128(e, r); |
3266 | } |
3267 | |
3268 | #[simd_test(enable = "sse" )] |
3269 | unsafe fn test_mm_movemask_ps() { |
3270 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0)); |
3271 | assert_eq!(r, 0b0101); |
3272 | |
3273 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0)); |
3274 | assert_eq!(r, 0b0111); |
3275 | } |
3276 | |
3277 | #[simd_test(enable = "sse" )] |
3278 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
3279 | #[cfg_attr (miri, ignore)] |
3280 | unsafe fn test_mm_sfence() { |
3281 | _mm_sfence(); |
3282 | } |
3283 | |
3284 | #[simd_test(enable = "sse" )] |
3285 | unsafe fn test_MM_TRANSPOSE4_PS() { |
3286 | let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
3287 | let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
3288 | let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0); |
3289 | let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0); |
3290 | |
3291 | _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d); |
3292 | |
3293 | assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0)); |
3294 | assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0)); |
3295 | assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0)); |
3296 | assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0)); |
3297 | } |
3298 | |
3299 | #[repr (align(16))] |
3300 | struct Memory { |
3301 | pub data: [f32; 4], |
3302 | } |
3303 | |
3304 | #[simd_test(enable = "sse" )] |
3305 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
3306 | // (non-temporal store) |
3307 | #[cfg_attr (miri, ignore)] |
3308 | unsafe fn test_mm_stream_ps() { |
3309 | let a = _mm_set1_ps(7.0); |
3310 | let mut mem = Memory { data: [-1.0; 4] }; |
3311 | |
3312 | _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a); |
3313 | for i in 0..4 { |
3314 | assert_eq!(mem.data[i], get_m128(a, i)); |
3315 | } |
3316 | } |
3317 | } |
3318 | |