1//! Streaming SIMD Extensions (SSE)
2
3use crate::{
4 core_arch::{simd::*, x86::*},
5 intrinsics::simd::*,
6 mem, ptr,
7};
8
9#[cfg(test)]
10use stdarch_test::assert_instr;
11
12/// Adds the first component of `a` and `b`, the other components are copied
13/// from `a`.
14///
15/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
16#[inline]
17#[target_feature(enable = "sse")]
18#[cfg_attr(test, assert_instr(addss))]
19#[stable(feature = "simd_x86", since = "1.27.0")]
20pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
21 addss(a, b)
22}
23
24/// Adds __m128 vectors.
25///
26/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
27#[inline]
28#[target_feature(enable = "sse")]
29#[cfg_attr(test, assert_instr(addps))]
30#[stable(feature = "simd_x86", since = "1.27.0")]
31pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
32 simd_add(x:a, y:b)
33}
34
35/// Subtracts the first component of `b` from `a`, the other components are
36/// copied from `a`.
37///
38/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
39#[inline]
40#[target_feature(enable = "sse")]
41#[cfg_attr(test, assert_instr(subss))]
42#[stable(feature = "simd_x86", since = "1.27.0")]
43pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
44 subss(a, b)
45}
46
47/// Subtracts __m128 vectors.
48///
49/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
50#[inline]
51#[target_feature(enable = "sse")]
52#[cfg_attr(test, assert_instr(subps))]
53#[stable(feature = "simd_x86", since = "1.27.0")]
54pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
55 simd_sub(lhs:a, rhs:b)
56}
57
58/// Multiplies the first component of `a` and `b`, the other components are
59/// copied from `a`.
60///
61/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
62#[inline]
63#[target_feature(enable = "sse")]
64#[cfg_attr(test, assert_instr(mulss))]
65#[stable(feature = "simd_x86", since = "1.27.0")]
66pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
67 mulss(a, b)
68}
69
70/// Multiplies __m128 vectors.
71///
72/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
73#[inline]
74#[target_feature(enable = "sse")]
75#[cfg_attr(test, assert_instr(mulps))]
76#[stable(feature = "simd_x86", since = "1.27.0")]
77pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
78 simd_mul(x:a, y:b)
79}
80
81/// Divides the first component of `b` by `a`, the other components are
82/// copied from `a`.
83///
84/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
85#[inline]
86#[target_feature(enable = "sse")]
87#[cfg_attr(test, assert_instr(divss))]
88#[stable(feature = "simd_x86", since = "1.27.0")]
89pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
90 divss(a, b)
91}
92
93/// Divides __m128 vectors.
94///
95/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
96#[inline]
97#[target_feature(enable = "sse")]
98#[cfg_attr(test, assert_instr(divps))]
99#[stable(feature = "simd_x86", since = "1.27.0")]
100pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
101 simd_div(lhs:a, rhs:b)
102}
103
104/// Returns the square root of the first single-precision (32-bit)
105/// floating-point element in `a`, the other elements are unchanged.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
108#[inline]
109#[target_feature(enable = "sse")]
110#[cfg_attr(test, assert_instr(sqrtss))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
113 sqrtss(a)
114}
115
116/// Returns the square root of packed single-precision (32-bit) floating-point
117/// elements in `a`.
118///
119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
120#[inline]
121#[target_feature(enable = "sse")]
122#[cfg_attr(test, assert_instr(sqrtps))]
123#[stable(feature = "simd_x86", since = "1.27.0")]
124pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
125 sqrtps(a)
126}
127
128/// Returns the approximate reciprocal of the first single-precision
129/// (32-bit) floating-point element in `a`, the other elements are unchanged.
130///
131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
132#[inline]
133#[target_feature(enable = "sse")]
134#[cfg_attr(test, assert_instr(rcpss))]
135#[stable(feature = "simd_x86", since = "1.27.0")]
136pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
137 rcpss(a)
138}
139
140/// Returns the approximate reciprocal of packed single-precision (32-bit)
141/// floating-point elements in `a`.
142///
143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
144#[inline]
145#[target_feature(enable = "sse")]
146#[cfg_attr(test, assert_instr(rcpps))]
147#[stable(feature = "simd_x86", since = "1.27.0")]
148pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
149 rcpps(a)
150}
151
152/// Returns the approximate reciprocal square root of the first single-precision
153/// (32-bit) floating-point element in `a`, the other elements are unchanged.
154///
155/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
156#[inline]
157#[target_feature(enable = "sse")]
158#[cfg_attr(test, assert_instr(rsqrtss))]
159#[stable(feature = "simd_x86", since = "1.27.0")]
160pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
161 rsqrtss(a)
162}
163
164/// Returns the approximate reciprocal square root of packed single-precision
165/// (32-bit) floating-point elements in `a`.
166///
167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
168#[inline]
169#[target_feature(enable = "sse")]
170#[cfg_attr(test, assert_instr(rsqrtps))]
171#[stable(feature = "simd_x86", since = "1.27.0")]
172pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
173 rsqrtps(a)
174}
175
176/// Compares the first single-precision (32-bit) floating-point element of `a`
177/// and `b`, and return the minimum value in the first element of the return
178/// value, the other elements are copied from `a`.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
181#[inline]
182#[target_feature(enable = "sse")]
183#[cfg_attr(test, assert_instr(minss))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
186 minss(a, b)
187}
188
189/// Compares packed single-precision (32-bit) floating-point elements in `a` and
190/// `b`, and return the corresponding minimum values.
191///
192/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
193#[inline]
194#[target_feature(enable = "sse")]
195#[cfg_attr(test, assert_instr(minps))]
196#[stable(feature = "simd_x86", since = "1.27.0")]
197pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
198 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
199 minps(a, b)
200}
201
202/// Compares the first single-precision (32-bit) floating-point element of `a`
203/// and `b`, and return the maximum value in the first element of the return
204/// value, the other elements are copied from `a`.
205///
206/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
207#[inline]
208#[target_feature(enable = "sse")]
209#[cfg_attr(test, assert_instr(maxss))]
210#[stable(feature = "simd_x86", since = "1.27.0")]
211pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
212 maxss(a, b)
213}
214
215/// Compares packed single-precision (32-bit) floating-point elements in `a` and
216/// `b`, and return the corresponding maximum values.
217///
218/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
219#[inline]
220#[target_feature(enable = "sse")]
221#[cfg_attr(test, assert_instr(maxps))]
222#[stable(feature = "simd_x86", since = "1.27.0")]
223pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
224 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
225 maxps(a, b)
226}
227
228/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
229///
230/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
231#[inline]
232#[target_feature(enable = "sse")]
233// i586 only seems to generate plain `and` instructions, so ignore it.
234#[cfg_attr(
235 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
236 assert_instr(andps)
237)]
238#[stable(feature = "simd_x86", since = "1.27.0")]
239pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
240 let a: __m128i = mem::transmute(src:a);
241 let b: __m128i = mem::transmute(src:b);
242 mem::transmute(src:simd_and(x:a, y:b))
243}
244
245/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
246/// elements.
247///
248/// Computes `!a & b` for each bit in `a` and `b`.
249///
250/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
251#[inline]
252#[target_feature(enable = "sse")]
253// i586 only seems to generate plain `not` and `and` instructions, so ignore
254// it.
255#[cfg_attr(
256 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
257 assert_instr(andnps)
258)]
259#[stable(feature = "simd_x86", since = "1.27.0")]
260pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
261 let a: __m128i = mem::transmute(src:a);
262 let b: __m128i = mem::transmute(src:b);
263 let mask: __m128i = mem::transmute(src:i32x4::splat(-1));
264 mem::transmute(src:simd_and(x:simd_xor(mask, a), y:b))
265}
266
267/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
268///
269/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
270#[inline]
271#[target_feature(enable = "sse")]
272// i586 only seems to generate plain `or` instructions, so we ignore it.
273#[cfg_attr(
274 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
275 assert_instr(orps)
276)]
277#[stable(feature = "simd_x86", since = "1.27.0")]
278pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
279 let a: __m128i = mem::transmute(src:a);
280 let b: __m128i = mem::transmute(src:b);
281 mem::transmute(src:simd_or(x:a, y:b))
282}
283
284/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
285/// elements.
286///
287/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
288#[inline]
289#[target_feature(enable = "sse")]
290// i586 only seems to generate plain `xor` instructions, so we ignore it.
291#[cfg_attr(
292 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
293 assert_instr(xorps)
294)]
295#[stable(feature = "simd_x86", since = "1.27.0")]
296pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
297 let a: __m128i = mem::transmute(src:a);
298 let b: __m128i = mem::transmute(src:b);
299 mem::transmute(src:simd_xor(x:a, y:b))
300}
301
302/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
303/// the result will be `0xffffffff` if the two inputs are equal, or `0`
304/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
305///
306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
307#[inline]
308#[target_feature(enable = "sse")]
309#[cfg_attr(test, assert_instr(cmpeqss))]
310#[stable(feature = "simd_x86", since = "1.27.0")]
311pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
312 cmpss(a, b, imm8:0)
313}
314
315/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
316/// of the result will be `0xffffffff` if `a.extract(0)` is less than
317/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
318/// upper 96 bits of `a`.
319///
320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
321#[inline]
322#[target_feature(enable = "sse")]
323#[cfg_attr(test, assert_instr(cmpltss))]
324#[stable(feature = "simd_x86", since = "1.27.0")]
325pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
326 cmpss(a, b, imm8:1)
327}
328
329/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
330/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
331/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
332/// are the upper 96 bits of `a`.
333///
334/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
335#[inline]
336#[target_feature(enable = "sse")]
337#[cfg_attr(test, assert_instr(cmpless))]
338#[stable(feature = "simd_x86", since = "1.27.0")]
339pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
340 cmpss(a, b, imm8:2)
341}
342
343/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
344/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
345/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
346/// are the upper 96 bits of `a`.
347///
348/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
349#[inline]
350#[target_feature(enable = "sse")]
351#[cfg_attr(test, assert_instr(cmpltss))]
352#[stable(feature = "simd_x86", since = "1.27.0")]
353pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
354 simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3])
355}
356
357/// Compares the lowest `f32` of both inputs for greater than or equal. The
358/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
359/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
360/// of the result are the upper 96 bits of `a`.
361///
362/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
363#[inline]
364#[target_feature(enable = "sse")]
365#[cfg_attr(test, assert_instr(cmpless))]
366#[stable(feature = "simd_x86", since = "1.27.0")]
367pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
368 simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3])
369}
370
371/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
372/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
373/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
374/// upper 96 bits of `a`.
375///
376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
377#[inline]
378#[target_feature(enable = "sse")]
379#[cfg_attr(test, assert_instr(cmpneqss))]
380#[stable(feature = "simd_x86", since = "1.27.0")]
381pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
382 cmpss(a, b, imm8:4)
383}
384
385/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
386/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
387/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
388/// upper 96 bits of `a`.
389///
390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
391#[inline]
392#[target_feature(enable = "sse")]
393#[cfg_attr(test, assert_instr(cmpnltss))]
394#[stable(feature = "simd_x86", since = "1.27.0")]
395pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
396 cmpss(a, b, imm8:5)
397}
398
399/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
400/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
401/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
402/// of the result are the upper 96 bits of `a`.
403///
404/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
405#[inline]
406#[target_feature(enable = "sse")]
407#[cfg_attr(test, assert_instr(cmpnless))]
408#[stable(feature = "simd_x86", since = "1.27.0")]
409pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
410 cmpss(a, b, imm8:6)
411}
412
413/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
414/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
415/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
416/// the upper 96 bits of `a`.
417///
418/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
419#[inline]
420#[target_feature(enable = "sse")]
421#[cfg_attr(test, assert_instr(cmpnltss))]
422#[stable(feature = "simd_x86", since = "1.27.0")]
423pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
424 simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3])
425}
426
427/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
428/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
429/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
430/// bits of the result are the upper 96 bits of `a`.
431///
432/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
433#[inline]
434#[target_feature(enable = "sse")]
435#[cfg_attr(test, assert_instr(cmpnless))]
436#[stable(feature = "simd_x86", since = "1.27.0")]
437pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
438 simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3])
439}
440
441/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
442/// the result will be `0xffffffff` if neither of `a.extract(0)` or
443/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
444/// are the upper 96 bits of `a`.
445///
446/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
447#[inline]
448#[target_feature(enable = "sse")]
449#[cfg_attr(test, assert_instr(cmpordss))]
450#[stable(feature = "simd_x86", since = "1.27.0")]
451pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
452 cmpss(a, b, imm8:7)
453}
454
455/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
456/// of the result will be `0xffffffff` if any of `a.extract(0)` or
457/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
458/// are the upper 96 bits of `a`.
459///
460/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
461#[inline]
462#[target_feature(enable = "sse")]
463#[cfg_attr(test, assert_instr(cmpunordss))]
464#[stable(feature = "simd_x86", since = "1.27.0")]
465pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
466 cmpss(a, b, imm8:3)
467}
468
469/// Compares each of the four floats in `a` to the corresponding element in `b`.
470/// The result in the output vector will be `0xffffffff` if the input elements
471/// were equal, or `0` otherwise.
472///
473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
474#[inline]
475#[target_feature(enable = "sse")]
476#[cfg_attr(test, assert_instr(cmpeqps))]
477#[stable(feature = "simd_x86", since = "1.27.0")]
478pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
479 cmpps(a, b, imm8:0)
480}
481
482/// Compares each of the four floats in `a` to the corresponding element in `b`.
483/// The result in the output vector will be `0xffffffff` if the input element
484/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
485///
486/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
487#[inline]
488#[target_feature(enable = "sse")]
489#[cfg_attr(test, assert_instr(cmpltps))]
490#[stable(feature = "simd_x86", since = "1.27.0")]
491pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
492 cmpps(a, b, imm8:1)
493}
494
495/// Compares each of the four floats in `a` to the corresponding element in `b`.
496/// The result in the output vector will be `0xffffffff` if the input element
497/// in `a` is less than or equal to the corresponding element in `b`, or `0`
498/// otherwise.
499///
500/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
501#[inline]
502#[target_feature(enable = "sse")]
503#[cfg_attr(test, assert_instr(cmpleps))]
504#[stable(feature = "simd_x86", since = "1.27.0")]
505pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
506 cmpps(a, b, imm8:2)
507}
508
509/// Compares each of the four floats in `a` to the corresponding element in `b`.
510/// The result in the output vector will be `0xffffffff` if the input element
511/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
512///
513/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
514#[inline]
515#[target_feature(enable = "sse")]
516#[cfg_attr(test, assert_instr(cmpltps))]
517#[stable(feature = "simd_x86", since = "1.27.0")]
518pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
519 cmpps(a:b, b:a, imm8:1)
520}
521
522/// Compares each of the four floats in `a` to the corresponding element in `b`.
523/// The result in the output vector will be `0xffffffff` if the input element
524/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
525/// otherwise.
526///
527/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
528#[inline]
529#[target_feature(enable = "sse")]
530#[cfg_attr(test, assert_instr(cmpleps))]
531#[stable(feature = "simd_x86", since = "1.27.0")]
532pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
533 cmpps(a:b, b:a, imm8:2)
534}
535
536/// Compares each of the four floats in `a` to the corresponding element in `b`.
537/// The result in the output vector will be `0xffffffff` if the input elements
538/// are **not** equal, or `0` otherwise.
539///
540/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
541#[inline]
542#[target_feature(enable = "sse")]
543#[cfg_attr(test, assert_instr(cmpneqps))]
544#[stable(feature = "simd_x86", since = "1.27.0")]
545pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
546 cmpps(a, b, imm8:4)
547}
548
549/// Compares each of the four floats in `a` to the corresponding element in `b`.
550/// The result in the output vector will be `0xffffffff` if the input element
551/// in `a` is **not** less than the corresponding element in `b`, or `0`
552/// otherwise.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
555#[inline]
556#[target_feature(enable = "sse")]
557#[cfg_attr(test, assert_instr(cmpnltps))]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
560 cmpps(a, b, imm8:5)
561}
562
563/// Compares each of the four floats in `a` to the corresponding element in `b`.
564/// The result in the output vector will be `0xffffffff` if the input element
565/// in `a` is **not** less than or equal to the corresponding element in `b`, or
566/// `0` otherwise.
567///
568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
569#[inline]
570#[target_feature(enable = "sse")]
571#[cfg_attr(test, assert_instr(cmpnleps))]
572#[stable(feature = "simd_x86", since = "1.27.0")]
573pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
574 cmpps(a, b, imm8:6)
575}
576
577/// Compares each of the four floats in `a` to the corresponding element in `b`.
578/// The result in the output vector will be `0xffffffff` if the input element
579/// in `a` is **not** greater than the corresponding element in `b`, or `0`
580/// otherwise.
581///
582/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
583#[inline]
584#[target_feature(enable = "sse")]
585#[cfg_attr(test, assert_instr(cmpnltps))]
586#[stable(feature = "simd_x86", since = "1.27.0")]
587pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
588 cmpps(a:b, b:a, imm8:5)
589}
590
591/// Compares each of the four floats in `a` to the corresponding element in `b`.
592/// The result in the output vector will be `0xffffffff` if the input element
593/// in `a` is **not** greater than or equal to the corresponding element in `b`,
594/// or `0` otherwise.
595///
596/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
597#[inline]
598#[target_feature(enable = "sse")]
599#[cfg_attr(test, assert_instr(cmpnleps))]
600#[stable(feature = "simd_x86", since = "1.27.0")]
601pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
602 cmpps(a:b, b:a, imm8:6)
603}
604
605/// Compares each of the four floats in `a` to the corresponding element in `b`.
606/// Returns four floats that have one of two possible bit patterns. The element
607/// in the output vector will be `0xffffffff` if the input elements in `a` and
608/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
611#[inline]
612#[target_feature(enable = "sse")]
613#[cfg_attr(test, assert_instr(cmpordps))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
616 cmpps(a:b, b:a, imm8:7)
617}
618
619/// Compares each of the four floats in `a` to the corresponding element in `b`.
620/// Returns four floats that have one of two possible bit patterns. The element
621/// in the output vector will be `0xffffffff` if the input elements in `a` and
622/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
623///
624/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
625#[inline]
626#[target_feature(enable = "sse")]
627#[cfg_attr(test, assert_instr(cmpunordps))]
628#[stable(feature = "simd_x86", since = "1.27.0")]
629pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
630 cmpps(a:b, b:a, imm8:3)
631}
632
633/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
634/// `1` if they are equal, or `0` otherwise.
635///
636/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
637#[inline]
638#[target_feature(enable = "sse")]
639#[cfg_attr(test, assert_instr(comiss))]
640#[stable(feature = "simd_x86", since = "1.27.0")]
641pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
642 comieq_ss(a, b)
643}
644
645/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
646/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
647///
648/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
649#[inline]
650#[target_feature(enable = "sse")]
651#[cfg_attr(test, assert_instr(comiss))]
652#[stable(feature = "simd_x86", since = "1.27.0")]
653pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
654 comilt_ss(a, b)
655}
656
657/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
658/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
659/// otherwise.
660///
661/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
662#[inline]
663#[target_feature(enable = "sse")]
664#[cfg_attr(test, assert_instr(comiss))]
665#[stable(feature = "simd_x86", since = "1.27.0")]
666pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
667 comile_ss(a, b)
668}
669
670/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
671/// `1` if the value from `a` is greater than the one from `b`, or `0`
672/// otherwise.
673///
674/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
675#[inline]
676#[target_feature(enable = "sse")]
677#[cfg_attr(test, assert_instr(comiss))]
678#[stable(feature = "simd_x86", since = "1.27.0")]
679pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
680 comigt_ss(a, b)
681}
682
683/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
684/// `1` if the value from `a` is greater than or equal to the one from `b`, or
685/// `0` otherwise.
686///
687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
688#[inline]
689#[target_feature(enable = "sse")]
690#[cfg_attr(test, assert_instr(comiss))]
691#[stable(feature = "simd_x86", since = "1.27.0")]
692pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
693 comige_ss(a, b)
694}
695
696/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
697/// `1` if they are **not** equal, or `0` otherwise.
698///
699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
700#[inline]
701#[target_feature(enable = "sse")]
702#[cfg_attr(test, assert_instr(comiss))]
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
705 comineq_ss(a, b)
706}
707
708/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
709/// `1` if they are equal, or `0` otherwise. This instruction will not signal
710/// an exception if either argument is a quiet NaN.
711///
712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
713#[inline]
714#[target_feature(enable = "sse")]
715#[cfg_attr(test, assert_instr(ucomiss))]
716#[stable(feature = "simd_x86", since = "1.27.0")]
717pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
718 ucomieq_ss(a, b)
719}
720
721/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
722/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
723/// This instruction will not signal an exception if either argument is a quiet
724/// NaN.
725///
726/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
727#[inline]
728#[target_feature(enable = "sse")]
729#[cfg_attr(test, assert_instr(ucomiss))]
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
732 ucomilt_ss(a, b)
733}
734
735/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
736/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
737/// otherwise. This instruction will not signal an exception if either argument
738/// is a quiet NaN.
739///
740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
741#[inline]
742#[target_feature(enable = "sse")]
743#[cfg_attr(test, assert_instr(ucomiss))]
744#[stable(feature = "simd_x86", since = "1.27.0")]
745pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
746 ucomile_ss(a, b)
747}
748
749/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
750/// `1` if the value from `a` is greater than the one from `b`, or `0`
751/// otherwise. This instruction will not signal an exception if either argument
752/// is a quiet NaN.
753///
754/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
755#[inline]
756#[target_feature(enable = "sse")]
757#[cfg_attr(test, assert_instr(ucomiss))]
758#[stable(feature = "simd_x86", since = "1.27.0")]
759pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
760 ucomigt_ss(a, b)
761}
762
763/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
764/// `1` if the value from `a` is greater than or equal to the one from `b`, or
765/// `0` otherwise. This instruction will not signal an exception if either
766/// argument is a quiet NaN.
767///
768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
769#[inline]
770#[target_feature(enable = "sse")]
771#[cfg_attr(test, assert_instr(ucomiss))]
772#[stable(feature = "simd_x86", since = "1.27.0")]
773pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
774 ucomige_ss(a, b)
775}
776
777/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
778/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
779/// signal an exception if either argument is a quiet NaN.
780///
781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
782#[inline]
783#[target_feature(enable = "sse")]
784#[cfg_attr(test, assert_instr(ucomiss))]
785#[stable(feature = "simd_x86", since = "1.27.0")]
786pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
787 ucomineq_ss(a, b)
788}
789
790/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
791///
792/// The result is rounded according to the current rounding mode. If the result
793/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
794/// (`i32::MIN`).
795///
796/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
797///
798/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
799#[inline]
800#[target_feature(enable = "sse")]
801#[cfg_attr(test, assert_instr(cvtss2si))]
802#[stable(feature = "simd_x86", since = "1.27.0")]
803pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
804 cvtss2si(a)
805}
806
807/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
808///
809/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
810#[inline]
811#[target_feature(enable = "sse")]
812#[cfg_attr(test, assert_instr(cvtss2si))]
813#[stable(feature = "simd_x86", since = "1.27.0")]
814pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
815 _mm_cvtss_si32(a)
816}
817
818/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
819/// with
820/// truncation.
821///
822/// The result is rounded always using truncation (round towards zero). If the
823/// result cannot be represented as a 32 bit integer the result will be
824/// `0x8000_0000` (`i32::MIN`).
825///
826/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
827///
828/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
829#[inline]
830#[target_feature(enable = "sse")]
831#[cfg_attr(test, assert_instr(cvttss2si))]
832#[stable(feature = "simd_x86", since = "1.27.0")]
833pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
834 cvttss2si(a)
835}
836
837/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
838///
839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
840#[inline]
841#[target_feature(enable = "sse")]
842#[cfg_attr(test, assert_instr(cvttss2si))]
843#[stable(feature = "simd_x86", since = "1.27.0")]
844pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
845 _mm_cvttss_si32(a)
846}
847
848/// Extracts the lowest 32 bit float from the input vector.
849///
850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
851#[inline]
852#[target_feature(enable = "sse")]
853// No point in using assert_instrs. In Unix x86_64 calling convention this is a
854// no-op, and on Windows it's just a `mov`.
855#[stable(feature = "simd_x86", since = "1.27.0")]
856pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
857 simd_extract!(a, 0)
858}
859
860/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
861/// vector `a` with the lowest 32 bit float replaced by the converted integer.
862///
863/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
864/// input).
865///
866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
867#[inline]
868#[target_feature(enable = "sse")]
869#[cfg_attr(test, assert_instr(cvtsi2ss))]
870#[stable(feature = "simd_x86", since = "1.27.0")]
871pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
872 cvtsi2ss(a, b)
873}
874
875/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
876///
877/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
878#[inline]
879#[target_feature(enable = "sse")]
880#[cfg_attr(test, assert_instr(cvtsi2ss))]
881#[stable(feature = "simd_x86", since = "1.27.0")]
882pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
883 _mm_cvtsi32_ss(a, b)
884}
885
886/// Construct a `__m128` with the lowest element set to `a` and the rest set to
887/// zero.
888///
889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
890#[inline]
891#[target_feature(enable = "sse")]
892#[cfg_attr(test, assert_instr(movss))]
893#[stable(feature = "simd_x86", since = "1.27.0")]
894pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
895 __m128(a, 0.0, 0.0, 0.0)
896}
897
898/// Construct a `__m128` with all element set to `a`.
899///
900/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
901#[inline]
902#[target_feature(enable = "sse")]
903#[cfg_attr(test, assert_instr(shufps))]
904#[stable(feature = "simd_x86", since = "1.27.0")]
905pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
906 __m128(a, a, a, a)
907}
908
909/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
912#[inline]
913#[target_feature(enable = "sse")]
914#[cfg_attr(test, assert_instr(shufps))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
917 _mm_set1_ps(a)
918}
919
920/// Construct a `__m128` from four floating point values highest to lowest.
921///
922/// Note that `a` will be the highest 32 bits of the result, and `d` the
923/// lowest. This matches the standard way of writing bit patterns on x86:
924///
925/// ```text
926/// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0
927/// +---------+---------+---------+---------+
928/// | a | b | c | d | result
929/// +---------+---------+---------+---------+
930/// ```
931///
932/// Alternatively:
933///
934/// ```text
935/// let v = _mm_set_ps(d, c, b, a);
936/// ```
937///
938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
939#[inline]
940#[target_feature(enable = "sse")]
941#[cfg_attr(test, assert_instr(unpcklps))]
942#[stable(feature = "simd_x86", since = "1.27.0")]
943pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
944 __m128(d, c, b, a)
945}
946
947/// Construct a `__m128` from four floating point values lowest to highest.
948///
949/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
950/// bits of the result, and `d` the highest.
951///
952/// ```text
953/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
954/// ```
955///
956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
957#[inline]
958#[target_feature(enable = "sse")]
959#[cfg_attr(
960 all(test, any(target_os = "windows", target_arch = "x86_64")),
961 assert_instr(unpcklps)
962)]
963// On a 32-bit architecture on non-Windows it just copies the operands from the stack.
964#[cfg_attr(
965 all(test, all(not(target_os = "windows"), target_arch = "x86")),
966 assert_instr(movaps)
967)]
968#[stable(feature = "simd_x86", since = "1.27.0")]
969pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
970 __m128(a, b, c, d)
971}
972
973/// Construct a `__m128` with all elements initialized to zero.
974///
975/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
976#[inline]
977#[target_feature(enable = "sse")]
978#[cfg_attr(test, assert_instr(xorps))]
979#[stable(feature = "simd_x86", since = "1.27.0")]
980pub unsafe fn _mm_setzero_ps() -> __m128 {
981 __m128(0.0, 0.0, 0.0, 0.0)
982}
983
984/// A utility function for creating masks to use with Intel shuffle and
985/// permute intrinsics.
986#[inline]
987#[allow(non_snake_case)]
988#[unstable(feature = "stdarch_x86_mm_shuffle", issue = "111147")]
989pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
990 ((z << 6) | (y << 4) | (x << 2) | w) as i32
991}
992
993/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
994/// `b` using `MASK`.
995///
996/// The lower half of result takes values from `a` and the higher half from
997/// `b`. Mask is split to 2 control bits each to index the element from inputs.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
1000///
1001/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1002/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1003/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1004/// Performing an implicit type conversion between an unsigned integer and a signed integer
1005/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1006#[inline]
1007#[target_feature(enable = "sse")]
1008#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1009#[rustc_legacy_const_generics(2)]
1010#[stable(feature = "simd_x86", since = "1.27.0")]
1011pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1012 static_assert_uimm_bits!(MASK, 8);
1013 simd_shuffle!(
1014 a,
1015 b,
1016 [
1017 MASK as u32 & 0b11,
1018 (MASK as u32 >> 2) & 0b11,
1019 ((MASK as u32 >> 4) & 0b11) + 4,
1020 ((MASK as u32 >> 6) & 0b11) + 4,
1021 ],
1022 )
1023}
1024
1025/// Unpacks and interleave single-precision (32-bit) floating-point elements
1026/// from the higher half of `a` and `b`.
1027///
1028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1029#[inline]
1030#[target_feature(enable = "sse")]
1031#[cfg_attr(test, assert_instr(unpckhps))]
1032#[stable(feature = "simd_x86", since = "1.27.0")]
1033pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1034 simd_shuffle!(a, b, [2, 6, 3, 7])
1035}
1036
1037/// Unpacks and interleave single-precision (32-bit) floating-point elements
1038/// from the lower half of `a` and `b`.
1039///
1040/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1041#[inline]
1042#[target_feature(enable = "sse")]
1043#[cfg_attr(test, assert_instr(unpcklps))]
1044#[stable(feature = "simd_x86", since = "1.27.0")]
1045pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1046 simd_shuffle!(a, b, [0, 4, 1, 5])
1047}
1048
1049/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1050/// lower half of result.
1051///
1052/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1053#[inline]
1054#[target_feature(enable = "sse")]
1055#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
1056#[stable(feature = "simd_x86", since = "1.27.0")]
1057pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1058 // TODO; figure why this is a different instruction on Windows?
1059 simd_shuffle!(a, b, [6, 7, 2, 3])
1060}
1061
1062/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1063/// higher half of result.
1064///
1065/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1066#[inline]
1067#[target_feature(enable = "sse")]
1068#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
1069#[stable(feature = "simd_x86", since = "1.27.0")]
1070pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1071 simd_shuffle!(a, b, [0, 1, 4, 5])
1072}
1073
1074/// Returns a mask of the most significant bit of each element in `a`.
1075///
1076/// The mask is stored in the 4 least significant bits of the return value.
1077/// All other bits are set to `0`.
1078///
1079/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1080#[inline]
1081#[target_feature(enable = "sse")]
1082#[cfg_attr(test, assert_instr(movmskps))]
1083#[stable(feature = "simd_x86", since = "1.27.0")]
1084pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
1085 // Propagate the highest bit to the rest, because simd_bitmask
1086 // requires all-1 or all-0.
1087 let mask: i32x4 = simd_lt(x:transmute(a), y:i32x4::splat(0));
1088 simd_bitmask::<i32x4, u8>(mask).into()
1089}
1090
1091/// Construct a `__m128` with the lowest element read from `p` and the other
1092/// elements set to zero.
1093///
1094/// This corresponds to instructions `VMOVSS` / `MOVSS`.
1095///
1096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1097#[inline]
1098#[target_feature(enable = "sse")]
1099#[cfg_attr(test, assert_instr(movss))]
1100#[stable(feature = "simd_x86", since = "1.27.0")]
1101pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1102 __m128(*p, 0.0, 0.0, 0.0)
1103}
1104
1105/// Construct a `__m128` by duplicating the value read from `p` into all
1106/// elements.
1107///
1108/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1109/// shuffling.
1110///
1111/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1112#[inline]
1113#[target_feature(enable = "sse")]
1114#[cfg_attr(test, assert_instr(movss))]
1115#[stable(feature = "simd_x86", since = "1.27.0")]
1116pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1117 let a: f32 = *p;
1118 __m128(a, a, a, a)
1119}
1120
1121/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1122///
1123/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1124#[inline]
1125#[target_feature(enable = "sse")]
1126#[cfg_attr(test, assert_instr(movss))]
1127#[stable(feature = "simd_x86", since = "1.27.0")]
1128pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1129 _mm_load1_ps(p)
1130}
1131
1132/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
1133/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1134/// protection fault will be triggered (fatal program crash).
1135///
1136/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1137/// memory.
1138///
1139/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1140///
1141/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1142#[inline]
1143#[target_feature(enable = "sse")]
1144#[cfg_attr(test, assert_instr(movaps))]
1145#[stable(feature = "simd_x86", since = "1.27.0")]
1146#[allow(clippy::cast_ptr_alignment)]
1147pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1148 *(p as *const __m128)
1149}
1150
1151/// Loads four `f32` values from memory into a `__m128`. There are no
1152/// restrictions
1153/// on memory alignment. For aligned memory
1154/// [`_mm_load_ps`](fn._mm_load_ps.html)
1155/// may be faster.
1156///
1157/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1158///
1159/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1160#[inline]
1161#[target_feature(enable = "sse")]
1162#[cfg_attr(test, assert_instr(movups))]
1163#[stable(feature = "simd_x86", since = "1.27.0")]
1164pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1165 // Note: Using `*p` would require `f32` alignment, but `movups` has no
1166 // alignment restrictions.
1167 let mut dst: __m128 = _mm_undefined_ps();
1168 ptr::copy_nonoverlapping(
1169 src:p as *const u8,
1170 dst:ptr::addr_of_mut!(dst) as *mut u8,
1171 count:mem::size_of::<__m128>(),
1172 );
1173 dst
1174}
1175
1176/// Loads four `f32` values from aligned memory into a `__m128` in reverse
1177/// order.
1178///
1179/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1180/// protection fault will be triggered (fatal program crash).
1181///
1182/// Functionally equivalent to the following code sequence (assuming `p`
1183/// satisfies the alignment restrictions):
1184///
1185/// ```text
1186/// let a0 = *p;
1187/// let a1 = *p.add(1);
1188/// let a2 = *p.add(2);
1189/// let a3 = *p.add(3);
1190/// __m128::new(a3, a2, a1, a0)
1191/// ```
1192///
1193/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1194/// shuffling.
1195///
1196/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1197#[inline]
1198#[target_feature(enable = "sse")]
1199#[cfg_attr(test, assert_instr(movaps))]
1200#[stable(feature = "simd_x86", since = "1.27.0")]
1201pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1202 let a: __m128 = _mm_load_ps(p);
1203 simd_shuffle!(a, a, [3, 2, 1, 0])
1204}
1205
1206/// Loads unaligned 64-bits of integer data from memory into new vector.
1207///
1208/// `mem_addr` does not need to be aligned on any particular boundary.
1209///
1210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
1211#[inline]
1212#[target_feature(enable = "sse")]
1213#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
1214pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
1215 transmute(src:i64x2::new(x0:ptr::read_unaligned(mem_addr as *const i64), x1:0))
1216}
1217
1218/// Stores the lowest 32 bit float of `a` into memory.
1219///
1220/// This intrinsic corresponds to the `MOVSS` instruction.
1221///
1222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1223#[inline]
1224#[target_feature(enable = "sse")]
1225#[cfg_attr(test, assert_instr(movss))]
1226#[stable(feature = "simd_x86", since = "1.27.0")]
1227pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1228 *p = simd_extract!(a, 0);
1229}
1230
1231/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
1232/// memory.
1233///
1234/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1235/// protection fault will be triggered (fatal program crash).
1236///
1237/// Functionally equivalent to the following code sequence (assuming `p`
1238/// satisfies the alignment restrictions):
1239///
1240/// ```text
1241/// let x = a.extract(0);
1242/// *p = x;
1243/// *p.add(1) = x;
1244/// *p.add(2) = x;
1245/// *p.add(3) = x;
1246/// ```
1247///
1248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1249#[inline]
1250#[target_feature(enable = "sse")]
1251#[cfg_attr(test, assert_instr(movaps))]
1252#[stable(feature = "simd_x86", since = "1.27.0")]
1253#[allow(clippy::cast_ptr_alignment)]
1254pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1255 let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
1256 *(p as *mut __m128) = b;
1257}
1258
1259/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1260///
1261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1262#[inline]
1263#[target_feature(enable = "sse")]
1264#[cfg_attr(test, assert_instr(movaps))]
1265#[stable(feature = "simd_x86", since = "1.27.0")]
1266pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1267 _mm_store1_ps(p, a);
1268}
1269
1270/// Stores four 32-bit floats into *aligned* memory.
1271///
1272/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1273/// protection fault will be triggered (fatal program crash).
1274///
1275/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1276/// memory.
1277///
1278/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1279///
1280/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1281#[inline]
1282#[target_feature(enable = "sse")]
1283#[cfg_attr(test, assert_instr(movaps))]
1284#[stable(feature = "simd_x86", since = "1.27.0")]
1285#[allow(clippy::cast_ptr_alignment)]
1286pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1287 *(p as *mut __m128) = a;
1288}
1289
1290/// Stores four 32-bit floats into memory. There are no restrictions on memory
1291/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1292/// faster.
1293///
1294/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1295///
1296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1297#[inline]
1298#[target_feature(enable = "sse")]
1299#[cfg_attr(test, assert_instr(movups))]
1300#[stable(feature = "simd_x86", since = "1.27.0")]
1301pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1302 ptr::copy_nonoverlapping(
1303 src:ptr::addr_of!(a) as *const u8,
1304 dst:p as *mut u8,
1305 count:mem::size_of::<__m128>(),
1306 );
1307}
1308
1309/// Stores four 32-bit floats into *aligned* memory in reverse order.
1310///
1311/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1312/// protection fault will be triggered (fatal program crash).
1313///
1314/// Functionally equivalent to the following code sequence (assuming `p`
1315/// satisfies the alignment restrictions):
1316///
1317/// ```text
1318/// *p = a.extract(3);
1319/// *p.add(1) = a.extract(2);
1320/// *p.add(2) = a.extract(1);
1321/// *p.add(3) = a.extract(0);
1322/// ```
1323///
1324/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1325#[inline]
1326#[target_feature(enable = "sse")]
1327#[cfg_attr(test, assert_instr(movaps))]
1328#[stable(feature = "simd_x86", since = "1.27.0")]
1329#[allow(clippy::cast_ptr_alignment)]
1330pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1331 let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
1332 *(p as *mut __m128) = b;
1333}
1334
1335/// Returns a `__m128` with the first component from `b` and the remaining
1336/// components from `a`.
1337///
1338/// In other words for any `a` and `b`:
1339/// ```text
1340/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1341/// ```
1342///
1343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1344#[inline]
1345#[target_feature(enable = "sse")]
1346#[cfg_attr(test, assert_instr(movss))]
1347#[stable(feature = "simd_x86", since = "1.27.0")]
1348pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1349 simd_shuffle!(a, b, [4, 1, 2, 3])
1350}
1351
1352/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
1353/// were issued by the current thread prior to this instruction.
1354///
1355/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
1356/// ordered before any load or store instruction which follows the fence in
1357/// synchronization order.
1358///
1359/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1360/// (but note that Intel is only documenting the hardware-level concerns related to this
1361/// instruction; the Intel documentation does not take into account the extra concerns that arise
1362/// because the Rust memory model is different from the x86 memory model.)
1363///
1364/// # Safety of non-temporal stores
1365///
1366/// After using any non-temporal store intrinsic, but before any other access to the memory that the
1367/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
1368/// intrinsic.
1369///
1370/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
1371/// memory model, these stores are happening asynchronously in a background thread. This means a
1372/// non-temporal store can cause data races with other accesses, even other accesses on the same
1373/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
1374/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
1375/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
1376/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
1377/// with all the non-temporal stores previously started on this thread, which means in particular
1378/// that subsequent synchronization with other threads will then work as intended again.
1379///
1380/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
1381/// code jumps back to code outside your library. This ensures all stores inside your function
1382/// are synchronized-before the return, and thus transitively synchronized-before everything
1383/// the caller does after your function returns.
1384//
1385// The following is not a doc comment since it's not clear whether we want to put this into the
1386// docs, but it should be written out somewhere.
1387//
1388// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
1389// inspect, and that behave like the following functions. This explains where the docs above come
1390// from.
1391// ```
1392// #[thread_local]
1393// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
1394//
1395// pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
1396// PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
1397// // Spawn a thread that will eventually do our write.
1398// // We need to fetch a pointer to this thread's pending-write
1399// // counter, so that we can access it from the background thread.
1400// let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
1401// // If this was actual Rust code we'd have to do some extra work
1402// // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
1403// std::thread::spawn(move || {
1404// // Do the write in the background thread.
1405// ptr.write(val);
1406// // Register the write as done. Crucially, this is `Release`, so it
1407// // syncs-with the `Acquire in `sfence`.
1408// (&*pending_writes).fetch_sub(1, Release);
1409// });
1410// }
1411//
1412// pub fn sfence() {
1413// unsafe {
1414// // Wait until there are no more pending writes.
1415// while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
1416// }
1417// }
1418// ```
1419#[inline]
1420#[target_feature(enable = "sse")]
1421#[cfg_attr(test, assert_instr(sfence))]
1422#[stable(feature = "simd_x86", since = "1.27.0")]
1423pub unsafe fn _mm_sfence() {
1424 sfence()
1425}
1426
1427/// Gets the unsigned 32-bit value of the MXCSR control and status register.
1428///
1429/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
1430/// floating-point operations may or may not result in this register getting updated with exception
1431/// state, and the register can change between two invocations of this function even when no
1432/// floating-point operations appear in the source code (since floating-point operations appearing
1433/// earlier or later can be reordered).
1434///
1435/// If you need to perform some floating-point operations and check whether they raised an
1436/// exception, use an inline assembly block for the entire sequence of operations.
1437///
1438/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1439///
1440/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1441#[inline]
1442#[target_feature(enable = "sse")]
1443#[cfg_attr(test, assert_instr(stmxcsr))]
1444#[stable(feature = "simd_x86", since = "1.27.0")]
1445#[deprecated(
1446 since = "1.75.0",
1447 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1448)]
1449pub unsafe fn _mm_getcsr() -> u32 {
1450 let mut result: i32 = 0_i32;
1451 stmxcsr(ptr::addr_of_mut!(result) as *mut i8);
1452 result as u32
1453}
1454
1455/// Sets the MXCSR register with the 32-bit unsigned integer value.
1456///
1457/// This register controls how SIMD instructions handle floating point
1458/// operations. Modifying this register only affects the current thread.
1459///
1460/// It contains several groups of flags:
1461///
1462/// * *Exception flags* report which exceptions occurred since last they were
1463/// reset.
1464///
1465/// * *Masking flags* can be used to mask (ignore) certain exceptions. By
1466/// default
1467/// these flags are all set to 1, so all exceptions are masked. When an
1468/// an exception is masked, the processor simply sets the exception flag and
1469/// continues the operation. If the exception is unmasked, the flag is also set
1470/// but additionally an exception handler is invoked.
1471///
1472/// * *Rounding mode flags* control the rounding mode of floating point
1473/// instructions.
1474///
1475/// * The *denormals-are-zero mode flag* turns all numbers which would be
1476/// denormalized (exponent bits are all zeros) into zeros.
1477///
1478/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
1479/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
1480/// will optimize accordingly. This even applies when the register is altered and later reset to its
1481/// original value without any floating-point operations appearing in the source code between those
1482/// operations (since floating-point operations appearing earlier or later can be reordered).
1483///
1484/// If you need to perform some floating-point operations under a different masking flags, rounding
1485/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
1486/// original MXCSR register state before the end of the block.
1487///
1488/// ## Exception Flags
1489///
1490/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1491/// Infinity by Infinity).
1492///
1493/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1494/// number. Mainly this can cause loss of precision.
1495///
1496/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
1497///
1498/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
1499/// result was too large to be represented (e.g., an `f32` with absolute
1500/// value
1501/// greater than `2^128`).
1502///
1503/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
1504/// result was too small to be represented in a normalized way (e.g., an
1505/// `f32`
1506/// with absulte value smaller than `2^-126`.)
1507///
1508/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
1509/// precision exception). This means some precision was lost due to rounding.
1510/// For example, the fraction `1/3` cannot be represented accurately in a
1511/// 32 or 64 bit float and computing it would cause this exception to be
1512/// raised. Precision exceptions are very common, so they are usually masked.
1513///
1514/// Exception flags can be read and set using the convenience functions
1515/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1516/// check if an operation caused some overflow:
1517///
1518/// ```rust,ignore
1519/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1520/// // perform calculations
1521/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1522/// // handle overflow
1523/// }
1524/// ```
1525///
1526/// ## Masking Flags
1527///
1528/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1529/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1530/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1531///
1532/// A single masking bit can be set via
1533///
1534/// ```rust,ignore
1535/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1536/// ```
1537///
1538/// However, since mask bits are by default all set to 1, it is more common to
1539/// want to *disable* certain bits. For example, to unmask the underflow
1540/// exception, use:
1541///
1542/// ```rust,ignore
1543/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1544/// exception
1545/// ```
1546///
1547/// Warning: an unmasked exception will cause an exception handler to be
1548/// called.
1549/// The standard handler will simply terminate the process. So, in this case
1550/// any underflow exception would terminate the current process with something
1551/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1552///
1553/// ## Rounding Mode
1554///
1555/// The rounding mode is describe using two bits. It can be read and set using
1556/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1557/// `_MM_SET_ROUNDING_MODE(mode)`.
1558///
1559/// The rounding modes are:
1560///
1561/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1562/// value. If two values are equally close, round to even (i.e., least
1563/// significant bit will be zero).
1564///
1565/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1566///
1567/// * `_MM_ROUND_UP`: Round toward positive Infinity.
1568///
1569/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1570///
1571/// Example:
1572///
1573/// ```rust,ignore
1574/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1575/// ```
1576///
1577/// ## Denormals-are-zero/Flush-to-zero Mode
1578///
1579/// If this bit is set, values that would be denormalized will be set to zero
1580/// instead. This is turned off by default.
1581///
1582/// You can read and enable/disable this mode via the helper functions
1583/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1584///
1585/// ```rust,ignore
1586/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1587/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1588/// ```
1589///
1590///
1591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1592#[inline]
1593#[target_feature(enable = "sse")]
1594#[cfg_attr(test, assert_instr(ldmxcsr))]
1595#[stable(feature = "simd_x86", since = "1.27.0")]
1596#[deprecated(
1597 since = "1.75.0",
1598 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1599)]
1600pub unsafe fn _mm_setcsr(val: u32) {
1601 ldmxcsr(ptr::addr_of!(val) as *const i8);
1602}
1603
1604/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1605#[stable(feature = "simd_x86", since = "1.27.0")]
1606pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1607/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1608#[stable(feature = "simd_x86", since = "1.27.0")]
1609pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1610/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1611#[stable(feature = "simd_x86", since = "1.27.0")]
1612pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1613/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1614#[stable(feature = "simd_x86", since = "1.27.0")]
1615pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1616/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1617#[stable(feature = "simd_x86", since = "1.27.0")]
1618pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1619/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1620#[stable(feature = "simd_x86", since = "1.27.0")]
1621pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1622/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1623#[stable(feature = "simd_x86", since = "1.27.0")]
1624pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1625
1626/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1627#[stable(feature = "simd_x86", since = "1.27.0")]
1628pub const _MM_MASK_INVALID: u32 = 0x0080;
1629/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1630#[stable(feature = "simd_x86", since = "1.27.0")]
1631pub const _MM_MASK_DENORM: u32 = 0x0100;
1632/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1633#[stable(feature = "simd_x86", since = "1.27.0")]
1634pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1635/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1636#[stable(feature = "simd_x86", since = "1.27.0")]
1637pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1638/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1639#[stable(feature = "simd_x86", since = "1.27.0")]
1640pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1641/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1642#[stable(feature = "simd_x86", since = "1.27.0")]
1643pub const _MM_MASK_INEXACT: u32 = 0x1000;
1644/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1645#[stable(feature = "simd_x86", since = "1.27.0")]
1646pub const _MM_MASK_MASK: u32 = 0x1f80;
1647
1648/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1649#[stable(feature = "simd_x86", since = "1.27.0")]
1650pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1651/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1652#[stable(feature = "simd_x86", since = "1.27.0")]
1653pub const _MM_ROUND_DOWN: u32 = 0x2000;
1654/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1655#[stable(feature = "simd_x86", since = "1.27.0")]
1656pub const _MM_ROUND_UP: u32 = 0x4000;
1657/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1658#[stable(feature = "simd_x86", since = "1.27.0")]
1659pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1660
1661/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1662#[stable(feature = "simd_x86", since = "1.27.0")]
1663pub const _MM_ROUND_MASK: u32 = 0x6000;
1664
1665/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1666#[stable(feature = "simd_x86", since = "1.27.0")]
1667pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1668/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1669#[stable(feature = "simd_x86", since = "1.27.0")]
1670pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1671/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1672#[stable(feature = "simd_x86", since = "1.27.0")]
1673pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1674
1675/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1676///
1677/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1678#[inline]
1679#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1680#[allow(non_snake_case)]
1681#[target_feature(enable = "sse")]
1682#[stable(feature = "simd_x86", since = "1.27.0")]
1683#[deprecated(
1684 since = "1.75.0",
1685 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1686)]
1687pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1688 _mm_getcsr() & _MM_MASK_MASK
1689}
1690
1691/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1692///
1693/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1694#[inline]
1695#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1696#[allow(non_snake_case)]
1697#[target_feature(enable = "sse")]
1698#[stable(feature = "simd_x86", since = "1.27.0")]
1699#[deprecated(
1700 since = "1.75.0",
1701 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1702)]
1703pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1704 _mm_getcsr() & _MM_EXCEPT_MASK
1705}
1706
1707/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1708///
1709/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1710#[inline]
1711#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1712#[allow(non_snake_case)]
1713#[target_feature(enable = "sse")]
1714#[stable(feature = "simd_x86", since = "1.27.0")]
1715#[deprecated(
1716 since = "1.75.0",
1717 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1718)]
1719pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1720 _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1721}
1722
1723/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1724///
1725/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1726#[inline]
1727#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1728#[allow(non_snake_case)]
1729#[target_feature(enable = "sse")]
1730#[stable(feature = "simd_x86", since = "1.27.0")]
1731#[deprecated(
1732 since = "1.75.0",
1733 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1734)]
1735pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1736 _mm_getcsr() & _MM_ROUND_MASK
1737}
1738
1739/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1740///
1741/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1742#[inline]
1743#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1744#[allow(non_snake_case)]
1745#[target_feature(enable = "sse")]
1746#[stable(feature = "simd_x86", since = "1.27.0")]
1747#[deprecated(
1748 since = "1.75.0",
1749 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1750)]
1751pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1752 _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
1753}
1754
1755/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1756///
1757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1758#[inline]
1759#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1760#[allow(non_snake_case)]
1761#[target_feature(enable = "sse")]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763#[deprecated(
1764 since = "1.75.0",
1765 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1766)]
1767pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1768 _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
1769}
1770
1771/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1772///
1773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1774#[inline]
1775#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1776#[allow(non_snake_case)]
1777#[target_feature(enable = "sse")]
1778#[stable(feature = "simd_x86", since = "1.27.0")]
1779#[deprecated(
1780 since = "1.75.0",
1781 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1782)]
1783pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1784 let val: u32 = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
1785 // println!("setting csr={:x}", val);
1786 _mm_setcsr(val)
1787}
1788
1789/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1790///
1791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1792#[inline]
1793#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1794#[allow(non_snake_case)]
1795#[target_feature(enable = "sse")]
1796#[stable(feature = "simd_x86", since = "1.27.0")]
1797#[deprecated(
1798 since = "1.75.0",
1799 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1800)]
1801pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1802 _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
1803}
1804
1805/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1806#[stable(feature = "simd_x86", since = "1.27.0")]
1807pub const _MM_HINT_T0: i32 = 3;
1808
1809/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1810#[stable(feature = "simd_x86", since = "1.27.0")]
1811pub const _MM_HINT_T1: i32 = 2;
1812
1813/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1814#[stable(feature = "simd_x86", since = "1.27.0")]
1815pub const _MM_HINT_T2: i32 = 1;
1816
1817/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1818#[stable(feature = "simd_x86", since = "1.27.0")]
1819pub const _MM_HINT_NTA: i32 = 0;
1820
1821/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1822#[stable(feature = "simd_x86", since = "1.27.0")]
1823pub const _MM_HINT_ET0: i32 = 7;
1824
1825/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1826#[stable(feature = "simd_x86", since = "1.27.0")]
1827pub const _MM_HINT_ET1: i32 = 6;
1828
1829/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1830///
1831/// The `STRATEGY` must be one of:
1832///
1833/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
1834/// cache hierarchy.
1835///
1836/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1837///
1838/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1839/// an implementation-specific choice (e.g., L2 if there is no L3).
1840///
1841/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1842/// non-temporal access (NTA) hint. It may be a place closer than main memory
1843/// but outside of the cache hierarchy. This is used to reduce access latency
1844/// without polluting the cache.
1845///
1846/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1847/// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1848/// and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1849///
1850/// The actual implementation depends on the particular CPU. This instruction
1851/// is considered a hint, so the CPU is also free to simply ignore the request.
1852///
1853/// The amount of prefetched data depends on the cache line size of the
1854/// specific CPU, but it will be at least 32 bytes.
1855///
1856/// Common caveats:
1857///
1858/// * Most modern CPUs already automatically prefetch data based on predicted
1859/// access patterns.
1860///
1861/// * Data is usually not fetched if this would cause a TLB miss or a page
1862/// fault.
1863///
1864/// * Too much prefetching can cause unnecessary cache evictions.
1865///
1866/// * Prefetching may also fail if there are not enough memory-subsystem
1867/// resources (e.g., request buffers).
1868///
1869///
1870/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1871#[inline]
1872#[target_feature(enable = "sse")]
1873#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1874#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1875#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1876#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1877#[rustc_legacy_const_generics(1)]
1878#[stable(feature = "simd_x86", since = "1.27.0")]
1879pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1880 // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1881 // `locality` and `rw` are based on our `STRATEGY`.
1882 prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, ty:1);
1883}
1884
1885/// Returns vector of type __m128 with indeterminate elements.
1886/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
1887/// In practice, this is equivalent to [`mem::zeroed`].
1888///
1889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1890#[inline]
1891#[target_feature(enable = "sse")]
1892#[stable(feature = "simd_x86", since = "1.27.0")]
1893pub unsafe fn _mm_undefined_ps() -> __m128 {
1894 _mm_set1_ps(0.0)
1895}
1896
1897/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1898///
1899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1900#[inline]
1901#[allow(non_snake_case)]
1902#[target_feature(enable = "sse")]
1903#[stable(feature = "simd_x86", since = "1.27.0")]
1904pub unsafe fn _MM_TRANSPOSE4_PS(
1905 row0: &mut __m128,
1906 row1: &mut __m128,
1907 row2: &mut __m128,
1908 row3: &mut __m128,
1909) {
1910 let tmp0: __m128 = _mm_unpacklo_ps(*row0, *row1);
1911 let tmp2: __m128 = _mm_unpacklo_ps(*row2, *row3);
1912 let tmp1: __m128 = _mm_unpackhi_ps(*row0, *row1);
1913 let tmp3: __m128 = _mm_unpackhi_ps(*row2, *row3);
1914
1915 *row0 = _mm_movelh_ps(a:tmp0, b:tmp2);
1916 *row1 = _mm_movehl_ps(a:tmp2, b:tmp0);
1917 *row2 = _mm_movelh_ps(a:tmp1, b:tmp3);
1918 *row3 = _mm_movehl_ps(a:tmp3, b:tmp1);
1919}
1920
1921#[allow(improper_ctypes)]
1922extern "C" {
1923 #[link_name = "llvm.x86.sse.add.ss"]
1924 fn addss(a: __m128, b: __m128) -> __m128;
1925 #[link_name = "llvm.x86.sse.sub.ss"]
1926 fn subss(a: __m128, b: __m128) -> __m128;
1927 #[link_name = "llvm.x86.sse.mul.ss"]
1928 fn mulss(a: __m128, b: __m128) -> __m128;
1929 #[link_name = "llvm.x86.sse.div.ss"]
1930 fn divss(a: __m128, b: __m128) -> __m128;
1931 #[link_name = "llvm.x86.sse.sqrt.ss"]
1932 fn sqrtss(a: __m128) -> __m128;
1933 #[link_name = "llvm.x86.sse.sqrt.ps"]
1934 fn sqrtps(a: __m128) -> __m128;
1935 #[link_name = "llvm.x86.sse.rcp.ss"]
1936 fn rcpss(a: __m128) -> __m128;
1937 #[link_name = "llvm.x86.sse.rcp.ps"]
1938 fn rcpps(a: __m128) -> __m128;
1939 #[link_name = "llvm.x86.sse.rsqrt.ss"]
1940 fn rsqrtss(a: __m128) -> __m128;
1941 #[link_name = "llvm.x86.sse.rsqrt.ps"]
1942 fn rsqrtps(a: __m128) -> __m128;
1943 #[link_name = "llvm.x86.sse.min.ss"]
1944 fn minss(a: __m128, b: __m128) -> __m128;
1945 #[link_name = "llvm.x86.sse.min.ps"]
1946 fn minps(a: __m128, b: __m128) -> __m128;
1947 #[link_name = "llvm.x86.sse.max.ss"]
1948 fn maxss(a: __m128, b: __m128) -> __m128;
1949 #[link_name = "llvm.x86.sse.max.ps"]
1950 fn maxps(a: __m128, b: __m128) -> __m128;
1951 #[link_name = "llvm.x86.sse.cmp.ps"]
1952 fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1953 #[link_name = "llvm.x86.sse.comieq.ss"]
1954 fn comieq_ss(a: __m128, b: __m128) -> i32;
1955 #[link_name = "llvm.x86.sse.comilt.ss"]
1956 fn comilt_ss(a: __m128, b: __m128) -> i32;
1957 #[link_name = "llvm.x86.sse.comile.ss"]
1958 fn comile_ss(a: __m128, b: __m128) -> i32;
1959 #[link_name = "llvm.x86.sse.comigt.ss"]
1960 fn comigt_ss(a: __m128, b: __m128) -> i32;
1961 #[link_name = "llvm.x86.sse.comige.ss"]
1962 fn comige_ss(a: __m128, b: __m128) -> i32;
1963 #[link_name = "llvm.x86.sse.comineq.ss"]
1964 fn comineq_ss(a: __m128, b: __m128) -> i32;
1965 #[link_name = "llvm.x86.sse.ucomieq.ss"]
1966 fn ucomieq_ss(a: __m128, b: __m128) -> i32;
1967 #[link_name = "llvm.x86.sse.ucomilt.ss"]
1968 fn ucomilt_ss(a: __m128, b: __m128) -> i32;
1969 #[link_name = "llvm.x86.sse.ucomile.ss"]
1970 fn ucomile_ss(a: __m128, b: __m128) -> i32;
1971 #[link_name = "llvm.x86.sse.ucomigt.ss"]
1972 fn ucomigt_ss(a: __m128, b: __m128) -> i32;
1973 #[link_name = "llvm.x86.sse.ucomige.ss"]
1974 fn ucomige_ss(a: __m128, b: __m128) -> i32;
1975 #[link_name = "llvm.x86.sse.ucomineq.ss"]
1976 fn ucomineq_ss(a: __m128, b: __m128) -> i32;
1977 #[link_name = "llvm.x86.sse.cvtss2si"]
1978 fn cvtss2si(a: __m128) -> i32;
1979 #[link_name = "llvm.x86.sse.cvttss2si"]
1980 fn cvttss2si(a: __m128) -> i32;
1981 #[link_name = "llvm.x86.sse.cvtsi2ss"]
1982 fn cvtsi2ss(a: __m128, b: i32) -> __m128;
1983 #[link_name = "llvm.x86.sse.sfence"]
1984 fn sfence();
1985 #[link_name = "llvm.x86.sse.stmxcsr"]
1986 fn stmxcsr(p: *mut i8);
1987 #[link_name = "llvm.x86.sse.ldmxcsr"]
1988 fn ldmxcsr(p: *const i8);
1989 #[link_name = "llvm.prefetch"]
1990 fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
1991 #[link_name = "llvm.x86.sse.cmp.ss"]
1992 fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
1993}
1994
1995/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1996///
1997/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1998/// exception _may_ be generated.
1999///
2000/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
2001///
2002/// # Safety of non-temporal stores
2003///
2004/// After using this intrinsic, but before any other access to the memory that this intrinsic
2005/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2006/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2007/// return.
2008///
2009/// See [`_mm_sfence`] for details.
2010#[inline]
2011#[target_feature(enable = "sse")]
2012#[cfg_attr(test, assert_instr(movntps))]
2013#[stable(feature = "simd_x86", since = "1.27.0")]
2014#[allow(clippy::cast_ptr_alignment)]
2015pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
2016 intrinsics::nontemporal_store(ptr:mem_addr as *mut __m128, val:a);
2017}
2018
2019#[cfg(test)]
2020mod tests {
2021 use crate::{hint::black_box, mem::transmute, ptr};
2022 use std::{boxed, f32::NAN};
2023 use stdarch_test::simd_test;
2024
2025 use crate::core_arch::{simd::*, x86::*};
2026
2027 #[simd_test(enable = "sse")]
2028 unsafe fn test_mm_add_ps() {
2029 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2030 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2031 let r = _mm_add_ps(a, b);
2032 assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
2033 }
2034
2035 #[simd_test(enable = "sse")]
2036 unsafe fn test_mm_add_ss() {
2037 let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
2038 let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
2039 let r = _mm_add_ss(a, b);
2040 assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
2041 }
2042
2043 #[simd_test(enable = "sse")]
2044 unsafe fn test_mm_sub_ps() {
2045 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2046 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2047 let r = _mm_sub_ps(a, b);
2048 assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
2049 }
2050
2051 #[simd_test(enable = "sse")]
2052 unsafe fn test_mm_sub_ss() {
2053 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2054 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2055 let r = _mm_sub_ss(a, b);
2056 assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
2057 }
2058
2059 #[simd_test(enable = "sse")]
2060 unsafe fn test_mm_mul_ps() {
2061 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2062 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2063 let r = _mm_mul_ps(a, b);
2064 assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
2065 }
2066
2067 #[simd_test(enable = "sse")]
2068 unsafe fn test_mm_mul_ss() {
2069 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2070 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2071 let r = _mm_mul_ss(a, b);
2072 assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
2073 }
2074
2075 #[simd_test(enable = "sse")]
2076 unsafe fn test_mm_div_ps() {
2077 let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
2078 let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
2079 let r = _mm_div_ps(a, b);
2080 assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
2081 }
2082
2083 #[simd_test(enable = "sse")]
2084 unsafe fn test_mm_div_ss() {
2085 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2086 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2087 let r = _mm_div_ss(a, b);
2088 assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
2089 }
2090
2091 #[simd_test(enable = "sse")]
2092 unsafe fn test_mm_sqrt_ss() {
2093 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2094 let r = _mm_sqrt_ss(a);
2095 let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
2096 assert_eq_m128(r, e);
2097 }
2098
2099 #[simd_test(enable = "sse")]
2100 unsafe fn test_mm_sqrt_ps() {
2101 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2102 let r = _mm_sqrt_ps(a);
2103 let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
2104 assert_eq_m128(r, e);
2105 }
2106
2107 #[simd_test(enable = "sse")]
2108 unsafe fn test_mm_rcp_ss() {
2109 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2110 let r = _mm_rcp_ss(a);
2111 let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
2112 let rel_err = 0.00048828125;
2113 assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
2114 for i in 1..4 {
2115 assert_eq!(get_m128(r, i), get_m128(e, i));
2116 }
2117 }
2118
2119 #[simd_test(enable = "sse")]
2120 unsafe fn test_mm_rcp_ps() {
2121 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2122 let r = _mm_rcp_ps(a);
2123 let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
2124 let rel_err = 0.00048828125;
2125 for i in 0..4 {
2126 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2127 }
2128 }
2129
2130 #[simd_test(enable = "sse")]
2131 unsafe fn test_mm_rsqrt_ss() {
2132 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2133 let r = _mm_rsqrt_ss(a);
2134 let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
2135 let rel_err = 0.00048828125;
2136 for i in 0..4 {
2137 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2138 }
2139 }
2140
2141 #[simd_test(enable = "sse")]
2142 unsafe fn test_mm_rsqrt_ps() {
2143 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2144 let r = _mm_rsqrt_ps(a);
2145 let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2146 let rel_err = 0.00048828125;
2147 for i in 0..4 {
2148 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2149 }
2150 }
2151
2152 #[simd_test(enable = "sse")]
2153 unsafe fn test_mm_min_ss() {
2154 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2155 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2156 let r = _mm_min_ss(a, b);
2157 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2158 }
2159
2160 #[simd_test(enable = "sse")]
2161 unsafe fn test_mm_min_ps() {
2162 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2163 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2164 let r = _mm_min_ps(a, b);
2165 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2166
2167 // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2168 // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2169 // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2170 // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2171 // `r1` to `a` and `r2` to `b`.
2172 let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2173 let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2174 let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
2175 let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
2176 let a: [u8; 16] = transmute(a);
2177 let b: [u8; 16] = transmute(b);
2178 assert_eq!(r1, b);
2179 assert_eq!(r2, a);
2180 assert_ne!(a, b); // sanity check that -0.0 is actually present
2181 }
2182
2183 #[simd_test(enable = "sse")]
2184 unsafe fn test_mm_max_ss() {
2185 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2186 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2187 let r = _mm_max_ss(a, b);
2188 assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2189 }
2190
2191 #[simd_test(enable = "sse")]
2192 unsafe fn test_mm_max_ps() {
2193 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2194 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2195 let r = _mm_max_ps(a, b);
2196 assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2197
2198 // Check SSE-specific semantics for -0.0 handling.
2199 let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2200 let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2201 let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
2202 let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
2203 let a: [u8; 16] = transmute(a);
2204 let b: [u8; 16] = transmute(b);
2205 assert_eq!(r1, b);
2206 assert_eq!(r2, a);
2207 assert_ne!(a, b); // sanity check that -0.0 is actually present
2208 }
2209
2210 #[simd_test(enable = "sse")]
2211 unsafe fn test_mm_and_ps() {
2212 let a = transmute(u32x4::splat(0b0011));
2213 let b = transmute(u32x4::splat(0b0101));
2214 let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2215 let e = transmute(u32x4::splat(0b0001));
2216 assert_eq_m128(r, e);
2217 }
2218
2219 #[simd_test(enable = "sse")]
2220 unsafe fn test_mm_andnot_ps() {
2221 let a = transmute(u32x4::splat(0b0011));
2222 let b = transmute(u32x4::splat(0b0101));
2223 let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2224 let e = transmute(u32x4::splat(0b0100));
2225 assert_eq_m128(r, e);
2226 }
2227
2228 #[simd_test(enable = "sse")]
2229 unsafe fn test_mm_or_ps() {
2230 let a = transmute(u32x4::splat(0b0011));
2231 let b = transmute(u32x4::splat(0b0101));
2232 let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2233 let e = transmute(u32x4::splat(0b0111));
2234 assert_eq_m128(r, e);
2235 }
2236
2237 #[simd_test(enable = "sse")]
2238 unsafe fn test_mm_xor_ps() {
2239 let a = transmute(u32x4::splat(0b0011));
2240 let b = transmute(u32x4::splat(0b0101));
2241 let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2242 let e = transmute(u32x4::splat(0b0110));
2243 assert_eq_m128(r, e);
2244 }
2245
2246 #[simd_test(enable = "sse")]
2247 unsafe fn test_mm_cmpeq_ss() {
2248 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2249 let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2250 let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2251 let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
2252 assert_eq!(r, e);
2253
2254 let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2255 let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
2256 let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
2257 assert_eq!(r2, e2);
2258 }
2259
2260 #[simd_test(enable = "sse")]
2261 unsafe fn test_mm_cmplt_ss() {
2262 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2263 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2264 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2265 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2266
2267 let b1 = 0u32; // a.extract(0) < b.extract(0)
2268 let c1 = 0u32; // a.extract(0) < c.extract(0)
2269 let d1 = !0u32; // a.extract(0) < d.extract(0)
2270
2271 let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2272 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2273 assert_eq!(rb, eb);
2274
2275 let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2276 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2277 assert_eq!(rc, ec);
2278
2279 let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2280 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2281 assert_eq!(rd, ed);
2282 }
2283
2284 #[simd_test(enable = "sse")]
2285 unsafe fn test_mm_cmple_ss() {
2286 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2287 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2288 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2289 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2290
2291 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2292 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2293 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2294
2295 let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2296 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2297 assert_eq!(rb, eb);
2298
2299 let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2300 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2301 assert_eq!(rc, ec);
2302
2303 let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2304 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2305 assert_eq!(rd, ed);
2306 }
2307
2308 #[simd_test(enable = "sse")]
2309 unsafe fn test_mm_cmpgt_ss() {
2310 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2311 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2312 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2313 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2314
2315 let b1 = !0u32; // a.extract(0) > b.extract(0)
2316 let c1 = 0u32; // a.extract(0) > c.extract(0)
2317 let d1 = 0u32; // a.extract(0) > d.extract(0)
2318
2319 let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2320 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2321 assert_eq!(rb, eb);
2322
2323 let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2324 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2325 assert_eq!(rc, ec);
2326
2327 let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2328 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2329 assert_eq!(rd, ed);
2330 }
2331
2332 #[simd_test(enable = "sse")]
2333 unsafe fn test_mm_cmpge_ss() {
2334 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2335 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2336 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2337 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2338
2339 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2340 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2341 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2342
2343 let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2344 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2345 assert_eq!(rb, eb);
2346
2347 let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2348 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2349 assert_eq!(rc, ec);
2350
2351 let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2352 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2353 assert_eq!(rd, ed);
2354 }
2355
2356 #[simd_test(enable = "sse")]
2357 unsafe fn test_mm_cmpneq_ss() {
2358 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2359 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2360 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2361 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2362
2363 let b1 = !0u32; // a.extract(0) != b.extract(0)
2364 let c1 = 0u32; // a.extract(0) != c.extract(0)
2365 let d1 = !0u32; // a.extract(0) != d.extract(0)
2366
2367 let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2368 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2369 assert_eq!(rb, eb);
2370
2371 let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2372 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2373 assert_eq!(rc, ec);
2374
2375 let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2376 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2377 assert_eq!(rd, ed);
2378 }
2379
2380 #[simd_test(enable = "sse")]
2381 unsafe fn test_mm_cmpnlt_ss() {
2382 // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2383 // must be a difference. It may have to do with behavior in the
2384 // presence of NaNs (signaling or quiet). If so, we should add tests
2385 // for those.
2386
2387 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2388 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2389 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2390 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2391
2392 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2393 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2394 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2395
2396 let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2397 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2398 assert_eq!(rb, eb);
2399
2400 let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2401 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2402 assert_eq!(rc, ec);
2403
2404 let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2405 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2406 assert_eq!(rd, ed);
2407 }
2408
2409 #[simd_test(enable = "sse")]
2410 unsafe fn test_mm_cmpnle_ss() {
2411 // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2412 // must be a difference. It may have to do with behavior in the
2413 // presence
2414 // of NaNs (signaling or quiet). If so, we should add tests for those.
2415
2416 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2417 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2418 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2419 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2420
2421 let b1 = !0u32; // a.extract(0) > b.extract(0)
2422 let c1 = 0u32; // a.extract(0) > c.extract(0)
2423 let d1 = 0u32; // a.extract(0) > d.extract(0)
2424
2425 let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2426 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2427 assert_eq!(rb, eb);
2428
2429 let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2430 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2431 assert_eq!(rc, ec);
2432
2433 let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2434 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2435 assert_eq!(rd, ed);
2436 }
2437
2438 #[simd_test(enable = "sse")]
2439 unsafe fn test_mm_cmpngt_ss() {
2440 // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2441 // must be a difference. It may have to do with behavior in the
2442 // presence of NaNs (signaling or quiet). If so, we should add tests
2443 // for those.
2444
2445 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2446 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2447 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2448 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2449
2450 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2451 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2452 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2453
2454 let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2455 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2456 assert_eq!(rb, eb);
2457
2458 let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2459 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2460 assert_eq!(rc, ec);
2461
2462 let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2463 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2464 assert_eq!(rd, ed);
2465 }
2466
2467 #[simd_test(enable = "sse")]
2468 unsafe fn test_mm_cmpnge_ss() {
2469 // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2470 // must be a difference. It may have to do with behavior in the
2471 // presence of NaNs (signaling or quiet). If so, we should add tests
2472 // for those.
2473
2474 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2475 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2476 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2477 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2478
2479 let b1 = 0u32; // a.extract(0) < b.extract(0)
2480 let c1 = 0u32; // a.extract(0) < c.extract(0)
2481 let d1 = !0u32; // a.extract(0) < d.extract(0)
2482
2483 let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2484 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2485 assert_eq!(rb, eb);
2486
2487 let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2488 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2489 assert_eq!(rc, ec);
2490
2491 let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2492 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2493 assert_eq!(rd, ed);
2494 }
2495
2496 #[simd_test(enable = "sse")]
2497 unsafe fn test_mm_cmpord_ss() {
2498 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2499 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2500 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2501 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2502
2503 let b1 = !0u32; // a.extract(0) ord b.extract(0)
2504 let c1 = 0u32; // a.extract(0) ord c.extract(0)
2505 let d1 = !0u32; // a.extract(0) ord d.extract(0)
2506
2507 let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2508 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2509 assert_eq!(rb, eb);
2510
2511 let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2512 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2513 assert_eq!(rc, ec);
2514
2515 let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2516 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2517 assert_eq!(rd, ed);
2518 }
2519
2520 #[simd_test(enable = "sse")]
2521 unsafe fn test_mm_cmpunord_ss() {
2522 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2523 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2524 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2525 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2526
2527 let b1 = 0u32; // a.extract(0) unord b.extract(0)
2528 let c1 = !0u32; // a.extract(0) unord c.extract(0)
2529 let d1 = 0u32; // a.extract(0) unord d.extract(0)
2530
2531 let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2532 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2533 assert_eq!(rb, eb);
2534
2535 let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2536 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2537 assert_eq!(rc, ec);
2538
2539 let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2540 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2541 assert_eq!(rd, ed);
2542 }
2543
2544 #[simd_test(enable = "sse")]
2545 unsafe fn test_mm_cmpeq_ps() {
2546 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2547 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2548 let tru = !0u32;
2549 let fls = 0u32;
2550
2551 let e = u32x4::new(fls, fls, tru, fls);
2552 let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2553 assert_eq!(r, e);
2554 }
2555
2556 #[simd_test(enable = "sse")]
2557 unsafe fn test_mm_cmplt_ps() {
2558 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2559 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2560 let tru = !0u32;
2561 let fls = 0u32;
2562
2563 let e = u32x4::new(tru, fls, fls, fls);
2564 let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2565 assert_eq!(r, e);
2566 }
2567
2568 #[simd_test(enable = "sse")]
2569 unsafe fn test_mm_cmple_ps() {
2570 let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2571 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2572 let tru = !0u32;
2573 let fls = 0u32;
2574
2575 let e = u32x4::new(tru, fls, tru, fls);
2576 let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2577 assert_eq!(r, e);
2578 }
2579
2580 #[simd_test(enable = "sse")]
2581 unsafe fn test_mm_cmpgt_ps() {
2582 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2583 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2584 let tru = !0u32;
2585 let fls = 0u32;
2586
2587 let e = u32x4::new(fls, tru, fls, fls);
2588 let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2589 assert_eq!(r, e);
2590 }
2591
2592 #[simd_test(enable = "sse")]
2593 unsafe fn test_mm_cmpge_ps() {
2594 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2595 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2596 let tru = !0u32;
2597 let fls = 0u32;
2598
2599 let e = u32x4::new(fls, tru, tru, fls);
2600 let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2601 assert_eq!(r, e);
2602 }
2603
2604 #[simd_test(enable = "sse")]
2605 unsafe fn test_mm_cmpneq_ps() {
2606 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2607 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2608 let tru = !0u32;
2609 let fls = 0u32;
2610
2611 let e = u32x4::new(tru, tru, fls, tru);
2612 let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2613 assert_eq!(r, e);
2614 }
2615
2616 #[simd_test(enable = "sse")]
2617 unsafe fn test_mm_cmpnlt_ps() {
2618 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2619 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2620 let tru = !0u32;
2621 let fls = 0u32;
2622
2623 let e = u32x4::new(fls, tru, tru, tru);
2624 let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2625 assert_eq!(r, e);
2626 }
2627
2628 #[simd_test(enable = "sse")]
2629 unsafe fn test_mm_cmpnle_ps() {
2630 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2631 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2632 let tru = !0u32;
2633 let fls = 0u32;
2634
2635 let e = u32x4::new(fls, tru, fls, tru);
2636 let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2637 assert_eq!(r, e);
2638 }
2639
2640 #[simd_test(enable = "sse")]
2641 unsafe fn test_mm_cmpngt_ps() {
2642 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2643 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2644 let tru = !0u32;
2645 let fls = 0u32;
2646
2647 let e = u32x4::new(tru, fls, tru, tru);
2648 let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2649 assert_eq!(r, e);
2650 }
2651
2652 #[simd_test(enable = "sse")]
2653 unsafe fn test_mm_cmpnge_ps() {
2654 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2655 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2656 let tru = !0u32;
2657 let fls = 0u32;
2658
2659 let e = u32x4::new(tru, fls, fls, tru);
2660 let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2661 assert_eq!(r, e);
2662 }
2663
2664 #[simd_test(enable = "sse")]
2665 unsafe fn test_mm_cmpord_ps() {
2666 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2667 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2668 let tru = !0u32;
2669 let fls = 0u32;
2670
2671 let e = u32x4::new(tru, fls, fls, fls);
2672 let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2673 assert_eq!(r, e);
2674 }
2675
2676 #[simd_test(enable = "sse")]
2677 unsafe fn test_mm_cmpunord_ps() {
2678 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2679 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2680 let tru = !0u32;
2681 let fls = 0u32;
2682
2683 let e = u32x4::new(fls, tru, tru, tru);
2684 let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2685 assert_eq!(r, e);
2686 }
2687
2688 #[simd_test(enable = "sse")]
2689 unsafe fn test_mm_comieq_ss() {
2690 let aa = &[3.0f32, 12.0, 23.0, NAN];
2691 let bb = &[3.0f32, 47.5, 1.5, NAN];
2692
2693 let ee = &[1i32, 0, 0, 0];
2694
2695 for i in 0..4 {
2696 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2697 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2698
2699 let r = _mm_comieq_ss(a, b);
2700
2701 assert_eq!(
2702 ee[i], r,
2703 "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2704 a, b, r, ee[i], i
2705 );
2706 }
2707 }
2708
2709 #[simd_test(enable = "sse")]
2710 unsafe fn test_mm_comilt_ss() {
2711 let aa = &[3.0f32, 12.0, 23.0, NAN];
2712 let bb = &[3.0f32, 47.5, 1.5, NAN];
2713
2714 let ee = &[0i32, 1, 0, 0];
2715
2716 for i in 0..4 {
2717 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2718 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2719
2720 let r = _mm_comilt_ss(a, b);
2721
2722 assert_eq!(
2723 ee[i], r,
2724 "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2725 a, b, r, ee[i], i
2726 );
2727 }
2728 }
2729
2730 #[simd_test(enable = "sse")]
2731 unsafe fn test_mm_comile_ss() {
2732 let aa = &[3.0f32, 12.0, 23.0, NAN];
2733 let bb = &[3.0f32, 47.5, 1.5, NAN];
2734
2735 let ee = &[1i32, 1, 0, 0];
2736
2737 for i in 0..4 {
2738 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2739 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2740
2741 let r = _mm_comile_ss(a, b);
2742
2743 assert_eq!(
2744 ee[i], r,
2745 "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2746 a, b, r, ee[i], i
2747 );
2748 }
2749 }
2750
2751 #[simd_test(enable = "sse")]
2752 unsafe fn test_mm_comigt_ss() {
2753 let aa = &[3.0f32, 12.0, 23.0, NAN];
2754 let bb = &[3.0f32, 47.5, 1.5, NAN];
2755
2756 let ee = &[1i32, 0, 1, 0];
2757
2758 for i in 0..4 {
2759 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2760 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2761
2762 let r = _mm_comige_ss(a, b);
2763
2764 assert_eq!(
2765 ee[i], r,
2766 "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2767 a, b, r, ee[i], i
2768 );
2769 }
2770 }
2771
2772 #[simd_test(enable = "sse")]
2773 unsafe fn test_mm_comineq_ss() {
2774 let aa = &[3.0f32, 12.0, 23.0, NAN];
2775 let bb = &[3.0f32, 47.5, 1.5, NAN];
2776
2777 let ee = &[0i32, 1, 1, 1];
2778
2779 for i in 0..4 {
2780 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2781 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2782
2783 let r = _mm_comineq_ss(a, b);
2784
2785 assert_eq!(
2786 ee[i], r,
2787 "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2788 a, b, r, ee[i], i
2789 );
2790 }
2791 }
2792
2793 #[simd_test(enable = "sse")]
2794 unsafe fn test_mm_ucomieq_ss() {
2795 let aa = &[3.0f32, 12.0, 23.0, NAN];
2796 let bb = &[3.0f32, 47.5, 1.5, NAN];
2797
2798 let ee = &[1i32, 0, 0, 0];
2799
2800 for i in 0..4 {
2801 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2802 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2803
2804 let r = _mm_ucomieq_ss(a, b);
2805
2806 assert_eq!(
2807 ee[i], r,
2808 "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2809 a, b, r, ee[i], i
2810 );
2811 }
2812 }
2813
2814 #[simd_test(enable = "sse")]
2815 unsafe fn test_mm_ucomilt_ss() {
2816 let aa = &[3.0f32, 12.0, 23.0, NAN];
2817 let bb = &[3.0f32, 47.5, 1.5, NAN];
2818
2819 let ee = &[0i32, 1, 0, 0];
2820
2821 for i in 0..4 {
2822 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2823 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2824
2825 let r = _mm_ucomilt_ss(a, b);
2826
2827 assert_eq!(
2828 ee[i], r,
2829 "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2830 a, b, r, ee[i], i
2831 );
2832 }
2833 }
2834
2835 #[simd_test(enable = "sse")]
2836 unsafe fn test_mm_ucomile_ss() {
2837 let aa = &[3.0f32, 12.0, 23.0, NAN];
2838 let bb = &[3.0f32, 47.5, 1.5, NAN];
2839
2840 let ee = &[1i32, 1, 0, 0];
2841
2842 for i in 0..4 {
2843 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2844 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2845
2846 let r = _mm_ucomile_ss(a, b);
2847
2848 assert_eq!(
2849 ee[i], r,
2850 "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2851 a, b, r, ee[i], i
2852 );
2853 }
2854 }
2855
2856 #[simd_test(enable = "sse")]
2857 unsafe fn test_mm_ucomigt_ss() {
2858 let aa = &[3.0f32, 12.0, 23.0, NAN];
2859 let bb = &[3.0f32, 47.5, 1.5, NAN];
2860
2861 let ee = &[0i32, 0, 1, 0];
2862
2863 for i in 0..4 {
2864 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2865 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2866
2867 let r = _mm_ucomigt_ss(a, b);
2868
2869 assert_eq!(
2870 ee[i], r,
2871 "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2872 a, b, r, ee[i], i
2873 );
2874 }
2875 }
2876
2877 #[simd_test(enable = "sse")]
2878 unsafe fn test_mm_ucomige_ss() {
2879 let aa = &[3.0f32, 12.0, 23.0, NAN];
2880 let bb = &[3.0f32, 47.5, 1.5, NAN];
2881
2882 let ee = &[1i32, 0, 1, 0];
2883
2884 for i in 0..4 {
2885 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2886 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2887
2888 let r = _mm_ucomige_ss(a, b);
2889
2890 assert_eq!(
2891 ee[i], r,
2892 "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2893 a, b, r, ee[i], i
2894 );
2895 }
2896 }
2897
2898 #[simd_test(enable = "sse")]
2899 unsafe fn test_mm_ucomineq_ss() {
2900 let aa = &[3.0f32, 12.0, 23.0, NAN];
2901 let bb = &[3.0f32, 47.5, 1.5, NAN];
2902
2903 let ee = &[0i32, 1, 1, 1];
2904
2905 for i in 0..4 {
2906 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2907 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2908
2909 let r = _mm_ucomineq_ss(a, b);
2910
2911 assert_eq!(
2912 ee[i], r,
2913 "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2914 a, b, r, ee[i], i
2915 );
2916 }
2917 }
2918
2919 #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
2920 #[simd_test(enable = "sse")]
2921 #[cfg_attr(miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri
2922 unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
2923 // If one of the arguments is a quiet NaN `comieq_ss` should signal an
2924 // Invalid Operation Exception while `ucomieq_ss` should not.
2925 let aa = &[3.0f32, NAN, 23.0, NAN];
2926 let bb = &[3.0f32, 47.5, NAN, NAN];
2927
2928 let ee = &[1i32, 0, 0, 0];
2929 let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
2930
2931 for i in 0..4 {
2932 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2933 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2934
2935 _MM_SET_EXCEPTION_STATE(0);
2936 let r1 = _mm_comieq_ss(*black_box(&a), b);
2937 let s1 = _MM_GET_EXCEPTION_STATE();
2938
2939 _MM_SET_EXCEPTION_STATE(0);
2940 let r2 = _mm_ucomieq_ss(*black_box(&a), b);
2941 let s2 = _MM_GET_EXCEPTION_STATE();
2942
2943 assert_eq!(
2944 ee[i], r1,
2945 "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2946 a, b, r1, ee[i], i
2947 );
2948 assert_eq!(
2949 ee[i], r2,
2950 "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2951 a, b, r2, ee[i], i
2952 );
2953 assert_eq!(
2954 s1,
2955 exc[i] * _MM_EXCEPT_INVALID,
2956 "_mm_comieq_ss() set exception flags: {} (i={})",
2957 s1,
2958 i
2959 );
2960 assert_eq!(
2961 s2,
2962 0, // ucomieq_ss should not signal an exception
2963 "_mm_ucomieq_ss() set exception flags: {} (i={})",
2964 s2,
2965 i
2966 );
2967 }
2968 }
2969
2970 #[simd_test(enable = "sse")]
2971 unsafe fn test_mm_cvtss_si32() {
2972 let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
2973 let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
2974 for i in 0..inputs.len() {
2975 let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2976 let e = result[i];
2977 let r = _mm_cvtss_si32(x);
2978 assert_eq!(
2979 e, r,
2980 "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2981 i, x, r, e
2982 );
2983 }
2984 }
2985
2986 #[simd_test(enable = "sse")]
2987 unsafe fn test_mm_cvttss_si32() {
2988 let inputs = &[
2989 (42.0f32, 42i32),
2990 (-31.4, -31),
2991 (-33.5, -33),
2992 (-34.5, -34),
2993 (10.999, 10),
2994 (-5.99, -5),
2995 (4.0e10, i32::MIN),
2996 (4.0e-10, 0),
2997 (NAN, i32::MIN),
2998 (2147483500.1, 2147483520),
2999 ];
3000 for (i, &(xi, e)) in inputs.iter().enumerate() {
3001 let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
3002 let r = _mm_cvttss_si32(x);
3003 assert_eq!(
3004 e, r,
3005 "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
3006 i, x, r, e
3007 );
3008 }
3009 }
3010
3011 #[simd_test(enable = "sse")]
3012 unsafe fn test_mm_cvtsi32_ss() {
3013 let inputs = &[
3014 (4555i32, 4555.0f32),
3015 (322223333, 322223330.0),
3016 (-432, -432.0),
3017 (-322223333, -322223330.0),
3018 ];
3019
3020 for &(x, f) in inputs.iter() {
3021 let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3022 let r = _mm_cvtsi32_ss(a, x);
3023 let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
3024 assert_eq_m128(e, r);
3025 }
3026 }
3027
3028 #[simd_test(enable = "sse")]
3029 unsafe fn test_mm_cvtss_f32() {
3030 let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
3031 assert_eq!(_mm_cvtss_f32(a), 312.0134);
3032 }
3033
3034 #[simd_test(enable = "sse")]
3035 unsafe fn test_mm_set_ss() {
3036 let r = _mm_set_ss(black_box(4.25));
3037 assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
3038 }
3039
3040 #[simd_test(enable = "sse")]
3041 unsafe fn test_mm_set1_ps() {
3042 let r1 = _mm_set1_ps(black_box(4.25));
3043 let r2 = _mm_set_ps1(black_box(4.25));
3044 assert_eq!(get_m128(r1, 0), 4.25);
3045 assert_eq!(get_m128(r1, 1), 4.25);
3046 assert_eq!(get_m128(r1, 2), 4.25);
3047 assert_eq!(get_m128(r1, 3), 4.25);
3048 assert_eq!(get_m128(r2, 0), 4.25);
3049 assert_eq!(get_m128(r2, 1), 4.25);
3050 assert_eq!(get_m128(r2, 2), 4.25);
3051 assert_eq!(get_m128(r2, 3), 4.25);
3052 }
3053
3054 #[simd_test(enable = "sse")]
3055 unsafe fn test_mm_set_ps() {
3056 let r = _mm_set_ps(
3057 black_box(1.0),
3058 black_box(2.0),
3059 black_box(3.0),
3060 black_box(4.0),
3061 );
3062 assert_eq!(get_m128(r, 0), 4.0);
3063 assert_eq!(get_m128(r, 1), 3.0);
3064 assert_eq!(get_m128(r, 2), 2.0);
3065 assert_eq!(get_m128(r, 3), 1.0);
3066 }
3067
3068 #[simd_test(enable = "sse")]
3069 unsafe fn test_mm_setr_ps() {
3070 let r = _mm_setr_ps(
3071 black_box(1.0),
3072 black_box(2.0),
3073 black_box(3.0),
3074 black_box(4.0),
3075 );
3076 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3077 }
3078
3079 #[simd_test(enable = "sse")]
3080 unsafe fn test_mm_setzero_ps() {
3081 let r = *black_box(&_mm_setzero_ps());
3082 assert_eq_m128(r, _mm_set1_ps(0.0));
3083 }
3084
3085 #[simd_test(enable = "sse")]
3086 unsafe fn test_mm_shuffle() {
3087 assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
3088 assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
3089 assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
3090 }
3091
3092 #[simd_test(enable = "sse")]
3093 unsafe fn test_mm_shuffle_ps() {
3094 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3095 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3096 let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
3097 assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
3098 }
3099
3100 #[simd_test(enable = "sse")]
3101 unsafe fn test_mm_unpackhi_ps() {
3102 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3103 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3104 let r = _mm_unpackhi_ps(a, b);
3105 assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
3106 }
3107
3108 #[simd_test(enable = "sse")]
3109 unsafe fn test_mm_unpacklo_ps() {
3110 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3111 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3112 let r = _mm_unpacklo_ps(a, b);
3113 assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
3114 }
3115
3116 #[simd_test(enable = "sse")]
3117 unsafe fn test_mm_movehl_ps() {
3118 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3119 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3120 let r = _mm_movehl_ps(a, b);
3121 assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
3122 }
3123
3124 #[simd_test(enable = "sse")]
3125 unsafe fn test_mm_movelh_ps() {
3126 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3127 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3128 let r = _mm_movelh_ps(a, b);
3129 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
3130 }
3131
3132 #[simd_test(enable = "sse")]
3133 unsafe fn test_mm_load_ss() {
3134 let a = 42.0f32;
3135 let r = _mm_load_ss(ptr::addr_of!(a));
3136 assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
3137 }
3138
3139 #[simd_test(enable = "sse")]
3140 unsafe fn test_mm_load1_ps() {
3141 let a = 42.0f32;
3142 let r = _mm_load1_ps(ptr::addr_of!(a));
3143 assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
3144 }
3145
3146 #[simd_test(enable = "sse")]
3147 unsafe fn test_mm_load_ps() {
3148 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3149
3150 let mut p = vals.as_ptr();
3151 let mut fixup = 0.0f32;
3152
3153 // Make sure p is aligned, otherwise we might get a
3154 // (signal: 11, SIGSEGV: invalid memory reference)
3155
3156 let unalignment = (p as usize) & 0xf;
3157 if unalignment != 0 {
3158 let delta = (16 - unalignment) >> 2;
3159 fixup = delta as f32;
3160 p = p.add(delta);
3161 }
3162
3163 let r = _mm_load_ps(p);
3164 let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
3165 assert_eq_m128(r, e);
3166 }
3167
3168 #[simd_test(enable = "sse")]
3169 unsafe fn test_mm_loadu_ps() {
3170 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3171 let p = vals.as_ptr().add(3);
3172 let r = _mm_loadu_ps(black_box(p));
3173 assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3174 }
3175
3176 #[simd_test(enable = "sse")]
3177 unsafe fn test_mm_loadr_ps() {
3178 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3179
3180 let mut p = vals.as_ptr();
3181 let mut fixup = 0.0f32;
3182
3183 // Make sure p is aligned, otherwise we might get a
3184 // (signal: 11, SIGSEGV: invalid memory reference)
3185
3186 let unalignment = (p as usize) & 0xf;
3187 if unalignment != 0 {
3188 let delta = (16 - unalignment) >> 2;
3189 fixup = delta as f32;
3190 p = p.add(delta);
3191 }
3192
3193 let r = _mm_loadr_ps(p);
3194 let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
3195 assert_eq_m128(r, e);
3196 }
3197
3198 #[simd_test(enable = "sse2")]
3199 unsafe fn test_mm_loadu_si64() {
3200 let a = _mm_setr_epi64x(5, 6);
3201 let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
3202 assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
3203 }
3204
3205 #[simd_test(enable = "sse")]
3206 unsafe fn test_mm_store_ss() {
3207 let mut vals = [0.0f32; 8];
3208 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3209 _mm_store_ss(vals.as_mut_ptr().add(1), a);
3210
3211 assert_eq!(vals[0], 0.0);
3212 assert_eq!(vals[1], 1.0);
3213 assert_eq!(vals[2], 0.0);
3214 }
3215
3216 #[simd_test(enable = "sse")]
3217 unsafe fn test_mm_store1_ps() {
3218 let mut vals = [0.0f32; 8];
3219 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3220
3221 let mut ofs = 0;
3222 let mut p = vals.as_mut_ptr();
3223
3224 if (p as usize) & 0xf != 0 {
3225 ofs = (16 - ((p as usize) & 0xf)) >> 2;
3226 p = p.add(ofs);
3227 }
3228
3229 _mm_store1_ps(p, *black_box(&a));
3230
3231 if ofs > 0 {
3232 assert_eq!(vals[ofs - 1], 0.0);
3233 }
3234 assert_eq!(vals[ofs + 0], 1.0);
3235 assert_eq!(vals[ofs + 1], 1.0);
3236 assert_eq!(vals[ofs + 2], 1.0);
3237 assert_eq!(vals[ofs + 3], 1.0);
3238 assert_eq!(vals[ofs + 4], 0.0);
3239 }
3240
3241 #[simd_test(enable = "sse")]
3242 unsafe fn test_mm_store_ps() {
3243 let mut vals = [0.0f32; 8];
3244 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3245
3246 let mut ofs = 0;
3247 let mut p = vals.as_mut_ptr();
3248
3249 // Align p to 16-byte boundary
3250 if (p as usize) & 0xf != 0 {
3251 ofs = (16 - ((p as usize) & 0xf)) >> 2;
3252 p = p.add(ofs);
3253 }
3254
3255 _mm_store_ps(p, *black_box(&a));
3256
3257 if ofs > 0 {
3258 assert_eq!(vals[ofs - 1], 0.0);
3259 }
3260 assert_eq!(vals[ofs + 0], 1.0);
3261 assert_eq!(vals[ofs + 1], 2.0);
3262 assert_eq!(vals[ofs + 2], 3.0);
3263 assert_eq!(vals[ofs + 3], 4.0);
3264 assert_eq!(vals[ofs + 4], 0.0);
3265 }
3266
3267 #[simd_test(enable = "sse")]
3268 unsafe fn test_mm_storer_ps() {
3269 let mut vals = [0.0f32; 8];
3270 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3271
3272 let mut ofs = 0;
3273 let mut p = vals.as_mut_ptr();
3274
3275 // Align p to 16-byte boundary
3276 if (p as usize) & 0xf != 0 {
3277 ofs = (16 - ((p as usize) & 0xf)) >> 2;
3278 p = p.add(ofs);
3279 }
3280
3281 _mm_storer_ps(p, *black_box(&a));
3282
3283 if ofs > 0 {
3284 assert_eq!(vals[ofs - 1], 0.0);
3285 }
3286 assert_eq!(vals[ofs + 0], 4.0);
3287 assert_eq!(vals[ofs + 1], 3.0);
3288 assert_eq!(vals[ofs + 2], 2.0);
3289 assert_eq!(vals[ofs + 3], 1.0);
3290 assert_eq!(vals[ofs + 4], 0.0);
3291 }
3292
3293 #[simd_test(enable = "sse")]
3294 unsafe fn test_mm_storeu_ps() {
3295 let mut vals = [0.0f32; 8];
3296 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3297
3298 let mut ofs = 0;
3299 let mut p = vals.as_mut_ptr();
3300
3301 // Make sure p is **not** aligned to 16-byte boundary
3302 if (p as usize) & 0xf == 0 {
3303 ofs = 1;
3304 p = p.add(1);
3305 }
3306
3307 _mm_storeu_ps(p, *black_box(&a));
3308
3309 if ofs > 0 {
3310 assert_eq!(vals[ofs - 1], 0.0);
3311 }
3312 assert_eq!(vals[ofs + 0], 1.0);
3313 assert_eq!(vals[ofs + 1], 2.0);
3314 assert_eq!(vals[ofs + 2], 3.0);
3315 assert_eq!(vals[ofs + 3], 4.0);
3316 assert_eq!(vals[ofs + 4], 0.0);
3317 }
3318
3319 #[simd_test(enable = "sse")]
3320 unsafe fn test_mm_move_ss() {
3321 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3322 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3323
3324 let r = _mm_move_ss(a, b);
3325 let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3326 assert_eq_m128(e, r);
3327 }
3328
3329 #[simd_test(enable = "sse")]
3330 unsafe fn test_mm_movemask_ps() {
3331 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3332 assert_eq!(r, 0b0101);
3333
3334 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3335 assert_eq!(r, 0b0111);
3336 }
3337
3338 #[simd_test(enable = "sse")]
3339 // Miri cannot support this until it is clear how it fits in the Rust memory model
3340 #[cfg_attr(miri, ignore)]
3341 unsafe fn test_mm_sfence() {
3342 _mm_sfence();
3343 }
3344
3345 #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3346 #[simd_test(enable = "sse")]
3347 #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3348 unsafe fn test_mm_getcsr_setcsr_1() {
3349 let saved_csr = _mm_getcsr();
3350
3351 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3352 let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3353
3354 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
3355 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3356
3357 _mm_setcsr(saved_csr);
3358
3359 let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
3360 assert_eq_m128(r, exp); // first component is a denormalized f32
3361 }
3362
3363 #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3364 #[simd_test(enable = "sse")]
3365 #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3366 unsafe fn test_mm_getcsr_setcsr_2() {
3367 // Same as _mm_setcsr_1 test, but with opposite flag value.
3368
3369 let saved_csr = _mm_getcsr();
3370
3371 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3372 let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3373
3374 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
3375 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3376
3377 _mm_setcsr(saved_csr);
3378
3379 let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
3380 assert_eq_m128(r, exp); // first component is a denormalized f32
3381 }
3382
3383 #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3384 #[simd_test(enable = "sse")]
3385 #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3386 unsafe fn test_mm_getcsr_setcsr_underflow() {
3387 _MM_SET_EXCEPTION_STATE(0);
3388
3389 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3390 let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0);
3391
3392 assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
3393
3394 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3395
3396 let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
3397 assert_eq_m128(r, exp);
3398
3399 let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0;
3400 assert!(underflow);
3401 }
3402
3403 #[simd_test(enable = "sse")]
3404 unsafe fn test_MM_TRANSPOSE4_PS() {
3405 let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3406 let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3407 let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3408 let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3409
3410 _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3411
3412 assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3413 assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3414 assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3415 assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3416 }
3417
3418 #[repr(align(16))]
3419 struct Memory {
3420 pub data: [f32; 4],
3421 }
3422
3423 #[simd_test(enable = "sse")]
3424 // Miri cannot support this until it is clear how it fits in the Rust memory model
3425 // (non-temporal store)
3426 #[cfg_attr(miri, ignore)]
3427 unsafe fn test_mm_stream_ps() {
3428 let a = _mm_set1_ps(7.0);
3429 let mut mem = Memory { data: [-1.0; 4] };
3430
3431 _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
3432 for i in 0..4 {
3433 assert_eq!(mem.data[i], get_m128(a, i));
3434 }
3435 }
3436}
3437