1//! Streaming SIMD Extensions (SSE)
2
3use crate::{
4 core_arch::{simd::*, simd_llvm::*, x86::*},
5 intrinsics, mem, ptr,
6};
7
8#[cfg(test)]
9use stdarch_test::assert_instr;
10
11/// Adds the first component of `a` and `b`, the other components are copied
12/// from `a`.
13///
14/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
15#[inline]
16#[target_feature(enable = "sse")]
17#[cfg_attr(test, assert_instr(addss))]
18#[stable(feature = "simd_x86", since = "1.27.0")]
19pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
20 addss(a, b)
21}
22
23/// Adds __m128 vectors.
24///
25/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
26#[inline]
27#[target_feature(enable = "sse")]
28#[cfg_attr(test, assert_instr(addps))]
29#[stable(feature = "simd_x86", since = "1.27.0")]
30pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
31 simd_add(x:a, y:b)
32}
33
34/// Subtracts the first component of `b` from `a`, the other components are
35/// copied from `a`.
36///
37/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
38#[inline]
39#[target_feature(enable = "sse")]
40#[cfg_attr(test, assert_instr(subss))]
41#[stable(feature = "simd_x86", since = "1.27.0")]
42pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
43 subss(a, b)
44}
45
46/// Subtracts __m128 vectors.
47///
48/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
49#[inline]
50#[target_feature(enable = "sse")]
51#[cfg_attr(test, assert_instr(subps))]
52#[stable(feature = "simd_x86", since = "1.27.0")]
53pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
54 simd_sub(x:a, y:b)
55}
56
57/// Multiplies the first component of `a` and `b`, the other components are
58/// copied from `a`.
59///
60/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
61#[inline]
62#[target_feature(enable = "sse")]
63#[cfg_attr(test, assert_instr(mulss))]
64#[stable(feature = "simd_x86", since = "1.27.0")]
65pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
66 mulss(a, b)
67}
68
69/// Multiplies __m128 vectors.
70///
71/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
72#[inline]
73#[target_feature(enable = "sse")]
74#[cfg_attr(test, assert_instr(mulps))]
75#[stable(feature = "simd_x86", since = "1.27.0")]
76pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
77 simd_mul(x:a, y:b)
78}
79
80/// Divides the first component of `b` by `a`, the other components are
81/// copied from `a`.
82///
83/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
84#[inline]
85#[target_feature(enable = "sse")]
86#[cfg_attr(test, assert_instr(divss))]
87#[stable(feature = "simd_x86", since = "1.27.0")]
88pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
89 divss(a, b)
90}
91
92/// Divides __m128 vectors.
93///
94/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
95#[inline]
96#[target_feature(enable = "sse")]
97#[cfg_attr(test, assert_instr(divps))]
98#[stable(feature = "simd_x86", since = "1.27.0")]
99pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
100 simd_div(x:a, y:b)
101}
102
103/// Returns the square root of the first single-precision (32-bit)
104/// floating-point element in `a`, the other elements are unchanged.
105///
106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
107#[inline]
108#[target_feature(enable = "sse")]
109#[cfg_attr(test, assert_instr(sqrtss))]
110#[stable(feature = "simd_x86", since = "1.27.0")]
111pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
112 sqrtss(a)
113}
114
115/// Returns the square root of packed single-precision (32-bit) floating-point
116/// elements in `a`.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
119#[inline]
120#[target_feature(enable = "sse")]
121#[cfg_attr(test, assert_instr(sqrtps))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
124 sqrtps(a)
125}
126
127/// Returns the approximate reciprocal of the first single-precision
128/// (32-bit) floating-point element in `a`, the other elements are unchanged.
129///
130/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
131#[inline]
132#[target_feature(enable = "sse")]
133#[cfg_attr(test, assert_instr(rcpss))]
134#[stable(feature = "simd_x86", since = "1.27.0")]
135pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
136 rcpss(a)
137}
138
139/// Returns the approximate reciprocal of packed single-precision (32-bit)
140/// floating-point elements in `a`.
141///
142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
143#[inline]
144#[target_feature(enable = "sse")]
145#[cfg_attr(test, assert_instr(rcpps))]
146#[stable(feature = "simd_x86", since = "1.27.0")]
147pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
148 rcpps(a)
149}
150
151/// Returns the approximate reciprocal square root of the first single-precision
152/// (32-bit) floating-point element in `a`, the other elements are unchanged.
153///
154/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
155#[inline]
156#[target_feature(enable = "sse")]
157#[cfg_attr(test, assert_instr(rsqrtss))]
158#[stable(feature = "simd_x86", since = "1.27.0")]
159pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
160 rsqrtss(a)
161}
162
163/// Returns the approximate reciprocal square root of packed single-precision
164/// (32-bit) floating-point elements in `a`.
165///
166/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
167#[inline]
168#[target_feature(enable = "sse")]
169#[cfg_attr(test, assert_instr(rsqrtps))]
170#[stable(feature = "simd_x86", since = "1.27.0")]
171pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
172 rsqrtps(a)
173}
174
175/// Compares the first single-precision (32-bit) floating-point element of `a`
176/// and `b`, and return the minimum value in the first element of the return
177/// value, the other elements are copied from `a`.
178///
179/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
180#[inline]
181#[target_feature(enable = "sse")]
182#[cfg_attr(test, assert_instr(minss))]
183#[stable(feature = "simd_x86", since = "1.27.0")]
184pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
185 minss(a, b)
186}
187
188/// Compares packed single-precision (32-bit) floating-point elements in `a` and
189/// `b`, and return the corresponding minimum values.
190///
191/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
192#[inline]
193#[target_feature(enable = "sse")]
194#[cfg_attr(test, assert_instr(minps))]
195#[stable(feature = "simd_x86", since = "1.27.0")]
196pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
197 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
198 minps(a, b)
199}
200
201/// Compares the first single-precision (32-bit) floating-point element of `a`
202/// and `b`, and return the maximum value in the first element of the return
203/// value, the other elements are copied from `a`.
204///
205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
206#[inline]
207#[target_feature(enable = "sse")]
208#[cfg_attr(test, assert_instr(maxss))]
209#[stable(feature = "simd_x86", since = "1.27.0")]
210pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
211 maxss(a, b)
212}
213
214/// Compares packed single-precision (32-bit) floating-point elements in `a` and
215/// `b`, and return the corresponding maximum values.
216///
217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
218#[inline]
219#[target_feature(enable = "sse")]
220#[cfg_attr(test, assert_instr(maxps))]
221#[stable(feature = "simd_x86", since = "1.27.0")]
222pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
223 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
224 maxps(a, b)
225}
226
227/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
228///
229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
230#[inline]
231#[target_feature(enable = "sse")]
232// i586 only seems to generate plain `and` instructions, so ignore it.
233#[cfg_attr(
234 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
235 assert_instr(andps)
236)]
237#[stable(feature = "simd_x86", since = "1.27.0")]
238pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
239 let a: __m128i = mem::transmute(src:a);
240 let b: __m128i = mem::transmute(src:b);
241 mem::transmute(src:simd_and(x:a, y:b))
242}
243
244/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
245/// elements.
246///
247/// Computes `!a & b` for each bit in `a` and `b`.
248///
249/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
250#[inline]
251#[target_feature(enable = "sse")]
252// i586 only seems to generate plain `not` and `and` instructions, so ignore
253// it.
254#[cfg_attr(
255 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
256 assert_instr(andnps)
257)]
258#[stable(feature = "simd_x86", since = "1.27.0")]
259pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
260 let a: __m128i = mem::transmute(src:a);
261 let b: __m128i = mem::transmute(src:b);
262 let mask: __m128i = mem::transmute(src:i32x4::splat(-1));
263 mem::transmute(src:simd_and(x:simd_xor(mask, a), y:b))
264}
265
266/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
267///
268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
269#[inline]
270#[target_feature(enable = "sse")]
271// i586 only seems to generate plain `or` instructions, so we ignore it.
272#[cfg_attr(
273 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
274 assert_instr(orps)
275)]
276#[stable(feature = "simd_x86", since = "1.27.0")]
277pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
278 let a: __m128i = mem::transmute(src:a);
279 let b: __m128i = mem::transmute(src:b);
280 mem::transmute(src:simd_or(x:a, y:b))
281}
282
283/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
284/// elements.
285///
286/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
287#[inline]
288#[target_feature(enable = "sse")]
289// i586 only seems to generate plain `xor` instructions, so we ignore it.
290#[cfg_attr(
291 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
292 assert_instr(xorps)
293)]
294#[stable(feature = "simd_x86", since = "1.27.0")]
295pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
296 let a: __m128i = mem::transmute(src:a);
297 let b: __m128i = mem::transmute(src:b);
298 mem::transmute(src:simd_xor(x:a, y:b))
299}
300
301/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
302/// the result will be `0xffffffff` if the two inputs are equal, or `0`
303/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
306#[inline]
307#[target_feature(enable = "sse")]
308#[cfg_attr(test, assert_instr(cmpeqss))]
309#[stable(feature = "simd_x86", since = "1.27.0")]
310pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
311 cmpss(a, b, imm8:0)
312}
313
314/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
315/// of the result will be `0xffffffff` if `a.extract(0)` is less than
316/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
317/// upper 96 bits of `a`.
318///
319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
320#[inline]
321#[target_feature(enable = "sse")]
322#[cfg_attr(test, assert_instr(cmpltss))]
323#[stable(feature = "simd_x86", since = "1.27.0")]
324pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
325 cmpss(a, b, imm8:1)
326}
327
328/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
329/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
330/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
331/// are the upper 96 bits of `a`.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
334#[inline]
335#[target_feature(enable = "sse")]
336#[cfg_attr(test, assert_instr(cmpless))]
337#[stable(feature = "simd_x86", since = "1.27.0")]
338pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
339 cmpss(a, b, imm8:2)
340}
341
342/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
343/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
344/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
345/// are the upper 96 bits of `a`.
346///
347/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
348#[inline]
349#[target_feature(enable = "sse")]
350#[cfg_attr(test, assert_instr(cmpltss))]
351#[stable(feature = "simd_x86", since = "1.27.0")]
352pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
353 simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3])
354}
355
356/// Compares the lowest `f32` of both inputs for greater than or equal. The
357/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
358/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
359/// of the result are the upper 96 bits of `a`.
360///
361/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
362#[inline]
363#[target_feature(enable = "sse")]
364#[cfg_attr(test, assert_instr(cmpless))]
365#[stable(feature = "simd_x86", since = "1.27.0")]
366pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
367 simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3])
368}
369
370/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
371/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
372/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
373/// upper 96 bits of `a`.
374///
375/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
376#[inline]
377#[target_feature(enable = "sse")]
378#[cfg_attr(test, assert_instr(cmpneqss))]
379#[stable(feature = "simd_x86", since = "1.27.0")]
380pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
381 cmpss(a, b, imm8:4)
382}
383
384/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
385/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
386/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
387/// upper 96 bits of `a`.
388///
389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
390#[inline]
391#[target_feature(enable = "sse")]
392#[cfg_attr(test, assert_instr(cmpnltss))]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
395 cmpss(a, b, imm8:5)
396}
397
398/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
399/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
400/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
401/// of the result are the upper 96 bits of `a`.
402///
403/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
404#[inline]
405#[target_feature(enable = "sse")]
406#[cfg_attr(test, assert_instr(cmpnless))]
407#[stable(feature = "simd_x86", since = "1.27.0")]
408pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
409 cmpss(a, b, imm8:6)
410}
411
412/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
413/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
414/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
415/// the upper 96 bits of `a`.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
418#[inline]
419#[target_feature(enable = "sse")]
420#[cfg_attr(test, assert_instr(cmpnltss))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
423 simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3])
424}
425
426/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
427/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
428/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
429/// bits of the result are the upper 96 bits of `a`.
430///
431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
432#[inline]
433#[target_feature(enable = "sse")]
434#[cfg_attr(test, assert_instr(cmpnless))]
435#[stable(feature = "simd_x86", since = "1.27.0")]
436pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
437 simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3])
438}
439
440/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
441/// the result will be `0xffffffff` if neither of `a.extract(0)` or
442/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
443/// are the upper 96 bits of `a`.
444///
445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
446#[inline]
447#[target_feature(enable = "sse")]
448#[cfg_attr(test, assert_instr(cmpordss))]
449#[stable(feature = "simd_x86", since = "1.27.0")]
450pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
451 cmpss(a, b, imm8:7)
452}
453
454/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
455/// of the result will be `0xffffffff` if any of `a.extract(0)` or
456/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
457/// are the upper 96 bits of `a`.
458///
459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
460#[inline]
461#[target_feature(enable = "sse")]
462#[cfg_attr(test, assert_instr(cmpunordss))]
463#[stable(feature = "simd_x86", since = "1.27.0")]
464pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
465 cmpss(a, b, imm8:3)
466}
467
468/// Compares each of the four floats in `a` to the corresponding element in `b`.
469/// The result in the output vector will be `0xffffffff` if the input elements
470/// were equal, or `0` otherwise.
471///
472/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
473#[inline]
474#[target_feature(enable = "sse")]
475#[cfg_attr(test, assert_instr(cmpeqps))]
476#[stable(feature = "simd_x86", since = "1.27.0")]
477pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
478 cmpps(a, b, imm8:0)
479}
480
481/// Compares each of the four floats in `a` to the corresponding element in `b`.
482/// The result in the output vector will be `0xffffffff` if the input element
483/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
484///
485/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
486#[inline]
487#[target_feature(enable = "sse")]
488#[cfg_attr(test, assert_instr(cmpltps))]
489#[stable(feature = "simd_x86", since = "1.27.0")]
490pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
491 cmpps(a, b, imm8:1)
492}
493
494/// Compares each of the four floats in `a` to the corresponding element in `b`.
495/// The result in the output vector will be `0xffffffff` if the input element
496/// in `a` is less than or equal to the corresponding element in `b`, or `0`
497/// otherwise.
498///
499/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
500#[inline]
501#[target_feature(enable = "sse")]
502#[cfg_attr(test, assert_instr(cmpleps))]
503#[stable(feature = "simd_x86", since = "1.27.0")]
504pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
505 cmpps(a, b, imm8:2)
506}
507
508/// Compares each of the four floats in `a` to the corresponding element in `b`.
509/// The result in the output vector will be `0xffffffff` if the input element
510/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
511///
512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
513#[inline]
514#[target_feature(enable = "sse")]
515#[cfg_attr(test, assert_instr(cmpltps))]
516#[stable(feature = "simd_x86", since = "1.27.0")]
517pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
518 cmpps(a:b, b:a, imm8:1)
519}
520
521/// Compares each of the four floats in `a` to the corresponding element in `b`.
522/// The result in the output vector will be `0xffffffff` if the input element
523/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
524/// otherwise.
525///
526/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
527#[inline]
528#[target_feature(enable = "sse")]
529#[cfg_attr(test, assert_instr(cmpleps))]
530#[stable(feature = "simd_x86", since = "1.27.0")]
531pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
532 cmpps(a:b, b:a, imm8:2)
533}
534
535/// Compares each of the four floats in `a` to the corresponding element in `b`.
536/// The result in the output vector will be `0xffffffff` if the input elements
537/// are **not** equal, or `0` otherwise.
538///
539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
540#[inline]
541#[target_feature(enable = "sse")]
542#[cfg_attr(test, assert_instr(cmpneqps))]
543#[stable(feature = "simd_x86", since = "1.27.0")]
544pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
545 cmpps(a, b, imm8:4)
546}
547
548/// Compares each of the four floats in `a` to the corresponding element in `b`.
549/// The result in the output vector will be `0xffffffff` if the input element
550/// in `a` is **not** less than the corresponding element in `b`, or `0`
551/// otherwise.
552///
553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
554#[inline]
555#[target_feature(enable = "sse")]
556#[cfg_attr(test, assert_instr(cmpnltps))]
557#[stable(feature = "simd_x86", since = "1.27.0")]
558pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
559 cmpps(a, b, imm8:5)
560}
561
562/// Compares each of the four floats in `a` to the corresponding element in `b`.
563/// The result in the output vector will be `0xffffffff` if the input element
564/// in `a` is **not** less than or equal to the corresponding element in `b`, or
565/// `0` otherwise.
566///
567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
568#[inline]
569#[target_feature(enable = "sse")]
570#[cfg_attr(test, assert_instr(cmpnleps))]
571#[stable(feature = "simd_x86", since = "1.27.0")]
572pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
573 cmpps(a, b, imm8:6)
574}
575
576/// Compares each of the four floats in `a` to the corresponding element in `b`.
577/// The result in the output vector will be `0xffffffff` if the input element
578/// in `a` is **not** greater than the corresponding element in `b`, or `0`
579/// otherwise.
580///
581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
582#[inline]
583#[target_feature(enable = "sse")]
584#[cfg_attr(test, assert_instr(cmpnltps))]
585#[stable(feature = "simd_x86", since = "1.27.0")]
586pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
587 cmpps(a:b, b:a, imm8:5)
588}
589
590/// Compares each of the four floats in `a` to the corresponding element in `b`.
591/// The result in the output vector will be `0xffffffff` if the input element
592/// in `a` is **not** greater than or equal to the corresponding element in `b`,
593/// or `0` otherwise.
594///
595/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
596#[inline]
597#[target_feature(enable = "sse")]
598#[cfg_attr(test, assert_instr(cmpnleps))]
599#[stable(feature = "simd_x86", since = "1.27.0")]
600pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
601 cmpps(a:b, b:a, imm8:6)
602}
603
604/// Compares each of the four floats in `a` to the corresponding element in `b`.
605/// Returns four floats that have one of two possible bit patterns. The element
606/// in the output vector will be `0xffffffff` if the input elements in `a` and
607/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
608///
609/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
610#[inline]
611#[target_feature(enable = "sse")]
612#[cfg_attr(test, assert_instr(cmpordps))]
613#[stable(feature = "simd_x86", since = "1.27.0")]
614pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
615 cmpps(a:b, b:a, imm8:7)
616}
617
618/// Compares each of the four floats in `a` to the corresponding element in `b`.
619/// Returns four floats that have one of two possible bit patterns. The element
620/// in the output vector will be `0xffffffff` if the input elements in `a` and
621/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
622///
623/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
624#[inline]
625#[target_feature(enable = "sse")]
626#[cfg_attr(test, assert_instr(cmpunordps))]
627#[stable(feature = "simd_x86", since = "1.27.0")]
628pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
629 cmpps(a:b, b:a, imm8:3)
630}
631
632/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
633/// `1` if they are equal, or `0` otherwise.
634///
635/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
636#[inline]
637#[target_feature(enable = "sse")]
638#[cfg_attr(test, assert_instr(comiss))]
639#[stable(feature = "simd_x86", since = "1.27.0")]
640pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
641 comieq_ss(a, b)
642}
643
644/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
645/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
646///
647/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
648#[inline]
649#[target_feature(enable = "sse")]
650#[cfg_attr(test, assert_instr(comiss))]
651#[stable(feature = "simd_x86", since = "1.27.0")]
652pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
653 comilt_ss(a, b)
654}
655
656/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
657/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
658/// otherwise.
659///
660/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
661#[inline]
662#[target_feature(enable = "sse")]
663#[cfg_attr(test, assert_instr(comiss))]
664#[stable(feature = "simd_x86", since = "1.27.0")]
665pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
666 comile_ss(a, b)
667}
668
669/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
670/// `1` if the value from `a` is greater than the one from `b`, or `0`
671/// otherwise.
672///
673/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
674#[inline]
675#[target_feature(enable = "sse")]
676#[cfg_attr(test, assert_instr(comiss))]
677#[stable(feature = "simd_x86", since = "1.27.0")]
678pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
679 comigt_ss(a, b)
680}
681
682/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
683/// `1` if the value from `a` is greater than or equal to the one from `b`, or
684/// `0` otherwise.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
687#[inline]
688#[target_feature(enable = "sse")]
689#[cfg_attr(test, assert_instr(comiss))]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
692 comige_ss(a, b)
693}
694
695/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
696/// `1` if they are **not** equal, or `0` otherwise.
697///
698/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
699#[inline]
700#[target_feature(enable = "sse")]
701#[cfg_attr(test, assert_instr(comiss))]
702#[stable(feature = "simd_x86", since = "1.27.0")]
703pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
704 comineq_ss(a, b)
705}
706
707/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
708/// `1` if they are equal, or `0` otherwise. This instruction will not signal
709/// an exception if either argument is a quiet NaN.
710///
711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
712#[inline]
713#[target_feature(enable = "sse")]
714#[cfg_attr(test, assert_instr(ucomiss))]
715#[stable(feature = "simd_x86", since = "1.27.0")]
716pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
717 ucomieq_ss(a, b)
718}
719
720/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
721/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
722/// This instruction will not signal an exception if either argument is a quiet
723/// NaN.
724///
725/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
726#[inline]
727#[target_feature(enable = "sse")]
728#[cfg_attr(test, assert_instr(ucomiss))]
729#[stable(feature = "simd_x86", since = "1.27.0")]
730pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
731 ucomilt_ss(a, b)
732}
733
734/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
735/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
736/// otherwise. This instruction will not signal an exception if either argument
737/// is a quiet NaN.
738///
739/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
740#[inline]
741#[target_feature(enable = "sse")]
742#[cfg_attr(test, assert_instr(ucomiss))]
743#[stable(feature = "simd_x86", since = "1.27.0")]
744pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
745 ucomile_ss(a, b)
746}
747
748/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
749/// `1` if the value from `a` is greater than the one from `b`, or `0`
750/// otherwise. This instruction will not signal an exception if either argument
751/// is a quiet NaN.
752///
753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
754#[inline]
755#[target_feature(enable = "sse")]
756#[cfg_attr(test, assert_instr(ucomiss))]
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
759 ucomigt_ss(a, b)
760}
761
762/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
763/// `1` if the value from `a` is greater than or equal to the one from `b`, or
764/// `0` otherwise. This instruction will not signal an exception if either
765/// argument is a quiet NaN.
766///
767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
768#[inline]
769#[target_feature(enable = "sse")]
770#[cfg_attr(test, assert_instr(ucomiss))]
771#[stable(feature = "simd_x86", since = "1.27.0")]
772pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
773 ucomige_ss(a, b)
774}
775
776/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
777/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
778/// signal an exception if either argument is a quiet NaN.
779///
780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
781#[inline]
782#[target_feature(enable = "sse")]
783#[cfg_attr(test, assert_instr(ucomiss))]
784#[stable(feature = "simd_x86", since = "1.27.0")]
785pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
786 ucomineq_ss(a, b)
787}
788
789/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
790///
791/// The result is rounded according to the current rounding mode. If the result
792/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
793/// (`i32::MIN`).
794///
795/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
796///
797/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
798#[inline]
799#[target_feature(enable = "sse")]
800#[cfg_attr(test, assert_instr(cvtss2si))]
801#[stable(feature = "simd_x86", since = "1.27.0")]
802pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
803 cvtss2si(a)
804}
805
806/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
807///
808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
809#[inline]
810#[target_feature(enable = "sse")]
811#[cfg_attr(test, assert_instr(cvtss2si))]
812#[stable(feature = "simd_x86", since = "1.27.0")]
813pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
814 _mm_cvtss_si32(a)
815}
816
817/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
818/// with
819/// truncation.
820///
821/// The result is rounded always using truncation (round towards zero). If the
822/// result cannot be represented as a 32 bit integer the result will be
823/// `0x8000_0000` (`i32::MIN`).
824///
825/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
826///
827/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
828#[inline]
829#[target_feature(enable = "sse")]
830#[cfg_attr(test, assert_instr(cvttss2si))]
831#[stable(feature = "simd_x86", since = "1.27.0")]
832pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
833 cvttss2si(a)
834}
835
836/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
837///
838/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
839#[inline]
840#[target_feature(enable = "sse")]
841#[cfg_attr(test, assert_instr(cvttss2si))]
842#[stable(feature = "simd_x86", since = "1.27.0")]
843pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
844 _mm_cvttss_si32(a)
845}
846
847/// Extracts the lowest 32 bit float from the input vector.
848///
849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
850#[inline]
851#[target_feature(enable = "sse")]
852// No point in using assert_instrs. In Unix x86_64 calling convention this is a
853// no-op, and on Windows it's just a `mov`.
854#[stable(feature = "simd_x86", since = "1.27.0")]
855pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
856 simd_extract(x:a, idx:0)
857}
858
859/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
860/// vector `a` with the lowest 32 bit float replaced by the converted integer.
861///
862/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
863/// input).
864///
865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
866#[inline]
867#[target_feature(enable = "sse")]
868#[cfg_attr(test, assert_instr(cvtsi2ss))]
869#[stable(feature = "simd_x86", since = "1.27.0")]
870pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
871 cvtsi2ss(a, b)
872}
873
874/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
875///
876/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
877#[inline]
878#[target_feature(enable = "sse")]
879#[cfg_attr(test, assert_instr(cvtsi2ss))]
880#[stable(feature = "simd_x86", since = "1.27.0")]
881pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
882 _mm_cvtsi32_ss(a, b)
883}
884
885/// Construct a `__m128` with the lowest element set to `a` and the rest set to
886/// zero.
887///
888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
889#[inline]
890#[target_feature(enable = "sse")]
891#[cfg_attr(test, assert_instr(movss))]
892#[stable(feature = "simd_x86", since = "1.27.0")]
893pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
894 __m128(a, 0.0, 0.0, 0.0)
895}
896
897/// Construct a `__m128` with all element set to `a`.
898///
899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
900#[inline]
901#[target_feature(enable = "sse")]
902#[cfg_attr(test, assert_instr(shufps))]
903#[stable(feature = "simd_x86", since = "1.27.0")]
904pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
905 __m128(a, a, a, a)
906}
907
908/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
909///
910/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
911#[inline]
912#[target_feature(enable = "sse")]
913#[cfg_attr(test, assert_instr(shufps))]
914#[stable(feature = "simd_x86", since = "1.27.0")]
915pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
916 _mm_set1_ps(a)
917}
918
919/// Construct a `__m128` from four floating point values highest to lowest.
920///
921/// Note that `a` will be the highest 32 bits of the result, and `d` the
922/// lowest. This matches the standard way of writing bit patterns on x86:
923///
924/// ```text
925/// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0
926/// +---------+---------+---------+---------+
927/// | a | b | c | d | result
928/// +---------+---------+---------+---------+
929/// ```
930///
931/// Alternatively:
932///
933/// ```text
934/// let v = _mm_set_ps(d, c, b, a);
935/// ```
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
938#[inline]
939#[target_feature(enable = "sse")]
940#[cfg_attr(test, assert_instr(unpcklps))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
943 __m128(d, c, b, a)
944}
945
946/// Construct a `__m128` from four floating point values lowest to highest.
947///
948/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
949/// bits of the result, and `d` the highest.
950///
951/// ```text
952/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
953/// ```
954///
955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
956#[inline]
957#[target_feature(enable = "sse")]
958#[cfg_attr(
959 all(test, any(target_os = "windows", target_arch = "x86_64")),
960 assert_instr(unpcklps)
961)]
962// On a 32-bit architecture on non-Windows it just copies the operands from the stack.
963#[cfg_attr(
964 all(test, all(not(target_os = "windows"), target_arch = "x86")),
965 assert_instr(movaps)
966)]
967#[stable(feature = "simd_x86", since = "1.27.0")]
968pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
969 __m128(a, b, c, d)
970}
971
972/// Construct a `__m128` with all elements initialized to zero.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
975#[inline]
976#[target_feature(enable = "sse")]
977#[cfg_attr(test, assert_instr(xorps))]
978#[stable(feature = "simd_x86", since = "1.27.0")]
979pub unsafe fn _mm_setzero_ps() -> __m128 {
980 __m128(0.0, 0.0, 0.0, 0.0)
981}
982
983/// A utility function for creating masks to use with Intel shuffle and
984/// permute intrinsics.
985#[inline]
986#[allow(non_snake_case)]
987#[unstable(feature = "stdarch", issue = "27731")]
988pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
989 ((z << 6) | (y << 4) | (x << 2) | w) as i32
990}
991
992/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
993/// `b` using `MASK`.
994///
995/// The lower half of result takes values from `a` and the higher half from
996/// `b`. Mask is split to 2 control bits each to index the element from inputs.
997///
998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
999///
1000/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1001/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1002/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1003/// Performing an implicit type conversion between an unsigned integer and a signed integer
1004/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1005#[inline]
1006#[target_feature(enable = "sse")]
1007#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1008#[rustc_legacy_const_generics(2)]
1009#[stable(feature = "simd_x86", since = "1.27.0")]
1010pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1011 static_assert_uimm_bits!(MASK, 8);
1012 simd_shuffle!(
1013 a,
1014 b,
1015 [
1016 MASK as u32 & 0b11,
1017 (MASK as u32 >> 2) & 0b11,
1018 ((MASK as u32 >> 4) & 0b11) + 4,
1019 ((MASK as u32 >> 6) & 0b11) + 4,
1020 ],
1021 )
1022}
1023
1024/// Unpacks and interleave single-precision (32-bit) floating-point elements
1025/// from the higher half of `a` and `b`.
1026///
1027/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1028#[inline]
1029#[target_feature(enable = "sse")]
1030#[cfg_attr(test, assert_instr(unpckhps))]
1031#[stable(feature = "simd_x86", since = "1.27.0")]
1032pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1033 simd_shuffle!(a, b, [2, 6, 3, 7])
1034}
1035
1036/// Unpacks and interleave single-precision (32-bit) floating-point elements
1037/// from the lower half of `a` and `b`.
1038///
1039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1040#[inline]
1041#[target_feature(enable = "sse")]
1042#[cfg_attr(test, assert_instr(unpcklps))]
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1045 simd_shuffle!(a, b, [0, 4, 1, 5])
1046}
1047
1048/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1049/// lower half of result.
1050///
1051/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1052#[inline]
1053#[target_feature(enable = "sse")]
1054#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
1055#[stable(feature = "simd_x86", since = "1.27.0")]
1056pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1057 // TODO; figure why this is a different instruction on Windows?
1058 simd_shuffle!(a, b, [6, 7, 2, 3])
1059}
1060
1061/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1062/// higher half of result.
1063///
1064/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1065#[inline]
1066#[target_feature(enable = "sse")]
1067#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
1068#[stable(feature = "simd_x86", since = "1.27.0")]
1069pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1070 simd_shuffle!(a, b, [0, 1, 4, 5])
1071}
1072
1073/// Returns a mask of the most significant bit of each element in `a`.
1074///
1075/// The mask is stored in the 4 least significant bits of the return value.
1076/// All other bits are set to `0`.
1077///
1078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1079#[inline]
1080#[target_feature(enable = "sse")]
1081#[cfg_attr(test, assert_instr(movmskps))]
1082#[stable(feature = "simd_x86", since = "1.27.0")]
1083pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
1084 // Propagate the highest bit to the rest, because simd_bitmask
1085 // requires all-1 or all-0.
1086 let mask: i32x4 = simd_lt(x:transmute(a), y:i32x4::splat(0));
1087 simd_bitmask::<i32x4, u8>(mask).into()
1088}
1089
1090/// Construct a `__m128` with the lowest element read from `p` and the other
1091/// elements set to zero.
1092///
1093/// This corresponds to instructions `VMOVSS` / `MOVSS`.
1094///
1095/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1096#[inline]
1097#[target_feature(enable = "sse")]
1098#[cfg_attr(test, assert_instr(movss))]
1099#[stable(feature = "simd_x86", since = "1.27.0")]
1100pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1101 __m128(*p, 0.0, 0.0, 0.0)
1102}
1103
1104/// Construct a `__m128` by duplicating the value read from `p` into all
1105/// elements.
1106///
1107/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1108/// shuffling.
1109///
1110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1111#[inline]
1112#[target_feature(enable = "sse")]
1113#[cfg_attr(test, assert_instr(movss))]
1114#[stable(feature = "simd_x86", since = "1.27.0")]
1115pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1116 let a: f32 = *p;
1117 __m128(a, a, a, a)
1118}
1119
1120/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1121///
1122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1123#[inline]
1124#[target_feature(enable = "sse")]
1125#[cfg_attr(test, assert_instr(movss))]
1126#[stable(feature = "simd_x86", since = "1.27.0")]
1127pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1128 _mm_load1_ps(p)
1129}
1130
1131/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
1132/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1133/// protection fault will be triggered (fatal program crash).
1134///
1135/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1136/// memory.
1137///
1138/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1139///
1140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1141#[inline]
1142#[target_feature(enable = "sse")]
1143#[cfg_attr(test, assert_instr(movaps))]
1144#[stable(feature = "simd_x86", since = "1.27.0")]
1145#[allow(clippy::cast_ptr_alignment)]
1146pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1147 *(p as *const __m128)
1148}
1149
1150/// Loads four `f32` values from memory into a `__m128`. There are no
1151/// restrictions
1152/// on memory alignment. For aligned memory
1153/// [`_mm_load_ps`](fn._mm_load_ps.html)
1154/// may be faster.
1155///
1156/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1159#[inline]
1160#[target_feature(enable = "sse")]
1161#[cfg_attr(test, assert_instr(movups))]
1162#[stable(feature = "simd_x86", since = "1.27.0")]
1163pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1164 // Note: Using `*p` would require `f32` alignment, but `movups` has no
1165 // alignment restrictions.
1166 let mut dst: __m128 = _mm_undefined_ps();
1167 ptr::copy_nonoverlapping(
1168 src:p as *const u8,
1169 &mut dst as *mut __m128 as *mut u8,
1170 count:mem::size_of::<__m128>(),
1171 );
1172 dst
1173}
1174
1175/// Loads four `f32` values from aligned memory into a `__m128` in reverse
1176/// order.
1177///
1178/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1179/// protection fault will be triggered (fatal program crash).
1180///
1181/// Functionally equivalent to the following code sequence (assuming `p`
1182/// satisfies the alignment restrictions):
1183///
1184/// ```text
1185/// let a0 = *p;
1186/// let a1 = *p.add(1);
1187/// let a2 = *p.add(2);
1188/// let a3 = *p.add(3);
1189/// __m128::new(a3, a2, a1, a0)
1190/// ```
1191///
1192/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1193/// shuffling.
1194///
1195/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1196#[inline]
1197#[target_feature(enable = "sse")]
1198#[cfg_attr(test, assert_instr(movaps))]
1199#[stable(feature = "simd_x86", since = "1.27.0")]
1200pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1201 let a: __m128 = _mm_load_ps(p);
1202 simd_shuffle!(a, a, [3, 2, 1, 0])
1203}
1204
1205/// Loads unaligned 64-bits of integer data from memory into new vector.
1206///
1207/// `mem_addr` does not need to be aligned on any particular boundary.
1208///
1209/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
1210#[inline]
1211#[target_feature(enable = "sse")]
1212#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
1213pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
1214 transmute(src:i64x2(ptr::read_unaligned(src:mem_addr as *const i64), 0))
1215}
1216
1217/// Stores the lowest 32 bit float of `a` into memory.
1218///
1219/// This intrinsic corresponds to the `MOVSS` instruction.
1220///
1221/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1222#[inline]
1223#[target_feature(enable = "sse")]
1224#[cfg_attr(test, assert_instr(movss))]
1225#[stable(feature = "simd_x86", since = "1.27.0")]
1226pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1227 *p = simd_extract(x:a, idx:0);
1228}
1229
1230/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
1231/// memory.
1232///
1233/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1234/// protection fault will be triggered (fatal program crash).
1235///
1236/// Functionally equivalent to the following code sequence (assuming `p`
1237/// satisfies the alignment restrictions):
1238///
1239/// ```text
1240/// let x = a.extract(0);
1241/// *p = x;
1242/// *p.add(1) = x;
1243/// *p.add(2) = x;
1244/// *p.add(3) = x;
1245/// ```
1246///
1247/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1248#[inline]
1249#[target_feature(enable = "sse")]
1250#[cfg_attr(test, assert_instr(movaps))]
1251#[stable(feature = "simd_x86", since = "1.27.0")]
1252#[allow(clippy::cast_ptr_alignment)]
1253pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1254 let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
1255 *(p as *mut __m128) = b;
1256}
1257
1258/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1259///
1260/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1261#[inline]
1262#[target_feature(enable = "sse")]
1263#[cfg_attr(test, assert_instr(movaps))]
1264#[stable(feature = "simd_x86", since = "1.27.0")]
1265pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1266 _mm_store1_ps(p, a);
1267}
1268
1269/// Stores four 32-bit floats into *aligned* memory.
1270///
1271/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1272/// protection fault will be triggered (fatal program crash).
1273///
1274/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1275/// memory.
1276///
1277/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1278///
1279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1280#[inline]
1281#[target_feature(enable = "sse")]
1282#[cfg_attr(test, assert_instr(movaps))]
1283#[stable(feature = "simd_x86", since = "1.27.0")]
1284#[allow(clippy::cast_ptr_alignment)]
1285pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1286 *(p as *mut __m128) = a;
1287}
1288
1289/// Stores four 32-bit floats into memory. There are no restrictions on memory
1290/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1291/// faster.
1292///
1293/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1294///
1295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1296#[inline]
1297#[target_feature(enable = "sse")]
1298#[cfg_attr(test, assert_instr(movups))]
1299#[stable(feature = "simd_x86", since = "1.27.0")]
1300pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1301 ptr::copy_nonoverlapping(
1302 &a as *const __m128 as *const u8,
1303 dst:p as *mut u8,
1304 count:mem::size_of::<__m128>(),
1305 );
1306}
1307
1308/// Stores four 32-bit floats into *aligned* memory in reverse order.
1309///
1310/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1311/// protection fault will be triggered (fatal program crash).
1312///
1313/// Functionally equivalent to the following code sequence (assuming `p`
1314/// satisfies the alignment restrictions):
1315///
1316/// ```text
1317/// *p = a.extract(3);
1318/// *p.add(1) = a.extract(2);
1319/// *p.add(2) = a.extract(1);
1320/// *p.add(3) = a.extract(0);
1321/// ```
1322///
1323/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1324#[inline]
1325#[target_feature(enable = "sse")]
1326#[cfg_attr(test, assert_instr(movaps))]
1327#[stable(feature = "simd_x86", since = "1.27.0")]
1328#[allow(clippy::cast_ptr_alignment)]
1329pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1330 let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
1331 *(p as *mut __m128) = b;
1332}
1333
1334/// Returns a `__m128` with the first component from `b` and the remaining
1335/// components from `a`.
1336///
1337/// In other words for any `a` and `b`:
1338/// ```text
1339/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1340/// ```
1341///
1342/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1343#[inline]
1344#[target_feature(enable = "sse")]
1345#[cfg_attr(test, assert_instr(movss))]
1346#[stable(feature = "simd_x86", since = "1.27.0")]
1347pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1348 simd_shuffle!(a, b, [4, 1, 2, 3])
1349}
1350
1351/// Performs a serializing operation on all store-to-memory instructions that
1352/// were issued prior to this instruction.
1353///
1354/// Guarantees that every store instruction that precedes, in program order, is
1355/// globally visible before any store instruction which follows the fence in
1356/// program order.
1357///
1358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1359#[inline]
1360#[target_feature(enable = "sse")]
1361#[cfg_attr(test, assert_instr(sfence))]
1362#[stable(feature = "simd_x86", since = "1.27.0")]
1363pub unsafe fn _mm_sfence() {
1364 sfence()
1365}
1366
1367/// Gets the unsigned 32-bit value of the MXCSR control and status register.
1368///
1369/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
1370/// floating-point operations may or may not result in this register getting updated with exception
1371/// state, and the register can change between two invocations of this function even when no
1372/// floating-point operations appear in the source code (since floating-point operations appearing
1373/// earlier or later can be reordered).
1374///
1375/// If you need to perform some floating-point operations and check whether they raised an
1376/// exception, use an inline assembly block for the entire sequence of operations.
1377///
1378/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1379///
1380/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1381#[inline]
1382#[target_feature(enable = "sse")]
1383#[cfg_attr(test, assert_instr(stmxcsr))]
1384#[stable(feature = "simd_x86", since = "1.27.0")]
1385#[deprecated(
1386 since = "1.75.0",
1387 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1388)]
1389pub unsafe fn _mm_getcsr() -> u32 {
1390 let mut result: i32 = 0_i32;
1391 stmxcsr(&mut result as *mut _ as *mut i8);
1392 result as u32
1393}
1394
1395/// Sets the MXCSR register with the 32-bit unsigned integer value.
1396///
1397/// This register controls how SIMD instructions handle floating point
1398/// operations. Modifying this register only affects the current thread.
1399///
1400/// It contains several groups of flags:
1401///
1402/// * *Exception flags* report which exceptions occurred since last they were
1403/// reset.
1404///
1405/// * *Masking flags* can be used to mask (ignore) certain exceptions. By
1406/// default
1407/// these flags are all set to 1, so all exceptions are masked. When an
1408/// an exception is masked, the processor simply sets the exception flag and
1409/// continues the operation. If the exception is unmasked, the flag is also set
1410/// but additionally an exception handler is invoked.
1411///
1412/// * *Rounding mode flags* control the rounding mode of floating point
1413/// instructions.
1414///
1415/// * The *denormals-are-zero mode flag* turns all numbers which would be
1416/// denormalized (exponent bits are all zeros) into zeros.
1417///
1418/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
1419/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
1420/// will optimize accordingly. This even applies when the register is altered and later reset to its
1421/// original value without any floating-point operations appearing in the source code between those
1422/// operations (since floating-point operations appearing earlier or later can be reordered).
1423///
1424/// If you need to perform some floating-point operations under a different masking flags, rounding
1425/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
1426/// original MXCSR register state before the end of the block.
1427///
1428/// ## Exception Flags
1429///
1430/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1431/// Infinity by Infinity).
1432///
1433/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1434/// number. Mainly this can cause loss of precision.
1435///
1436/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
1437///
1438/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
1439/// result was too large to be represented (e.g., an `f32` with absolute
1440/// value
1441/// greater than `2^128`).
1442///
1443/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
1444/// result was too small to be represented in a normalized way (e.g., an
1445/// `f32`
1446/// with absulte value smaller than `2^-126`.)
1447///
1448/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
1449/// precision exception). This means some precision was lost due to rounding.
1450/// For example, the fraction `1/3` cannot be represented accurately in a
1451/// 32 or 64 bit float and computing it would cause this exception to be
1452/// raised. Precision exceptions are very common, so they are usually masked.
1453///
1454/// Exception flags can be read and set using the convenience functions
1455/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1456/// check if an operation caused some overflow:
1457///
1458/// ```rust,ignore
1459/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1460/// // perform calculations
1461/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1462/// // handle overflow
1463/// }
1464/// ```
1465///
1466/// ## Masking Flags
1467///
1468/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1469/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1470/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1471///
1472/// A single masking bit can be set via
1473///
1474/// ```rust,ignore
1475/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1476/// ```
1477///
1478/// However, since mask bits are by default all set to 1, it is more common to
1479/// want to *disable* certain bits. For example, to unmask the underflow
1480/// exception, use:
1481///
1482/// ```rust,ignore
1483/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1484/// exception
1485/// ```
1486///
1487/// Warning: an unmasked exception will cause an exception handler to be
1488/// called.
1489/// The standard handler will simply terminate the process. So, in this case
1490/// any underflow exception would terminate the current process with something
1491/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1492///
1493/// ## Rounding Mode
1494///
1495/// The rounding mode is describe using two bits. It can be read and set using
1496/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1497/// `_MM_SET_ROUNDING_MODE(mode)`.
1498///
1499/// The rounding modes are:
1500///
1501/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1502/// value. If two values are equally close, round to even (i.e., least
1503/// significant bit will be zero).
1504///
1505/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1506///
1507/// * `_MM_ROUND_UP`: Round toward positive Infinity.
1508///
1509/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1510///
1511/// Example:
1512///
1513/// ```rust,ignore
1514/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1515/// ```
1516///
1517/// ## Denormals-are-zero/Flush-to-zero Mode
1518///
1519/// If this bit is set, values that would be denormalized will be set to zero
1520/// instead. This is turned off by default.
1521///
1522/// You can read and enable/disable this mode via the helper functions
1523/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1524///
1525/// ```rust,ignore
1526/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1527/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1528/// ```
1529///
1530///
1531/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1532#[inline]
1533#[target_feature(enable = "sse")]
1534#[cfg_attr(test, assert_instr(ldmxcsr))]
1535#[stable(feature = "simd_x86", since = "1.27.0")]
1536#[deprecated(
1537 since = "1.75.0",
1538 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1539)]
1540pub unsafe fn _mm_setcsr(val: u32) {
1541 ldmxcsr(&val as *const _ as *const i8);
1542}
1543
1544/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1545#[stable(feature = "simd_x86", since = "1.27.0")]
1546pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1547/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1548#[stable(feature = "simd_x86", since = "1.27.0")]
1549pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1550/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1551#[stable(feature = "simd_x86", since = "1.27.0")]
1552pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1553/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1554#[stable(feature = "simd_x86", since = "1.27.0")]
1555pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1556/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1557#[stable(feature = "simd_x86", since = "1.27.0")]
1558pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1559/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1560#[stable(feature = "simd_x86", since = "1.27.0")]
1561pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1562/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1563#[stable(feature = "simd_x86", since = "1.27.0")]
1564pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1565
1566/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1567#[stable(feature = "simd_x86", since = "1.27.0")]
1568pub const _MM_MASK_INVALID: u32 = 0x0080;
1569/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1570#[stable(feature = "simd_x86", since = "1.27.0")]
1571pub const _MM_MASK_DENORM: u32 = 0x0100;
1572/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1573#[stable(feature = "simd_x86", since = "1.27.0")]
1574pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1575/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1576#[stable(feature = "simd_x86", since = "1.27.0")]
1577pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1578/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1579#[stable(feature = "simd_x86", since = "1.27.0")]
1580pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1581/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1582#[stable(feature = "simd_x86", since = "1.27.0")]
1583pub const _MM_MASK_INEXACT: u32 = 0x1000;
1584/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1585#[stable(feature = "simd_x86", since = "1.27.0")]
1586pub const _MM_MASK_MASK: u32 = 0x1f80;
1587
1588/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1589#[stable(feature = "simd_x86", since = "1.27.0")]
1590pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1591/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1592#[stable(feature = "simd_x86", since = "1.27.0")]
1593pub const _MM_ROUND_DOWN: u32 = 0x2000;
1594/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1595#[stable(feature = "simd_x86", since = "1.27.0")]
1596pub const _MM_ROUND_UP: u32 = 0x4000;
1597/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1598#[stable(feature = "simd_x86", since = "1.27.0")]
1599pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1600
1601/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1602#[stable(feature = "simd_x86", since = "1.27.0")]
1603pub const _MM_ROUND_MASK: u32 = 0x6000;
1604
1605/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1606#[stable(feature = "simd_x86", since = "1.27.0")]
1607pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1608/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1609#[stable(feature = "simd_x86", since = "1.27.0")]
1610pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1611/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1612#[stable(feature = "simd_x86", since = "1.27.0")]
1613pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1614
1615/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1616///
1617/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1618#[inline]
1619#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1620#[allow(non_snake_case)]
1621#[target_feature(enable = "sse")]
1622#[stable(feature = "simd_x86", since = "1.27.0")]
1623#[deprecated(
1624 since = "1.75.0",
1625 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1626)]
1627pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1628 _mm_getcsr() & _MM_MASK_MASK
1629}
1630
1631/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1632///
1633/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1634#[inline]
1635#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1636#[allow(non_snake_case)]
1637#[target_feature(enable = "sse")]
1638#[stable(feature = "simd_x86", since = "1.27.0")]
1639#[deprecated(
1640 since = "1.75.0",
1641 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1642)]
1643pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1644 _mm_getcsr() & _MM_EXCEPT_MASK
1645}
1646
1647/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1648///
1649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1650#[inline]
1651#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1652#[allow(non_snake_case)]
1653#[target_feature(enable = "sse")]
1654#[stable(feature = "simd_x86", since = "1.27.0")]
1655#[deprecated(
1656 since = "1.75.0",
1657 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1658)]
1659pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1660 _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1661}
1662
1663/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1664///
1665/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1666#[inline]
1667#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1668#[allow(non_snake_case)]
1669#[target_feature(enable = "sse")]
1670#[stable(feature = "simd_x86", since = "1.27.0")]
1671#[deprecated(
1672 since = "1.75.0",
1673 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1674)]
1675pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1676 _mm_getcsr() & _MM_ROUND_MASK
1677}
1678
1679/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1680///
1681/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1682#[inline]
1683#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1684#[allow(non_snake_case)]
1685#[target_feature(enable = "sse")]
1686#[stable(feature = "simd_x86", since = "1.27.0")]
1687#[deprecated(
1688 since = "1.75.0",
1689 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1690)]
1691pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1692 _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
1693}
1694
1695/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1696///
1697/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1698#[inline]
1699#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1700#[allow(non_snake_case)]
1701#[target_feature(enable = "sse")]
1702#[stable(feature = "simd_x86", since = "1.27.0")]
1703#[deprecated(
1704 since = "1.75.0",
1705 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1706)]
1707pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1708 _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
1709}
1710
1711/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1712///
1713/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1714#[inline]
1715#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1716#[allow(non_snake_case)]
1717#[target_feature(enable = "sse")]
1718#[stable(feature = "simd_x86", since = "1.27.0")]
1719#[deprecated(
1720 since = "1.75.0",
1721 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1722)]
1723pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1724 let val: u32 = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
1725 // println!("setting csr={:x}", val);
1726 _mm_setcsr(val)
1727}
1728
1729/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1730///
1731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1732#[inline]
1733#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1734#[allow(non_snake_case)]
1735#[target_feature(enable = "sse")]
1736#[stable(feature = "simd_x86", since = "1.27.0")]
1737#[deprecated(
1738 since = "1.75.0",
1739 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1740)]
1741pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1742 _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
1743}
1744
1745/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1746#[stable(feature = "simd_x86", since = "1.27.0")]
1747pub const _MM_HINT_T0: i32 = 3;
1748
1749/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1750#[stable(feature = "simd_x86", since = "1.27.0")]
1751pub const _MM_HINT_T1: i32 = 2;
1752
1753/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1754#[stable(feature = "simd_x86", since = "1.27.0")]
1755pub const _MM_HINT_T2: i32 = 1;
1756
1757/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1758#[stable(feature = "simd_x86", since = "1.27.0")]
1759pub const _MM_HINT_NTA: i32 = 0;
1760
1761/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763pub const _MM_HINT_ET0: i32 = 7;
1764
1765/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1766#[stable(feature = "simd_x86", since = "1.27.0")]
1767pub const _MM_HINT_ET1: i32 = 6;
1768
1769/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1770///
1771/// The `STRATEGY` must be one of:
1772///
1773/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
1774/// cache hierarchy.
1775///
1776/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1777///
1778/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1779/// an implementation-specific choice (e.g., L2 if there is no L3).
1780///
1781/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1782/// non-temporal access (NTA) hint. It may be a place closer than main memory
1783/// but outside of the cache hierarchy. This is used to reduce access latency
1784/// without polluting the cache.
1785///
1786/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1787/// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1788/// and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1789///
1790/// The actual implementation depends on the particular CPU. This instruction
1791/// is considered a hint, so the CPU is also free to simply ignore the request.
1792///
1793/// The amount of prefetched data depends on the cache line size of the
1794/// specific CPU, but it will be at least 32 bytes.
1795///
1796/// Common caveats:
1797///
1798/// * Most modern CPUs already automatically prefetch data based on predicted
1799/// access patterns.
1800///
1801/// * Data is usually not fetched if this would cause a TLB miss or a page
1802/// fault.
1803///
1804/// * Too much prefetching can cause unnecessary cache evictions.
1805///
1806/// * Prefetching may also fail if there are not enough memory-subsystem
1807/// resources (e.g., request buffers).
1808///
1809///
1810/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1811#[inline]
1812#[target_feature(enable = "sse")]
1813#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1814#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1815#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1816#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1817#[rustc_legacy_const_generics(1)]
1818#[stable(feature = "simd_x86", since = "1.27.0")]
1819pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1820 // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1821 // `locality` and `rw` are based on our `STRATEGY`.
1822 prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, ty:1);
1823}
1824
1825/// Returns vector of type __m128 with indeterminate elements.
1826/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
1827/// In practice, this is equivalent to [`mem::zeroed`].
1828///
1829/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1830#[inline]
1831#[target_feature(enable = "sse")]
1832#[stable(feature = "simd_x86", since = "1.27.0")]
1833pub unsafe fn _mm_undefined_ps() -> __m128 {
1834 _mm_set1_ps(0.0)
1835}
1836
1837/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1838///
1839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1840#[inline]
1841#[allow(non_snake_case)]
1842#[target_feature(enable = "sse")]
1843#[stable(feature = "simd_x86", since = "1.27.0")]
1844pub unsafe fn _MM_TRANSPOSE4_PS(
1845 row0: &mut __m128,
1846 row1: &mut __m128,
1847 row2: &mut __m128,
1848 row3: &mut __m128,
1849) {
1850 let tmp0: __m128 = _mm_unpacklo_ps(*row0, *row1);
1851 let tmp2: __m128 = _mm_unpacklo_ps(*row2, *row3);
1852 let tmp1: __m128 = _mm_unpackhi_ps(*row0, *row1);
1853 let tmp3: __m128 = _mm_unpackhi_ps(*row2, *row3);
1854
1855 *row0 = _mm_movelh_ps(a:tmp0, b:tmp2);
1856 *row1 = _mm_movehl_ps(a:tmp2, b:tmp0);
1857 *row2 = _mm_movelh_ps(a:tmp1, b:tmp3);
1858 *row3 = _mm_movehl_ps(a:tmp3, b:tmp1);
1859}
1860
1861#[allow(improper_ctypes)]
1862extern "C" {
1863 #[link_name = "llvm.x86.sse.add.ss"]
1864 fn addss(a: __m128, b: __m128) -> __m128;
1865 #[link_name = "llvm.x86.sse.sub.ss"]
1866 fn subss(a: __m128, b: __m128) -> __m128;
1867 #[link_name = "llvm.x86.sse.mul.ss"]
1868 fn mulss(a: __m128, b: __m128) -> __m128;
1869 #[link_name = "llvm.x86.sse.div.ss"]
1870 fn divss(a: __m128, b: __m128) -> __m128;
1871 #[link_name = "llvm.x86.sse.sqrt.ss"]
1872 fn sqrtss(a: __m128) -> __m128;
1873 #[link_name = "llvm.x86.sse.sqrt.ps"]
1874 fn sqrtps(a: __m128) -> __m128;
1875 #[link_name = "llvm.x86.sse.rcp.ss"]
1876 fn rcpss(a: __m128) -> __m128;
1877 #[link_name = "llvm.x86.sse.rcp.ps"]
1878 fn rcpps(a: __m128) -> __m128;
1879 #[link_name = "llvm.x86.sse.rsqrt.ss"]
1880 fn rsqrtss(a: __m128) -> __m128;
1881 #[link_name = "llvm.x86.sse.rsqrt.ps"]
1882 fn rsqrtps(a: __m128) -> __m128;
1883 #[link_name = "llvm.x86.sse.min.ss"]
1884 fn minss(a: __m128, b: __m128) -> __m128;
1885 #[link_name = "llvm.x86.sse.min.ps"]
1886 fn minps(a: __m128, b: __m128) -> __m128;
1887 #[link_name = "llvm.x86.sse.max.ss"]
1888 fn maxss(a: __m128, b: __m128) -> __m128;
1889 #[link_name = "llvm.x86.sse.max.ps"]
1890 fn maxps(a: __m128, b: __m128) -> __m128;
1891 #[link_name = "llvm.x86.sse.cmp.ps"]
1892 fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1893 #[link_name = "llvm.x86.sse.comieq.ss"]
1894 fn comieq_ss(a: __m128, b: __m128) -> i32;
1895 #[link_name = "llvm.x86.sse.comilt.ss"]
1896 fn comilt_ss(a: __m128, b: __m128) -> i32;
1897 #[link_name = "llvm.x86.sse.comile.ss"]
1898 fn comile_ss(a: __m128, b: __m128) -> i32;
1899 #[link_name = "llvm.x86.sse.comigt.ss"]
1900 fn comigt_ss(a: __m128, b: __m128) -> i32;
1901 #[link_name = "llvm.x86.sse.comige.ss"]
1902 fn comige_ss(a: __m128, b: __m128) -> i32;
1903 #[link_name = "llvm.x86.sse.comineq.ss"]
1904 fn comineq_ss(a: __m128, b: __m128) -> i32;
1905 #[link_name = "llvm.x86.sse.ucomieq.ss"]
1906 fn ucomieq_ss(a: __m128, b: __m128) -> i32;
1907 #[link_name = "llvm.x86.sse.ucomilt.ss"]
1908 fn ucomilt_ss(a: __m128, b: __m128) -> i32;
1909 #[link_name = "llvm.x86.sse.ucomile.ss"]
1910 fn ucomile_ss(a: __m128, b: __m128) -> i32;
1911 #[link_name = "llvm.x86.sse.ucomigt.ss"]
1912 fn ucomigt_ss(a: __m128, b: __m128) -> i32;
1913 #[link_name = "llvm.x86.sse.ucomige.ss"]
1914 fn ucomige_ss(a: __m128, b: __m128) -> i32;
1915 #[link_name = "llvm.x86.sse.ucomineq.ss"]
1916 fn ucomineq_ss(a: __m128, b: __m128) -> i32;
1917 #[link_name = "llvm.x86.sse.cvtss2si"]
1918 fn cvtss2si(a: __m128) -> i32;
1919 #[link_name = "llvm.x86.sse.cvttss2si"]
1920 fn cvttss2si(a: __m128) -> i32;
1921 #[link_name = "llvm.x86.sse.cvtsi2ss"]
1922 fn cvtsi2ss(a: __m128, b: i32) -> __m128;
1923 #[link_name = "llvm.x86.sse.sfence"]
1924 fn sfence();
1925 #[link_name = "llvm.x86.sse.stmxcsr"]
1926 fn stmxcsr(p: *mut i8);
1927 #[link_name = "llvm.x86.sse.ldmxcsr"]
1928 fn ldmxcsr(p: *const i8);
1929 #[link_name = "llvm.prefetch"]
1930 fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
1931 #[link_name = "llvm.x86.sse.cmp.ss"]
1932 fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
1933}
1934
1935/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1936///
1937/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1938/// exception _may_ be generated.
1939///
1940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
1941#[inline]
1942#[target_feature(enable = "sse")]
1943#[cfg_attr(test, assert_instr(movntps))]
1944#[stable(feature = "simd_x86", since = "1.27.0")]
1945#[allow(clippy::cast_ptr_alignment)]
1946pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
1947 intrinsics::nontemporal_store(ptr:mem_addr as *mut __m128, val:a);
1948}
1949
1950#[cfg(test)]
1951mod tests {
1952 use crate::{hint::black_box, mem::transmute};
1953 use std::{boxed, f32::NAN};
1954 use stdarch_test::simd_test;
1955
1956 use crate::core_arch::{simd::*, x86::*};
1957
1958 #[simd_test(enable = "sse")]
1959 unsafe fn test_mm_add_ps() {
1960 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1961 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1962 let r = _mm_add_ps(a, b);
1963 assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
1964 }
1965
1966 #[simd_test(enable = "sse")]
1967 unsafe fn test_mm_add_ss() {
1968 let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
1969 let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
1970 let r = _mm_add_ss(a, b);
1971 assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
1972 }
1973
1974 #[simd_test(enable = "sse")]
1975 unsafe fn test_mm_sub_ps() {
1976 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1977 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1978 let r = _mm_sub_ps(a, b);
1979 assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
1980 }
1981
1982 #[simd_test(enable = "sse")]
1983 unsafe fn test_mm_sub_ss() {
1984 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1985 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1986 let r = _mm_sub_ss(a, b);
1987 assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
1988 }
1989
1990 #[simd_test(enable = "sse")]
1991 unsafe fn test_mm_mul_ps() {
1992 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1993 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1994 let r = _mm_mul_ps(a, b);
1995 assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
1996 }
1997
1998 #[simd_test(enable = "sse")]
1999 unsafe fn test_mm_mul_ss() {
2000 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2001 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2002 let r = _mm_mul_ss(a, b);
2003 assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
2004 }
2005
2006 #[simd_test(enable = "sse")]
2007 unsafe fn test_mm_div_ps() {
2008 let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
2009 let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
2010 let r = _mm_div_ps(a, b);
2011 assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
2012 }
2013
2014 #[simd_test(enable = "sse")]
2015 unsafe fn test_mm_div_ss() {
2016 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2017 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2018 let r = _mm_div_ss(a, b);
2019 assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
2020 }
2021
2022 #[simd_test(enable = "sse")]
2023 unsafe fn test_mm_sqrt_ss() {
2024 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2025 let r = _mm_sqrt_ss(a);
2026 let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
2027 assert_eq_m128(r, e);
2028 }
2029
2030 #[simd_test(enable = "sse")]
2031 unsafe fn test_mm_sqrt_ps() {
2032 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2033 let r = _mm_sqrt_ps(a);
2034 let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
2035 assert_eq_m128(r, e);
2036 }
2037
2038 #[simd_test(enable = "sse")]
2039 unsafe fn test_mm_rcp_ss() {
2040 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2041 let r = _mm_rcp_ss(a);
2042 let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
2043 let rel_err = 0.00048828125;
2044 assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
2045 for i in 1..4 {
2046 assert_eq!(get_m128(r, i), get_m128(e, i));
2047 }
2048 }
2049
2050 #[simd_test(enable = "sse")]
2051 unsafe fn test_mm_rcp_ps() {
2052 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2053 let r = _mm_rcp_ps(a);
2054 let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
2055 let rel_err = 0.00048828125;
2056 for i in 0..4 {
2057 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2058 }
2059 }
2060
2061 #[simd_test(enable = "sse")]
2062 unsafe fn test_mm_rsqrt_ss() {
2063 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2064 let r = _mm_rsqrt_ss(a);
2065 let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
2066 let rel_err = 0.00048828125;
2067 for i in 0..4 {
2068 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2069 }
2070 }
2071
2072 #[simd_test(enable = "sse")]
2073 unsafe fn test_mm_rsqrt_ps() {
2074 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2075 let r = _mm_rsqrt_ps(a);
2076 let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2077 let rel_err = 0.00048828125;
2078 for i in 0..4 {
2079 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2080 }
2081 }
2082
2083 #[simd_test(enable = "sse")]
2084 unsafe fn test_mm_min_ss() {
2085 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2086 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2087 let r = _mm_min_ss(a, b);
2088 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2089 }
2090
2091 #[simd_test(enable = "sse")]
2092 unsafe fn test_mm_min_ps() {
2093 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2094 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2095 let r = _mm_min_ps(a, b);
2096 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2097
2098 // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2099 // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2100 // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2101 // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2102 // `r1` to `a` and `r2` to `b`.
2103 let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2104 let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2105 let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
2106 let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
2107 let a: [u8; 16] = transmute(a);
2108 let b: [u8; 16] = transmute(b);
2109 assert_eq!(r1, b);
2110 assert_eq!(r2, a);
2111 assert_ne!(a, b); // sanity check that -0.0 is actually present
2112 }
2113
2114 #[simd_test(enable = "sse")]
2115 unsafe fn test_mm_max_ss() {
2116 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2117 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2118 let r = _mm_max_ss(a, b);
2119 assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2120 }
2121
2122 #[simd_test(enable = "sse")]
2123 unsafe fn test_mm_max_ps() {
2124 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2125 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2126 let r = _mm_max_ps(a, b);
2127 assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2128
2129 // Check SSE-specific semantics for -0.0 handling.
2130 let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2131 let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2132 let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
2133 let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
2134 let a: [u8; 16] = transmute(a);
2135 let b: [u8; 16] = transmute(b);
2136 assert_eq!(r1, b);
2137 assert_eq!(r2, a);
2138 assert_ne!(a, b); // sanity check that -0.0 is actually present
2139 }
2140
2141 #[simd_test(enable = "sse")]
2142 unsafe fn test_mm_and_ps() {
2143 let a = transmute(u32x4::splat(0b0011));
2144 let b = transmute(u32x4::splat(0b0101));
2145 let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2146 let e = transmute(u32x4::splat(0b0001));
2147 assert_eq_m128(r, e);
2148 }
2149
2150 #[simd_test(enable = "sse")]
2151 unsafe fn test_mm_andnot_ps() {
2152 let a = transmute(u32x4::splat(0b0011));
2153 let b = transmute(u32x4::splat(0b0101));
2154 let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2155 let e = transmute(u32x4::splat(0b0100));
2156 assert_eq_m128(r, e);
2157 }
2158
2159 #[simd_test(enable = "sse")]
2160 unsafe fn test_mm_or_ps() {
2161 let a = transmute(u32x4::splat(0b0011));
2162 let b = transmute(u32x4::splat(0b0101));
2163 let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2164 let e = transmute(u32x4::splat(0b0111));
2165 assert_eq_m128(r, e);
2166 }
2167
2168 #[simd_test(enable = "sse")]
2169 unsafe fn test_mm_xor_ps() {
2170 let a = transmute(u32x4::splat(0b0011));
2171 let b = transmute(u32x4::splat(0b0101));
2172 let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2173 let e = transmute(u32x4::splat(0b0110));
2174 assert_eq_m128(r, e);
2175 }
2176
2177 #[simd_test(enable = "sse")]
2178 unsafe fn test_mm_cmpeq_ss() {
2179 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2180 let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2181 let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2182 let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
2183 assert_eq!(r, e);
2184
2185 let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2186 let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
2187 let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
2188 assert_eq!(r2, e2);
2189 }
2190
2191 #[simd_test(enable = "sse")]
2192 unsafe fn test_mm_cmplt_ss() {
2193 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2194 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2195 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2196 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2197
2198 let b1 = 0u32; // a.extract(0) < b.extract(0)
2199 let c1 = 0u32; // a.extract(0) < c.extract(0)
2200 let d1 = !0u32; // a.extract(0) < d.extract(0)
2201
2202 let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2203 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2204 assert_eq!(rb, eb);
2205
2206 let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2207 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2208 assert_eq!(rc, ec);
2209
2210 let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2211 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2212 assert_eq!(rd, ed);
2213 }
2214
2215 #[simd_test(enable = "sse")]
2216 unsafe fn test_mm_cmple_ss() {
2217 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2218 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2219 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2220 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2221
2222 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2223 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2224 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2225
2226 let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2227 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2228 assert_eq!(rb, eb);
2229
2230 let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2231 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2232 assert_eq!(rc, ec);
2233
2234 let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2235 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2236 assert_eq!(rd, ed);
2237 }
2238
2239 #[simd_test(enable = "sse")]
2240 unsafe fn test_mm_cmpgt_ss() {
2241 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2242 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2243 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2244 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2245
2246 let b1 = !0u32; // a.extract(0) > b.extract(0)
2247 let c1 = 0u32; // a.extract(0) > c.extract(0)
2248 let d1 = 0u32; // a.extract(0) > d.extract(0)
2249
2250 let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2251 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2252 assert_eq!(rb, eb);
2253
2254 let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2255 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2256 assert_eq!(rc, ec);
2257
2258 let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2259 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2260 assert_eq!(rd, ed);
2261 }
2262
2263 #[simd_test(enable = "sse")]
2264 unsafe fn test_mm_cmpge_ss() {
2265 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2266 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2267 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2268 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2269
2270 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2271 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2272 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2273
2274 let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2275 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2276 assert_eq!(rb, eb);
2277
2278 let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2279 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2280 assert_eq!(rc, ec);
2281
2282 let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2283 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2284 assert_eq!(rd, ed);
2285 }
2286
2287 #[simd_test(enable = "sse")]
2288 unsafe fn test_mm_cmpneq_ss() {
2289 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2290 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2291 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2292 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2293
2294 let b1 = !0u32; // a.extract(0) != b.extract(0)
2295 let c1 = 0u32; // a.extract(0) != c.extract(0)
2296 let d1 = !0u32; // a.extract(0) != d.extract(0)
2297
2298 let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2299 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2300 assert_eq!(rb, eb);
2301
2302 let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2303 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2304 assert_eq!(rc, ec);
2305
2306 let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2307 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2308 assert_eq!(rd, ed);
2309 }
2310
2311 #[simd_test(enable = "sse")]
2312 unsafe fn test_mm_cmpnlt_ss() {
2313 // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2314 // must be a difference. It may have to do with behavior in the
2315 // presence of NaNs (signaling or quiet). If so, we should add tests
2316 // for those.
2317
2318 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2319 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2320 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2321 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2322
2323 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2324 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2325 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2326
2327 let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2328 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2329 assert_eq!(rb, eb);
2330
2331 let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2332 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2333 assert_eq!(rc, ec);
2334
2335 let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2336 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2337 assert_eq!(rd, ed);
2338 }
2339
2340 #[simd_test(enable = "sse")]
2341 unsafe fn test_mm_cmpnle_ss() {
2342 // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2343 // must be a difference. It may have to do with behavior in the
2344 // presence
2345 // of NaNs (signaling or quiet). If so, we should add tests for those.
2346
2347 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2348 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2349 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2350 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2351
2352 let b1 = !0u32; // a.extract(0) > b.extract(0)
2353 let c1 = 0u32; // a.extract(0) > c.extract(0)
2354 let d1 = 0u32; // a.extract(0) > d.extract(0)
2355
2356 let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2357 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2358 assert_eq!(rb, eb);
2359
2360 let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2361 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2362 assert_eq!(rc, ec);
2363
2364 let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2365 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2366 assert_eq!(rd, ed);
2367 }
2368
2369 #[simd_test(enable = "sse")]
2370 unsafe fn test_mm_cmpngt_ss() {
2371 // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2372 // must be a difference. It may have to do with behavior in the
2373 // presence of NaNs (signaling or quiet). If so, we should add tests
2374 // for those.
2375
2376 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2377 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2378 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2379 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2380
2381 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2382 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2383 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2384
2385 let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2386 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2387 assert_eq!(rb, eb);
2388
2389 let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2390 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2391 assert_eq!(rc, ec);
2392
2393 let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2394 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2395 assert_eq!(rd, ed);
2396 }
2397
2398 #[simd_test(enable = "sse")]
2399 unsafe fn test_mm_cmpnge_ss() {
2400 // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2401 // must be a difference. It may have to do with behavior in the
2402 // presence of NaNs (signaling or quiet). If so, we should add tests
2403 // for those.
2404
2405 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2406 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2407 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2408 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2409
2410 let b1 = 0u32; // a.extract(0) < b.extract(0)
2411 let c1 = 0u32; // a.extract(0) < c.extract(0)
2412 let d1 = !0u32; // a.extract(0) < d.extract(0)
2413
2414 let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2415 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2416 assert_eq!(rb, eb);
2417
2418 let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2419 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2420 assert_eq!(rc, ec);
2421
2422 let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2423 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2424 assert_eq!(rd, ed);
2425 }
2426
2427 #[simd_test(enable = "sse")]
2428 unsafe fn test_mm_cmpord_ss() {
2429 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2430 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2431 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2432 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2433
2434 let b1 = !0u32; // a.extract(0) ord b.extract(0)
2435 let c1 = 0u32; // a.extract(0) ord c.extract(0)
2436 let d1 = !0u32; // a.extract(0) ord d.extract(0)
2437
2438 let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2439 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2440 assert_eq!(rb, eb);
2441
2442 let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2443 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2444 assert_eq!(rc, ec);
2445
2446 let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2447 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2448 assert_eq!(rd, ed);
2449 }
2450
2451 #[simd_test(enable = "sse")]
2452 unsafe fn test_mm_cmpunord_ss() {
2453 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2454 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2455 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2456 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2457
2458 let b1 = 0u32; // a.extract(0) unord b.extract(0)
2459 let c1 = !0u32; // a.extract(0) unord c.extract(0)
2460 let d1 = 0u32; // a.extract(0) unord d.extract(0)
2461
2462 let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2463 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2464 assert_eq!(rb, eb);
2465
2466 let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2467 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2468 assert_eq!(rc, ec);
2469
2470 let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2471 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2472 assert_eq!(rd, ed);
2473 }
2474
2475 #[simd_test(enable = "sse")]
2476 unsafe fn test_mm_cmpeq_ps() {
2477 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2478 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2479 let tru = !0u32;
2480 let fls = 0u32;
2481
2482 let e = u32x4::new(fls, fls, tru, fls);
2483 let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2484 assert_eq!(r, e);
2485 }
2486
2487 #[simd_test(enable = "sse")]
2488 unsafe fn test_mm_cmplt_ps() {
2489 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2490 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2491 let tru = !0u32;
2492 let fls = 0u32;
2493
2494 let e = u32x4::new(tru, fls, fls, fls);
2495 let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2496 assert_eq!(r, e);
2497 }
2498
2499 #[simd_test(enable = "sse")]
2500 unsafe fn test_mm_cmple_ps() {
2501 let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2502 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2503 let tru = !0u32;
2504 let fls = 0u32;
2505
2506 let e = u32x4::new(tru, fls, tru, fls);
2507 let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2508 assert_eq!(r, e);
2509 }
2510
2511 #[simd_test(enable = "sse")]
2512 unsafe fn test_mm_cmpgt_ps() {
2513 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2514 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2515 let tru = !0u32;
2516 let fls = 0u32;
2517
2518 let e = u32x4::new(fls, tru, fls, fls);
2519 let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2520 assert_eq!(r, e);
2521 }
2522
2523 #[simd_test(enable = "sse")]
2524 unsafe fn test_mm_cmpge_ps() {
2525 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2526 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2527 let tru = !0u32;
2528 let fls = 0u32;
2529
2530 let e = u32x4::new(fls, tru, tru, fls);
2531 let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2532 assert_eq!(r, e);
2533 }
2534
2535 #[simd_test(enable = "sse")]
2536 unsafe fn test_mm_cmpneq_ps() {
2537 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2538 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2539 let tru = !0u32;
2540 let fls = 0u32;
2541
2542 let e = u32x4::new(tru, tru, fls, tru);
2543 let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2544 assert_eq!(r, e);
2545 }
2546
2547 #[simd_test(enable = "sse")]
2548 unsafe fn test_mm_cmpnlt_ps() {
2549 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2550 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2551 let tru = !0u32;
2552 let fls = 0u32;
2553
2554 let e = u32x4::new(fls, tru, tru, tru);
2555 let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2556 assert_eq!(r, e);
2557 }
2558
2559 #[simd_test(enable = "sse")]
2560 unsafe fn test_mm_cmpnle_ps() {
2561 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2562 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2563 let tru = !0u32;
2564 let fls = 0u32;
2565
2566 let e = u32x4::new(fls, tru, fls, tru);
2567 let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2568 assert_eq!(r, e);
2569 }
2570
2571 #[simd_test(enable = "sse")]
2572 unsafe fn test_mm_cmpngt_ps() {
2573 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2574 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2575 let tru = !0u32;
2576 let fls = 0u32;
2577
2578 let e = u32x4::new(tru, fls, tru, tru);
2579 let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2580 assert_eq!(r, e);
2581 }
2582
2583 #[simd_test(enable = "sse")]
2584 unsafe fn test_mm_cmpnge_ps() {
2585 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2586 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2587 let tru = !0u32;
2588 let fls = 0u32;
2589
2590 let e = u32x4::new(tru, fls, fls, tru);
2591 let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2592 assert_eq!(r, e);
2593 }
2594
2595 #[simd_test(enable = "sse")]
2596 unsafe fn test_mm_cmpord_ps() {
2597 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2598 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2599 let tru = !0u32;
2600 let fls = 0u32;
2601
2602 let e = u32x4::new(tru, fls, fls, fls);
2603 let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2604 assert_eq!(r, e);
2605 }
2606
2607 #[simd_test(enable = "sse")]
2608 unsafe fn test_mm_cmpunord_ps() {
2609 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2610 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2611 let tru = !0u32;
2612 let fls = 0u32;
2613
2614 let e = u32x4::new(fls, tru, tru, tru);
2615 let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2616 assert_eq!(r, e);
2617 }
2618
2619 #[simd_test(enable = "sse")]
2620 unsafe fn test_mm_comieq_ss() {
2621 let aa = &[3.0f32, 12.0, 23.0, NAN];
2622 let bb = &[3.0f32, 47.5, 1.5, NAN];
2623
2624 let ee = &[1i32, 0, 0, 0];
2625
2626 for i in 0..4 {
2627 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2628 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2629
2630 let r = _mm_comieq_ss(a, b);
2631
2632 assert_eq!(
2633 ee[i], r,
2634 "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2635 a, b, r, ee[i], i
2636 );
2637 }
2638 }
2639
2640 #[simd_test(enable = "sse")]
2641 unsafe fn test_mm_comilt_ss() {
2642 let aa = &[3.0f32, 12.0, 23.0, NAN];
2643 let bb = &[3.0f32, 47.5, 1.5, NAN];
2644
2645 let ee = &[0i32, 1, 0, 0];
2646
2647 for i in 0..4 {
2648 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2649 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2650
2651 let r = _mm_comilt_ss(a, b);
2652
2653 assert_eq!(
2654 ee[i], r,
2655 "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2656 a, b, r, ee[i], i
2657 );
2658 }
2659 }
2660
2661 #[simd_test(enable = "sse")]
2662 unsafe fn test_mm_comile_ss() {
2663 let aa = &[3.0f32, 12.0, 23.0, NAN];
2664 let bb = &[3.0f32, 47.5, 1.5, NAN];
2665
2666 let ee = &[1i32, 1, 0, 0];
2667
2668 for i in 0..4 {
2669 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2670 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2671
2672 let r = _mm_comile_ss(a, b);
2673
2674 assert_eq!(
2675 ee[i], r,
2676 "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2677 a, b, r, ee[i], i
2678 );
2679 }
2680 }
2681
2682 #[simd_test(enable = "sse")]
2683 unsafe fn test_mm_comigt_ss() {
2684 let aa = &[3.0f32, 12.0, 23.0, NAN];
2685 let bb = &[3.0f32, 47.5, 1.5, NAN];
2686
2687 let ee = &[1i32, 0, 1, 0];
2688
2689 for i in 0..4 {
2690 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2691 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2692
2693 let r = _mm_comige_ss(a, b);
2694
2695 assert_eq!(
2696 ee[i], r,
2697 "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2698 a, b, r, ee[i], i
2699 );
2700 }
2701 }
2702
2703 #[simd_test(enable = "sse")]
2704 unsafe fn test_mm_comineq_ss() {
2705 let aa = &[3.0f32, 12.0, 23.0, NAN];
2706 let bb = &[3.0f32, 47.5, 1.5, NAN];
2707
2708 let ee = &[0i32, 1, 1, 1];
2709
2710 for i in 0..4 {
2711 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2712 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2713
2714 let r = _mm_comineq_ss(a, b);
2715
2716 assert_eq!(
2717 ee[i], r,
2718 "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2719 a, b, r, ee[i], i
2720 );
2721 }
2722 }
2723
2724 #[simd_test(enable = "sse")]
2725 unsafe fn test_mm_ucomieq_ss() {
2726 let aa = &[3.0f32, 12.0, 23.0, NAN];
2727 let bb = &[3.0f32, 47.5, 1.5, NAN];
2728
2729 let ee = &[1i32, 0, 0, 0];
2730
2731 for i in 0..4 {
2732 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2733 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2734
2735 let r = _mm_ucomieq_ss(a, b);
2736
2737 assert_eq!(
2738 ee[i], r,
2739 "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2740 a, b, r, ee[i], i
2741 );
2742 }
2743 }
2744
2745 #[simd_test(enable = "sse")]
2746 unsafe fn test_mm_ucomilt_ss() {
2747 let aa = &[3.0f32, 12.0, 23.0, NAN];
2748 let bb = &[3.0f32, 47.5, 1.5, NAN];
2749
2750 let ee = &[0i32, 1, 0, 0];
2751
2752 for i in 0..4 {
2753 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2754 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2755
2756 let r = _mm_ucomilt_ss(a, b);
2757
2758 assert_eq!(
2759 ee[i], r,
2760 "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2761 a, b, r, ee[i], i
2762 );
2763 }
2764 }
2765
2766 #[simd_test(enable = "sse")]
2767 unsafe fn test_mm_ucomile_ss() {
2768 let aa = &[3.0f32, 12.0, 23.0, NAN];
2769 let bb = &[3.0f32, 47.5, 1.5, NAN];
2770
2771 let ee = &[1i32, 1, 0, 0];
2772
2773 for i in 0..4 {
2774 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2775 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2776
2777 let r = _mm_ucomile_ss(a, b);
2778
2779 assert_eq!(
2780 ee[i], r,
2781 "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2782 a, b, r, ee[i], i
2783 );
2784 }
2785 }
2786
2787 #[simd_test(enable = "sse")]
2788 unsafe fn test_mm_ucomigt_ss() {
2789 let aa = &[3.0f32, 12.0, 23.0, NAN];
2790 let bb = &[3.0f32, 47.5, 1.5, NAN];
2791
2792 let ee = &[0i32, 0, 1, 0];
2793
2794 for i in 0..4 {
2795 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2796 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2797
2798 let r = _mm_ucomigt_ss(a, b);
2799
2800 assert_eq!(
2801 ee[i], r,
2802 "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2803 a, b, r, ee[i], i
2804 );
2805 }
2806 }
2807
2808 #[simd_test(enable = "sse")]
2809 unsafe fn test_mm_ucomige_ss() {
2810 let aa = &[3.0f32, 12.0, 23.0, NAN];
2811 let bb = &[3.0f32, 47.5, 1.5, NAN];
2812
2813 let ee = &[1i32, 0, 1, 0];
2814
2815 for i in 0..4 {
2816 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2817 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2818
2819 let r = _mm_ucomige_ss(a, b);
2820
2821 assert_eq!(
2822 ee[i], r,
2823 "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2824 a, b, r, ee[i], i
2825 );
2826 }
2827 }
2828
2829 #[simd_test(enable = "sse")]
2830 unsafe fn test_mm_ucomineq_ss() {
2831 let aa = &[3.0f32, 12.0, 23.0, NAN];
2832 let bb = &[3.0f32, 47.5, 1.5, NAN];
2833
2834 let ee = &[0i32, 1, 1, 1];
2835
2836 for i in 0..4 {
2837 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2838 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2839
2840 let r = _mm_ucomineq_ss(a, b);
2841
2842 assert_eq!(
2843 ee[i], r,
2844 "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2845 a, b, r, ee[i], i
2846 );
2847 }
2848 }
2849
2850 #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
2851 #[simd_test(enable = "sse")]
2852 #[cfg_attr(miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri
2853 unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
2854 // If one of the arguments is a quiet NaN `comieq_ss` should signal an
2855 // Invalid Operation Exception while `ucomieq_ss` should not.
2856 let aa = &[3.0f32, NAN, 23.0, NAN];
2857 let bb = &[3.0f32, 47.5, NAN, NAN];
2858
2859 let ee = &[1i32, 0, 0, 0];
2860 let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
2861
2862 for i in 0..4 {
2863 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2864 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2865
2866 _MM_SET_EXCEPTION_STATE(0);
2867 let r1 = _mm_comieq_ss(*black_box(&a), b);
2868 let s1 = _MM_GET_EXCEPTION_STATE();
2869
2870 _MM_SET_EXCEPTION_STATE(0);
2871 let r2 = _mm_ucomieq_ss(*black_box(&a), b);
2872 let s2 = _MM_GET_EXCEPTION_STATE();
2873
2874 assert_eq!(
2875 ee[i], r1,
2876 "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2877 a, b, r1, ee[i], i
2878 );
2879 assert_eq!(
2880 ee[i], r2,
2881 "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2882 a, b, r2, ee[i], i
2883 );
2884 assert_eq!(
2885 s1,
2886 exc[i] * _MM_EXCEPT_INVALID,
2887 "_mm_comieq_ss() set exception flags: {} (i={})",
2888 s1,
2889 i
2890 );
2891 assert_eq!(
2892 s2,
2893 0, // ucomieq_ss should not signal an exception
2894 "_mm_ucomieq_ss() set exception flags: {} (i={})",
2895 s2,
2896 i
2897 );
2898 }
2899 }
2900
2901 #[simd_test(enable = "sse")]
2902 unsafe fn test_mm_cvtss_si32() {
2903 let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
2904 let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
2905 for i in 0..inputs.len() {
2906 let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2907 let e = result[i];
2908 let r = _mm_cvtss_si32(x);
2909 assert_eq!(
2910 e, r,
2911 "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2912 i, x, r, e
2913 );
2914 }
2915 }
2916
2917 #[simd_test(enable = "sse")]
2918 unsafe fn test_mm_cvttss_si32() {
2919 let inputs = &[
2920 (42.0f32, 42i32),
2921 (-31.4, -31),
2922 (-33.5, -33),
2923 (-34.5, -34),
2924 (10.999, 10),
2925 (-5.99, -5),
2926 (4.0e10, i32::MIN),
2927 (4.0e-10, 0),
2928 (NAN, i32::MIN),
2929 (2147483500.1, 2147483520),
2930 ];
2931 for i in 0..inputs.len() {
2932 let (xi, e) = inputs[i];
2933 let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
2934 let r = _mm_cvttss_si32(x);
2935 assert_eq!(
2936 e, r,
2937 "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2938 i, x, r, e
2939 );
2940 }
2941 }
2942
2943 #[simd_test(enable = "sse")]
2944 unsafe fn test_mm_cvtsi32_ss() {
2945 let inputs = &[
2946 (4555i32, 4555.0f32),
2947 (322223333, 322223330.0),
2948 (-432, -432.0),
2949 (-322223333, -322223330.0),
2950 ];
2951
2952 for i in 0..inputs.len() {
2953 let (x, f) = inputs[i];
2954 let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2955 let r = _mm_cvtsi32_ss(a, x);
2956 let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
2957 assert_eq_m128(e, r);
2958 }
2959 }
2960
2961 #[simd_test(enable = "sse")]
2962 unsafe fn test_mm_cvtss_f32() {
2963 let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
2964 assert_eq!(_mm_cvtss_f32(a), 312.0134);
2965 }
2966
2967 #[simd_test(enable = "sse")]
2968 unsafe fn test_mm_set_ss() {
2969 let r = _mm_set_ss(black_box(4.25));
2970 assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
2971 }
2972
2973 #[simd_test(enable = "sse")]
2974 unsafe fn test_mm_set1_ps() {
2975 let r1 = _mm_set1_ps(black_box(4.25));
2976 let r2 = _mm_set_ps1(black_box(4.25));
2977 assert_eq!(get_m128(r1, 0), 4.25);
2978 assert_eq!(get_m128(r1, 1), 4.25);
2979 assert_eq!(get_m128(r1, 2), 4.25);
2980 assert_eq!(get_m128(r1, 3), 4.25);
2981 assert_eq!(get_m128(r2, 0), 4.25);
2982 assert_eq!(get_m128(r2, 1), 4.25);
2983 assert_eq!(get_m128(r2, 2), 4.25);
2984 assert_eq!(get_m128(r2, 3), 4.25);
2985 }
2986
2987 #[simd_test(enable = "sse")]
2988 unsafe fn test_mm_set_ps() {
2989 let r = _mm_set_ps(
2990 black_box(1.0),
2991 black_box(2.0),
2992 black_box(3.0),
2993 black_box(4.0),
2994 );
2995 assert_eq!(get_m128(r, 0), 4.0);
2996 assert_eq!(get_m128(r, 1), 3.0);
2997 assert_eq!(get_m128(r, 2), 2.0);
2998 assert_eq!(get_m128(r, 3), 1.0);
2999 }
3000
3001 #[simd_test(enable = "sse")]
3002 unsafe fn test_mm_setr_ps() {
3003 let r = _mm_setr_ps(
3004 black_box(1.0),
3005 black_box(2.0),
3006 black_box(3.0),
3007 black_box(4.0),
3008 );
3009 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3010 }
3011
3012 #[simd_test(enable = "sse")]
3013 unsafe fn test_mm_setzero_ps() {
3014 let r = *black_box(&_mm_setzero_ps());
3015 assert_eq_m128(r, _mm_set1_ps(0.0));
3016 }
3017
3018 #[simd_test(enable = "sse")]
3019 unsafe fn test_mm_shuffle() {
3020 assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
3021 assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
3022 assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
3023 }
3024
3025 #[simd_test(enable = "sse")]
3026 unsafe fn test_mm_shuffle_ps() {
3027 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3028 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3029 let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
3030 assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
3031 }
3032
3033 #[simd_test(enable = "sse")]
3034 unsafe fn test_mm_unpackhi_ps() {
3035 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3036 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3037 let r = _mm_unpackhi_ps(a, b);
3038 assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
3039 }
3040
3041 #[simd_test(enable = "sse")]
3042 unsafe fn test_mm_unpacklo_ps() {
3043 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3044 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3045 let r = _mm_unpacklo_ps(a, b);
3046 assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
3047 }
3048
3049 #[simd_test(enable = "sse")]
3050 unsafe fn test_mm_movehl_ps() {
3051 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3052 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3053 let r = _mm_movehl_ps(a, b);
3054 assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
3055 }
3056
3057 #[simd_test(enable = "sse")]
3058 unsafe fn test_mm_movelh_ps() {
3059 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3060 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3061 let r = _mm_movelh_ps(a, b);
3062 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
3063 }
3064
3065 #[simd_test(enable = "sse")]
3066 unsafe fn test_mm_load_ss() {
3067 let a = 42.0f32;
3068 let r = _mm_load_ss(&a as *const f32);
3069 assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
3070 }
3071
3072 #[simd_test(enable = "sse")]
3073 unsafe fn test_mm_load1_ps() {
3074 let a = 42.0f32;
3075 let r = _mm_load1_ps(&a as *const f32);
3076 assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
3077 }
3078
3079 #[simd_test(enable = "sse")]
3080 unsafe fn test_mm_load_ps() {
3081 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3082
3083 let mut p = vals.as_ptr();
3084 let mut fixup = 0.0f32;
3085
3086 // Make sure p is aligned, otherwise we might get a
3087 // (signal: 11, SIGSEGV: invalid memory reference)
3088
3089 let unalignment = (p as usize) & 0xf;
3090 if unalignment != 0 {
3091 let delta = (16 - unalignment) >> 2;
3092 fixup = delta as f32;
3093 p = p.add(delta);
3094 }
3095
3096 let r = _mm_load_ps(p);
3097 let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
3098 assert_eq_m128(r, e);
3099 }
3100
3101 #[simd_test(enable = "sse")]
3102 unsafe fn test_mm_loadu_ps() {
3103 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3104 let p = vals.as_ptr().add(3);
3105 let r = _mm_loadu_ps(black_box(p));
3106 assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3107 }
3108
3109 #[simd_test(enable = "sse")]
3110 unsafe fn test_mm_loadr_ps() {
3111 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3112
3113 let mut p = vals.as_ptr();
3114 let mut fixup = 0.0f32;
3115
3116 // Make sure p is aligned, otherwise we might get a
3117 // (signal: 11, SIGSEGV: invalid memory reference)
3118
3119 let unalignment = (p as usize) & 0xf;
3120 if unalignment != 0 {
3121 let delta = (16 - unalignment) >> 2;
3122 fixup = delta as f32;
3123 p = p.add(delta);
3124 }
3125
3126 let r = _mm_loadr_ps(p);
3127 let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
3128 assert_eq_m128(r, e);
3129 }
3130
3131 #[simd_test(enable = "sse2")]
3132 unsafe fn test_mm_loadu_si64() {
3133 let a = _mm_setr_epi64x(5, 6);
3134 let r = _mm_loadu_si64(&a as *const _ as *const _);
3135 assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
3136 }
3137
3138 #[simd_test(enable = "sse")]
3139 unsafe fn test_mm_store_ss() {
3140 let mut vals = [0.0f32; 8];
3141 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3142 _mm_store_ss(vals.as_mut_ptr().add(1), a);
3143
3144 assert_eq!(vals[0], 0.0);
3145 assert_eq!(vals[1], 1.0);
3146 assert_eq!(vals[2], 0.0);
3147 }
3148
3149 #[simd_test(enable = "sse")]
3150 unsafe fn test_mm_store1_ps() {
3151 let mut vals = [0.0f32; 8];
3152 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3153
3154 let mut ofs = 0;
3155 let mut p = vals.as_mut_ptr();
3156
3157 if (p as usize) & 0xf != 0 {
3158 ofs = (16 - ((p as usize) & 0xf)) >> 2;
3159 p = p.add(ofs);
3160 }
3161
3162 _mm_store1_ps(p, *black_box(&a));
3163
3164 if ofs > 0 {
3165 assert_eq!(vals[ofs - 1], 0.0);
3166 }
3167 assert_eq!(vals[ofs + 0], 1.0);
3168 assert_eq!(vals[ofs + 1], 1.0);
3169 assert_eq!(vals[ofs + 2], 1.0);
3170 assert_eq!(vals[ofs + 3], 1.0);
3171 assert_eq!(vals[ofs + 4], 0.0);
3172 }
3173
3174 #[simd_test(enable = "sse")]
3175 unsafe fn test_mm_store_ps() {
3176 let mut vals = [0.0f32; 8];
3177 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3178
3179 let mut ofs = 0;
3180 let mut p = vals.as_mut_ptr();
3181
3182 // Align p to 16-byte boundary
3183 if (p as usize) & 0xf != 0 {
3184 ofs = (16 - ((p as usize) & 0xf)) >> 2;
3185 p = p.add(ofs);
3186 }
3187
3188 _mm_store_ps(p, *black_box(&a));
3189
3190 if ofs > 0 {
3191 assert_eq!(vals[ofs - 1], 0.0);
3192 }
3193 assert_eq!(vals[ofs + 0], 1.0);
3194 assert_eq!(vals[ofs + 1], 2.0);
3195 assert_eq!(vals[ofs + 2], 3.0);
3196 assert_eq!(vals[ofs + 3], 4.0);
3197 assert_eq!(vals[ofs + 4], 0.0);
3198 }
3199
3200 #[simd_test(enable = "sse")]
3201 unsafe fn test_mm_storer_ps() {
3202 let mut vals = [0.0f32; 8];
3203 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3204
3205 let mut ofs = 0;
3206 let mut p = vals.as_mut_ptr();
3207
3208 // Align p to 16-byte boundary
3209 if (p as usize) & 0xf != 0 {
3210 ofs = (16 - ((p as usize) & 0xf)) >> 2;
3211 p = p.add(ofs);
3212 }
3213
3214 _mm_storer_ps(p, *black_box(&a));
3215
3216 if ofs > 0 {
3217 assert_eq!(vals[ofs - 1], 0.0);
3218 }
3219 assert_eq!(vals[ofs + 0], 4.0);
3220 assert_eq!(vals[ofs + 1], 3.0);
3221 assert_eq!(vals[ofs + 2], 2.0);
3222 assert_eq!(vals[ofs + 3], 1.0);
3223 assert_eq!(vals[ofs + 4], 0.0);
3224 }
3225
3226 #[simd_test(enable = "sse")]
3227 unsafe fn test_mm_storeu_ps() {
3228 let mut vals = [0.0f32; 8];
3229 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3230
3231 let mut ofs = 0;
3232 let mut p = vals.as_mut_ptr();
3233
3234 // Make sure p is **not** aligned to 16-byte boundary
3235 if (p as usize) & 0xf == 0 {
3236 ofs = 1;
3237 p = p.add(1);
3238 }
3239
3240 _mm_storeu_ps(p, *black_box(&a));
3241
3242 if ofs > 0 {
3243 assert_eq!(vals[ofs - 1], 0.0);
3244 }
3245 assert_eq!(vals[ofs + 0], 1.0);
3246 assert_eq!(vals[ofs + 1], 2.0);
3247 assert_eq!(vals[ofs + 2], 3.0);
3248 assert_eq!(vals[ofs + 3], 4.0);
3249 assert_eq!(vals[ofs + 4], 0.0);
3250 }
3251
3252 #[simd_test(enable = "sse")]
3253 unsafe fn test_mm_move_ss() {
3254 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3255 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3256
3257 let r = _mm_move_ss(a, b);
3258 let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3259 assert_eq_m128(e, r);
3260 }
3261
3262 #[simd_test(enable = "sse")]
3263 unsafe fn test_mm_movemask_ps() {
3264 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3265 assert_eq!(r, 0b0101);
3266
3267 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3268 assert_eq!(r, 0b0111);
3269 }
3270
3271 #[simd_test(enable = "sse")]
3272 // Miri cannot support this until it is clear how it fits in the Rust memory model
3273 #[cfg_attr(miri, ignore)]
3274 unsafe fn test_mm_sfence() {
3275 _mm_sfence();
3276 }
3277
3278 #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3279 #[simd_test(enable = "sse")]
3280 #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3281 unsafe fn test_mm_getcsr_setcsr_1() {
3282 let saved_csr = _mm_getcsr();
3283
3284 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3285 let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3286
3287 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
3288 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3289
3290 _mm_setcsr(saved_csr);
3291
3292 let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
3293 assert_eq_m128(r, exp); // first component is a denormalized f32
3294 }
3295
3296 #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3297 #[simd_test(enable = "sse")]
3298 #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3299 unsafe fn test_mm_getcsr_setcsr_2() {
3300 // Same as _mm_setcsr_1 test, but with opposite flag value.
3301
3302 let saved_csr = _mm_getcsr();
3303
3304 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3305 let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3306
3307 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
3308 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3309
3310 _mm_setcsr(saved_csr);
3311
3312 let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
3313 assert_eq_m128(r, exp); // first component is a denormalized f32
3314 }
3315
3316 #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3317 #[simd_test(enable = "sse")]
3318 #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3319 unsafe fn test_mm_getcsr_setcsr_underflow() {
3320 _MM_SET_EXCEPTION_STATE(0);
3321
3322 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3323 let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0);
3324
3325 assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
3326
3327 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3328
3329 let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
3330 assert_eq_m128(r, exp);
3331
3332 let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0;
3333 assert_eq!(underflow, true);
3334 }
3335
3336 #[simd_test(enable = "sse")]
3337 unsafe fn test_MM_TRANSPOSE4_PS() {
3338 let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3339 let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3340 let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3341 let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3342
3343 _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3344
3345 assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3346 assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3347 assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3348 assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3349 }
3350
3351 #[repr(align(16))]
3352 struct Memory {
3353 pub data: [f32; 4],
3354 }
3355
3356 #[simd_test(enable = "sse")]
3357 // Miri cannot support this until it is clear how it fits in the Rust memory model
3358 // (non-temporal store)
3359 #[cfg_attr(miri, ignore)]
3360 unsafe fn test_mm_stream_ps() {
3361 let a = _mm_set1_ps(7.0);
3362 let mut mem = Memory { data: [-1.0; 4] };
3363
3364 _mm_stream_ps(&mut mem.data[0] as *mut f32, a);
3365 for i in 0..4 {
3366 assert_eq!(mem.data[i], get_m128(a, i));
3367 }
3368 }
3369}
3370