1//! Streaming SIMD Extensions (SSE)
2
3use crate::{
4 core_arch::{simd::*, x86::*},
5 intrinsics::simd::*,
6 intrinsics::sqrtf32,
7 mem, ptr,
8};
9
10#[cfg(test)]
11use stdarch_test::assert_instr;
12
13/// Adds the first component of `a` and `b`, the other components are copied
14/// from `a`.
15///
16/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
17#[inline]
18#[target_feature(enable = "sse")]
19#[cfg_attr(test, assert_instr(addss))]
20#[stable(feature = "simd_x86", since = "1.27.0")]
21pub fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
22 unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) + _mm_cvtss_f32(b)) }
23}
24
25/// Adds packed single-precision (32-bit) floating-point elements in `a` and
26/// `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
29#[inline]
30#[target_feature(enable = "sse")]
31#[cfg_attr(test, assert_instr(addps))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
34 unsafe { simd_add(x:a, y:b) }
35}
36
37/// Subtracts the first component of `b` from `a`, the other components are
38/// copied from `a`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
41#[inline]
42#[target_feature(enable = "sse")]
43#[cfg_attr(test, assert_instr(subss))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
46 unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) - _mm_cvtss_f32(b)) }
47}
48
49/// Subtracts packed single-precision (32-bit) floating-point elements in `a` and
50/// `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
53#[inline]
54#[target_feature(enable = "sse")]
55#[cfg_attr(test, assert_instr(subps))]
56#[stable(feature = "simd_x86", since = "1.27.0")]
57pub fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
58 unsafe { simd_sub(lhs:a, rhs:b) }
59}
60
61/// Multiplies the first component of `a` and `b`, the other components are
62/// copied from `a`.
63///
64/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
65#[inline]
66#[target_feature(enable = "sse")]
67#[cfg_attr(test, assert_instr(mulss))]
68#[stable(feature = "simd_x86", since = "1.27.0")]
69pub fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
70 unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) * _mm_cvtss_f32(b)) }
71}
72
73/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
74/// `b`.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
77#[inline]
78#[target_feature(enable = "sse")]
79#[cfg_attr(test, assert_instr(mulps))]
80#[stable(feature = "simd_x86", since = "1.27.0")]
81pub fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
82 unsafe { simd_mul(x:a, y:b) }
83}
84
85/// Divides the first component of `b` by `a`, the other components are
86/// copied from `a`.
87///
88/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
89#[inline]
90#[target_feature(enable = "sse")]
91#[cfg_attr(test, assert_instr(divss))]
92#[stable(feature = "simd_x86", since = "1.27.0")]
93pub fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
94 unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) / _mm_cvtss_f32(b)) }
95}
96
97/// Divides packed single-precision (32-bit) floating-point elements in `a` and
98/// `b`.
99///
100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
101#[inline]
102#[target_feature(enable = "sse")]
103#[cfg_attr(test, assert_instr(divps))]
104#[stable(feature = "simd_x86", since = "1.27.0")]
105pub fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
106 unsafe { simd_div(lhs:a, rhs:b) }
107}
108
109/// Returns the square root of the first single-precision (32-bit)
110/// floating-point element in `a`, the other elements are unchanged.
111///
112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
113#[inline]
114#[target_feature(enable = "sse")]
115#[cfg_attr(test, assert_instr(sqrtss))]
116#[stable(feature = "simd_x86", since = "1.27.0")]
117pub fn _mm_sqrt_ss(a: __m128) -> __m128 {
118 unsafe { simd_insert!(a, 0, sqrtf32(_mm_cvtss_f32(a))) }
119}
120
121/// Returns the square root of packed single-precision (32-bit) floating-point
122/// elements in `a`.
123///
124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
125#[inline]
126#[target_feature(enable = "sse")]
127#[cfg_attr(test, assert_instr(sqrtps))]
128#[stable(feature = "simd_x86", since = "1.27.0")]
129pub fn _mm_sqrt_ps(a: __m128) -> __m128 {
130 unsafe { simd_fsqrt(a) }
131}
132
133/// Returns the approximate reciprocal of the first single-precision
134/// (32-bit) floating-point element in `a`, the other elements are unchanged.
135///
136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
137#[inline]
138#[target_feature(enable = "sse")]
139#[cfg_attr(test, assert_instr(rcpss))]
140#[stable(feature = "simd_x86", since = "1.27.0")]
141pub fn _mm_rcp_ss(a: __m128) -> __m128 {
142 unsafe { rcpss(a) }
143}
144
145/// Returns the approximate reciprocal of packed single-precision (32-bit)
146/// floating-point elements in `a`.
147///
148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
149#[inline]
150#[target_feature(enable = "sse")]
151#[cfg_attr(test, assert_instr(rcpps))]
152#[stable(feature = "simd_x86", since = "1.27.0")]
153pub fn _mm_rcp_ps(a: __m128) -> __m128 {
154 unsafe { rcpps(a) }
155}
156
157/// Returns the approximate reciprocal square root of the first single-precision
158/// (32-bit) floating-point element in `a`, the other elements are unchanged.
159///
160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
161#[inline]
162#[target_feature(enable = "sse")]
163#[cfg_attr(test, assert_instr(rsqrtss))]
164#[stable(feature = "simd_x86", since = "1.27.0")]
165pub fn _mm_rsqrt_ss(a: __m128) -> __m128 {
166 unsafe { rsqrtss(a) }
167}
168
169/// Returns the approximate reciprocal square root of packed single-precision
170/// (32-bit) floating-point elements in `a`.
171///
172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
173#[inline]
174#[target_feature(enable = "sse")]
175#[cfg_attr(test, assert_instr(rsqrtps))]
176#[stable(feature = "simd_x86", since = "1.27.0")]
177pub fn _mm_rsqrt_ps(a: __m128) -> __m128 {
178 unsafe { rsqrtps(a) }
179}
180
181/// Compares the first single-precision (32-bit) floating-point element of `a`
182/// and `b`, and return the minimum value in the first element of the return
183/// value, the other elements are copied from `a`.
184///
185/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
186#[inline]
187#[target_feature(enable = "sse")]
188#[cfg_attr(test, assert_instr(minss))]
189#[stable(feature = "simd_x86", since = "1.27.0")]
190pub fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
191 unsafe { minss(a, b) }
192}
193
194/// Compares packed single-precision (32-bit) floating-point elements in `a` and
195/// `b`, and return the corresponding minimum values.
196///
197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
198#[inline]
199#[target_feature(enable = "sse")]
200#[cfg_attr(test, assert_instr(minps))]
201#[stable(feature = "simd_x86", since = "1.27.0")]
202pub fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
203 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
204 unsafe { minps(a, b) }
205}
206
207/// Compares the first single-precision (32-bit) floating-point element of `a`
208/// and `b`, and return the maximum value in the first element of the return
209/// value, the other elements are copied from `a`.
210///
211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
212#[inline]
213#[target_feature(enable = "sse")]
214#[cfg_attr(test, assert_instr(maxss))]
215#[stable(feature = "simd_x86", since = "1.27.0")]
216pub fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
217 unsafe { maxss(a, b) }
218}
219
220/// Compares packed single-precision (32-bit) floating-point elements in `a` and
221/// `b`, and return the corresponding maximum values.
222///
223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
224#[inline]
225#[target_feature(enable = "sse")]
226#[cfg_attr(test, assert_instr(maxps))]
227#[stable(feature = "simd_x86", since = "1.27.0")]
228pub fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
229 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
230 unsafe { maxps(a, b) }
231}
232
233/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
234///
235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
236#[inline]
237#[target_feature(enable = "sse")]
238// i586 only seems to generate plain `and` instructions, so ignore it.
239#[cfg_attr(
240 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
241 assert_instr(andps)
242)]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
245 unsafe {
246 let a: __m128i = mem::transmute(src:a);
247 let b: __m128i = mem::transmute(src:b);
248 mem::transmute(src:simd_and(x:a, y:b))
249 }
250}
251
252/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
253/// elements.
254///
255/// Computes `!a & b` for each bit in `a` and `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
258#[inline]
259#[target_feature(enable = "sse")]
260// i586 only seems to generate plain `not` and `and` instructions, so ignore
261// it.
262#[cfg_attr(
263 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
264 assert_instr(andnps)
265)]
266#[stable(feature = "simd_x86", since = "1.27.0")]
267pub fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
268 unsafe {
269 let a: __m128i = mem::transmute(src:a);
270 let b: __m128i = mem::transmute(src:b);
271 let mask: __m128i = mem::transmute(src:i32x4::splat(-1));
272 mem::transmute(src:simd_and(x:simd_xor(mask, a), y:b))
273 }
274}
275
276/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
277///
278/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
279#[inline]
280#[target_feature(enable = "sse")]
281// i586 only seems to generate plain `or` instructions, so we ignore it.
282#[cfg_attr(
283 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
284 assert_instr(orps)
285)]
286#[stable(feature = "simd_x86", since = "1.27.0")]
287pub fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
288 unsafe {
289 let a: __m128i = mem::transmute(src:a);
290 let b: __m128i = mem::transmute(src:b);
291 mem::transmute(src:simd_or(x:a, y:b))
292 }
293}
294
295/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
296/// elements.
297///
298/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
299#[inline]
300#[target_feature(enable = "sse")]
301// i586 only seems to generate plain `xor` instructions, so we ignore it.
302#[cfg_attr(
303 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
304 assert_instr(xorps)
305)]
306#[stable(feature = "simd_x86", since = "1.27.0")]
307pub fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
308 unsafe {
309 let a: __m128i = mem::transmute(src:a);
310 let b: __m128i = mem::transmute(src:b);
311 mem::transmute(src:simd_xor(x:a, y:b))
312 }
313}
314
315/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
316/// the result will be `0xffffffff` if the two inputs are equal, or `0`
317/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
318///
319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
320#[inline]
321#[target_feature(enable = "sse")]
322#[cfg_attr(test, assert_instr(cmpeqss))]
323#[stable(feature = "simd_x86", since = "1.27.0")]
324pub fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
325 unsafe { cmpss(a, b, imm8:0) }
326}
327
328/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
329/// of the result will be `0xffffffff` if `a.extract(0)` is less than
330/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
331/// upper 96 bits of `a`.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
334#[inline]
335#[target_feature(enable = "sse")]
336#[cfg_attr(test, assert_instr(cmpltss))]
337#[stable(feature = "simd_x86", since = "1.27.0")]
338pub fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
339 unsafe { cmpss(a, b, imm8:1) }
340}
341
342/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
343/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
344/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
345/// are the upper 96 bits of `a`.
346///
347/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
348#[inline]
349#[target_feature(enable = "sse")]
350#[cfg_attr(test, assert_instr(cmpless))]
351#[stable(feature = "simd_x86", since = "1.27.0")]
352pub fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
353 unsafe { cmpss(a, b, imm8:2) }
354}
355
356/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
357/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
358/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
359/// are the upper 96 bits of `a`.
360///
361/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
362#[inline]
363#[target_feature(enable = "sse")]
364#[cfg_attr(test, assert_instr(cmpltss))]
365#[stable(feature = "simd_x86", since = "1.27.0")]
366pub fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
367 unsafe { simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3]) }
368}
369
370/// Compares the lowest `f32` of both inputs for greater than or equal. The
371/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
372/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
373/// of the result are the upper 96 bits of `a`.
374///
375/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
376#[inline]
377#[target_feature(enable = "sse")]
378#[cfg_attr(test, assert_instr(cmpless))]
379#[stable(feature = "simd_x86", since = "1.27.0")]
380pub fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
381 unsafe { simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3]) }
382}
383
384/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
385/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
386/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
387/// upper 96 bits of `a`.
388///
389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
390#[inline]
391#[target_feature(enable = "sse")]
392#[cfg_attr(test, assert_instr(cmpneqss))]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
395 unsafe { cmpss(a, b, imm8:4) }
396}
397
398/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
399/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
400/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
401/// upper 96 bits of `a`.
402///
403/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
404#[inline]
405#[target_feature(enable = "sse")]
406#[cfg_attr(test, assert_instr(cmpnltss))]
407#[stable(feature = "simd_x86", since = "1.27.0")]
408pub fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
409 unsafe { cmpss(a, b, imm8:5) }
410}
411
412/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
413/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
414/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
415/// of the result are the upper 96 bits of `a`.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
418#[inline]
419#[target_feature(enable = "sse")]
420#[cfg_attr(test, assert_instr(cmpnless))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422pub fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
423 unsafe { cmpss(a, b, imm8:6) }
424}
425
426/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
427/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
428/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
429/// the upper 96 bits of `a`.
430///
431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
432#[inline]
433#[target_feature(enable = "sse")]
434#[cfg_attr(test, assert_instr(cmpnltss))]
435#[stable(feature = "simd_x86", since = "1.27.0")]
436pub fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
437 unsafe { simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3]) }
438}
439
440/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
441/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
442/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
443/// bits of the result are the upper 96 bits of `a`.
444///
445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
446#[inline]
447#[target_feature(enable = "sse")]
448#[cfg_attr(test, assert_instr(cmpnless))]
449#[stable(feature = "simd_x86", since = "1.27.0")]
450pub fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
451 unsafe { simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3]) }
452}
453
454/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
455/// the result will be `0xffffffff` if neither of `a.extract(0)` or
456/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
457/// are the upper 96 bits of `a`.
458///
459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
460#[inline]
461#[target_feature(enable = "sse")]
462#[cfg_attr(test, assert_instr(cmpordss))]
463#[stable(feature = "simd_x86", since = "1.27.0")]
464pub fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
465 unsafe { cmpss(a, b, imm8:7) }
466}
467
468/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
469/// of the result will be `0xffffffff` if any of `a.extract(0)` or
470/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
471/// are the upper 96 bits of `a`.
472///
473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
474#[inline]
475#[target_feature(enable = "sse")]
476#[cfg_attr(test, assert_instr(cmpunordss))]
477#[stable(feature = "simd_x86", since = "1.27.0")]
478pub fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
479 unsafe { cmpss(a, b, imm8:3) }
480}
481
482/// Compares each of the four floats in `a` to the corresponding element in `b`.
483/// The result in the output vector will be `0xffffffff` if the input elements
484/// were equal, or `0` otherwise.
485///
486/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
487#[inline]
488#[target_feature(enable = "sse")]
489#[cfg_attr(test, assert_instr(cmpeqps))]
490#[stable(feature = "simd_x86", since = "1.27.0")]
491pub fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
492 unsafe { cmpps(a, b, imm8:0) }
493}
494
495/// Compares each of the four floats in `a` to the corresponding element in `b`.
496/// The result in the output vector will be `0xffffffff` if the input element
497/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
498///
499/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
500#[inline]
501#[target_feature(enable = "sse")]
502#[cfg_attr(test, assert_instr(cmpltps))]
503#[stable(feature = "simd_x86", since = "1.27.0")]
504pub fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
505 unsafe { cmpps(a, b, imm8:1) }
506}
507
508/// Compares each of the four floats in `a` to the corresponding element in `b`.
509/// The result in the output vector will be `0xffffffff` if the input element
510/// in `a` is less than or equal to the corresponding element in `b`, or `0`
511/// otherwise.
512///
513/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
514#[inline]
515#[target_feature(enable = "sse")]
516#[cfg_attr(test, assert_instr(cmpleps))]
517#[stable(feature = "simd_x86", since = "1.27.0")]
518pub fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
519 unsafe { cmpps(a, b, imm8:2) }
520}
521
522/// Compares each of the four floats in `a` to the corresponding element in `b`.
523/// The result in the output vector will be `0xffffffff` if the input element
524/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
525///
526/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
527#[inline]
528#[target_feature(enable = "sse")]
529#[cfg_attr(test, assert_instr(cmpltps))]
530#[stable(feature = "simd_x86", since = "1.27.0")]
531pub fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
532 unsafe { cmpps(a:b, b:a, imm8:1) }
533}
534
535/// Compares each of the four floats in `a` to the corresponding element in `b`.
536/// The result in the output vector will be `0xffffffff` if the input element
537/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
538/// otherwise.
539///
540/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
541#[inline]
542#[target_feature(enable = "sse")]
543#[cfg_attr(test, assert_instr(cmpleps))]
544#[stable(feature = "simd_x86", since = "1.27.0")]
545pub fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
546 unsafe { cmpps(a:b, b:a, imm8:2) }
547}
548
549/// Compares each of the four floats in `a` to the corresponding element in `b`.
550/// The result in the output vector will be `0xffffffff` if the input elements
551/// are **not** equal, or `0` otherwise.
552///
553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
554#[inline]
555#[target_feature(enable = "sse")]
556#[cfg_attr(test, assert_instr(cmpneqps))]
557#[stable(feature = "simd_x86", since = "1.27.0")]
558pub fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
559 unsafe { cmpps(a, b, imm8:4) }
560}
561
562/// Compares each of the four floats in `a` to the corresponding element in `b`.
563/// The result in the output vector will be `0xffffffff` if the input element
564/// in `a` is **not** less than the corresponding element in `b`, or `0`
565/// otherwise.
566///
567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
568#[inline]
569#[target_feature(enable = "sse")]
570#[cfg_attr(test, assert_instr(cmpnltps))]
571#[stable(feature = "simd_x86", since = "1.27.0")]
572pub fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
573 unsafe { cmpps(a, b, imm8:5) }
574}
575
576/// Compares each of the four floats in `a` to the corresponding element in `b`.
577/// The result in the output vector will be `0xffffffff` if the input element
578/// in `a` is **not** less than or equal to the corresponding element in `b`, or
579/// `0` otherwise.
580///
581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
582#[inline]
583#[target_feature(enable = "sse")]
584#[cfg_attr(test, assert_instr(cmpnleps))]
585#[stable(feature = "simd_x86", since = "1.27.0")]
586pub fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
587 unsafe { cmpps(a, b, imm8:6) }
588}
589
590/// Compares each of the four floats in `a` to the corresponding element in `b`.
591/// The result in the output vector will be `0xffffffff` if the input element
592/// in `a` is **not** greater than the corresponding element in `b`, or `0`
593/// otherwise.
594///
595/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
596#[inline]
597#[target_feature(enable = "sse")]
598#[cfg_attr(test, assert_instr(cmpnltps))]
599#[stable(feature = "simd_x86", since = "1.27.0")]
600pub fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
601 unsafe { cmpps(a:b, b:a, imm8:5) }
602}
603
604/// Compares each of the four floats in `a` to the corresponding element in `b`.
605/// The result in the output vector will be `0xffffffff` if the input element
606/// in `a` is **not** greater than or equal to the corresponding element in `b`,
607/// or `0` otherwise.
608///
609/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
610#[inline]
611#[target_feature(enable = "sse")]
612#[cfg_attr(test, assert_instr(cmpnleps))]
613#[stable(feature = "simd_x86", since = "1.27.0")]
614pub fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
615 unsafe { cmpps(a:b, b:a, imm8:6) }
616}
617
618/// Compares each of the four floats in `a` to the corresponding element in `b`.
619/// Returns four floats that have one of two possible bit patterns. The element
620/// in the output vector will be `0xffffffff` if the input elements in `a` and
621/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
622///
623/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
624#[inline]
625#[target_feature(enable = "sse")]
626#[cfg_attr(test, assert_instr(cmpordps))]
627#[stable(feature = "simd_x86", since = "1.27.0")]
628pub fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
629 unsafe { cmpps(a:b, b:a, imm8:7) }
630}
631
632/// Compares each of the four floats in `a` to the corresponding element in `b`.
633/// Returns four floats that have one of two possible bit patterns. The element
634/// in the output vector will be `0xffffffff` if the input elements in `a` and
635/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
636///
637/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
638#[inline]
639#[target_feature(enable = "sse")]
640#[cfg_attr(test, assert_instr(cmpunordps))]
641#[stable(feature = "simd_x86", since = "1.27.0")]
642pub fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
643 unsafe { cmpps(a:b, b:a, imm8:3) }
644}
645
646/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
647/// `1` if they are equal, or `0` otherwise.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
650#[inline]
651#[target_feature(enable = "sse")]
652#[cfg_attr(test, assert_instr(comiss))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654pub fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
655 unsafe { comieq_ss(a, b) }
656}
657
658/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
659/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
660///
661/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
662#[inline]
663#[target_feature(enable = "sse")]
664#[cfg_attr(test, assert_instr(comiss))]
665#[stable(feature = "simd_x86", since = "1.27.0")]
666pub fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
667 unsafe { comilt_ss(a, b) }
668}
669
670/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
671/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
672/// otherwise.
673///
674/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
675#[inline]
676#[target_feature(enable = "sse")]
677#[cfg_attr(test, assert_instr(comiss))]
678#[stable(feature = "simd_x86", since = "1.27.0")]
679pub fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
680 unsafe { comile_ss(a, b) }
681}
682
683/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
684/// `1` if the value from `a` is greater than the one from `b`, or `0`
685/// otherwise.
686///
687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
688#[inline]
689#[target_feature(enable = "sse")]
690#[cfg_attr(test, assert_instr(comiss))]
691#[stable(feature = "simd_x86", since = "1.27.0")]
692pub fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
693 unsafe { comigt_ss(a, b) }
694}
695
696/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
697/// `1` if the value from `a` is greater than or equal to the one from `b`, or
698/// `0` otherwise.
699///
700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
701#[inline]
702#[target_feature(enable = "sse")]
703#[cfg_attr(test, assert_instr(comiss))]
704#[stable(feature = "simd_x86", since = "1.27.0")]
705pub fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
706 unsafe { comige_ss(a, b) }
707}
708
709/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
710/// `1` if they are **not** equal, or `0` otherwise.
711///
712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
713#[inline]
714#[target_feature(enable = "sse")]
715#[cfg_attr(test, assert_instr(comiss))]
716#[stable(feature = "simd_x86", since = "1.27.0")]
717pub fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
718 unsafe { comineq_ss(a, b) }
719}
720
721/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
722/// `1` if they are equal, or `0` otherwise. This instruction will not signal
723/// an exception if either argument is a quiet NaN.
724///
725/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
726#[inline]
727#[target_feature(enable = "sse")]
728#[cfg_attr(test, assert_instr(ucomiss))]
729#[stable(feature = "simd_x86", since = "1.27.0")]
730pub fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
731 unsafe { ucomieq_ss(a, b) }
732}
733
734/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
735/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
736/// This instruction will not signal an exception if either argument is a quiet
737/// NaN.
738///
739/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
740#[inline]
741#[target_feature(enable = "sse")]
742#[cfg_attr(test, assert_instr(ucomiss))]
743#[stable(feature = "simd_x86", since = "1.27.0")]
744pub fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
745 unsafe { ucomilt_ss(a, b) }
746}
747
748/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
749/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
750/// otherwise. This instruction will not signal an exception if either argument
751/// is a quiet NaN.
752///
753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
754#[inline]
755#[target_feature(enable = "sse")]
756#[cfg_attr(test, assert_instr(ucomiss))]
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
759 unsafe { ucomile_ss(a, b) }
760}
761
762/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
763/// `1` if the value from `a` is greater than the one from `b`, or `0`
764/// otherwise. This instruction will not signal an exception if either argument
765/// is a quiet NaN.
766///
767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
768#[inline]
769#[target_feature(enable = "sse")]
770#[cfg_attr(test, assert_instr(ucomiss))]
771#[stable(feature = "simd_x86", since = "1.27.0")]
772pub fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
773 unsafe { ucomigt_ss(a, b) }
774}
775
776/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
777/// `1` if the value from `a` is greater than or equal to the one from `b`, or
778/// `0` otherwise. This instruction will not signal an exception if either
779/// argument is a quiet NaN.
780///
781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
782#[inline]
783#[target_feature(enable = "sse")]
784#[cfg_attr(test, assert_instr(ucomiss))]
785#[stable(feature = "simd_x86", since = "1.27.0")]
786pub fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
787 unsafe { ucomige_ss(a, b) }
788}
789
790/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
791/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
792/// signal an exception if either argument is a quiet NaN.
793///
794/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
795#[inline]
796#[target_feature(enable = "sse")]
797#[cfg_attr(test, assert_instr(ucomiss))]
798#[stable(feature = "simd_x86", since = "1.27.0")]
799pub fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
800 unsafe { ucomineq_ss(a, b) }
801}
802
803/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
804///
805/// The result is rounded according to the current rounding mode. If the result
806/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
807/// (`i32::MIN`).
808///
809/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
810///
811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
812#[inline]
813#[target_feature(enable = "sse")]
814#[cfg_attr(test, assert_instr(cvtss2si))]
815#[stable(feature = "simd_x86", since = "1.27.0")]
816pub fn _mm_cvtss_si32(a: __m128) -> i32 {
817 unsafe { cvtss2si(a) }
818}
819
820/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
821///
822/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
823#[inline]
824#[target_feature(enable = "sse")]
825#[cfg_attr(test, assert_instr(cvtss2si))]
826#[stable(feature = "simd_x86", since = "1.27.0")]
827pub fn _mm_cvt_ss2si(a: __m128) -> i32 {
828 _mm_cvtss_si32(a)
829}
830
831/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
832/// with
833/// truncation.
834///
835/// The result is rounded always using truncation (round towards zero). If the
836/// result cannot be represented as a 32 bit integer the result will be
837/// `0x8000_0000` (`i32::MIN`).
838///
839/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
840///
841/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
842#[inline]
843#[target_feature(enable = "sse")]
844#[cfg_attr(test, assert_instr(cvttss2si))]
845#[stable(feature = "simd_x86", since = "1.27.0")]
846pub fn _mm_cvttss_si32(a: __m128) -> i32 {
847 unsafe { cvttss2si(a) }
848}
849
850/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
851///
852/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
853#[inline]
854#[target_feature(enable = "sse")]
855#[cfg_attr(test, assert_instr(cvttss2si))]
856#[stable(feature = "simd_x86", since = "1.27.0")]
857pub fn _mm_cvtt_ss2si(a: __m128) -> i32 {
858 _mm_cvttss_si32(a)
859}
860
861/// Extracts the lowest 32 bit float from the input vector.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
864#[inline]
865#[target_feature(enable = "sse")]
866// No point in using assert_instrs. In Unix x86_64 calling convention this is a
867// no-op, and on msvc it's just a `mov`.
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub fn _mm_cvtss_f32(a: __m128) -> f32 {
870 unsafe { simd_extract!(a, 0) }
871}
872
873/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
874/// vector `a` with the lowest 32 bit float replaced by the converted integer.
875///
876/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
877/// input).
878///
879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
880#[inline]
881#[target_feature(enable = "sse")]
882#[cfg_attr(test, assert_instr(cvtsi2ss))]
883#[stable(feature = "simd_x86", since = "1.27.0")]
884pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
885 unsafe { cvtsi2ss(a, b) }
886}
887
888/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
889///
890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
891#[inline]
892#[target_feature(enable = "sse")]
893#[cfg_attr(test, assert_instr(cvtsi2ss))]
894#[stable(feature = "simd_x86", since = "1.27.0")]
895pub fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
896 _mm_cvtsi32_ss(a, b)
897}
898
899/// Construct a `__m128` with the lowest element set to `a` and the rest set to
900/// zero.
901///
902/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
903#[inline]
904#[target_feature(enable = "sse")]
905#[cfg_attr(test, assert_instr(movss))]
906#[stable(feature = "simd_x86", since = "1.27.0")]
907pub fn _mm_set_ss(a: f32) -> __m128 {
908 __m128([a, 0.0, 0.0, 0.0])
909}
910
911/// Construct a `__m128` with all element set to `a`.
912///
913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
914#[inline]
915#[target_feature(enable = "sse")]
916#[cfg_attr(test, assert_instr(shufps))]
917#[stable(feature = "simd_x86", since = "1.27.0")]
918pub fn _mm_set1_ps(a: f32) -> __m128 {
919 __m128([a, a, a, a])
920}
921
922/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
925#[inline]
926#[target_feature(enable = "sse")]
927#[cfg_attr(test, assert_instr(shufps))]
928#[stable(feature = "simd_x86", since = "1.27.0")]
929pub fn _mm_set_ps1(a: f32) -> __m128 {
930 _mm_set1_ps(a)
931}
932
933/// Construct a `__m128` from four floating point values highest to lowest.
934///
935/// Note that `a` will be the highest 32 bits of the result, and `d` the
936/// lowest. This matches the standard way of writing bit patterns on x86:
937///
938/// ```text
939/// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0
940/// +---------+---------+---------+---------+
941/// | a | b | c | d | result
942/// +---------+---------+---------+---------+
943/// ```
944///
945/// Alternatively:
946///
947/// ```text
948/// let v = _mm_set_ps(d, c, b, a);
949/// ```
950///
951/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
952#[inline]
953#[target_feature(enable = "sse")]
954#[cfg_attr(test, assert_instr(unpcklps))]
955#[stable(feature = "simd_x86", since = "1.27.0")]
956pub fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
957 __m128([d, c, b, a])
958}
959
960/// Construct a `__m128` from four floating point values lowest to highest.
961///
962/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
963/// bits of the result, and `d` the highest.
964///
965/// ```text
966/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
967/// ```
968///
969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
970#[inline]
971#[target_feature(enable = "sse")]
972#[cfg_attr(
973 all(test, any(target_env = "msvc", target_arch = "x86_64")),
974 assert_instr(unpcklps)
975)]
976// On a 32-bit architecture on non-msvc it just copies the operands from the stack.
977#[cfg_attr(
978 all(test, all(not(target_env = "msvc"), target_arch = "x86")),
979 assert_instr(movaps)
980)]
981#[stable(feature = "simd_x86", since = "1.27.0")]
982pub fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
983 __m128([a, b, c, d])
984}
985
986/// Construct a `__m128` with all elements initialized to zero.
987///
988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
989#[inline]
990#[target_feature(enable = "sse")]
991#[cfg_attr(test, assert_instr(xorps))]
992#[stable(feature = "simd_x86", since = "1.27.0")]
993pub fn _mm_setzero_ps() -> __m128 {
994 const { unsafe { mem::zeroed() } }
995}
996
997/// A utility function for creating masks to use with Intel shuffle and
998/// permute intrinsics.
999#[inline]
1000#[allow(non_snake_case)]
1001#[unstable(feature = "stdarch_x86_mm_shuffle", issue = "111147")]
1002pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
1003 ((z << 6) | (y << 4) | (x << 2) | w) as i32
1004}
1005
1006/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
1007/// `b` using `MASK`.
1008///
1009/// The lower half of result takes values from `a` and the higher half from
1010/// `b`. Mask is split to 2 control bits each to index the element from inputs.
1011///
1012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
1013///
1014/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1015/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1016/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1017/// Performing an implicit type conversion between an unsigned integer and a signed integer
1018/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1019#[inline]
1020#[target_feature(enable = "sse")]
1021#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1022#[rustc_legacy_const_generics(2)]
1023#[stable(feature = "simd_x86", since = "1.27.0")]
1024pub fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1025 static_assert_uimm_bits!(MASK, 8);
1026 unsafe {
1027 simd_shuffle!(
1028 a,
1029 b,
1030 [
1031 MASK as u32 & 0b11,
1032 (MASK as u32 >> 2) & 0b11,
1033 ((MASK as u32 >> 4) & 0b11) + 4,
1034 ((MASK as u32 >> 6) & 0b11) + 4,
1035 ],
1036 )
1037 }
1038}
1039
1040/// Unpacks and interleave single-precision (32-bit) floating-point elements
1041/// from the higher half of `a` and `b`.
1042///
1043/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1044#[inline]
1045#[target_feature(enable = "sse")]
1046#[cfg_attr(test, assert_instr(unpckhps))]
1047#[stable(feature = "simd_x86", since = "1.27.0")]
1048pub fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1049 unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
1050}
1051
1052/// Unpacks and interleave single-precision (32-bit) floating-point elements
1053/// from the lower half of `a` and `b`.
1054///
1055/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1056#[inline]
1057#[target_feature(enable = "sse")]
1058#[cfg_attr(test, assert_instr(unpcklps))]
1059#[stable(feature = "simd_x86", since = "1.27.0")]
1060pub fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1061 unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
1062}
1063
1064/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1065/// lower half of result.
1066///
1067/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1068#[inline]
1069#[target_feature(enable = "sse")]
1070#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movhlps))]
1071#[stable(feature = "simd_x86", since = "1.27.0")]
1072pub fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1073 // TODO; figure why this is a different instruction on msvc?
1074 unsafe { simd_shuffle!(a, b, [6, 7, 2, 3]) }
1075}
1076
1077/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1078/// higher half of result.
1079///
1080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1081#[inline]
1082#[target_feature(enable = "sse")]
1083#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlhps))]
1084#[stable(feature = "simd_x86", since = "1.27.0")]
1085pub fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1086 unsafe { simd_shuffle!(a, b, [0, 1, 4, 5]) }
1087}
1088
1089/// Returns a mask of the most significant bit of each element in `a`.
1090///
1091/// The mask is stored in the 4 least significant bits of the return value.
1092/// All other bits are set to `0`.
1093///
1094/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1095#[inline]
1096#[target_feature(enable = "sse")]
1097#[cfg_attr(test, assert_instr(movmskps))]
1098#[stable(feature = "simd_x86", since = "1.27.0")]
1099pub fn _mm_movemask_ps(a: __m128) -> i32 {
1100 // Propagate the highest bit to the rest, because simd_bitmask
1101 // requires all-1 or all-0.
1102 unsafe {
1103 let mask: i32x4 = simd_lt(x:transmute(a), y:i32x4::ZERO);
1104 simd_bitmask::<i32x4, u8>(mask).into()
1105 }
1106}
1107
1108/// Construct a `__m128` with the lowest element read from `p` and the other
1109/// elements set to zero.
1110///
1111/// This corresponds to instructions `VMOVSS` / `MOVSS`.
1112///
1113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1114#[inline]
1115#[target_feature(enable = "sse")]
1116#[cfg_attr(test, assert_instr(movss))]
1117#[stable(feature = "simd_x86", since = "1.27.0")]
1118pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1119 __m128([*p, 0.0, 0.0, 0.0])
1120}
1121
1122/// Construct a `__m128` by duplicating the value read from `p` into all
1123/// elements.
1124///
1125/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1126/// shuffling.
1127///
1128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1129#[inline]
1130#[target_feature(enable = "sse")]
1131#[cfg_attr(test, assert_instr(movss))]
1132#[stable(feature = "simd_x86", since = "1.27.0")]
1133pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1134 let a: f32 = *p;
1135 __m128([a, a, a, a])
1136}
1137
1138/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1139///
1140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1141#[inline]
1142#[target_feature(enable = "sse")]
1143#[cfg_attr(test, assert_instr(movss))]
1144#[stable(feature = "simd_x86", since = "1.27.0")]
1145pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1146 _mm_load1_ps(p)
1147}
1148
1149/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
1150/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1151/// protection fault will be triggered (fatal program crash).
1152///
1153/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1154/// memory.
1155///
1156/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1159#[inline]
1160#[target_feature(enable = "sse")]
1161#[cfg_attr(test, assert_instr(movaps))]
1162#[stable(feature = "simd_x86", since = "1.27.0")]
1163#[allow(clippy::cast_ptr_alignment)]
1164pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1165 *(p as *const __m128)
1166}
1167
1168/// Loads four `f32` values from memory into a `__m128`. There are no
1169/// restrictions
1170/// on memory alignment. For aligned memory
1171/// [`_mm_load_ps`](fn._mm_load_ps.html)
1172/// may be faster.
1173///
1174/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1175///
1176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1177#[inline]
1178#[target_feature(enable = "sse")]
1179#[cfg_attr(test, assert_instr(movups))]
1180#[stable(feature = "simd_x86", since = "1.27.0")]
1181pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1182 // Note: Using `*p` would require `f32` alignment, but `movups` has no
1183 // alignment restrictions.
1184 let mut dst: __m128 = _mm_undefined_ps();
1185 ptr::copy_nonoverlapping(
1186 src:p as *const u8,
1187 dst:ptr::addr_of_mut!(dst) as *mut u8,
1188 count:mem::size_of::<__m128>(),
1189 );
1190 dst
1191}
1192
1193/// Loads four `f32` values from aligned memory into a `__m128` in reverse
1194/// order.
1195///
1196/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1197/// protection fault will be triggered (fatal program crash).
1198///
1199/// Functionally equivalent to the following code sequence (assuming `p`
1200/// satisfies the alignment restrictions):
1201///
1202/// ```text
1203/// let a0 = *p;
1204/// let a1 = *p.add(1);
1205/// let a2 = *p.add(2);
1206/// let a3 = *p.add(3);
1207/// __m128::new(a3, a2, a1, a0)
1208/// ```
1209///
1210/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1211/// shuffling.
1212///
1213/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1214#[inline]
1215#[target_feature(enable = "sse")]
1216#[cfg_attr(test, assert_instr(movaps))]
1217#[stable(feature = "simd_x86", since = "1.27.0")]
1218pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1219 let a: __m128 = _mm_load_ps(p);
1220 simd_shuffle!(a, a, [3, 2, 1, 0])
1221}
1222
1223/// Stores the lowest 32 bit float of `a` into memory.
1224///
1225/// This intrinsic corresponds to the `MOVSS` instruction.
1226///
1227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1228#[inline]
1229#[target_feature(enable = "sse")]
1230#[cfg_attr(test, assert_instr(movss))]
1231#[stable(feature = "simd_x86", since = "1.27.0")]
1232pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1233 *p = simd_extract!(a, 0);
1234}
1235
1236/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
1237/// memory.
1238///
1239/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1240/// protection fault will be triggered (fatal program crash).
1241///
1242/// Functionally equivalent to the following code sequence (assuming `p`
1243/// satisfies the alignment restrictions):
1244///
1245/// ```text
1246/// let x = a.extract(0);
1247/// *p = x;
1248/// *p.add(1) = x;
1249/// *p.add(2) = x;
1250/// *p.add(3) = x;
1251/// ```
1252///
1253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1254#[inline]
1255#[target_feature(enable = "sse")]
1256#[cfg_attr(test, assert_instr(movaps))]
1257#[stable(feature = "simd_x86", since = "1.27.0")]
1258#[allow(clippy::cast_ptr_alignment)]
1259pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1260 let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
1261 *(p as *mut __m128) = b;
1262}
1263
1264/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1265///
1266/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1267#[inline]
1268#[target_feature(enable = "sse")]
1269#[cfg_attr(test, assert_instr(movaps))]
1270#[stable(feature = "simd_x86", since = "1.27.0")]
1271pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1272 _mm_store1_ps(p, a);
1273}
1274
1275/// Stores four 32-bit floats into *aligned* memory.
1276///
1277/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1278/// protection fault will be triggered (fatal program crash).
1279///
1280/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1281/// memory.
1282///
1283/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1284///
1285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1286#[inline]
1287#[target_feature(enable = "sse")]
1288#[cfg_attr(test, assert_instr(movaps))]
1289#[stable(feature = "simd_x86", since = "1.27.0")]
1290#[allow(clippy::cast_ptr_alignment)]
1291pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1292 *(p as *mut __m128) = a;
1293}
1294
1295/// Stores four 32-bit floats into memory. There are no restrictions on memory
1296/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1297/// faster.
1298///
1299/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1300///
1301/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1302#[inline]
1303#[target_feature(enable = "sse")]
1304#[cfg_attr(test, assert_instr(movups))]
1305#[stable(feature = "simd_x86", since = "1.27.0")]
1306pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1307 ptr::copy_nonoverlapping(
1308 src:ptr::addr_of!(a) as *const u8,
1309 dst:p as *mut u8,
1310 count:mem::size_of::<__m128>(),
1311 );
1312}
1313
1314/// Stores four 32-bit floats into *aligned* memory in reverse order.
1315///
1316/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1317/// protection fault will be triggered (fatal program crash).
1318///
1319/// Functionally equivalent to the following code sequence (assuming `p`
1320/// satisfies the alignment restrictions):
1321///
1322/// ```text
1323/// *p = a.extract(3);
1324/// *p.add(1) = a.extract(2);
1325/// *p.add(2) = a.extract(1);
1326/// *p.add(3) = a.extract(0);
1327/// ```
1328///
1329/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1330#[inline]
1331#[target_feature(enable = "sse")]
1332#[cfg_attr(test, assert_instr(movaps))]
1333#[stable(feature = "simd_x86", since = "1.27.0")]
1334#[allow(clippy::cast_ptr_alignment)]
1335pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1336 let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
1337 *(p as *mut __m128) = b;
1338}
1339
1340/// Returns a `__m128` with the first component from `b` and the remaining
1341/// components from `a`.
1342///
1343/// In other words for any `a` and `b`:
1344/// ```text
1345/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1346/// ```
1347///
1348/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1349#[inline]
1350#[target_feature(enable = "sse")]
1351#[cfg_attr(test, assert_instr(movss))]
1352#[stable(feature = "simd_x86", since = "1.27.0")]
1353pub fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1354 unsafe { simd_shuffle!(a, b, [4, 1, 2, 3]) }
1355}
1356
1357/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
1358/// were issued by the current thread prior to this instruction.
1359///
1360/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
1361/// ordered before any load or store instruction which follows the fence in
1362/// synchronization order.
1363///
1364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1365/// (but note that Intel is only documenting the hardware-level concerns related to this
1366/// instruction; the Intel documentation does not take into account the extra concerns that arise
1367/// because the Rust memory model is different from the x86 memory model.)
1368///
1369/// # Safety of non-temporal stores
1370///
1371/// After using any non-temporal store intrinsic, but before any other access to the memory that the
1372/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
1373/// intrinsic.
1374///
1375/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
1376/// memory model, these stores are happening asynchronously in a background thread. This means a
1377/// non-temporal store can cause data races with other accesses, even other accesses on the same
1378/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
1379/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
1380/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
1381/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
1382/// with all the non-temporal stores previously started on this thread, which means in particular
1383/// that subsequent synchronization with other threads will then work as intended again.
1384///
1385/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
1386/// code jumps back to code outside your library. This ensures all stores inside your function
1387/// are synchronized-before the return, and thus transitively synchronized-before everything
1388/// the caller does after your function returns.
1389//
1390// The following is not a doc comment since it's not clear whether we want to put this into the
1391// docs, but it should be written out somewhere.
1392//
1393// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
1394// inspect, and that behave like the following functions. This explains where the docs above come
1395// from.
1396// ```
1397// #[thread_local]
1398// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
1399//
1400// pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
1401// PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
1402// // Spawn a thread that will eventually do our write.
1403// // We need to fetch a pointer to this thread's pending-write
1404// // counter, so that we can access it from the background thread.
1405// let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
1406// // If this was actual Rust code we'd have to do some extra work
1407// // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
1408// std::thread::spawn(move || {
1409// // Do the write in the background thread.
1410// ptr.write(val);
1411// // Register the write as done. Crucially, this is `Release`, so it
1412// // syncs-with the `Acquire in `sfence`.
1413// (&*pending_writes).fetch_sub(1, Release);
1414// });
1415// }
1416//
1417// pub fn sfence() {
1418// unsafe {
1419// // Wait until there are no more pending writes.
1420// while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
1421// }
1422// }
1423// ```
1424#[inline]
1425#[target_feature(enable = "sse")]
1426#[cfg_attr(test, assert_instr(sfence))]
1427#[stable(feature = "simd_x86", since = "1.27.0")]
1428pub unsafe fn _mm_sfence() {
1429 sfence()
1430}
1431
1432/// Gets the unsigned 32-bit value of the MXCSR control and status register.
1433///
1434/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
1435/// floating-point operations may or may not result in this register getting updated with exception
1436/// state, and the register can change between two invocations of this function even when no
1437/// floating-point operations appear in the source code (since floating-point operations appearing
1438/// earlier or later can be reordered).
1439///
1440/// If you need to perform some floating-point operations and check whether they raised an
1441/// exception, use an inline assembly block for the entire sequence of operations.
1442///
1443/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1444///
1445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1446#[inline]
1447#[target_feature(enable = "sse")]
1448#[cfg_attr(test, assert_instr(stmxcsr))]
1449#[stable(feature = "simd_x86", since = "1.27.0")]
1450#[deprecated(
1451 since = "1.75.0",
1452 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1453)]
1454pub unsafe fn _mm_getcsr() -> u32 {
1455 unsafe {
1456 let mut result: i32 = 0_i32;
1457 stmxcsr(ptr::addr_of_mut!(result) as *mut i8);
1458 result as u32
1459 }
1460}
1461
1462/// Sets the MXCSR register with the 32-bit unsigned integer value.
1463///
1464/// This register controls how SIMD instructions handle floating point
1465/// operations. Modifying this register only affects the current thread.
1466///
1467/// It contains several groups of flags:
1468///
1469/// * *Exception flags* report which exceptions occurred since last they were reset.
1470///
1471/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
1472/// these flags are all set to 1, so all exceptions are masked. When
1473/// an exception is masked, the processor simply sets the exception flag and
1474/// continues the operation. If the exception is unmasked, the flag is also set
1475/// but additionally an exception handler is invoked.
1476///
1477/// * *Rounding mode flags* control the rounding mode of floating point
1478/// instructions.
1479///
1480/// * The *denormals-are-zero mode flag* turns all numbers which would be
1481/// denormalized (exponent bits are all zeros) into zeros.
1482///
1483/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
1484/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
1485/// will optimize accordingly. This even applies when the register is altered and later reset to its
1486/// original value without any floating-point operations appearing in the source code between those
1487/// operations (since floating-point operations appearing earlier or later can be reordered).
1488///
1489/// If you need to perform some floating-point operations under a different masking flags, rounding
1490/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
1491/// original MXCSR register state before the end of the block.
1492///
1493/// ## Exception Flags
1494///
1495/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1496/// Infinity by Infinity).
1497///
1498/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1499/// number. Mainly this can cause loss of precision.
1500///
1501/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
1502///
1503/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
1504/// result was too large to be represented (e.g., an `f32` with absolute
1505/// value greater than `2^128`).
1506///
1507/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
1508/// result was too small to be represented in a normalized way (e.g., an
1509/// `f32` with absolute value smaller than `2^-126`.)
1510///
1511/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
1512/// precision exception). This means some precision was lost due to rounding.
1513/// For example, the fraction `1/3` cannot be represented accurately in a
1514/// 32 or 64 bit float and computing it would cause this exception to be
1515/// raised. Precision exceptions are very common, so they are usually masked.
1516///
1517/// Exception flags can be read and set using the convenience functions
1518/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1519/// check if an operation caused some overflow:
1520///
1521/// ```rust,ignore
1522/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1523/// // perform calculations
1524/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1525/// // handle overflow
1526/// }
1527/// ```
1528///
1529/// ## Masking Flags
1530///
1531/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1532/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1533/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1534///
1535/// A single masking bit can be set via
1536///
1537/// ```rust,ignore
1538/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1539/// ```
1540///
1541/// However, since mask bits are by default all set to 1, it is more common to
1542/// want to *disable* certain bits. For example, to unmask the underflow
1543/// exception, use:
1544///
1545/// ```rust,ignore
1546/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1547/// exception
1548/// ```
1549///
1550/// Warning: an unmasked exception will cause an exception handler to be
1551/// called.
1552/// The standard handler will simply terminate the process. So, in this case
1553/// any underflow exception would terminate the current process with something
1554/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1555///
1556/// ## Rounding Mode
1557///
1558/// The rounding mode is describe using two bits. It can be read and set using
1559/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1560/// `_MM_SET_ROUNDING_MODE(mode)`.
1561///
1562/// The rounding modes are:
1563///
1564/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1565/// value. If two values are equally close, round to even (i.e., least
1566/// significant bit will be zero).
1567///
1568/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1569///
1570/// * `_MM_ROUND_UP`: Round toward positive Infinity.
1571///
1572/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1573///
1574/// Example:
1575///
1576/// ```rust,ignore
1577/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1578/// ```
1579///
1580/// ## Denormals-are-zero/Flush-to-zero Mode
1581///
1582/// If this bit is set, values that would be denormalized will be set to zero
1583/// instead. This is turned off by default.
1584///
1585/// You can read and enable/disable this mode via the helper functions
1586/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1587///
1588/// ```rust,ignore
1589/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1590/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1591/// ```
1592///
1593///
1594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1595#[inline]
1596#[target_feature(enable = "sse")]
1597#[cfg_attr(test, assert_instr(ldmxcsr))]
1598#[stable(feature = "simd_x86", since = "1.27.0")]
1599#[deprecated(
1600 since = "1.75.0",
1601 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1602)]
1603pub unsafe fn _mm_setcsr(val: u32) {
1604 ldmxcsr(ptr::addr_of!(val) as *const i8);
1605}
1606
1607/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1608#[stable(feature = "simd_x86", since = "1.27.0")]
1609pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1610/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1611#[stable(feature = "simd_x86", since = "1.27.0")]
1612pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1613/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1614#[stable(feature = "simd_x86", since = "1.27.0")]
1615pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1616/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1617#[stable(feature = "simd_x86", since = "1.27.0")]
1618pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1619/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1620#[stable(feature = "simd_x86", since = "1.27.0")]
1621pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1622/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1623#[stable(feature = "simd_x86", since = "1.27.0")]
1624pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1625/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1626#[stable(feature = "simd_x86", since = "1.27.0")]
1627pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1628
1629/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1630#[stable(feature = "simd_x86", since = "1.27.0")]
1631pub const _MM_MASK_INVALID: u32 = 0x0080;
1632/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1633#[stable(feature = "simd_x86", since = "1.27.0")]
1634pub const _MM_MASK_DENORM: u32 = 0x0100;
1635/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1636#[stable(feature = "simd_x86", since = "1.27.0")]
1637pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1638/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1639#[stable(feature = "simd_x86", since = "1.27.0")]
1640pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1641/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1642#[stable(feature = "simd_x86", since = "1.27.0")]
1643pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1644/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1645#[stable(feature = "simd_x86", since = "1.27.0")]
1646pub const _MM_MASK_INEXACT: u32 = 0x1000;
1647/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1648#[stable(feature = "simd_x86", since = "1.27.0")]
1649pub const _MM_MASK_MASK: u32 = 0x1f80;
1650
1651/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1652#[stable(feature = "simd_x86", since = "1.27.0")]
1653pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1654/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1655#[stable(feature = "simd_x86", since = "1.27.0")]
1656pub const _MM_ROUND_DOWN: u32 = 0x2000;
1657/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1658#[stable(feature = "simd_x86", since = "1.27.0")]
1659pub const _MM_ROUND_UP: u32 = 0x4000;
1660/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1661#[stable(feature = "simd_x86", since = "1.27.0")]
1662pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1663
1664/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1665#[stable(feature = "simd_x86", since = "1.27.0")]
1666pub const _MM_ROUND_MASK: u32 = 0x6000;
1667
1668/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1669#[stable(feature = "simd_x86", since = "1.27.0")]
1670pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1671/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1672#[stable(feature = "simd_x86", since = "1.27.0")]
1673pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1674/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1675#[stable(feature = "simd_x86", since = "1.27.0")]
1676pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1677
1678/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1679///
1680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1681#[inline]
1682#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1683#[allow(non_snake_case)]
1684#[target_feature(enable = "sse")]
1685#[stable(feature = "simd_x86", since = "1.27.0")]
1686#[deprecated(
1687 since = "1.75.0",
1688 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1689)]
1690pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1691 _mm_getcsr() & _MM_MASK_MASK
1692}
1693
1694/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1695///
1696/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1697#[inline]
1698#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1699#[allow(non_snake_case)]
1700#[target_feature(enable = "sse")]
1701#[stable(feature = "simd_x86", since = "1.27.0")]
1702#[deprecated(
1703 since = "1.75.0",
1704 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1705)]
1706pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1707 _mm_getcsr() & _MM_EXCEPT_MASK
1708}
1709
1710/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1711///
1712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1713#[inline]
1714#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1715#[allow(non_snake_case)]
1716#[target_feature(enable = "sse")]
1717#[stable(feature = "simd_x86", since = "1.27.0")]
1718#[deprecated(
1719 since = "1.75.0",
1720 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1721)]
1722pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1723 _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1724}
1725
1726/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1727///
1728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1729#[inline]
1730#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1731#[allow(non_snake_case)]
1732#[target_feature(enable = "sse")]
1733#[stable(feature = "simd_x86", since = "1.27.0")]
1734#[deprecated(
1735 since = "1.75.0",
1736 note = "see `_mm_getcsr` documentation - use inline assembly instead"
1737)]
1738pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1739 _mm_getcsr() & _MM_ROUND_MASK
1740}
1741
1742/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1743///
1744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1745#[inline]
1746#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1747#[allow(non_snake_case)]
1748#[target_feature(enable = "sse")]
1749#[stable(feature = "simd_x86", since = "1.27.0")]
1750#[deprecated(
1751 since = "1.75.0",
1752 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1753)]
1754pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1755 _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | (x & _MM_MASK_MASK))
1756}
1757
1758/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1759///
1760/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1761#[inline]
1762#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1763#[allow(non_snake_case)]
1764#[target_feature(enable = "sse")]
1765#[stable(feature = "simd_x86", since = "1.27.0")]
1766#[deprecated(
1767 since = "1.75.0",
1768 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1769)]
1770pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1771 _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | (x & _MM_EXCEPT_MASK))
1772}
1773
1774/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1775///
1776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1777#[inline]
1778#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1779#[allow(non_snake_case)]
1780#[target_feature(enable = "sse")]
1781#[stable(feature = "simd_x86", since = "1.27.0")]
1782#[deprecated(
1783 since = "1.75.0",
1784 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1785)]
1786pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1787 _mm_setcsr((_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | (x & _MM_FLUSH_ZERO_MASK))
1788}
1789
1790/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1791///
1792/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1793#[inline]
1794#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1795#[allow(non_snake_case)]
1796#[target_feature(enable = "sse")]
1797#[stable(feature = "simd_x86", since = "1.27.0")]
1798#[deprecated(
1799 since = "1.75.0",
1800 note = "see `_mm_setcsr` documentation - use inline assembly instead"
1801)]
1802pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1803 _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | (x & _MM_ROUND_MASK))
1804}
1805
1806/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1807#[stable(feature = "simd_x86", since = "1.27.0")]
1808pub const _MM_HINT_T0: i32 = 3;
1809
1810/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1811#[stable(feature = "simd_x86", since = "1.27.0")]
1812pub const _MM_HINT_T1: i32 = 2;
1813
1814/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1815#[stable(feature = "simd_x86", since = "1.27.0")]
1816pub const _MM_HINT_T2: i32 = 1;
1817
1818/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1819#[stable(feature = "simd_x86", since = "1.27.0")]
1820pub const _MM_HINT_NTA: i32 = 0;
1821
1822/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1823#[stable(feature = "simd_x86", since = "1.27.0")]
1824pub const _MM_HINT_ET0: i32 = 7;
1825
1826/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1827#[stable(feature = "simd_x86", since = "1.27.0")]
1828pub const _MM_HINT_ET1: i32 = 6;
1829
1830/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1831///
1832/// The `STRATEGY` must be one of:
1833///
1834/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
1835/// cache hierarchy.
1836///
1837/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1838///
1839/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1840/// an implementation-specific choice (e.g., L2 if there is no L3).
1841///
1842/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1843/// non-temporal access (NTA) hint. It may be a place closer than main memory
1844/// but outside of the cache hierarchy. This is used to reduce access latency
1845/// without polluting the cache.
1846///
1847/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1848/// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1849/// and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1850///
1851/// The actual implementation depends on the particular CPU. This instruction
1852/// is considered a hint, so the CPU is also free to simply ignore the request.
1853///
1854/// The amount of prefetched data depends on the cache line size of the
1855/// specific CPU, but it will be at least 32 bytes.
1856///
1857/// Common caveats:
1858///
1859/// * Most modern CPUs already automatically prefetch data based on predicted
1860/// access patterns.
1861///
1862/// * Data is usually not fetched if this would cause a TLB miss or a page
1863/// fault.
1864///
1865/// * Too much prefetching can cause unnecessary cache evictions.
1866///
1867/// * Prefetching may also fail if there are not enough memory-subsystem
1868/// resources (e.g., request buffers).
1869///
1870///
1871/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1872#[inline]
1873#[target_feature(enable = "sse")]
1874#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1875#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1876#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1877#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1878#[rustc_legacy_const_generics(1)]
1879#[stable(feature = "simd_x86", since = "1.27.0")]
1880pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1881 static_assert_uimm_bits!(STRATEGY, 3);
1882 // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1883 // `locality` and `rw` are based on our `STRATEGY`.
1884 prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, ty:1);
1885}
1886
1887/// Returns vector of type __m128 with indeterminate elements.
1888/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
1889/// In practice, this is equivalent to [`mem::zeroed`].
1890///
1891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1892#[inline]
1893#[target_feature(enable = "sse")]
1894#[stable(feature = "simd_x86", since = "1.27.0")]
1895pub fn _mm_undefined_ps() -> __m128 {
1896 const { unsafe { mem::zeroed() } }
1897}
1898
1899/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1900///
1901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1902#[inline]
1903#[allow(non_snake_case)]
1904#[target_feature(enable = "sse")]
1905#[stable(feature = "simd_x86", since = "1.27.0")]
1906pub fn _MM_TRANSPOSE4_PS(
1907 row0: &mut __m128,
1908 row1: &mut __m128,
1909 row2: &mut __m128,
1910 row3: &mut __m128,
1911) {
1912 let tmp0: __m128 = _mm_unpacklo_ps(*row0, *row1);
1913 let tmp2: __m128 = _mm_unpacklo_ps(*row2, *row3);
1914 let tmp1: __m128 = _mm_unpackhi_ps(*row0, *row1);
1915 let tmp3: __m128 = _mm_unpackhi_ps(*row2, *row3);
1916
1917 *row0 = _mm_movelh_ps(a:tmp0, b:tmp2);
1918 *row1 = _mm_movehl_ps(a:tmp2, b:tmp0);
1919 *row2 = _mm_movelh_ps(a:tmp1, b:tmp3);
1920 *row3 = _mm_movehl_ps(a:tmp3, b:tmp1);
1921}
1922
1923#[allow(improper_ctypes)]
1924unsafe extern "C" {
1925 #[link_name = "llvm.x86.sse.rcp.ss"]
1926 unsafefn rcpss(a: __m128) -> __m128;
1927 #[link_name = "llvm.x86.sse.rcp.ps"]
1928 unsafefn rcpps(a: __m128) -> __m128;
1929 #[link_name = "llvm.x86.sse.rsqrt.ss"]
1930 unsafefn rsqrtss(a: __m128) -> __m128;
1931 #[link_name = "llvm.x86.sse.rsqrt.ps"]
1932 unsafefn rsqrtps(a: __m128) -> __m128;
1933 #[link_name = "llvm.x86.sse.min.ss"]
1934 unsafefn minss(a: __m128, b: __m128) -> __m128;
1935 #[link_name = "llvm.x86.sse.min.ps"]
1936 unsafefn minps(a: __m128, b: __m128) -> __m128;
1937 #[link_name = "llvm.x86.sse.max.ss"]
1938 unsafefn maxss(a: __m128, b: __m128) -> __m128;
1939 #[link_name = "llvm.x86.sse.max.ps"]
1940 unsafefn maxps(a: __m128, b: __m128) -> __m128;
1941 #[link_name = "llvm.x86.sse.cmp.ps"]
1942 unsafefn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1943 #[link_name = "llvm.x86.sse.comieq.ss"]
1944 unsafefn comieq_ss(a: __m128, b: __m128) -> i32;
1945 #[link_name = "llvm.x86.sse.comilt.ss"]
1946 unsafefn comilt_ss(a: __m128, b: __m128) -> i32;
1947 #[link_name = "llvm.x86.sse.comile.ss"]
1948 unsafefn comile_ss(a: __m128, b: __m128) -> i32;
1949 #[link_name = "llvm.x86.sse.comigt.ss"]
1950 unsafefn comigt_ss(a: __m128, b: __m128) -> i32;
1951 #[link_name = "llvm.x86.sse.comige.ss"]
1952 unsafefn comige_ss(a: __m128, b: __m128) -> i32;
1953 #[link_name = "llvm.x86.sse.comineq.ss"]
1954 unsafefn comineq_ss(a: __m128, b: __m128) -> i32;
1955 #[link_name = "llvm.x86.sse.ucomieq.ss"]
1956 unsafefn ucomieq_ss(a: __m128, b: __m128) -> i32;
1957 #[link_name = "llvm.x86.sse.ucomilt.ss"]
1958 unsafefn ucomilt_ss(a: __m128, b: __m128) -> i32;
1959 #[link_name = "llvm.x86.sse.ucomile.ss"]
1960 unsafefn ucomile_ss(a: __m128, b: __m128) -> i32;
1961 #[link_name = "llvm.x86.sse.ucomigt.ss"]
1962 unsafefn ucomigt_ss(a: __m128, b: __m128) -> i32;
1963 #[link_name = "llvm.x86.sse.ucomige.ss"]
1964 unsafefn ucomige_ss(a: __m128, b: __m128) -> i32;
1965 #[link_name = "llvm.x86.sse.ucomineq.ss"]
1966 unsafefn ucomineq_ss(a: __m128, b: __m128) -> i32;
1967 #[link_name = "llvm.x86.sse.cvtss2si"]
1968 unsafefn cvtss2si(a: __m128) -> i32;
1969 #[link_name = "llvm.x86.sse.cvttss2si"]
1970 unsafefn cvttss2si(a: __m128) -> i32;
1971 #[link_name = "llvm.x86.sse.cvtsi2ss"]
1972 unsafefn cvtsi2ss(a: __m128, b: i32) -> __m128;
1973 #[link_name = "llvm.x86.sse.sfence"]
1974 unsafefn sfence();
1975 #[link_name = "llvm.x86.sse.stmxcsr"]
1976 unsafefn stmxcsr(p: *mut i8);
1977 #[link_name = "llvm.x86.sse.ldmxcsr"]
1978 unsafefn ldmxcsr(p: *const i8);
1979 #[link_name = "llvm.prefetch"]
1980 unsafefn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
1981 #[link_name = "llvm.x86.sse.cmp.ss"]
1982 unsafefn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
1983}
1984
1985/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1986///
1987/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1988/// exception _may_ be generated.
1989///
1990/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
1991///
1992/// # Safety of non-temporal stores
1993///
1994/// After using this intrinsic, but before any other access to the memory that this intrinsic
1995/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1996/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1997/// return.
1998///
1999/// See [`_mm_sfence`] for details.
2000#[inline]
2001#[target_feature(enable = "sse")]
2002#[cfg_attr(test, assert_instr(movntps))]
2003#[stable(feature = "simd_x86", since = "1.27.0")]
2004#[allow(clippy::cast_ptr_alignment)]
2005pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
2006 crate::arch::asm!(
2007 vps!("movntps", ",{a}"),
2008 p = in(reg) mem_addr,
2009 a = in(xmm_reg) a,
2010 options(nostack, preserves_flags),
2011 );
2012}
2013
2014#[cfg(test)]
2015mod tests {
2016 use crate::{hint::black_box, mem::transmute, ptr};
2017 use std::boxed;
2018 use stdarch_test::simd_test;
2019
2020 use crate::core_arch::{simd::*, x86::*};
2021
2022 const NAN: f32 = f32::NAN;
2023
2024 #[simd_test(enable = "sse")]
2025 unsafe fn test_mm_add_ps() {
2026 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2027 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2028 let r = _mm_add_ps(a, b);
2029 assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
2030 }
2031
2032 #[simd_test(enable = "sse")]
2033 unsafe fn test_mm_add_ss() {
2034 let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
2035 let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
2036 let r = _mm_add_ss(a, b);
2037 assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
2038 }
2039
2040 #[simd_test(enable = "sse")]
2041 unsafe fn test_mm_sub_ps() {
2042 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2043 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2044 let r = _mm_sub_ps(a, b);
2045 assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
2046 }
2047
2048 #[simd_test(enable = "sse")]
2049 unsafe fn test_mm_sub_ss() {
2050 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2051 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2052 let r = _mm_sub_ss(a, b);
2053 assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
2054 }
2055
2056 #[simd_test(enable = "sse")]
2057 unsafe fn test_mm_mul_ps() {
2058 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2059 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2060 let r = _mm_mul_ps(a, b);
2061 assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
2062 }
2063
2064 #[simd_test(enable = "sse")]
2065 unsafe fn test_mm_mul_ss() {
2066 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2067 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2068 let r = _mm_mul_ss(a, b);
2069 assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
2070 }
2071
2072 #[simd_test(enable = "sse")]
2073 unsafe fn test_mm_div_ps() {
2074 let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
2075 let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
2076 let r = _mm_div_ps(a, b);
2077 assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
2078 }
2079
2080 #[simd_test(enable = "sse")]
2081 unsafe fn test_mm_div_ss() {
2082 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2083 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2084 let r = _mm_div_ss(a, b);
2085 assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
2086 }
2087
2088 #[simd_test(enable = "sse")]
2089 unsafe fn test_mm_sqrt_ss() {
2090 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2091 let r = _mm_sqrt_ss(a);
2092 let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
2093 assert_eq_m128(r, e);
2094 }
2095
2096 #[simd_test(enable = "sse")]
2097 unsafe fn test_mm_sqrt_ps() {
2098 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2099 let r = _mm_sqrt_ps(a);
2100 let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
2101 assert_eq_m128(r, e);
2102 }
2103
2104 #[simd_test(enable = "sse")]
2105 unsafe fn test_mm_rcp_ss() {
2106 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2107 let r = _mm_rcp_ss(a);
2108 let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
2109 let rel_err = 0.00048828125;
2110 assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
2111 for i in 1..4 {
2112 assert_eq!(get_m128(r, i), get_m128(e, i));
2113 }
2114 }
2115
2116 #[simd_test(enable = "sse")]
2117 unsafe fn test_mm_rcp_ps() {
2118 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2119 let r = _mm_rcp_ps(a);
2120 let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
2121 let rel_err = 0.00048828125;
2122 for i in 0..4 {
2123 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2124 }
2125 }
2126
2127 #[simd_test(enable = "sse")]
2128 unsafe fn test_mm_rsqrt_ss() {
2129 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2130 let r = _mm_rsqrt_ss(a);
2131 let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
2132 let rel_err = 0.00048828125;
2133 for i in 0..4 {
2134 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2135 }
2136 }
2137
2138 #[simd_test(enable = "sse")]
2139 unsafe fn test_mm_rsqrt_ps() {
2140 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2141 let r = _mm_rsqrt_ps(a);
2142 let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2143 let rel_err = 0.00048828125;
2144 for i in 0..4 {
2145 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2146 }
2147 }
2148
2149 #[simd_test(enable = "sse")]
2150 unsafe fn test_mm_min_ss() {
2151 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2152 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2153 let r = _mm_min_ss(a, b);
2154 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2155 }
2156
2157 #[simd_test(enable = "sse")]
2158 unsafe fn test_mm_min_ps() {
2159 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2160 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2161 let r = _mm_min_ps(a, b);
2162 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2163
2164 // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2165 // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2166 // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2167 // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2168 // `r1` to `a` and `r2` to `b`.
2169 let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2170 let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2171 let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
2172 let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
2173 let a: [u8; 16] = transmute(a);
2174 let b: [u8; 16] = transmute(b);
2175 assert_eq!(r1, b);
2176 assert_eq!(r2, a);
2177 assert_ne!(a, b); // sanity check that -0.0 is actually present
2178 }
2179
2180 #[simd_test(enable = "sse")]
2181 unsafe fn test_mm_max_ss() {
2182 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2183 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2184 let r = _mm_max_ss(a, b);
2185 assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2186 }
2187
2188 #[simd_test(enable = "sse")]
2189 unsafe fn test_mm_max_ps() {
2190 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2191 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2192 let r = _mm_max_ps(a, b);
2193 assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2194
2195 // Check SSE-specific semantics for -0.0 handling.
2196 let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2197 let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2198 let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
2199 let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
2200 let a: [u8; 16] = transmute(a);
2201 let b: [u8; 16] = transmute(b);
2202 assert_eq!(r1, b);
2203 assert_eq!(r2, a);
2204 assert_ne!(a, b); // sanity check that -0.0 is actually present
2205 }
2206
2207 #[simd_test(enable = "sse")]
2208 unsafe fn test_mm_and_ps() {
2209 let a = transmute(u32x4::splat(0b0011));
2210 let b = transmute(u32x4::splat(0b0101));
2211 let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2212 let e = transmute(u32x4::splat(0b0001));
2213 assert_eq_m128(r, e);
2214 }
2215
2216 #[simd_test(enable = "sse")]
2217 unsafe fn test_mm_andnot_ps() {
2218 let a = transmute(u32x4::splat(0b0011));
2219 let b = transmute(u32x4::splat(0b0101));
2220 let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2221 let e = transmute(u32x4::splat(0b0100));
2222 assert_eq_m128(r, e);
2223 }
2224
2225 #[simd_test(enable = "sse")]
2226 unsafe fn test_mm_or_ps() {
2227 let a = transmute(u32x4::splat(0b0011));
2228 let b = transmute(u32x4::splat(0b0101));
2229 let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2230 let e = transmute(u32x4::splat(0b0111));
2231 assert_eq_m128(r, e);
2232 }
2233
2234 #[simd_test(enable = "sse")]
2235 unsafe fn test_mm_xor_ps() {
2236 let a = transmute(u32x4::splat(0b0011));
2237 let b = transmute(u32x4::splat(0b0101));
2238 let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2239 let e = transmute(u32x4::splat(0b0110));
2240 assert_eq_m128(r, e);
2241 }
2242
2243 #[simd_test(enable = "sse")]
2244 unsafe fn test_mm_cmpeq_ss() {
2245 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2246 let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2247 let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2248 let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
2249 assert_eq!(r, e);
2250
2251 let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2252 let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
2253 let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
2254 assert_eq!(r2, e2);
2255 }
2256
2257 #[simd_test(enable = "sse")]
2258 unsafe fn test_mm_cmplt_ss() {
2259 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2260 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2261 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2262 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2263
2264 let b1 = 0u32; // a.extract(0) < b.extract(0)
2265 let c1 = 0u32; // a.extract(0) < c.extract(0)
2266 let d1 = !0u32; // a.extract(0) < d.extract(0)
2267
2268 let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2269 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2270 assert_eq!(rb, eb);
2271
2272 let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2273 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2274 assert_eq!(rc, ec);
2275
2276 let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2277 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2278 assert_eq!(rd, ed);
2279 }
2280
2281 #[simd_test(enable = "sse")]
2282 unsafe fn test_mm_cmple_ss() {
2283 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2284 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2285 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2286 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2287
2288 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2289 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2290 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2291
2292 let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2293 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2294 assert_eq!(rb, eb);
2295
2296 let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2297 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2298 assert_eq!(rc, ec);
2299
2300 let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2301 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2302 assert_eq!(rd, ed);
2303 }
2304
2305 #[simd_test(enable = "sse")]
2306 unsafe fn test_mm_cmpgt_ss() {
2307 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2308 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2309 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2310 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2311
2312 let b1 = !0u32; // a.extract(0) > b.extract(0)
2313 let c1 = 0u32; // a.extract(0) > c.extract(0)
2314 let d1 = 0u32; // a.extract(0) > d.extract(0)
2315
2316 let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2317 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2318 assert_eq!(rb, eb);
2319
2320 let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2321 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2322 assert_eq!(rc, ec);
2323
2324 let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2325 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2326 assert_eq!(rd, ed);
2327 }
2328
2329 #[simd_test(enable = "sse")]
2330 unsafe fn test_mm_cmpge_ss() {
2331 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2332 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2333 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2334 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2335
2336 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2337 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2338 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2339
2340 let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2341 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2342 assert_eq!(rb, eb);
2343
2344 let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2345 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2346 assert_eq!(rc, ec);
2347
2348 let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2349 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2350 assert_eq!(rd, ed);
2351 }
2352
2353 #[simd_test(enable = "sse")]
2354 unsafe fn test_mm_cmpneq_ss() {
2355 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2356 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2357 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2358 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2359
2360 let b1 = !0u32; // a.extract(0) != b.extract(0)
2361 let c1 = 0u32; // a.extract(0) != c.extract(0)
2362 let d1 = !0u32; // a.extract(0) != d.extract(0)
2363
2364 let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2365 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2366 assert_eq!(rb, eb);
2367
2368 let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2369 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2370 assert_eq!(rc, ec);
2371
2372 let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2373 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2374 assert_eq!(rd, ed);
2375 }
2376
2377 #[simd_test(enable = "sse")]
2378 unsafe fn test_mm_cmpnlt_ss() {
2379 // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2380 // must be a difference. It may have to do with behavior in the
2381 // presence of NaNs (signaling or quiet). If so, we should add tests
2382 // for those.
2383
2384 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2385 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2386 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2387 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2388
2389 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2390 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2391 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2392
2393 let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2394 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2395 assert_eq!(rb, eb);
2396
2397 let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2398 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2399 assert_eq!(rc, ec);
2400
2401 let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2402 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2403 assert_eq!(rd, ed);
2404 }
2405
2406 #[simd_test(enable = "sse")]
2407 unsafe fn test_mm_cmpnle_ss() {
2408 // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2409 // must be a difference. It may have to do with behavior in the
2410 // presence
2411 // of NaNs (signaling or quiet). If so, we should add tests for those.
2412
2413 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2414 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2415 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2416 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2417
2418 let b1 = !0u32; // a.extract(0) > b.extract(0)
2419 let c1 = 0u32; // a.extract(0) > c.extract(0)
2420 let d1 = 0u32; // a.extract(0) > d.extract(0)
2421
2422 let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2423 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2424 assert_eq!(rb, eb);
2425
2426 let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2427 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2428 assert_eq!(rc, ec);
2429
2430 let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2431 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2432 assert_eq!(rd, ed);
2433 }
2434
2435 #[simd_test(enable = "sse")]
2436 unsafe fn test_mm_cmpngt_ss() {
2437 // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2438 // must be a difference. It may have to do with behavior in the
2439 // presence of NaNs (signaling or quiet). If so, we should add tests
2440 // for those.
2441
2442 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2443 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2444 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2445 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2446
2447 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2448 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2449 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2450
2451 let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2452 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2453 assert_eq!(rb, eb);
2454
2455 let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2456 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2457 assert_eq!(rc, ec);
2458
2459 let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2460 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2461 assert_eq!(rd, ed);
2462 }
2463
2464 #[simd_test(enable = "sse")]
2465 unsafe fn test_mm_cmpnge_ss() {
2466 // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2467 // must be a difference. It may have to do with behavior in the
2468 // presence of NaNs (signaling or quiet). If so, we should add tests
2469 // for those.
2470
2471 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2472 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2473 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2474 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2475
2476 let b1 = 0u32; // a.extract(0) < b.extract(0)
2477 let c1 = 0u32; // a.extract(0) < c.extract(0)
2478 let d1 = !0u32; // a.extract(0) < d.extract(0)
2479
2480 let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2481 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2482 assert_eq!(rb, eb);
2483
2484 let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2485 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2486 assert_eq!(rc, ec);
2487
2488 let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2489 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2490 assert_eq!(rd, ed);
2491 }
2492
2493 #[simd_test(enable = "sse")]
2494 unsafe fn test_mm_cmpord_ss() {
2495 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2496 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2497 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2498 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2499
2500 let b1 = !0u32; // a.extract(0) ord b.extract(0)
2501 let c1 = 0u32; // a.extract(0) ord c.extract(0)
2502 let d1 = !0u32; // a.extract(0) ord d.extract(0)
2503
2504 let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2505 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2506 assert_eq!(rb, eb);
2507
2508 let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2509 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2510 assert_eq!(rc, ec);
2511
2512 let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2513 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2514 assert_eq!(rd, ed);
2515 }
2516
2517 #[simd_test(enable = "sse")]
2518 unsafe fn test_mm_cmpunord_ss() {
2519 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2520 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2521 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2522 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2523
2524 let b1 = 0u32; // a.extract(0) unord b.extract(0)
2525 let c1 = !0u32; // a.extract(0) unord c.extract(0)
2526 let d1 = 0u32; // a.extract(0) unord d.extract(0)
2527
2528 let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2529 let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2530 assert_eq!(rb, eb);
2531
2532 let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2533 let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2534 assert_eq!(rc, ec);
2535
2536 let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2537 let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2538 assert_eq!(rd, ed);
2539 }
2540
2541 #[simd_test(enable = "sse")]
2542 unsafe fn test_mm_cmpeq_ps() {
2543 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2544 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2545 let tru = !0u32;
2546 let fls = 0u32;
2547
2548 let e = u32x4::new(fls, fls, tru, fls);
2549 let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2550 assert_eq!(r, e);
2551 }
2552
2553 #[simd_test(enable = "sse")]
2554 unsafe fn test_mm_cmplt_ps() {
2555 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2556 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2557 let tru = !0u32;
2558 let fls = 0u32;
2559
2560 let e = u32x4::new(tru, fls, fls, fls);
2561 let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2562 assert_eq!(r, e);
2563 }
2564
2565 #[simd_test(enable = "sse")]
2566 unsafe fn test_mm_cmple_ps() {
2567 let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2568 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2569 let tru = !0u32;
2570 let fls = 0u32;
2571
2572 let e = u32x4::new(tru, fls, tru, fls);
2573 let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2574 assert_eq!(r, e);
2575 }
2576
2577 #[simd_test(enable = "sse")]
2578 unsafe fn test_mm_cmpgt_ps() {
2579 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2580 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2581 let tru = !0u32;
2582 let fls = 0u32;
2583
2584 let e = u32x4::new(fls, tru, fls, fls);
2585 let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2586 assert_eq!(r, e);
2587 }
2588
2589 #[simd_test(enable = "sse")]
2590 unsafe fn test_mm_cmpge_ps() {
2591 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2592 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2593 let tru = !0u32;
2594 let fls = 0u32;
2595
2596 let e = u32x4::new(fls, tru, tru, fls);
2597 let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2598 assert_eq!(r, e);
2599 }
2600
2601 #[simd_test(enable = "sse")]
2602 unsafe fn test_mm_cmpneq_ps() {
2603 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2604 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2605 let tru = !0u32;
2606 let fls = 0u32;
2607
2608 let e = u32x4::new(tru, tru, fls, tru);
2609 let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2610 assert_eq!(r, e);
2611 }
2612
2613 #[simd_test(enable = "sse")]
2614 unsafe fn test_mm_cmpnlt_ps() {
2615 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2616 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2617 let tru = !0u32;
2618 let fls = 0u32;
2619
2620 let e = u32x4::new(fls, tru, tru, tru);
2621 let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2622 assert_eq!(r, e);
2623 }
2624
2625 #[simd_test(enable = "sse")]
2626 unsafe fn test_mm_cmpnle_ps() {
2627 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2628 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2629 let tru = !0u32;
2630 let fls = 0u32;
2631
2632 let e = u32x4::new(fls, tru, fls, tru);
2633 let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2634 assert_eq!(r, e);
2635 }
2636
2637 #[simd_test(enable = "sse")]
2638 unsafe fn test_mm_cmpngt_ps() {
2639 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2640 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2641 let tru = !0u32;
2642 let fls = 0u32;
2643
2644 let e = u32x4::new(tru, fls, tru, tru);
2645 let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2646 assert_eq!(r, e);
2647 }
2648
2649 #[simd_test(enable = "sse")]
2650 unsafe fn test_mm_cmpnge_ps() {
2651 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2652 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2653 let tru = !0u32;
2654 let fls = 0u32;
2655
2656 let e = u32x4::new(tru, fls, fls, tru);
2657 let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2658 assert_eq!(r, e);
2659 }
2660
2661 #[simd_test(enable = "sse")]
2662 unsafe fn test_mm_cmpord_ps() {
2663 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2664 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2665 let tru = !0u32;
2666 let fls = 0u32;
2667
2668 let e = u32x4::new(tru, fls, fls, fls);
2669 let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2670 assert_eq!(r, e);
2671 }
2672
2673 #[simd_test(enable = "sse")]
2674 unsafe fn test_mm_cmpunord_ps() {
2675 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2676 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2677 let tru = !0u32;
2678 let fls = 0u32;
2679
2680 let e = u32x4::new(fls, tru, tru, tru);
2681 let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2682 assert_eq!(r, e);
2683 }
2684
2685 #[simd_test(enable = "sse")]
2686 unsafe fn test_mm_comieq_ss() {
2687 let aa = &[3.0f32, 12.0, 23.0, NAN];
2688 let bb = &[3.0f32, 47.5, 1.5, NAN];
2689
2690 let ee = &[1i32, 0, 0, 0];
2691
2692 for i in 0..4 {
2693 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2694 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2695
2696 let r = _mm_comieq_ss(a, b);
2697
2698 assert_eq!(
2699 ee[i], r,
2700 "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2701 a, b, r, ee[i], i
2702 );
2703 }
2704 }
2705
2706 #[simd_test(enable = "sse")]
2707 unsafe fn test_mm_comilt_ss() {
2708 let aa = &[3.0f32, 12.0, 23.0, NAN];
2709 let bb = &[3.0f32, 47.5, 1.5, NAN];
2710
2711 let ee = &[0i32, 1, 0, 0];
2712
2713 for i in 0..4 {
2714 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2715 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2716
2717 let r = _mm_comilt_ss(a, b);
2718
2719 assert_eq!(
2720 ee[i], r,
2721 "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2722 a, b, r, ee[i], i
2723 );
2724 }
2725 }
2726
2727 #[simd_test(enable = "sse")]
2728 unsafe fn test_mm_comile_ss() {
2729 let aa = &[3.0f32, 12.0, 23.0, NAN];
2730 let bb = &[3.0f32, 47.5, 1.5, NAN];
2731
2732 let ee = &[1i32, 1, 0, 0];
2733
2734 for i in 0..4 {
2735 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2736 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2737
2738 let r = _mm_comile_ss(a, b);
2739
2740 assert_eq!(
2741 ee[i], r,
2742 "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2743 a, b, r, ee[i], i
2744 );
2745 }
2746 }
2747
2748 #[simd_test(enable = "sse")]
2749 unsafe fn test_mm_comigt_ss() {
2750 let aa = &[3.0f32, 12.0, 23.0, NAN];
2751 let bb = &[3.0f32, 47.5, 1.5, NAN];
2752
2753 let ee = &[1i32, 0, 1, 0];
2754
2755 for i in 0..4 {
2756 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2757 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2758
2759 let r = _mm_comige_ss(a, b);
2760
2761 assert_eq!(
2762 ee[i], r,
2763 "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2764 a, b, r, ee[i], i
2765 );
2766 }
2767 }
2768
2769 #[simd_test(enable = "sse")]
2770 unsafe fn test_mm_comineq_ss() {
2771 let aa = &[3.0f32, 12.0, 23.0, NAN];
2772 let bb = &[3.0f32, 47.5, 1.5, NAN];
2773
2774 let ee = &[0i32, 1, 1, 1];
2775
2776 for i in 0..4 {
2777 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2778 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2779
2780 let r = _mm_comineq_ss(a, b);
2781
2782 assert_eq!(
2783 ee[i], r,
2784 "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2785 a, b, r, ee[i], i
2786 );
2787 }
2788 }
2789
2790 #[simd_test(enable = "sse")]
2791 unsafe fn test_mm_ucomieq_ss() {
2792 let aa = &[3.0f32, 12.0, 23.0, NAN];
2793 let bb = &[3.0f32, 47.5, 1.5, NAN];
2794
2795 let ee = &[1i32, 0, 0, 0];
2796
2797 for i in 0..4 {
2798 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2799 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2800
2801 let r = _mm_ucomieq_ss(a, b);
2802
2803 assert_eq!(
2804 ee[i], r,
2805 "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2806 a, b, r, ee[i], i
2807 );
2808 }
2809 }
2810
2811 #[simd_test(enable = "sse")]
2812 unsafe fn test_mm_ucomilt_ss() {
2813 let aa = &[3.0f32, 12.0, 23.0, NAN];
2814 let bb = &[3.0f32, 47.5, 1.5, NAN];
2815
2816 let ee = &[0i32, 1, 0, 0];
2817
2818 for i in 0..4 {
2819 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2820 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2821
2822 let r = _mm_ucomilt_ss(a, b);
2823
2824 assert_eq!(
2825 ee[i], r,
2826 "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2827 a, b, r, ee[i], i
2828 );
2829 }
2830 }
2831
2832 #[simd_test(enable = "sse")]
2833 unsafe fn test_mm_ucomile_ss() {
2834 let aa = &[3.0f32, 12.0, 23.0, NAN];
2835 let bb = &[3.0f32, 47.5, 1.5, NAN];
2836
2837 let ee = &[1i32, 1, 0, 0];
2838
2839 for i in 0..4 {
2840 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2841 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2842
2843 let r = _mm_ucomile_ss(a, b);
2844
2845 assert_eq!(
2846 ee[i], r,
2847 "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2848 a, b, r, ee[i], i
2849 );
2850 }
2851 }
2852
2853 #[simd_test(enable = "sse")]
2854 unsafe fn test_mm_ucomigt_ss() {
2855 let aa = &[3.0f32, 12.0, 23.0, NAN];
2856 let bb = &[3.0f32, 47.5, 1.5, NAN];
2857
2858 let ee = &[0i32, 0, 1, 0];
2859
2860 for i in 0..4 {
2861 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2862 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2863
2864 let r = _mm_ucomigt_ss(a, b);
2865
2866 assert_eq!(
2867 ee[i], r,
2868 "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2869 a, b, r, ee[i], i
2870 );
2871 }
2872 }
2873
2874 #[simd_test(enable = "sse")]
2875 unsafe fn test_mm_ucomige_ss() {
2876 let aa = &[3.0f32, 12.0, 23.0, NAN];
2877 let bb = &[3.0f32, 47.5, 1.5, NAN];
2878
2879 let ee = &[1i32, 0, 1, 0];
2880
2881 for i in 0..4 {
2882 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2883 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2884
2885 let r = _mm_ucomige_ss(a, b);
2886
2887 assert_eq!(
2888 ee[i], r,
2889 "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2890 a, b, r, ee[i], i
2891 );
2892 }
2893 }
2894
2895 #[simd_test(enable = "sse")]
2896 unsafe fn test_mm_ucomineq_ss() {
2897 let aa = &[3.0f32, 12.0, 23.0, NAN];
2898 let bb = &[3.0f32, 47.5, 1.5, NAN];
2899
2900 let ee = &[0i32, 1, 1, 1];
2901
2902 for i in 0..4 {
2903 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2904 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2905
2906 let r = _mm_ucomineq_ss(a, b);
2907
2908 assert_eq!(
2909 ee[i], r,
2910 "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2911 a, b, r, ee[i], i
2912 );
2913 }
2914 }
2915
2916 #[simd_test(enable = "sse")]
2917 unsafe fn test_mm_cvtss_si32() {
2918 let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
2919 let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
2920 for i in 0..inputs.len() {
2921 let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2922 let e = result[i];
2923 let r = _mm_cvtss_si32(x);
2924 assert_eq!(
2925 e, r,
2926 "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2927 i, x, r, e
2928 );
2929 }
2930 }
2931
2932 #[simd_test(enable = "sse")]
2933 unsafe fn test_mm_cvttss_si32() {
2934 let inputs = &[
2935 (42.0f32, 42i32),
2936 (-31.4, -31),
2937 (-33.5, -33),
2938 (-34.5, -34),
2939 (10.999, 10),
2940 (-5.99, -5),
2941 (4.0e10, i32::MIN),
2942 (4.0e-10, 0),
2943 (NAN, i32::MIN),
2944 (2147483500.1, 2147483520),
2945 ];
2946 for (i, &(xi, e)) in inputs.iter().enumerate() {
2947 let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
2948 let r = _mm_cvttss_si32(x);
2949 assert_eq!(
2950 e, r,
2951 "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2952 i, x, r, e
2953 );
2954 }
2955 }
2956
2957 #[simd_test(enable = "sse")]
2958 unsafe fn test_mm_cvtsi32_ss() {
2959 let inputs = &[
2960 (4555i32, 4555.0f32),
2961 (322223333, 322223330.0),
2962 (-432, -432.0),
2963 (-322223333, -322223330.0),
2964 ];
2965
2966 for &(x, f) in inputs.iter() {
2967 let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2968 let r = _mm_cvtsi32_ss(a, x);
2969 let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
2970 assert_eq_m128(e, r);
2971 }
2972 }
2973
2974 #[simd_test(enable = "sse")]
2975 unsafe fn test_mm_cvtss_f32() {
2976 let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
2977 assert_eq!(_mm_cvtss_f32(a), 312.0134);
2978 }
2979
2980 #[simd_test(enable = "sse")]
2981 unsafe fn test_mm_set_ss() {
2982 let r = _mm_set_ss(black_box(4.25));
2983 assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
2984 }
2985
2986 #[simd_test(enable = "sse")]
2987 unsafe fn test_mm_set1_ps() {
2988 let r1 = _mm_set1_ps(black_box(4.25));
2989 let r2 = _mm_set_ps1(black_box(4.25));
2990 assert_eq!(get_m128(r1, 0), 4.25);
2991 assert_eq!(get_m128(r1, 1), 4.25);
2992 assert_eq!(get_m128(r1, 2), 4.25);
2993 assert_eq!(get_m128(r1, 3), 4.25);
2994 assert_eq!(get_m128(r2, 0), 4.25);
2995 assert_eq!(get_m128(r2, 1), 4.25);
2996 assert_eq!(get_m128(r2, 2), 4.25);
2997 assert_eq!(get_m128(r2, 3), 4.25);
2998 }
2999
3000 #[simd_test(enable = "sse")]
3001 unsafe fn test_mm_set_ps() {
3002 let r = _mm_set_ps(
3003 black_box(1.0),
3004 black_box(2.0),
3005 black_box(3.0),
3006 black_box(4.0),
3007 );
3008 assert_eq!(get_m128(r, 0), 4.0);
3009 assert_eq!(get_m128(r, 1), 3.0);
3010 assert_eq!(get_m128(r, 2), 2.0);
3011 assert_eq!(get_m128(r, 3), 1.0);
3012 }
3013
3014 #[simd_test(enable = "sse")]
3015 unsafe fn test_mm_setr_ps() {
3016 let r = _mm_setr_ps(
3017 black_box(1.0),
3018 black_box(2.0),
3019 black_box(3.0),
3020 black_box(4.0),
3021 );
3022 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3023 }
3024
3025 #[simd_test(enable = "sse")]
3026 unsafe fn test_mm_setzero_ps() {
3027 let r = *black_box(&_mm_setzero_ps());
3028 assert_eq_m128(r, _mm_set1_ps(0.0));
3029 }
3030
3031 #[simd_test(enable = "sse")]
3032 unsafe fn test_mm_shuffle() {
3033 assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
3034 assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
3035 assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
3036 }
3037
3038 #[simd_test(enable = "sse")]
3039 unsafe fn test_mm_shuffle_ps() {
3040 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3041 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3042 let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
3043 assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
3044 }
3045
3046 #[simd_test(enable = "sse")]
3047 unsafe fn test_mm_unpackhi_ps() {
3048 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3049 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3050 let r = _mm_unpackhi_ps(a, b);
3051 assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
3052 }
3053
3054 #[simd_test(enable = "sse")]
3055 unsafe fn test_mm_unpacklo_ps() {
3056 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3057 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3058 let r = _mm_unpacklo_ps(a, b);
3059 assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
3060 }
3061
3062 #[simd_test(enable = "sse")]
3063 unsafe fn test_mm_movehl_ps() {
3064 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3065 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3066 let r = _mm_movehl_ps(a, b);
3067 assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
3068 }
3069
3070 #[simd_test(enable = "sse")]
3071 unsafe fn test_mm_movelh_ps() {
3072 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3073 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3074 let r = _mm_movelh_ps(a, b);
3075 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
3076 }
3077
3078 #[simd_test(enable = "sse")]
3079 unsafe fn test_mm_load_ss() {
3080 let a = 42.0f32;
3081 let r = _mm_load_ss(ptr::addr_of!(a));
3082 assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
3083 }
3084
3085 #[simd_test(enable = "sse")]
3086 unsafe fn test_mm_load1_ps() {
3087 let a = 42.0f32;
3088 let r = _mm_load1_ps(ptr::addr_of!(a));
3089 assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
3090 }
3091
3092 #[simd_test(enable = "sse")]
3093 unsafe fn test_mm_load_ps() {
3094 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3095
3096 let mut p = vals.as_ptr();
3097 let mut fixup = 0.0f32;
3098
3099 // Make sure p is aligned, otherwise we might get a
3100 // (signal: 11, SIGSEGV: invalid memory reference)
3101
3102 let unalignment = (p as usize) & 0xf;
3103 if unalignment != 0 {
3104 let delta = (16 - unalignment) >> 2;
3105 fixup = delta as f32;
3106 p = p.add(delta);
3107 }
3108
3109 let r = _mm_load_ps(p);
3110 let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
3111 assert_eq_m128(r, e);
3112 }
3113
3114 #[simd_test(enable = "sse")]
3115 unsafe fn test_mm_loadu_ps() {
3116 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3117 let p = vals.as_ptr().add(3);
3118 let r = _mm_loadu_ps(black_box(p));
3119 assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3120 }
3121
3122 #[simd_test(enable = "sse")]
3123 unsafe fn test_mm_loadr_ps() {
3124 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3125
3126 let mut p = vals.as_ptr();
3127 let mut fixup = 0.0f32;
3128
3129 // Make sure p is aligned, otherwise we might get a
3130 // (signal: 11, SIGSEGV: invalid memory reference)
3131
3132 let unalignment = (p as usize) & 0xf;
3133 if unalignment != 0 {
3134 let delta = (16 - unalignment) >> 2;
3135 fixup = delta as f32;
3136 p = p.add(delta);
3137 }
3138
3139 let r = _mm_loadr_ps(p);
3140 let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
3141 assert_eq_m128(r, e);
3142 }
3143
3144 #[simd_test(enable = "sse")]
3145 unsafe fn test_mm_store_ss() {
3146 let mut vals = [0.0f32; 8];
3147 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3148 _mm_store_ss(vals.as_mut_ptr().add(1), a);
3149
3150 assert_eq!(vals[0], 0.0);
3151 assert_eq!(vals[1], 1.0);
3152 assert_eq!(vals[2], 0.0);
3153 }
3154
3155 #[simd_test(enable = "sse")]
3156 unsafe fn test_mm_store1_ps() {
3157 let mut vals = [0.0f32; 8];
3158 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3159
3160 let mut ofs = 0;
3161 let mut p = vals.as_mut_ptr();
3162
3163 if (p as usize) & 0xf != 0 {
3164 ofs = (16 - ((p as usize) & 0xf)) >> 2;
3165 p = p.add(ofs);
3166 }
3167
3168 _mm_store1_ps(p, *black_box(&a));
3169
3170 if ofs > 0 {
3171 assert_eq!(vals[ofs - 1], 0.0);
3172 }
3173 assert_eq!(vals[ofs + 0], 1.0);
3174 assert_eq!(vals[ofs + 1], 1.0);
3175 assert_eq!(vals[ofs + 2], 1.0);
3176 assert_eq!(vals[ofs + 3], 1.0);
3177 assert_eq!(vals[ofs + 4], 0.0);
3178 }
3179
3180 #[simd_test(enable = "sse")]
3181 unsafe fn test_mm_store_ps() {
3182 let mut vals = [0.0f32; 8];
3183 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3184
3185 let mut ofs = 0;
3186 let mut p = vals.as_mut_ptr();
3187
3188 // Align p to 16-byte boundary
3189 if (p as usize) & 0xf != 0 {
3190 ofs = (16 - ((p as usize) & 0xf)) >> 2;
3191 p = p.add(ofs);
3192 }
3193
3194 _mm_store_ps(p, *black_box(&a));
3195
3196 if ofs > 0 {
3197 assert_eq!(vals[ofs - 1], 0.0);
3198 }
3199 assert_eq!(vals[ofs + 0], 1.0);
3200 assert_eq!(vals[ofs + 1], 2.0);
3201 assert_eq!(vals[ofs + 2], 3.0);
3202 assert_eq!(vals[ofs + 3], 4.0);
3203 assert_eq!(vals[ofs + 4], 0.0);
3204 }
3205
3206 #[simd_test(enable = "sse")]
3207 unsafe fn test_mm_storer_ps() {
3208 let mut vals = [0.0f32; 8];
3209 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3210
3211 let mut ofs = 0;
3212 let mut p = vals.as_mut_ptr();
3213
3214 // Align p to 16-byte boundary
3215 if (p as usize) & 0xf != 0 {
3216 ofs = (16 - ((p as usize) & 0xf)) >> 2;
3217 p = p.add(ofs);
3218 }
3219
3220 _mm_storer_ps(p, *black_box(&a));
3221
3222 if ofs > 0 {
3223 assert_eq!(vals[ofs - 1], 0.0);
3224 }
3225 assert_eq!(vals[ofs + 0], 4.0);
3226 assert_eq!(vals[ofs + 1], 3.0);
3227 assert_eq!(vals[ofs + 2], 2.0);
3228 assert_eq!(vals[ofs + 3], 1.0);
3229 assert_eq!(vals[ofs + 4], 0.0);
3230 }
3231
3232 #[simd_test(enable = "sse")]
3233 unsafe fn test_mm_storeu_ps() {
3234 let mut vals = [0.0f32; 8];
3235 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3236
3237 let mut ofs = 0;
3238 let mut p = vals.as_mut_ptr();
3239
3240 // Make sure p is **not** aligned to 16-byte boundary
3241 if (p as usize) & 0xf == 0 {
3242 ofs = 1;
3243 p = p.add(1);
3244 }
3245
3246 _mm_storeu_ps(p, *black_box(&a));
3247
3248 if ofs > 0 {
3249 assert_eq!(vals[ofs - 1], 0.0);
3250 }
3251 assert_eq!(vals[ofs + 0], 1.0);
3252 assert_eq!(vals[ofs + 1], 2.0);
3253 assert_eq!(vals[ofs + 2], 3.0);
3254 assert_eq!(vals[ofs + 3], 4.0);
3255 assert_eq!(vals[ofs + 4], 0.0);
3256 }
3257
3258 #[simd_test(enable = "sse")]
3259 unsafe fn test_mm_move_ss() {
3260 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3261 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3262
3263 let r = _mm_move_ss(a, b);
3264 let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3265 assert_eq_m128(e, r);
3266 }
3267
3268 #[simd_test(enable = "sse")]
3269 unsafe fn test_mm_movemask_ps() {
3270 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3271 assert_eq!(r, 0b0101);
3272
3273 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3274 assert_eq!(r, 0b0111);
3275 }
3276
3277 #[simd_test(enable = "sse")]
3278 // Miri cannot support this until it is clear how it fits in the Rust memory model
3279 #[cfg_attr(miri, ignore)]
3280 unsafe fn test_mm_sfence() {
3281 _mm_sfence();
3282 }
3283
3284 #[simd_test(enable = "sse")]
3285 unsafe fn test_MM_TRANSPOSE4_PS() {
3286 let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3287 let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3288 let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3289 let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3290
3291 _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3292
3293 assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3294 assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3295 assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3296 assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3297 }
3298
3299 #[repr(align(16))]
3300 struct Memory {
3301 pub data: [f32; 4],
3302 }
3303
3304 #[simd_test(enable = "sse")]
3305 // Miri cannot support this until it is clear how it fits in the Rust memory model
3306 // (non-temporal store)
3307 #[cfg_attr(miri, ignore)]
3308 unsafe fn test_mm_stream_ps() {
3309 let a = _mm_set1_ps(7.0);
3310 let mut mem = Memory { data: [-1.0; 4] };
3311
3312 _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
3313 for i in 0..4 {
3314 assert_eq!(mem.data[i], get_m128(a, i));
3315 }
3316 }
3317}
3318