sse.rs source code [crates/core_arch/src/x86/sse.rs]

1	//! Streaming SIMD Extensions (SSE)
2
3	use crate::{
4	core_arch::{simd::, x86::},
5	intrinsics::simd::*,
6	intrinsics::sqrtf32,
7	mem, ptr,
8	};
9
10	#[cfg(test)]
11	use stdarch_test::assert_instr;
12
13	/// Adds the first component of `a` and `b`, the other components are copied
14	/// from `a`.
15	///
16	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
17	#[inline]
18	#[target_feature(enable = "sse")]
19	#[cfg_attr(test, assert_instr(addss))]
20	#[stable(feature = "simd_x86", since = "1.27.0")]
21	pub fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
22	unsafe { simd_insert!(a, `0`, _mm_cvtss_f32(a) + _mm_cvtss_f32(b)) }
23	}
24
25	/// Adds packed single-precision (32-bit) floating-point elements in `a` and
26	/// `b`.
27	///
28	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
29	#[inline]
30	#[target_feature(enable = "sse")]
31	#[cfg_attr(test, assert_instr(addps))]
32	#[stable(feature = "simd_x86", since = "1.27.0")]
33	pub fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
34	unsafe { simd_add(x:a, y:b) }
35	}
36
37	/// Subtracts the first component of `b` from `a`, the other components are
38	/// copied from `a`.
39	///
40	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
41	#[inline]
42	#[target_feature(enable = "sse")]
43	#[cfg_attr(test, assert_instr(subss))]
44	#[stable(feature = "simd_x86", since = "1.27.0")]
45	pub fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
46	unsafe { simd_insert!(a, `0`, _mm_cvtss_f32(a) - _mm_cvtss_f32(b)) }
47	}
48
49	/// Subtracts packed single-precision (32-bit) floating-point elements in `a` and
50	/// `b`.
51	///
52	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
53	#[inline]
54	#[target_feature(enable = "sse")]
55	#[cfg_attr(test, assert_instr(subps))]
56	#[stable(feature = "simd_x86", since = "1.27.0")]
57	pub fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
58	unsafe { simd_sub(lhs:a, rhs:b) }
59	}
60
61	/// Multiplies the first component of `a` and `b`, the other components are
62	/// copied from `a`.
63	///
64	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
65	#[inline]
66	#[target_feature(enable = "sse")]
67	#[cfg_attr(test, assert_instr(mulss))]
68	#[stable(feature = "simd_x86", since = "1.27.0")]
69	pub fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
70	unsafe { simd_insert!(a, `0`, _mm_cvtss_f32(a) * _mm_cvtss_f32(b)) }
71	}
72
73	/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
74	/// `b`.
75	///
76	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
77	#[inline]
78	#[target_feature(enable = "sse")]
79	#[cfg_attr(test, assert_instr(mulps))]
80	#[stable(feature = "simd_x86", since = "1.27.0")]
81	pub fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
82	unsafe { simd_mul(x:a, y:b) }
83	}
84
85	/// Divides the first component of `b` by `a`, the other components are
86	/// copied from `a`.
87	///
88	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
89	#[inline]
90	#[target_feature(enable = "sse")]
91	#[cfg_attr(test, assert_instr(divss))]
92	#[stable(feature = "simd_x86", since = "1.27.0")]
93	pub fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
94	unsafe { simd_insert!(a, `0`, _mm_cvtss_f32(a) / _mm_cvtss_f32(b)) }
95	}
96
97	/// Divides packed single-precision (32-bit) floating-point elements in `a` and
98	/// `b`.
99	///
100	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
101	#[inline]
102	#[target_feature(enable = "sse")]
103	#[cfg_attr(test, assert_instr(divps))]
104	#[stable(feature = "simd_x86", since = "1.27.0")]
105	pub fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
106	unsafe { simd_div(lhs:a, rhs:b) }
107	}
108
109	/// Returns the square root of the first single-precision (32-bit)
110	/// floating-point element in `a`, the other elements are unchanged.
111	///
112	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
113	#[inline]
114	#[target_feature(enable = "sse")]
115	#[cfg_attr(test, assert_instr(sqrtss))]
116	#[stable(feature = "simd_x86", since = "1.27.0")]
117	pub fn _mm_sqrt_ss(a: __m128) -> __m128 {
118	unsafe { simd_insert!(a, `0`, sqrtf32(_mm_cvtss_f32(a))) }
119	}
120
121	/// Returns the square root of packed single-precision (32-bit) floating-point
122	/// elements in `a`.
123	///
124	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
125	#[inline]
126	#[target_feature(enable = "sse")]
127	#[cfg_attr(test, assert_instr(sqrtps))]
128	#[stable(feature = "simd_x86", since = "1.27.0")]
129	pub fn _mm_sqrt_ps(a: __m128) -> __m128 {
130	unsafe { simd_fsqrt(a) }
131	}
132
133	/// Returns the approximate reciprocal of the first single-precision
134	/// (32-bit) floating-point element in `a`, the other elements are unchanged.
135	///
136	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
137	#[inline]
138	#[target_feature(enable = "sse")]
139	#[cfg_attr(test, assert_instr(rcpss))]
140	#[stable(feature = "simd_x86", since = "1.27.0")]
141	pub fn _mm_rcp_ss(a: __m128) -> __m128 {
142	unsafe { rcpss(a) }
143	}
144
145	/// Returns the approximate reciprocal of packed single-precision (32-bit)
146	/// floating-point elements in `a`.
147	///
148	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
149	#[inline]
150	#[target_feature(enable = "sse")]
151	#[cfg_attr(test, assert_instr(rcpps))]
152	#[stable(feature = "simd_x86", since = "1.27.0")]
153	pub fn _mm_rcp_ps(a: __m128) -> __m128 {
154	unsafe { rcpps(a) }
155	}
156
157	/// Returns the approximate reciprocal square root of the first single-precision
158	/// (32-bit) floating-point element in `a`, the other elements are unchanged.
159	///
160	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
161	#[inline]
162	#[target_feature(enable = "sse")]
163	#[cfg_attr(test, assert_instr(rsqrtss))]
164	#[stable(feature = "simd_x86", since = "1.27.0")]
165	pub fn _mm_rsqrt_ss(a: __m128) -> __m128 {
166	unsafe { rsqrtss(a) }
167	}
168
169	/// Returns the approximate reciprocal square root of packed single-precision
170	/// (32-bit) floating-point elements in `a`.
171	///
172	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
173	#[inline]
174	#[target_feature(enable = "sse")]
175	#[cfg_attr(test, assert_instr(rsqrtps))]
176	#[stable(feature = "simd_x86", since = "1.27.0")]
177	pub fn _mm_rsqrt_ps(a: __m128) -> __m128 {
178	unsafe { rsqrtps(a) }
179	}
180
181	/// Compares the first single-precision (32-bit) floating-point element of `a`
182	/// and `b`, and return the minimum value in the first element of the return
183	/// value, the other elements are copied from `a`.
184	///
185	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
186	#[inline]
187	#[target_feature(enable = "sse")]
188	#[cfg_attr(test, assert_instr(minss))]
189	#[stable(feature = "simd_x86", since = "1.27.0")]
190	pub fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
191	unsafe { minss(a, b) }
192	}
193
194	/// Compares packed single-precision (32-bit) floating-point elements in `a` and
195	/// `b`, and return the corresponding minimum values.
196	///
197	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
198	#[inline]
199	#[target_feature(enable = "sse")]
200	#[cfg_attr(test, assert_instr(minps))]
201	#[stable(feature = "simd_x86", since = "1.27.0")]
202	pub fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
203	// See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
204	unsafe { minps(a, b) }
205	}
206
207	/// Compares the first single-precision (32-bit) floating-point element of `a`
208	/// and `b`, and return the maximum value in the first element of the return
209	/// value, the other elements are copied from `a`.
210	///
211	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
212	#[inline]
213	#[target_feature(enable = "sse")]
214	#[cfg_attr(test, assert_instr(maxss))]
215	#[stable(feature = "simd_x86", since = "1.27.0")]
216	pub fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
217	unsafe { maxss(a, b) }
218	}
219
220	/// Compares packed single-precision (32-bit) floating-point elements in `a` and
221	/// `b`, and return the corresponding maximum values.
222	///
223	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
224	#[inline]
225	#[target_feature(enable = "sse")]
226	#[cfg_attr(test, assert_instr(maxps))]
227	#[stable(feature = "simd_x86", since = "1.27.0")]
228	pub fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
229	// See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
230	unsafe { maxps(a, b) }
231	}
232
233	/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
234	///
235	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
236	#[inline]
237	#[target_feature(enable = "sse")]
238	// i586 only seems to generate plain `and` instructions, so ignore it.
239	#[cfg_attr(
240	all(test, any(target_arch = "x86_64", target_feature = "sse2")),
241	assert_instr(andps)
242	)]
243	#[stable(feature = "simd_x86", since = "1.27.0")]
244	pub fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
245	unsafe {
246	let a: __m128i = mem::transmute(src:a);
247	let b: __m128i = mem::transmute(src:b);
248	mem::transmute(src:simd_and(x:a, y:b))
249	}
250	}
251
252	/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
253	/// elements.
254	///
255	/// Computes `!a & b` for each bit in `a` and `b`.
256	///
257	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
258	#[inline]
259	#[target_feature(enable = "sse")]
260	// i586 only seems to generate plain `not` and `and` instructions, so ignore
261	// it.
262	#[cfg_attr(
263	all(test, any(target_arch = "x86_64", target_feature = "sse2")),
264	assert_instr(andnps)
265	)]
266	#[stable(feature = "simd_x86", since = "1.27.0")]
267	pub fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
268	unsafe {
269	let a: __m128i = mem::transmute(src:a);
270	let b: __m128i = mem::transmute(src:b);
271	let mask: __m128i = mem::transmute(src:i32x4::splat(`-1`));
272	mem::transmute(src:simd_and(x:simd_xor(mask, a), y:b))
273	}
274	}
275
276	/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
277	///
278	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
279	#[inline]
280	#[target_feature(enable = "sse")]
281	// i586 only seems to generate plain `or` instructions, so we ignore it.
282	#[cfg_attr(
283	all(test, any(target_arch = "x86_64", target_feature = "sse2")),
284	assert_instr(orps)
285	)]
286	#[stable(feature = "simd_x86", since = "1.27.0")]
287	pub fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
288	unsafe {
289	let a: __m128i = mem::transmute(src:a);
290	let b: __m128i = mem::transmute(src:b);
291	mem::transmute(src:simd_or(x:a, y:b))
292	}
293	}
294
295	/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
296	/// elements.
297	///
298	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
299	#[inline]
300	#[target_feature(enable = "sse")]
301	// i586 only seems to generate plain `xor` instructions, so we ignore it.
302	#[cfg_attr(
303	all(test, any(target_arch = "x86_64", target_feature = "sse2")),
304	assert_instr(xorps)
305	)]
306	#[stable(feature = "simd_x86", since = "1.27.0")]
307	pub fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
308	unsafe {
309	let a: __m128i = mem::transmute(src:a);
310	let b: __m128i = mem::transmute(src:b);
311	mem::transmute(src:simd_xor(x:a, y:b))
312	}
313	}
314
315	/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
316	/// the result will be `0xffffffff` if the two inputs are equal, or `0`
317	/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
318	///
319	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
320	#[inline]
321	#[target_feature(enable = "sse")]
322	#[cfg_attr(test, assert_instr(cmpeqss))]
323	#[stable(feature = "simd_x86", since = "1.27.0")]
324	pub fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
325	unsafe { cmpss(a, b, imm8:`0`) }
326	}
327
328	/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
329	/// of the result will be `0xffffffff` if `a.extract(0)` is less than
330	/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
331	/// upper 96 bits of `a`.
332	///
333	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
334	#[inline]
335	#[target_feature(enable = "sse")]
336	#[cfg_attr(test, assert_instr(cmpltss))]
337	#[stable(feature = "simd_x86", since = "1.27.0")]
338	pub fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
339	unsafe { cmpss(a, b, imm8:`1`) }
340	}
341
342	/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
343	/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
344	/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
345	/// are the upper 96 bits of `a`.
346	///
347	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
348	#[inline]
349	#[target_feature(enable = "sse")]
350	#[cfg_attr(test, assert_instr(cmpless))]
351	#[stable(feature = "simd_x86", since = "1.27.0")]
352	pub fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
353	unsafe { cmpss(a, b, imm8:`2`) }
354	}
355
356	/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
357	/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
358	/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
359	/// are the upper 96 bits of `a`.
360	///
361	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
362	#[inline]
363	#[target_feature(enable = "sse")]
364	#[cfg_attr(test, assert_instr(cmpltss))]
365	#[stable(feature = "simd_x86", since = "1.27.0")]
366	pub fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
367	unsafe { simd_shuffle!(a, cmpss(b, a, `1`), [`4`, `1`, `2`, `3`]) }
368	}
369
370	/// Compares the lowest `f32` of both inputs for greater than or equal. The
371	/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
372	/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
373	/// of the result are the upper 96 bits of `a`.
374	///
375	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
376	#[inline]
377	#[target_feature(enable = "sse")]
378	#[cfg_attr(test, assert_instr(cmpless))]
379	#[stable(feature = "simd_x86", since = "1.27.0")]
380	pub fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
381	unsafe { simd_shuffle!(a, cmpss(b, a, `2`), [`4`, `1`, `2`, `3`]) }
382	}
383
384	/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
385	/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
386	/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
387	/// upper 96 bits of `a`.
388	///
389	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
390	#[inline]
391	#[target_feature(enable = "sse")]
392	#[cfg_attr(test, assert_instr(cmpneqss))]
393	#[stable(feature = "simd_x86", since = "1.27.0")]
394	pub fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
395	unsafe { cmpss(a, b, imm8:`4`) }
396	}
397
398	/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
399	/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
400	/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
401	/// upper 96 bits of `a`.
402	///
403	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
404	#[inline]
405	#[target_feature(enable = "sse")]
406	#[cfg_attr(test, assert_instr(cmpnltss))]
407	#[stable(feature = "simd_x86", since = "1.27.0")]
408	pub fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
409	unsafe { cmpss(a, b, imm8:`5`) }
410	}
411
412	/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
413	/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
414	/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
415	/// of the result are the upper 96 bits of `a`.
416	///
417	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
418	#[inline]
419	#[target_feature(enable = "sse")]
420	#[cfg_attr(test, assert_instr(cmpnless))]
421	#[stable(feature = "simd_x86", since = "1.27.0")]
422	pub fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
423	unsafe { cmpss(a, b, imm8:`6`) }
424	}
425
426	/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
427	/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
428	/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
429	/// the upper 96 bits of `a`.
430	///
431	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
432	#[inline]
433	#[target_feature(enable = "sse")]
434	#[cfg_attr(test, assert_instr(cmpnltss))]
435	#[stable(feature = "simd_x86", since = "1.27.0")]
436	pub fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
437	unsafe { simd_shuffle!(a, cmpss(b, a, `5`), [`4`, `1`, `2`, `3`]) }
438	}
439
440	/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
441	/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
442	/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
443	/// bits of the result are the upper 96 bits of `a`.
444	///
445	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
446	#[inline]
447	#[target_feature(enable = "sse")]
448	#[cfg_attr(test, assert_instr(cmpnless))]
449	#[stable(feature = "simd_x86", since = "1.27.0")]
450	pub fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
451	unsafe { simd_shuffle!(a, cmpss(b, a, `6`), [`4`, `1`, `2`, `3`]) }
452	}
453
454	/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
455	/// the result will be `0xffffffff` if neither of `a.extract(0)` or
456	/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
457	/// are the upper 96 bits of `a`.
458	///
459	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
460	#[inline]
461	#[target_feature(enable = "sse")]
462	#[cfg_attr(test, assert_instr(cmpordss))]
463	#[stable(feature = "simd_x86", since = "1.27.0")]
464	pub fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
465	unsafe { cmpss(a, b, imm8:`7`) }
466	}
467
468	/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
469	/// of the result will be `0xffffffff` if any of `a.extract(0)` or
470	/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
471	/// are the upper 96 bits of `a`.
472	///
473	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
474	#[inline]
475	#[target_feature(enable = "sse")]
476	#[cfg_attr(test, assert_instr(cmpunordss))]
477	#[stable(feature = "simd_x86", since = "1.27.0")]
478	pub fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
479	unsafe { cmpss(a, b, imm8:`3`) }
480	}
481
482	/// Compares each of the four floats in `a` to the corresponding element in `b`.
483	/// The result in the output vector will be `0xffffffff` if the input elements
484	/// were equal, or `0` otherwise.
485	///
486	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
487	#[inline]
488	#[target_feature(enable = "sse")]
489	#[cfg_attr(test, assert_instr(cmpeqps))]
490	#[stable(feature = "simd_x86", since = "1.27.0")]
491	pub fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
492	unsafe { cmpps(a, b, imm8:`0`) }
493	}
494
495	/// Compares each of the four floats in `a` to the corresponding element in `b`.
496	/// The result in the output vector will be `0xffffffff` if the input element
497	/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
498	///
499	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
500	#[inline]
501	#[target_feature(enable = "sse")]
502	#[cfg_attr(test, assert_instr(cmpltps))]
503	#[stable(feature = "simd_x86", since = "1.27.0")]
504	pub fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
505	unsafe { cmpps(a, b, imm8:`1`) }
506	}
507
508	/// Compares each of the four floats in `a` to the corresponding element in `b`.
509	/// The result in the output vector will be `0xffffffff` if the input element
510	/// in `a` is less than or equal to the corresponding element in `b`, or `0`
511	/// otherwise.
512	///
513	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
514	#[inline]
515	#[target_feature(enable = "sse")]
516	#[cfg_attr(test, assert_instr(cmpleps))]
517	#[stable(feature = "simd_x86", since = "1.27.0")]
518	pub fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
519	unsafe { cmpps(a, b, imm8:`2`) }
520	}
521
522	/// Compares each of the four floats in `a` to the corresponding element in `b`.
523	/// The result in the output vector will be `0xffffffff` if the input element
524	/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
525	///
526	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
527	#[inline]
528	#[target_feature(enable = "sse")]
529	#[cfg_attr(test, assert_instr(cmpltps))]
530	#[stable(feature = "simd_x86", since = "1.27.0")]
531	pub fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
532	unsafe { cmpps(a:b, b:a, imm8:`1`) }
533	}
534
535	/// Compares each of the four floats in `a` to the corresponding element in `b`.
536	/// The result in the output vector will be `0xffffffff` if the input element
537	/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
538	/// otherwise.
539	///
540	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
541	#[inline]
542	#[target_feature(enable = "sse")]
543	#[cfg_attr(test, assert_instr(cmpleps))]
544	#[stable(feature = "simd_x86", since = "1.27.0")]
545	pub fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
546	unsafe { cmpps(a:b, b:a, imm8:`2`) }
547	}
548
549	/// Compares each of the four floats in `a` to the corresponding element in `b`.
550	/// The result in the output vector will be `0xffffffff` if the input elements
551	/// are not* equal, or `0` otherwise.*
552	///
553	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
554	#[inline]
555	#[target_feature(enable = "sse")]
556	#[cfg_attr(test, assert_instr(cmpneqps))]
557	#[stable(feature = "simd_x86", since = "1.27.0")]
558	pub fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
559	unsafe { cmpps(a, b, imm8:`4`) }
560	}
561
562	/// Compares each of the four floats in `a` to the corresponding element in `b`.
563	/// The result in the output vector will be `0xffffffff` if the input element
564	/// in `a` is not* less than the corresponding element in `b`, or `0`*
565	/// otherwise.
566	///
567	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
568	#[inline]
569	#[target_feature(enable = "sse")]
570	#[cfg_attr(test, assert_instr(cmpnltps))]
571	#[stable(feature = "simd_x86", since = "1.27.0")]
572	pub fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
573	unsafe { cmpps(a, b, imm8:`5`) }
574	}
575
576	/// Compares each of the four floats in `a` to the corresponding element in `b`.
577	/// The result in the output vector will be `0xffffffff` if the input element
578	/// in `a` is not* less than or equal to the corresponding element in `b`, or*
579	/// `0` otherwise.
580	///
581	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
582	#[inline]
583	#[target_feature(enable = "sse")]
584	#[cfg_attr(test, assert_instr(cmpnleps))]
585	#[stable(feature = "simd_x86", since = "1.27.0")]
586	pub fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
587	unsafe { cmpps(a, b, imm8:`6`) }
588	}
589
590	/// Compares each of the four floats in `a` to the corresponding element in `b`.
591	/// The result in the output vector will be `0xffffffff` if the input element
592	/// in `a` is not* greater than the corresponding element in `b`, or `0`*
593	/// otherwise.
594	///
595	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
596	#[inline]
597	#[target_feature(enable = "sse")]
598	#[cfg_attr(test, assert_instr(cmpnltps))]
599	#[stable(feature = "simd_x86", since = "1.27.0")]
600	pub fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
601	unsafe { cmpps(a:b, b:a, imm8:`5`) }
602	}
603
604	/// Compares each of the four floats in `a` to the corresponding element in `b`.
605	/// The result in the output vector will be `0xffffffff` if the input element
606	/// in `a` is not* greater than or equal to the corresponding element in `b`,*
607	/// or `0` otherwise.
608	///
609	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
610	#[inline]
611	#[target_feature(enable = "sse")]
612	#[cfg_attr(test, assert_instr(cmpnleps))]
613	#[stable(feature = "simd_x86", since = "1.27.0")]
614	pub fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
615	unsafe { cmpps(a:b, b:a, imm8:`6`) }
616	}
617
618	/// Compares each of the four floats in `a` to the corresponding element in `b`.
619	/// Returns four floats that have one of two possible bit patterns. The element
620	/// in the output vector will be `0xffffffff` if the input elements in `a` and
621	/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
622	///
623	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
624	#[inline]
625	#[target_feature(enable = "sse")]
626	#[cfg_attr(test, assert_instr(cmpordps))]
627	#[stable(feature = "simd_x86", since = "1.27.0")]
628	pub fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
629	unsafe { cmpps(a:b, b:a, imm8:`7`) }
630	}
631
632	/// Compares each of the four floats in `a` to the corresponding element in `b`.
633	/// Returns four floats that have one of two possible bit patterns. The element
634	/// in the output vector will be `0xffffffff` if the input elements in `a` and
635	/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
636	///
637	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
638	#[inline]
639	#[target_feature(enable = "sse")]
640	#[cfg_attr(test, assert_instr(cmpunordps))]
641	#[stable(feature = "simd_x86", since = "1.27.0")]
642	pub fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
643	unsafe { cmpps(a:b, b:a, imm8:`3`) }
644	}
645
646	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
647	/// `1` if they are equal, or `0` otherwise.
648	///
649	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
650	#[inline]
651	#[target_feature(enable = "sse")]
652	#[cfg_attr(test, assert_instr(comiss))]
653	#[stable(feature = "simd_x86", since = "1.27.0")]
654	pub fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
655	unsafe { comieq_ss(a, b) }
656	}
657
658	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
659	/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
660	///
661	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
662	#[inline]
663	#[target_feature(enable = "sse")]
664	#[cfg_attr(test, assert_instr(comiss))]
665	#[stable(feature = "simd_x86", since = "1.27.0")]
666	pub fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
667	unsafe { comilt_ss(a, b) }
668	}
669
670	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
671	/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
672	/// otherwise.
673	///
674	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
675	#[inline]
676	#[target_feature(enable = "sse")]
677	#[cfg_attr(test, assert_instr(comiss))]
678	#[stable(feature = "simd_x86", since = "1.27.0")]
679	pub fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
680	unsafe { comile_ss(a, b) }
681	}
682
683	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
684	/// `1` if the value from `a` is greater than the one from `b`, or `0`
685	/// otherwise.
686	///
687	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
688	#[inline]
689	#[target_feature(enable = "sse")]
690	#[cfg_attr(test, assert_instr(comiss))]
691	#[stable(feature = "simd_x86", since = "1.27.0")]
692	pub fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
693	unsafe { comigt_ss(a, b) }
694	}
695
696	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
697	/// `1` if the value from `a` is greater than or equal to the one from `b`, or
698	/// `0` otherwise.
699	///
700	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
701	#[inline]
702	#[target_feature(enable = "sse")]
703	#[cfg_attr(test, assert_instr(comiss))]
704	#[stable(feature = "simd_x86", since = "1.27.0")]
705	pub fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
706	unsafe { comige_ss(a, b) }
707	}
708
709	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
710	/// `1` if they are not* equal, or `0` otherwise.*
711	///
712	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
713	#[inline]
714	#[target_feature(enable = "sse")]
715	#[cfg_attr(test, assert_instr(comiss))]
716	#[stable(feature = "simd_x86", since = "1.27.0")]
717	pub fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
718	unsafe { comineq_ss(a, b) }
719	}
720
721	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
722	/// `1` if they are equal, or `0` otherwise. This instruction will not signal
723	/// an exception if either argument is a quiet NaN.
724	///
725	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
726	#[inline]
727	#[target_feature(enable = "sse")]
728	#[cfg_attr(test, assert_instr(ucomiss))]
729	#[stable(feature = "simd_x86", since = "1.27.0")]
730	pub fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
731	unsafe { ucomieq_ss(a, b) }
732	}
733
734	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
735	/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
736	/// This instruction will not signal an exception if either argument is a quiet
737	/// NaN.
738	///
739	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
740	#[inline]
741	#[target_feature(enable = "sse")]
742	#[cfg_attr(test, assert_instr(ucomiss))]
743	#[stable(feature = "simd_x86", since = "1.27.0")]
744	pub fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
745	unsafe { ucomilt_ss(a, b) }
746	}
747
748	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
749	/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
750	/// otherwise. This instruction will not signal an exception if either argument
751	/// is a quiet NaN.
752	///
753	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
754	#[inline]
755	#[target_feature(enable = "sse")]
756	#[cfg_attr(test, assert_instr(ucomiss))]
757	#[stable(feature = "simd_x86", since = "1.27.0")]
758	pub fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
759	unsafe { ucomile_ss(a, b) }
760	}
761
762	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
763	/// `1` if the value from `a` is greater than the one from `b`, or `0`
764	/// otherwise. This instruction will not signal an exception if either argument
765	/// is a quiet NaN.
766	///
767	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
768	#[inline]
769	#[target_feature(enable = "sse")]
770	#[cfg_attr(test, assert_instr(ucomiss))]
771	#[stable(feature = "simd_x86", since = "1.27.0")]
772	pub fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
773	unsafe { ucomigt_ss(a, b) }
774	}
775
776	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
777	/// `1` if the value from `a` is greater than or equal to the one from `b`, or
778	/// `0` otherwise. This instruction will not signal an exception if either
779	/// argument is a quiet NaN.
780	///
781	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
782	#[inline]
783	#[target_feature(enable = "sse")]
784	#[cfg_attr(test, assert_instr(ucomiss))]
785	#[stable(feature = "simd_x86", since = "1.27.0")]
786	pub fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
787	unsafe { ucomige_ss(a, b) }
788	}
789
790	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
791	/// `1` if they are not* equal, or `0` otherwise. This instruction will not*
792	/// signal an exception if either argument is a quiet NaN.
793	///
794	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
795	#[inline]
796	#[target_feature(enable = "sse")]
797	#[cfg_attr(test, assert_instr(ucomiss))]
798	#[stable(feature = "simd_x86", since = "1.27.0")]
799	pub fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
800	unsafe { ucomineq_ss(a, b) }
801	}
802
803	/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
804	///
805	/// The result is rounded according to the current rounding mode. If the result
806	/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
807	/// (`i32::MIN`).
808	///
809	/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
810	///
811	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
812	#[inline]
813	#[target_feature(enable = "sse")]
814	#[cfg_attr(test, assert_instr(cvtss2si))]
815	#[stable(feature = "simd_x86", since = "1.27.0")]
816	pub fn _mm_cvtss_si32(a: __m128) -> i32 {
817	unsafe { cvtss2si(a) }
818	}
819
820	/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
821	///
822	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
823	#[inline]
824	#[target_feature(enable = "sse")]
825	#[cfg_attr(test, assert_instr(cvtss2si))]
826	#[stable(feature = "simd_x86", since = "1.27.0")]
827	pub fn _mm_cvt_ss2si(a: __m128) -> i32 {
828	_mm_cvtss_si32(a)
829	}
830
831	/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
832	/// with
833	/// truncation.
834	///
835	/// The result is rounded always using truncation (round towards zero). If the
836	/// result cannot be represented as a 32 bit integer the result will be
837	/// `0x8000_0000` (`i32::MIN`).
838	///
839	/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
840	///
841	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
842	#[inline]
843	#[target_feature(enable = "sse")]
844	#[cfg_attr(test, assert_instr(cvttss2si))]
845	#[stable(feature = "simd_x86", since = "1.27.0")]
846	pub fn _mm_cvttss_si32(a: __m128) -> i32 {
847	unsafe { cvttss2si(a) }
848	}
849
850	/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
851	///
852	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
853	#[inline]
854	#[target_feature(enable = "sse")]
855	#[cfg_attr(test, assert_instr(cvttss2si))]
856	#[stable(feature = "simd_x86", since = "1.27.0")]
857	pub fn _mm_cvtt_ss2si(a: __m128) -> i32 {
858	_mm_cvttss_si32(a)
859	}
860
861	/// Extracts the lowest 32 bit float from the input vector.
862	///
863	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
864	#[inline]
865	#[target_feature(enable = "sse")]
866	// No point in using assert_instrs. In Unix x86_64 calling convention this is a
867	// no-op, and on msvc it's just a `mov`.
868	#[stable(feature = "simd_x86", since = "1.27.0")]
869	pub fn _mm_cvtss_f32(a: __m128) -> f32 {
870	unsafe { simd_extract!(a, `0`) }
871	}
872
873	/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
874	/// vector `a` with the lowest 32 bit float replaced by the converted integer.
875	///
876	/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
877	/// input).
878	///
879	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
880	#[inline]
881	#[target_feature(enable = "sse")]
882	#[cfg_attr(test, assert_instr(cvtsi2ss))]
883	#[stable(feature = "simd_x86", since = "1.27.0")]
884	pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
885	unsafe { cvtsi2ss(a, b) }
886	}
887
888	/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
889	///
890	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
891	#[inline]
892	#[target_feature(enable = "sse")]
893	#[cfg_attr(test, assert_instr(cvtsi2ss))]
894	#[stable(feature = "simd_x86", since = "1.27.0")]
895	pub fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
896	_mm_cvtsi32_ss(a, b)
897	}
898
899	/// Construct a `__m128` with the lowest element set to `a` and the rest set to
900	/// zero.
901	///
902	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
903	#[inline]
904	#[target_feature(enable = "sse")]
905	#[cfg_attr(test, assert_instr(movss))]
906	#[stable(feature = "simd_x86", since = "1.27.0")]
907	pub fn _mm_set_ss(a: f32) -> __m128 {
908	__m128([a, `0.0`, `0.0`, `0.0`])
909	}
910
911	/// Construct a `__m128` with all element set to `a`.
912	///
913	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
914	#[inline]
915	#[target_feature(enable = "sse")]
916	#[cfg_attr(test, assert_instr(shufps))]
917	#[stable(feature = "simd_x86", since = "1.27.0")]
918	pub fn _mm_set1_ps(a: f32) -> __m128 {
919	__m128([a, a, a, a])
920	}
921
922	/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
923	///
924	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
925	#[inline]
926	#[target_feature(enable = "sse")]
927	#[cfg_attr(test, assert_instr(shufps))]
928	#[stable(feature = "simd_x86", since = "1.27.0")]
929	pub fn _mm_set_ps1(a: f32) -> __m128 {
930	_mm_set1_ps(a)
931	}
932
933	/// Construct a `__m128` from four floating point values highest to lowest.
934	///
935	/// Note that `a` will be the highest 32 bits of the result, and `d` the
936	/// lowest. This matches the standard way of writing bit patterns on x86:
937	///
938	/// ```text
939	/// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0
940	/// +---------+---------+---------+---------+
941	/// \| a \| b \| c \| d \| result
942	/// +---------+---------+---------+---------+
943	/// ```
944	///
945	/// Alternatively:
946	///
947	/// ```text
948	/// let v = _mm_set_ps(d, c, b, a);
949	/// ```
950	///
951	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
952	#[inline]
953	#[target_feature(enable = "sse")]
954	#[cfg_attr(test, assert_instr(unpcklps))]
955	#[stable(feature = "simd_x86", since = "1.27.0")]
956	pub fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
957	__m128([d, c, b, a])
958	}
959
960	/// Construct a `__m128` from four floating point values lowest to highest.
961	///
962	/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
963	/// bits of the result, and `d` the highest.
964	///
965	/// ```text
966	/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
967	/// ```
968	///
969	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
970	#[inline]
971	#[target_feature(enable = "sse")]
972	#[cfg_attr(
973	all(test, any(target_env = "msvc", target_arch = "x86_64")),
974	assert_instr(unpcklps)
975	)]
976	// On a 32-bit architecture on non-msvc it just copies the operands from the stack.
977	#[cfg_attr(
978	all(test, all(not(target_env = "msvc"), target_arch = "x86")),
979	assert_instr(movaps)
980	)]
981	#[stable(feature = "simd_x86", since = "1.27.0")]
982	pub fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
983	__m128([a, b, c, d])
984	}
985
986	/// Construct a `__m128` with all elements initialized to zero.
987	///
988	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
989	#[inline]
990	#[target_feature(enable = "sse")]
991	#[cfg_attr(test, assert_instr(xorps))]
992	#[stable(feature = "simd_x86", since = "1.27.0")]
993	pub fn _mm_setzero_ps() -> __m128 {
994	const { unsafe { mem::zeroed() } }
995	}
996
997	/// A utility function for creating masks to use with Intel shuffle and
998	/// permute intrinsics.
999	#[inline]
1000	#[allow(non_snake_case)]
1001	#[unstable(feature = "stdarch_x86_mm_shuffle", issue = "111147")]
1002	pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
1003	((z << `6`) \| (y << `4`) \| (x << `2`) \| w) as i32
1004	}
1005
1006	/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
1007	/// `b` using `MASK`.
1008	///
1009	/// The lower half of result takes values from `a` and the higher half from
1010	/// `b`. Mask is split to 2 control bits each to index the element from inputs.
1011	///
1012	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
1013	///
1014	/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1015	/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1016	/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1017	/// Performing an implicit type conversion between an unsigned integer and a signed integer
1018	/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1019	#[inline]
1020	#[target_feature(enable = "sse")]
1021	#[cfg_attr(test, assert_instr(shufps, MASK = `3`))]
1022	#[rustc_legacy_const_generics(`2`)]
1023	#[stable(feature = "simd_x86", since = "1.27.0")]
1024	pub fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1025	static_assert_uimm_bits!(MASK, `8`);
1026	unsafe {
1027	simd_shuffle!(
1028	a,
1029	b,
1030	[
1031	MASK as u32 & `0b11`,
1032	(MASK as u32 >> `2`) & `0b11`,
1033	((MASK as u32 >> `4`) & `0b11`) + `4`,
1034	((MASK as u32 >> `6`) & `0b11`) + `4`,
1035	],
1036	)
1037	}
1038	}
1039
1040	/// Unpacks and interleave single-precision (32-bit) floating-point elements
1041	/// from the higher half of `a` and `b`.
1042	///
1043	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1044	#[inline]
1045	#[target_feature(enable = "sse")]
1046	#[cfg_attr(test, assert_instr(unpckhps))]
1047	#[stable(feature = "simd_x86", since = "1.27.0")]
1048	pub fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1049	unsafe { simd_shuffle!(a, b, [`2`, `6`, `3`, `7`]) }
1050	}
1051
1052	/// Unpacks and interleave single-precision (32-bit) floating-point elements
1053	/// from the lower half of `a` and `b`.
1054	///
1055	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1056	#[inline]
1057	#[target_feature(enable = "sse")]
1058	#[cfg_attr(test, assert_instr(unpcklps))]
1059	#[stable(feature = "simd_x86", since = "1.27.0")]
1060	pub fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1061	unsafe { simd_shuffle!(a, b, [`0`, `4`, `1`, `5`]) }
1062	}
1063
1064	/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1065	/// lower half of result.
1066	///
1067	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1068	#[inline]
1069	#[target_feature(enable = "sse")]
1070	#[cfg_attr(test, assert_instr(movhlps))]
1071	#[stable(feature = "simd_x86", since = "1.27.0")]
1072	pub fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1073	// TODO; figure why this is a different instruction on msvc?
1074	unsafe { simd_shuffle!(a, b, [`6`, `7`, `2`, `3`]) }
1075	}
1076
1077	/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1078	/// higher half of result.
1079	///
1080	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1081	#[inline]
1082	#[target_feature(enable = "sse")]
1083	#[cfg_attr(test, assert_instr(movlhps))]
1084	#[stable(feature = "simd_x86", since = "1.27.0")]
1085	pub fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1086	unsafe { simd_shuffle!(a, b, [`0`, `1`, `4`, `5`]) }
1087	}
1088
1089	/// Returns a mask of the most significant bit of each element in `a`.
1090	///
1091	/// The mask is stored in the 4 least significant bits of the return value.
1092	/// All other bits are set to `0`.
1093	///
1094	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1095	#[inline]
1096	#[target_feature(enable = "sse")]
1097	#[cfg_attr(test, assert_instr(movmskps))]
1098	#[stable(feature = "simd_x86", since = "1.27.0")]
1099	pub fn _mm_movemask_ps(a: __m128) -> i32 {
1100	// Propagate the highest bit to the rest, because simd_bitmask
1101	// requires all-1 or all-0.
1102	unsafe {
1103	let mask: i32x4 = simd_lt(x:transmute(a), y:i32x4::ZERO);
1104	simd_bitmask::<i32x4, u8>(mask).into()
1105	}
1106	}
1107
1108	/// Construct a `__m128` with the lowest element read from `p` and the other
1109	/// elements set to zero.
1110	///
1111	/// This corresponds to instructions `VMOVSS` / `MOVSS`.
1112	///
1113	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1114	#[inline]
1115	#[target_feature(enable = "sse")]
1116	#[cfg_attr(test, assert_instr(movss))]
1117	#[stable(feature = "simd_x86", since = "1.27.0")]
1118	pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1119	__m128([*p, `0.0`, `0.0`, `0.0`])
1120	}
1121
1122	/// Construct a `__m128` by duplicating the value read from `p` into all
1123	/// elements.
1124	///
1125	/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1126	/// shuffling.
1127	///
1128	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1129	#[inline]
1130	#[target_feature(enable = "sse")]
1131	#[cfg_attr(test, assert_instr(movss))]
1132	#[stable(feature = "simd_x86", since = "1.27.0")]
1133	pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1134	let a: f32 = *p;
1135	__m128([a, a, a, a])
1136	}
1137
1138	/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1139	///
1140	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1141	#[inline]
1142	#[target_feature(enable = "sse")]
1143	#[cfg_attr(test, assert_instr(movss))]
1144	#[stable(feature = "simd_x86", since = "1.27.0")]
1145	pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1146	_mm_load1_ps(p)
1147	}
1148
1149	/// Loads four `f32` values from aligned* memory into a `__m128`. If the*
1150	/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1151	/// protection fault will be triggered (fatal program crash).
1152	///
1153	/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1154	/// memory.
1155	///
1156	/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1157	///
1158	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1159	#[inline]
1160	#[target_feature(enable = "sse")]
1161	// FIXME: Rust doesn't emit alignment attributes for MSVC x86-32. Ref https://github.com/rust-lang/rust/pull/139261
1162	// All aligned load/store intrinsics are affected
1163	#[cfg_attr(
1164	all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1165	assert_instr(movaps)
1166	)]
1167	#[stable(feature = "simd_x86", since = "1.27.0")]
1168	#[allow(clippy::cast_ptr_alignment)]
1169	pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1170	(p as const __m128)
1171	}
1172
1173	/// Loads four `f32` values from memory into a `__m128`. There are no
1174	/// restrictions
1175	/// on memory alignment. For aligned memory
1176	/// [`_mm_load_ps`](fn._mm_load_ps.html)
1177	/// may be faster.
1178	///
1179	/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1180	///
1181	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1182	#[inline]
1183	#[target_feature(enable = "sse")]
1184	#[cfg_attr(test, assert_instr(movups))]
1185	#[stable(feature = "simd_x86", since = "1.27.0")]
1186	pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1187	// Note: Using `p` would require `f32` alignment, but `movups` has no*
1188	// alignment restrictions.
1189	let mut dst: __m128 = _mm_undefined_ps();
1190	ptr::copy_nonoverlapping(
1191	src:p as *const u8,
1192	dst:ptr::addr_of_mut!(dst) as *mut u8,
1193	count:mem::size_of::<__m128>(),
1194	);
1195	dst
1196	}
1197
1198	/// Loads four `f32` values from aligned memory into a `__m128` in reverse
1199	/// order.
1200	///
1201	/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1202	/// protection fault will be triggered (fatal program crash).
1203	///
1204	/// Functionally equivalent to the following code sequence (assuming `p`
1205	/// satisfies the alignment restrictions):
1206	///
1207	/// ```text
1208	/// let a0 = p;*
1209	/// let a1 = p.add(1);*
1210	/// let a2 = p.add(2);*
1211	/// let a3 = p.add(3);*
1212	/// __m128::new(a3, a2, a1, a0)
1213	/// ```
1214	///
1215	/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1216	/// shuffling.
1217	///
1218	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1219	#[inline]
1220	#[target_feature(enable = "sse")]
1221	#[cfg_attr(
1222	all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1223	assert_instr(movaps)
1224	)]
1225	#[stable(feature = "simd_x86", since = "1.27.0")]
1226	pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1227	let a: __m128 = _mm_load_ps(p);
1228	simd_shuffle!(a, a, [`3`, `2`, `1`, `0`])
1229	}
1230
1231	/// Stores the lowest 32 bit float of `a` into memory.
1232	///
1233	/// This intrinsic corresponds to the `MOVSS` instruction.
1234	///
1235	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1236	#[inline]
1237	#[target_feature(enable = "sse")]
1238	#[cfg_attr(test, assert_instr(movss))]
1239	#[stable(feature = "simd_x86", since = "1.27.0")]
1240	pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1241	*p = simd_extract!(a, `0`);
1242	}
1243
1244	/// Stores the lowest 32 bit float of `a` repeated four times into aligned
1245	/// memory.
1246	///
1247	/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1248	/// protection fault will be triggered (fatal program crash).
1249	///
1250	/// Functionally equivalent to the following code sequence (assuming `p`
1251	/// satisfies the alignment restrictions):
1252	///
1253	/// ```text
1254	/// let x = a.extract(0);
1255	/// p = x;*
1256	/// p.add(1) = x;*
1257	/// p.add(2) = x;*
1258	/// p.add(3) = x;*
1259	/// ```
1260	///
1261	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1262	#[inline]
1263	#[target_feature(enable = "sse")]
1264	#[cfg_attr(
1265	all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1266	assert_instr(movaps)
1267	)]
1268	#[stable(feature = "simd_x86", since = "1.27.0")]
1269	#[allow(clippy::cast_ptr_alignment)]
1270	pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1271	let b: __m128 = simd_shuffle!(a, a, [`0`, `0`, `0`, `0`]);
1272	(p as mut __m128) = b;
1273	}
1274
1275	/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1276	///
1277	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1278	#[inline]
1279	#[target_feature(enable = "sse")]
1280	#[cfg_attr(
1281	all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1282	assert_instr(movaps)
1283	)]
1284	#[stable(feature = "simd_x86", since = "1.27.0")]
1285	pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1286	_mm_store1_ps(p, a);
1287	}
1288
1289	/// Stores four 32-bit floats into aligned* memory.*
1290	///
1291	/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1292	/// protection fault will be triggered (fatal program crash).
1293	///
1294	/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1295	/// memory.
1296	///
1297	/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1298	///
1299	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1300	#[inline]
1301	#[target_feature(enable = "sse")]
1302	#[cfg_attr(
1303	all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1304	assert_instr(movaps)
1305	)]
1306	#[stable(feature = "simd_x86", since = "1.27.0")]
1307	#[allow(clippy::cast_ptr_alignment)]
1308	pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1309	(p as mut __m128) = a;
1310	}
1311
1312	/// Stores four 32-bit floats into memory. There are no restrictions on memory
1313	/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1314	/// faster.
1315	///
1316	/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1317	///
1318	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1319	#[inline]
1320	#[target_feature(enable = "sse")]
1321	#[cfg_attr(test, assert_instr(movups))]
1322	#[stable(feature = "simd_x86", since = "1.27.0")]
1323	pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1324	ptr::copy_nonoverlapping(
1325	src:ptr::addr_of!(a) as *const u8,
1326	dst:p as *mut u8,
1327	count:mem::size_of::<__m128>(),
1328	);
1329	}
1330
1331	/// Stores four 32-bit floats into aligned* memory in reverse order.*
1332	///
1333	/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1334	/// protection fault will be triggered (fatal program crash).
1335	///
1336	/// Functionally equivalent to the following code sequence (assuming `p`
1337	/// satisfies the alignment restrictions):
1338	///
1339	/// ```text
1340	/// p = a.extract(3);*
1341	/// p.add(1) = a.extract(2);*
1342	/// p.add(2) = a.extract(1);*
1343	/// p.add(3) = a.extract(0);*
1344	/// ```
1345	///
1346	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1347	#[inline]
1348	#[target_feature(enable = "sse")]
1349	#[cfg_attr(
1350	all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1351	assert_instr(movaps)
1352	)]
1353	#[stable(feature = "simd_x86", since = "1.27.0")]
1354	#[allow(clippy::cast_ptr_alignment)]
1355	pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1356	let b: __m128 = simd_shuffle!(a, a, [`3`, `2`, `1`, `0`]);
1357	(p as mut __m128) = b;
1358	}
1359
1360	/// Returns a `__m128` with the first component from `b` and the remaining
1361	/// components from `a`.
1362	///
1363	/// In other words for any `a` and `b`:
1364	/// ```text
1365	/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1366	/// ```
1367	///
1368	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1369	#[inline]
1370	#[target_feature(enable = "sse")]
1371	#[cfg_attr(test, assert_instr(movss))]
1372	#[stable(feature = "simd_x86", since = "1.27.0")]
1373	pub fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1374	unsafe { simd_shuffle!(a, b, [`4`, `1`, `2`, `3`]) }
1375	}
1376
1377	/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
1378	/// were issued by the current thread prior to this instruction.
1379	///
1380	/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
1381	/// ordered before any load or store instruction which follows the fence in
1382	/// synchronization order.
1383	///
1384	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1385	/// (but note that Intel is only documenting the hardware-level concerns related to this
1386	/// instruction; the Intel documentation does not take into account the extra concerns that arise
1387	/// because the Rust memory model is different from the x86 memory model.)
1388	///
1389	/// # Safety of non-temporal stores
1390	///
1391	/// After using any non-temporal store intrinsic, but before any other access to the memory that the
1392	/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
1393	/// intrinsic.
1394	///
1395	/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
1396	/// memory model, these stores are happening asynchronously in a background thread. This means a
1397	/// non-temporal store can cause data races with other accesses, even other accesses on the same
1398	/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
1399	/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
1400	/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
1401	/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
1402	/// with all the non-temporal stores previously started on this thread, which means in particular
1403	/// that subsequent synchronization with other threads will then work as intended again.
1404	///
1405	/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
1406	/// code jumps back to code outside your library. This ensures all stores inside your function
1407	/// are synchronized-before the return, and thus transitively synchronized-before everything
1408	/// the caller does after your function returns.
1409	//
1410	// The following is not a doc comment since it's not clear whether we want to put this into the
1411	// docs, but it should be written out somewhere.
1412	//
1413	// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
1414	// inspect, and that behave like the following functions. This explains where the docs above come
1415	// from.
1416	// ```
1417	// #[thread_local]
1418	// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
1419	//
1420	// pub unsafe fn nontemporal_store<T>(ptr: mut T, val: T) {*
1421	// PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
1422	// // Spawn a thread that will eventually do our write.
1423	// // We need to fetch a pointer to this thread's pending-write
1424	// // counter, so that we can access it from the background thread.
1425	// let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
1426	// // If this was actual Rust code we'd have to do some extra work
1427	// // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
1428	// std::thread::spawn(move \|\| {
1429	// // Do the write in the background thread.
1430	// ptr.write(val);
1431	// // Register the write as done. Crucially, this is `Release`, so it
1432	// // syncs-with the `Acquire in `sfence`.
1433	// (&pending_writes).fetch_sub(1, Release);*
1434	// });
1435	// }
1436	//
1437	// pub fn sfence() {
1438	// unsafe {
1439	// // Wait until there are no more pending writes.
1440	// while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
1441	// }
1442	// }
1443	// ```
1444	#[inline]
1445	#[target_feature(enable = "sse")]
1446	#[cfg_attr(test, assert_instr(sfence))]
1447	#[stable(feature = "simd_x86", since = "1.27.0")]
1448	pub unsafe fn _mm_sfence() {
1449	sfence()
1450	}
1451
1452	/// Gets the unsigned 32-bit value of the MXCSR control and status register.
1453	///
1454	/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
1455	/// floating-point operations may or may not result in this register getting updated with exception
1456	/// state, and the register can change between two invocations of this function even when no
1457	/// floating-point operations appear in the source code (since floating-point operations appearing
1458	/// earlier or later can be reordered).
1459	///
1460	/// If you need to perform some floating-point operations and check whether they raised an
1461	/// exception, use an inline assembly block for the entire sequence of operations.
1462	///
1463	/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1464	///
1465	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1466	#[inline]
1467	#[target_feature(enable = "sse")]
1468	#[cfg_attr(test, assert_instr(stmxcsr))]
1469	#[stable(feature = "simd_x86", since = "1.27.0")]
1470	#[deprecated(
1471	since = "1.75.0",
1472	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1473	)]
1474	pub unsafe fn _mm_getcsr() -> u32 {
1475	unsafe {
1476	let mut result: i32 = `0_i32`;
1477	stmxcsr(ptr::addr_of_mut!(result) as *mut i8);
1478	result as u32
1479	}
1480	}
1481
1482	/// Sets the MXCSR register with the 32-bit unsigned integer value.
1483	///
1484	/// This register controls how SIMD instructions handle floating point
1485	/// operations. Modifying this register only affects the current thread.
1486	///
1487	/// It contains several groups of flags:
1488	///
1489	/// Exception flags report which exceptions occurred since last they were reset.*
1490	///
1491	/// Masking flags can be used to mask (ignore) certain exceptions. By default*
1492	/// these flags are all set to 1, so all exceptions are masked. When
1493	/// an exception is masked, the processor simply sets the exception flag and
1494	/// continues the operation. If the exception is unmasked, the flag is also set
1495	/// but additionally an exception handler is invoked.
1496	///
1497	/// Rounding mode flags control the rounding mode of floating point*
1498	/// instructions.
1499	///
1500	/// The denormals-are-zero mode flag turns all numbers which would be*
1501	/// denormalized (exponent bits are all zeros) into zeros.
1502	///
1503	/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
1504	/// immediate Undefined Behavior: Rust assumes that these are always in their default state and
1505	/// will optimize accordingly. This even applies when the register is altered and later reset to its
1506	/// original value without any floating-point operations appearing in the source code between those
1507	/// operations (since floating-point operations appearing earlier or later can be reordered).
1508	///
1509	/// If you need to perform some floating-point operations under a different masking flags, rounding
1510	/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
1511	/// original MXCSR register state before the end of the block.
1512	///
1513	/// ## Exception Flags
1514	///
1515	/// `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing*
1516	/// Infinity by Infinity).
1517	///
1518	/// `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized*
1519	/// number. Mainly this can cause loss of precision.
1520	///
1521	/// `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.*
1522	///
1523	/// `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a*
1524	/// result was too large to be represented (e.g., an `f32` with absolute
1525	/// value greater than `2^128`).
1526	///
1527	/// `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a*
1528	/// result was too small to be represented in a normalized way (e.g., an
1529	/// `f32` with absolute value smaller than `2^-126`.)
1530	///
1531	/// `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.*
1532	/// precision exception). This means some precision was lost due to rounding.
1533	/// For example, the fraction `1/3` cannot be represented accurately in a
1534	/// 32 or 64 bit float and computing it would cause this exception to be
1535	/// raised. Precision exceptions are very common, so they are usually masked.
1536	///
1537	/// Exception flags can be read and set using the convenience functions
1538	/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1539	/// check if an operation caused some overflow:
1540	///
1541	/// ```rust,ignore
1542	/// _MM_SET_EXCEPTION_STATE(`0`); // clear all exception flags
1543	/// // perform calculations
1544	/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != `0` {
1545	/// // handle overflow
1546	/// }
1547	/// ```
1548	///
1549	/// ## Masking Flags
1550	///
1551	/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1552	/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1553	/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1554	///
1555	/// A single masking bit can be set via
1556	///
1557	/// ```rust,ignore
1558	/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1559	/// ```
1560	///
1561	/// However, since mask bits are by default all set to 1, it is more common to
1562	/// want to disable* certain bits. For example, to unmask the underflow*
1563	/// exception, use:
1564	///
1565	/// ```rust,ignore
1566	/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1567	/// exception
1568	/// ```
1569	///
1570	/// Warning: an unmasked exception will cause an exception handler to be
1571	/// called.
1572	/// The standard handler will simply terminate the process. So, in this case
1573	/// any underflow exception would terminate the current process with something
1574	/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1575	///
1576	/// ## Rounding Mode
1577	///
1578	/// The rounding mode is describe using two bits. It can be read and set using
1579	/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1580	/// `_MM_SET_ROUNDING_MODE(mode)`.
1581	///
1582	/// The rounding modes are:
1583	///
1584	/// `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision*
1585	/// value. If two values are equally close, round to even (i.e., least
1586	/// significant bit will be zero).
1587	///
1588	/// `_MM_ROUND_DOWN`: Round toward negative Infinity.*
1589	///
1590	/// `_MM_ROUND_UP`: Round toward positive Infinity.*
1591	///
1592	/// `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).*
1593	///
1594	/// Example:
1595	///
1596	/// ```rust,ignore
1597	/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1598	/// ```
1599	///
1600	/// ## Denormals-are-zero/Flush-to-zero Mode
1601	///
1602	/// If this bit is set, values that would be denormalized will be set to zero
1603	/// instead. This is turned off by default.
1604	///
1605	/// You can read and enable/disable this mode via the helper functions
1606	/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1607	///
1608	/// ```rust,ignore
1609	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1610	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1611	/// ```
1612	///
1613	///
1614	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1615	#[inline]
1616	#[target_feature(enable = "sse")]
1617	#[cfg_attr(test, assert_instr(ldmxcsr))]
1618	#[stable(feature = "simd_x86", since = "1.27.0")]
1619	#[deprecated(
1620	since = "1.75.0",
1621	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1622	)]
1623	pub unsafe fn _mm_setcsr(val: u32) {
1624	ldmxcsr(ptr::addr_of!(val) as *const i8);
1625	}
1626
1627	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1628	#[stable(feature = "simd_x86", since = "1.27.0")]
1629	pub const _MM_EXCEPT_INVALID: u32 = `0x0001`;
1630	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1631	#[stable(feature = "simd_x86", since = "1.27.0")]
1632	pub const _MM_EXCEPT_DENORM: u32 = `0x0002`;
1633	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1634	#[stable(feature = "simd_x86", since = "1.27.0")]
1635	pub const _MM_EXCEPT_DIV_ZERO: u32 = `0x0004`;
1636	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1637	#[stable(feature = "simd_x86", since = "1.27.0")]
1638	pub const _MM_EXCEPT_OVERFLOW: u32 = `0x0008`;
1639	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1640	#[stable(feature = "simd_x86", since = "1.27.0")]
1641	pub const _MM_EXCEPT_UNDERFLOW: u32 = `0x0010`;
1642	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1643	#[stable(feature = "simd_x86", since = "1.27.0")]
1644	pub const _MM_EXCEPT_INEXACT: u32 = `0x0020`;
1645	/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1646	#[stable(feature = "simd_x86", since = "1.27.0")]
1647	pub const _MM_EXCEPT_MASK: u32 = `0x003f`;
1648
1649	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1650	#[stable(feature = "simd_x86", since = "1.27.0")]
1651	pub const _MM_MASK_INVALID: u32 = `0x0080`;
1652	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1653	#[stable(feature = "simd_x86", since = "1.27.0")]
1654	pub const _MM_MASK_DENORM: u32 = `0x0100`;
1655	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1656	#[stable(feature = "simd_x86", since = "1.27.0")]
1657	pub const _MM_MASK_DIV_ZERO: u32 = `0x0200`;
1658	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1659	#[stable(feature = "simd_x86", since = "1.27.0")]
1660	pub const _MM_MASK_OVERFLOW: u32 = `0x0400`;
1661	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1662	#[stable(feature = "simd_x86", since = "1.27.0")]
1663	pub const _MM_MASK_UNDERFLOW: u32 = `0x0800`;
1664	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1665	#[stable(feature = "simd_x86", since = "1.27.0")]
1666	pub const _MM_MASK_INEXACT: u32 = `0x1000`;
1667	/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1668	#[stable(feature = "simd_x86", since = "1.27.0")]
1669	pub const _MM_MASK_MASK: u32 = `0x1f80`;
1670
1671	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1672	#[stable(feature = "simd_x86", since = "1.27.0")]
1673	pub const _MM_ROUND_NEAREST: u32 = `0x0000`;
1674	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1675	#[stable(feature = "simd_x86", since = "1.27.0")]
1676	pub const _MM_ROUND_DOWN: u32 = `0x2000`;
1677	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1678	#[stable(feature = "simd_x86", since = "1.27.0")]
1679	pub const _MM_ROUND_UP: u32 = `0x4000`;
1680	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1681	#[stable(feature = "simd_x86", since = "1.27.0")]
1682	pub const _MM_ROUND_TOWARD_ZERO: u32 = `0x6000`;
1683
1684	/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1685	#[stable(feature = "simd_x86", since = "1.27.0")]
1686	pub const _MM_ROUND_MASK: u32 = `0x6000`;
1687
1688	/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1689	#[stable(feature = "simd_x86", since = "1.27.0")]
1690	pub const _MM_FLUSH_ZERO_MASK: u32 = `0x8000`;
1691	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1692	#[stable(feature = "simd_x86", since = "1.27.0")]
1693	pub const _MM_FLUSH_ZERO_ON: u32 = `0x8000`;
1694	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1695	#[stable(feature = "simd_x86", since = "1.27.0")]
1696	pub const _MM_FLUSH_ZERO_OFF: u32 = `0x0000`;
1697
1698	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1699	///
1700	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1701	#[inline]
1702	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1703	#[allow(non_snake_case)]
1704	#[target_feature(enable = "sse")]
1705	#[stable(feature = "simd_x86", since = "1.27.0")]
1706	#[deprecated(
1707	since = "1.75.0",
1708	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1709	)]
1710	pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1711	_mm_getcsr() & _MM_MASK_MASK
1712	}
1713
1714	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1715	///
1716	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1717	#[inline]
1718	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1719	#[allow(non_snake_case)]
1720	#[target_feature(enable = "sse")]
1721	#[stable(feature = "simd_x86", since = "1.27.0")]
1722	#[deprecated(
1723	since = "1.75.0",
1724	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1725	)]
1726	pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1727	_mm_getcsr() & _MM_EXCEPT_MASK
1728	}
1729
1730	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1731	///
1732	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1733	#[inline]
1734	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1735	#[allow(non_snake_case)]
1736	#[target_feature(enable = "sse")]
1737	#[stable(feature = "simd_x86", since = "1.27.0")]
1738	#[deprecated(
1739	since = "1.75.0",
1740	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1741	)]
1742	pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1743	_mm_getcsr() & _MM_FLUSH_ZERO_MASK
1744	}
1745
1746	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1747	///
1748	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1749	#[inline]
1750	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1751	#[allow(non_snake_case)]
1752	#[target_feature(enable = "sse")]
1753	#[stable(feature = "simd_x86", since = "1.27.0")]
1754	#[deprecated(
1755	since = "1.75.0",
1756	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1757	)]
1758	pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1759	_mm_getcsr() & _MM_ROUND_MASK
1760	}
1761
1762	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1763	///
1764	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1765	#[inline]
1766	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1767	#[allow(non_snake_case)]
1768	#[target_feature(enable = "sse")]
1769	#[stable(feature = "simd_x86", since = "1.27.0")]
1770	#[deprecated(
1771	since = "1.75.0",
1772	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1773	)]
1774	pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1775	_mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) \| (x & _MM_MASK_MASK))
1776	}
1777
1778	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1779	///
1780	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1781	#[inline]
1782	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1783	#[allow(non_snake_case)]
1784	#[target_feature(enable = "sse")]
1785	#[stable(feature = "simd_x86", since = "1.27.0")]
1786	#[deprecated(
1787	since = "1.75.0",
1788	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1789	)]
1790	pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1791	_mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) \| (x & _MM_EXCEPT_MASK))
1792	}
1793
1794	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1795	///
1796	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1797	#[inline]
1798	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1799	#[allow(non_snake_case)]
1800	#[target_feature(enable = "sse")]
1801	#[stable(feature = "simd_x86", since = "1.27.0")]
1802	#[deprecated(
1803	since = "1.75.0",
1804	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1805	)]
1806	pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1807	_mm_setcsr((_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) \| (x & _MM_FLUSH_ZERO_MASK))
1808	}
1809
1810	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1811	///
1812	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1813	#[inline]
1814	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1815	#[allow(non_snake_case)]
1816	#[target_feature(enable = "sse")]
1817	#[stable(feature = "simd_x86", since = "1.27.0")]
1818	#[deprecated(
1819	since = "1.75.0",
1820	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1821	)]
1822	pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1823	_mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) \| (x & _MM_ROUND_MASK))
1824	}
1825
1826	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1827	#[stable(feature = "simd_x86", since = "1.27.0")]
1828	pub const _MM_HINT_T0: i32 = `3`;
1829
1830	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1831	#[stable(feature = "simd_x86", since = "1.27.0")]
1832	pub const _MM_HINT_T1: i32 = `2`;
1833
1834	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1835	#[stable(feature = "simd_x86", since = "1.27.0")]
1836	pub const _MM_HINT_T2: i32 = `1`;
1837
1838	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1839	#[stable(feature = "simd_x86", since = "1.27.0")]
1840	pub const _MM_HINT_NTA: i32 = `0`;
1841
1842	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1843	#[stable(feature = "simd_x86", since = "1.27.0")]
1844	pub const _MM_HINT_ET0: i32 = `7`;
1845
1846	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1847	#[stable(feature = "simd_x86", since = "1.27.0")]
1848	pub const _MM_HINT_ET1: i32 = `6`;
1849
1850	/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1851	///
1852	/// The `STRATEGY` must be one of:
1853	///
1854	/// [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the*
1855	/// cache hierarchy.
1856	///
1857	/// [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.*
1858	///
1859	/// [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or*
1860	/// an implementation-specific choice (e.g., L2 if there is no L3).
1861	///
1862	/// [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the*
1863	/// non-temporal access (NTA) hint. It may be a place closer than main memory
1864	/// but outside of the cache hierarchy. This is used to reduce access latency
1865	/// without polluting the cache.
1866	///
1867	/// [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and*
1868	/// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1869	/// and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1870	///
1871	/// The actual implementation depends on the particular CPU. This instruction
1872	/// is considered a hint, so the CPU is also free to simply ignore the request.
1873	///
1874	/// The amount of prefetched data depends on the cache line size of the
1875	/// specific CPU, but it will be at least 32 bytes.
1876	///
1877	/// Common caveats:
1878	///
1879	/// Most modern CPUs already automatically prefetch data based on predicted*
1880	/// access patterns.
1881	///
1882	/// Data is usually not fetched if this would cause a TLB miss or a page*
1883	/// fault.
1884	///
1885	/// Too much prefetching can cause unnecessary cache evictions.*
1886	///
1887	/// Prefetching may also fail if there are not enough memory-subsystem*
1888	/// resources (e.g., request buffers).
1889	///
1890	///
1891	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1892	#[inline]
1893	#[target_feature(enable = "sse")]
1894	#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1895	#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1896	#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1897	#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1898	#[rustc_legacy_const_generics(`1`)]
1899	#[stable(feature = "simd_x86", since = "1.27.0")]
1900	pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1901	static_assert_uimm_bits!(STRATEGY, `3`);
1902	// We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1903	// `locality` and `rw` are based on our `STRATEGY`.
1904	prefetch(p, (STRATEGY >> `2`) & `1`, STRATEGY & `3`, ty:`1`);
1905	}
1906
1907	/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
1908	/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
1909	/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
1910	/// In practice, this is typically equivalent to [`mem::zeroed`].
1911	///
1912	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1913	#[inline]
1914	#[target_feature(enable = "sse")]
1915	#[stable(feature = "simd_x86", since = "1.27.0")]
1916	pub fn _mm_undefined_ps() -> __m128 {
1917	const { unsafe { mem::zeroed() } }
1918	}
1919
1920	/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1921	///
1922	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1923	#[inline]
1924	#[allow(non_snake_case)]
1925	#[target_feature(enable = "sse")]
1926	#[stable(feature = "simd_x86", since = "1.27.0")]
1927	pub fn _MM_TRANSPOSE4_PS(
1928	row0: &mut __m128,
1929	row1: &mut __m128,
1930	row2: &mut __m128,
1931	row3: &mut __m128,
1932	) {
1933	let tmp0: __m128 = _mm_unpacklo_ps(row0, row1);
1934	let tmp2: __m128 = _mm_unpacklo_ps(row2, row3);
1935	let tmp1: __m128 = _mm_unpackhi_ps(row0, row1);
1936	let tmp3: __m128 = _mm_unpackhi_ps(row2, row3);
1937
1938	*row0 = _mm_movelh_ps(a:tmp0, b:tmp2);
1939	*row1 = _mm_movehl_ps(a:tmp2, b:tmp0);
1940	*row2 = _mm_movelh_ps(a:tmp1, b:tmp3);
1941	*row3 = _mm_movehl_ps(a:tmp3, b:tmp1);
1942	}
1943
1944	#[allow(improper_ctypes)]
1945	unsafe extern "C" {
1946	#[link_name = "llvm.x86.sse.rcp.ss"]
1947	unsafefn rcpss(a: __m128) -> __m128;
1948	#[link_name = "llvm.x86.sse.rcp.ps"]
1949	unsafefn rcpps(a: __m128) -> __m128;
1950	#[link_name = "llvm.x86.sse.rsqrt.ss"]
1951	unsafefn rsqrtss(a: __m128) -> __m128;
1952	#[link_name = "llvm.x86.sse.rsqrt.ps"]
1953	unsafefn rsqrtps(a: __m128) -> __m128;
1954	#[link_name = "llvm.x86.sse.min.ss"]
1955	unsafefn minss(a: __m128, b: __m128) -> __m128;
1956	#[link_name = "llvm.x86.sse.min.ps"]
1957	unsafefn minps(a: __m128, b: __m128) -> __m128;
1958	#[link_name = "llvm.x86.sse.max.ss"]
1959	unsafefn maxss(a: __m128, b: __m128) -> __m128;
1960	#[link_name = "llvm.x86.sse.max.ps"]
1961	unsafefn maxps(a: __m128, b: __m128) -> __m128;
1962	#[link_name = "llvm.x86.sse.cmp.ps"]
1963	unsafefn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1964	#[link_name = "llvm.x86.sse.comieq.ss"]
1965	unsafefn comieq_ss(a: __m128, b: __m128) -> i32;
1966	#[link_name = "llvm.x86.sse.comilt.ss"]
1967	unsafefn comilt_ss(a: __m128, b: __m128) -> i32;
1968	#[link_name = "llvm.x86.sse.comile.ss"]
1969	unsafefn comile_ss(a: __m128, b: __m128) -> i32;
1970	#[link_name = "llvm.x86.sse.comigt.ss"]
1971	unsafefn comigt_ss(a: __m128, b: __m128) -> i32;
1972	#[link_name = "llvm.x86.sse.comige.ss"]
1973	unsafefn comige_ss(a: __m128, b: __m128) -> i32;
1974	#[link_name = "llvm.x86.sse.comineq.ss"]
1975	unsafefn comineq_ss(a: __m128, b: __m128) -> i32;
1976	#[link_name = "llvm.x86.sse.ucomieq.ss"]
1977	unsafefn ucomieq_ss(a: __m128, b: __m128) -> i32;
1978	#[link_name = "llvm.x86.sse.ucomilt.ss"]
1979	unsafefn ucomilt_ss(a: __m128, b: __m128) -> i32;
1980	#[link_name = "llvm.x86.sse.ucomile.ss"]
1981	unsafefn ucomile_ss(a: __m128, b: __m128) -> i32;
1982	#[link_name = "llvm.x86.sse.ucomigt.ss"]
1983	unsafefn ucomigt_ss(a: __m128, b: __m128) -> i32;
1984	#[link_name = "llvm.x86.sse.ucomige.ss"]
1985	unsafefn ucomige_ss(a: __m128, b: __m128) -> i32;
1986	#[link_name = "llvm.x86.sse.ucomineq.ss"]
1987	unsafefn ucomineq_ss(a: __m128, b: __m128) -> i32;
1988	#[link_name = "llvm.x86.sse.cvtss2si"]
1989	unsafefn cvtss2si(a: __m128) -> i32;
1990	#[link_name = "llvm.x86.sse.cvttss2si"]
1991	unsafefn cvttss2si(a: __m128) -> i32;
1992	#[link_name = "llvm.x86.sse.cvtsi2ss"]
1993	unsafefn cvtsi2ss(a: __m128, b: i32) -> __m128;
1994	#[link_name = "llvm.x86.sse.sfence"]
1995	unsafefn sfence();
1996	#[link_name = "llvm.x86.sse.stmxcsr"]
1997	unsafefn stmxcsr(p: *mut i8);
1998	#[link_name = "llvm.x86.sse.ldmxcsr"]
1999	unsafefn ldmxcsr(p: *const i8);
2000	#[link_name = "llvm.prefetch"]
2001	unsafefn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
2002	#[link_name = "llvm.x86.sse.cmp.ss"]
2003	unsafefn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
2004	}
2005
2006	/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
2007	///
2008	/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2009	/// exception _may_ be generated.
2010	///
2011	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
2012	///
2013	/// # Safety of non-temporal stores
2014	///
2015	/// After using this intrinsic, but before any other access to the memory that this intrinsic
2016	/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2017	/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2018	/// return.
2019	///
2020	/// See [`_mm_sfence`] for details.
2021	#[inline]
2022	#[target_feature(enable = "sse")]
2023	#[cfg_attr(test, assert_instr(movntps))]
2024	#[stable(feature = "simd_x86", since = "1.27.0")]
2025	#[allow(clippy::cast_ptr_alignment)]
2026	pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
2027	crate::arch::asm!(
2028	vps!("movntps", ",{a}"),
2029	p = in(reg) mem_addr,
2030	a = in(xmm_reg) a,
2031	options(nostack, preserves_flags),
2032	);
2033	}
2034
2035	#[cfg(test)]
2036	mod tests {
2037	use crate::{hint::black_box, mem::transmute, ptr};
2038	use std::boxed;
2039	use stdarch_test::simd_test;
2040
2041	use crate::core_arch::{simd::, x86::};
2042
2043	const NAN: f32 = f32::NAN;
2044
2045	#[simd_test(enable = "sse")]
2046	unsafe fn test_mm_add_ps() {
2047	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2048	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2049	let r = _mm_add_ps(a, b);
2050	assert_eq_m128(r, _mm_setr_ps(`-101.0`, `25.0`, `0.0`, `-15.0`));
2051	}
2052
2053	#[simd_test(enable = "sse")]
2054	unsafe fn test_mm_add_ss() {
2055	let a = _mm_set_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2056	let b = _mm_set_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2057	let r = _mm_add_ss(a, b);
2058	assert_eq_m128(r, _mm_set_ps(`-1.0`, `5.0`, `0.0`, `-15.0`));
2059	}
2060
2061	#[simd_test(enable = "sse")]
2062	unsafe fn test_mm_sub_ps() {
2063	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2064	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2065	let r = _mm_sub_ps(a, b);
2066	assert_eq_m128(r, _mm_setr_ps(`99.0`, `-15.0`, `0.0`, `-5.0`));
2067	}
2068
2069	#[simd_test(enable = "sse")]
2070	unsafe fn test_mm_sub_ss() {
2071	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2072	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2073	let r = _mm_sub_ss(a, b);
2074	assert_eq_m128(r, _mm_setr_ps(`99.0`, `5.0`, `0.0`, `-10.0`));
2075	}
2076
2077	#[simd_test(enable = "sse")]
2078	unsafe fn test_mm_mul_ps() {
2079	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2080	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2081	let r = _mm_mul_ps(a, b);
2082	assert_eq_m128(r, _mm_setr_ps(`100.0`, `100.0`, `0.0`, `50.0`));
2083	}
2084
2085	#[simd_test(enable = "sse")]
2086	unsafe fn test_mm_mul_ss() {
2087	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2088	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2089	let r = _mm_mul_ss(a, b);
2090	assert_eq_m128(r, _mm_setr_ps(`100.0`, `5.0`, `0.0`, `-10.0`));
2091	}
2092
2093	#[simd_test(enable = "sse")]
2094	unsafe fn test_mm_div_ps() {
2095	let a = _mm_setr_ps(`-1.0`, `5.0`, `2.0`, `-10.0`);
2096	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.2`, `-5.0`);
2097	let r = _mm_div_ps(a, b);
2098	assert_eq_m128(r, _mm_setr_ps(`0.01`, `0.25`, `10.0`, `2.0`));
2099	}
2100
2101	#[simd_test(enable = "sse")]
2102	unsafe fn test_mm_div_ss() {
2103	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2104	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2105	let r = _mm_div_ss(a, b);
2106	assert_eq_m128(r, _mm_setr_ps(`0.01`, `5.0`, `0.0`, `-10.0`));
2107	}
2108
2109	#[simd_test(enable = "sse")]
2110	unsafe fn test_mm_sqrt_ss() {
2111	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2112	let r = _mm_sqrt_ss(a);
2113	let e = _mm_setr_ps(`2.0`, `13.0`, `16.0`, `100.0`);
2114	assert_eq_m128(r, e);
2115	}
2116
2117	#[simd_test(enable = "sse")]
2118	unsafe fn test_mm_sqrt_ps() {
2119	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2120	let r = _mm_sqrt_ps(a);
2121	let e = _mm_setr_ps(`2.0`, `3.6055512`, `4.0`, `10.0`);
2122	assert_eq_m128(r, e);
2123	}
2124
2125	#[simd_test(enable = "sse")]
2126	unsafe fn test_mm_rcp_ss() {
2127	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2128	let r = _mm_rcp_ss(a);
2129	let e = _mm_setr_ps(`0.24993896`, `13.0`, `16.0`, `100.0`);
2130	let rel_err = `0.00048828125`;
2131	assert_approx_eq!(get_m128(r, `0`), get_m128(e, `0`), `2.` * rel_err);
2132	for i in `1`..`4` {
2133	assert_eq!(get_m128(r, i), get_m128(e, i));
2134	}
2135	}
2136
2137	#[simd_test(enable = "sse")]
2138	unsafe fn test_mm_rcp_ps() {
2139	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2140	let r = _mm_rcp_ps(a);
2141	let e = _mm_setr_ps(`0.24993896`, `0.0769043`, `0.06248474`, `0.0099983215`);
2142	let rel_err = `0.00048828125`;
2143	for i in `0`..`4` {
2144	assert_approx_eq!(get_m128(r, i), get_m128(e, i), `2.` * rel_err);
2145	}
2146	}
2147
2148	#[simd_test(enable = "sse")]
2149	unsafe fn test_mm_rsqrt_ss() {
2150	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2151	let r = _mm_rsqrt_ss(a);
2152	let e = _mm_setr_ps(`0.49987793`, `13.0`, `16.0`, `100.0`);
2153	let rel_err = `0.00048828125`;
2154	for i in `0`..`4` {
2155	assert_approx_eq!(get_m128(r, i), get_m128(e, i), `2.` * rel_err);
2156	}
2157	}
2158
2159	#[simd_test(enable = "sse")]
2160	unsafe fn test_mm_rsqrt_ps() {
2161	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2162	let r = _mm_rsqrt_ps(a);
2163	let e = _mm_setr_ps(`0.49987793`, `0.2772827`, `0.24993896`, `0.099990845`);
2164	let rel_err = `0.00048828125`;
2165	for i in `0`..`4` {
2166	assert_approx_eq!(get_m128(r, i), get_m128(e, i), `2.` * rel_err);
2167	}
2168	}
2169
2170	#[simd_test(enable = "sse")]
2171	unsafe fn test_mm_min_ss() {
2172	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2173	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2174	let r = _mm_min_ss(a, b);
2175	assert_eq_m128(r, _mm_setr_ps(`-100.0`, `5.0`, `0.0`, `-10.0`));
2176	}
2177
2178	#[simd_test(enable = "sse")]
2179	unsafe fn test_mm_min_ps() {
2180	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2181	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2182	let r = _mm_min_ps(a, b);
2183	assert_eq_m128(r, _mm_setr_ps(`-100.0`, `5.0`, `0.0`, `-10.0`));
2184
2185	// `_mm_min_ps` can not* be implemented using the `simd_min` rust intrinsic. `simd_min`*
2186	// is lowered by the llvm codegen backend to `llvm.minnum.v` llvm intrinsic. This intrinsic*
2187	// doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2188	// the `minps` x86 instruction on x86. The `llvm.minnum.v` llvm intrinsic equals*
2189	// `r1` to `a` and `r2` to `b`.
2190	let a = _mm_setr_ps(`-0.0`, `0.0`, `0.0`, `0.0`);
2191	let b = _mm_setr_ps(`0.0`, `0.0`, `0.0`, `0.0`);
2192	let r1: [u8; `16`] = transmute(_mm_min_ps(a, b));
2193	let r2: [u8; `16`] = transmute(_mm_min_ps(b, a));
2194	let a: [u8; `16`] = transmute(a);
2195	let b: [u8; `16`] = transmute(b);
2196	assert_eq!(r1, b);
2197	assert_eq!(r2, a);
2198	assert_ne!(a, b); // sanity check that -0.0 is actually present
2199	}
2200
2201	#[simd_test(enable = "sse")]
2202	unsafe fn test_mm_max_ss() {
2203	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2204	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2205	let r = _mm_max_ss(a, b);
2206	assert_eq_m128(r, _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`));
2207	}
2208
2209	#[simd_test(enable = "sse")]
2210	unsafe fn test_mm_max_ps() {
2211	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2212	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2213	let r = _mm_max_ps(a, b);
2214	assert_eq_m128(r, _mm_setr_ps(`-1.0`, `20.0`, `0.0`, `-5.0`));
2215
2216	// Check SSE-specific semantics for -0.0 handling.
2217	let a = _mm_setr_ps(`-0.0`, `0.0`, `0.0`, `0.0`);
2218	let b = _mm_setr_ps(`0.0`, `0.0`, `0.0`, `0.0`);
2219	let r1: [u8; `16`] = transmute(_mm_max_ps(a, b));
2220	let r2: [u8; `16`] = transmute(_mm_max_ps(b, a));
2221	let a: [u8; `16`] = transmute(a);
2222	let b: [u8; `16`] = transmute(b);
2223	assert_eq!(r1, b);
2224	assert_eq!(r2, a);
2225	assert_ne!(a, b); // sanity check that -0.0 is actually present
2226	}
2227
2228	#[simd_test(enable = "sse")]
2229	unsafe fn test_mm_and_ps() {
2230	let a = transmute(u32x4::splat(`0b0011`));
2231	let b = transmute(u32x4::splat(`0b0101`));
2232	let r = _mm_and_ps(black_box(&a), black_box(&b));
2233	let e = transmute(u32x4::splat(`0b0001`));
2234	assert_eq_m128(r, e);
2235	}
2236
2237	#[simd_test(enable = "sse")]
2238	unsafe fn test_mm_andnot_ps() {
2239	let a = transmute(u32x4::splat(`0b0011`));
2240	let b = transmute(u32x4::splat(`0b0101`));
2241	let r = _mm_andnot_ps(black_box(&a), black_box(&b));
2242	let e = transmute(u32x4::splat(`0b0100`));
2243	assert_eq_m128(r, e);
2244	}
2245
2246	#[simd_test(enable = "sse")]
2247	unsafe fn test_mm_or_ps() {
2248	let a = transmute(u32x4::splat(`0b0011`));
2249	let b = transmute(u32x4::splat(`0b0101`));
2250	let r = _mm_or_ps(black_box(&a), black_box(&b));
2251	let e = transmute(u32x4::splat(`0b0111`));
2252	assert_eq_m128(r, e);
2253	}
2254
2255	#[simd_test(enable = "sse")]
2256	unsafe fn test_mm_xor_ps() {
2257	let a = transmute(u32x4::splat(`0b0011`));
2258	let b = transmute(u32x4::splat(`0b0101`));
2259	let r = _mm_xor_ps(black_box(&a), black_box(&b));
2260	let e = transmute(u32x4::splat(`0b0110`));
2261	assert_eq_m128(r, e);
2262	}
2263
2264	#[simd_test(enable = "sse")]
2265	unsafe fn test_mm_cmpeq_ss() {
2266	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2267	let b = _mm_setr_ps(`-1.0`, `5.0`, `6.0`, `7.0`);
2268	let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2269	let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(`0`), `2.0`, `3.0`, `4.0`));
2270	assert_eq!(r, e);
2271
2272	let b2 = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2273	let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
2274	let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(`0xffffffff`), `2.0`, `3.0`, `4.0`));
2275	assert_eq!(r2, e2);
2276	}
2277
2278	#[simd_test(enable = "sse")]
2279	unsafe fn test_mm_cmplt_ss() {
2280	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2281	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2282	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2283	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2284
2285	let b1 = `0u32`; // a.extract(0) < b.extract(0)
2286	let c1 = `0u32`; // a.extract(0) < c.extract(0)
2287	let d1 = !`0u32`; // a.extract(0) < d.extract(0)
2288
2289	let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2290	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2291	assert_eq!(rb, eb);
2292
2293	let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2294	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2295	assert_eq!(rc, ec);
2296
2297	let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2298	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2299	assert_eq!(rd, ed);
2300	}
2301
2302	#[simd_test(enable = "sse")]
2303	unsafe fn test_mm_cmple_ss() {
2304	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2305	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2306	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2307	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2308
2309	let b1 = `0u32`; // a.extract(0) <= b.extract(0)
2310	let c1 = !`0u32`; // a.extract(0) <= c.extract(0)
2311	let d1 = !`0u32`; // a.extract(0) <= d.extract(0)
2312
2313	let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2314	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2315	assert_eq!(rb, eb);
2316
2317	let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2318	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2319	assert_eq!(rc, ec);
2320
2321	let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2322	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2323	assert_eq!(rd, ed);
2324	}
2325
2326	#[simd_test(enable = "sse")]
2327	unsafe fn test_mm_cmpgt_ss() {
2328	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2329	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2330	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2331	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2332
2333	let b1 = !`0u32`; // a.extract(0) > b.extract(0)
2334	let c1 = `0u32`; // a.extract(0) > c.extract(0)
2335	let d1 = `0u32`; // a.extract(0) > d.extract(0)
2336
2337	let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2338	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2339	assert_eq!(rb, eb);
2340
2341	let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2342	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2343	assert_eq!(rc, ec);
2344
2345	let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2346	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2347	assert_eq!(rd, ed);
2348	}
2349
2350	#[simd_test(enable = "sse")]
2351	unsafe fn test_mm_cmpge_ss() {
2352	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2353	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2354	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2355	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2356
2357	let b1 = !`0u32`; // a.extract(0) >= b.extract(0)
2358	let c1 = !`0u32`; // a.extract(0) >= c.extract(0)
2359	let d1 = `0u32`; // a.extract(0) >= d.extract(0)
2360
2361	let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2362	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2363	assert_eq!(rb, eb);
2364
2365	let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2366	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2367	assert_eq!(rc, ec);
2368
2369	let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2370	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2371	assert_eq!(rd, ed);
2372	}
2373
2374	#[simd_test(enable = "sse")]
2375	unsafe fn test_mm_cmpneq_ss() {
2376	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2377	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2378	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2379	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2380
2381	let b1 = !`0u32`; // a.extract(0) != b.extract(0)
2382	let c1 = `0u32`; // a.extract(0) != c.extract(0)
2383	let d1 = !`0u32`; // a.extract(0) != d.extract(0)
2384
2385	let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2386	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2387	assert_eq!(rb, eb);
2388
2389	let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2390	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2391	assert_eq!(rc, ec);
2392
2393	let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2394	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2395	assert_eq!(rd, ed);
2396	}
2397
2398	#[simd_test(enable = "sse")]
2399	unsafe fn test_mm_cmpnlt_ss() {
2400	// TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2401	// must be a difference. It may have to do with behavior in the
2402	// presence of NaNs (signaling or quiet). If so, we should add tests
2403	// for those.
2404
2405	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2406	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2407	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2408	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2409
2410	let b1 = !`0u32`; // a.extract(0) >= b.extract(0)
2411	let c1 = !`0u32`; // a.extract(0) >= c.extract(0)
2412	let d1 = `0u32`; // a.extract(0) >= d.extract(0)
2413
2414	let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2415	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2416	assert_eq!(rb, eb);
2417
2418	let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2419	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2420	assert_eq!(rc, ec);
2421
2422	let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2423	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2424	assert_eq!(rd, ed);
2425	}
2426
2427	#[simd_test(enable = "sse")]
2428	unsafe fn test_mm_cmpnle_ss() {
2429	// TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2430	// must be a difference. It may have to do with behavior in the
2431	// presence
2432	// of NaNs (signaling or quiet). If so, we should add tests for those.
2433
2434	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2435	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2436	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2437	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2438
2439	let b1 = !`0u32`; // a.extract(0) > b.extract(0)
2440	let c1 = `0u32`; // a.extract(0) > c.extract(0)
2441	let d1 = `0u32`; // a.extract(0) > d.extract(0)
2442
2443	let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2444	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2445	assert_eq!(rb, eb);
2446
2447	let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2448	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2449	assert_eq!(rc, ec);
2450
2451	let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2452	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2453	assert_eq!(rd, ed);
2454	}
2455
2456	#[simd_test(enable = "sse")]
2457	unsafe fn test_mm_cmpngt_ss() {
2458	// TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2459	// must be a difference. It may have to do with behavior in the
2460	// presence of NaNs (signaling or quiet). If so, we should add tests
2461	// for those.
2462
2463	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2464	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2465	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2466	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2467
2468	let b1 = `0u32`; // a.extract(0) <= b.extract(0)
2469	let c1 = !`0u32`; // a.extract(0) <= c.extract(0)
2470	let d1 = !`0u32`; // a.extract(0) <= d.extract(0)
2471
2472	let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2473	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2474	assert_eq!(rb, eb);
2475
2476	let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2477	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2478	assert_eq!(rc, ec);
2479
2480	let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2481	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2482	assert_eq!(rd, ed);
2483	}
2484
2485	#[simd_test(enable = "sse")]
2486	unsafe fn test_mm_cmpnge_ss() {
2487	// TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2488	// must be a difference. It may have to do with behavior in the
2489	// presence of NaNs (signaling or quiet). If so, we should add tests
2490	// for those.
2491
2492	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2493	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2494	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2495	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2496
2497	let b1 = `0u32`; // a.extract(0) < b.extract(0)
2498	let c1 = `0u32`; // a.extract(0) < c.extract(0)
2499	let d1 = !`0u32`; // a.extract(0) < d.extract(0)
2500
2501	let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2502	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2503	assert_eq!(rb, eb);
2504
2505	let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2506	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2507	assert_eq!(rc, ec);
2508
2509	let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2510	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2511	assert_eq!(rd, ed);
2512	}
2513
2514	#[simd_test(enable = "sse")]
2515	unsafe fn test_mm_cmpord_ss() {
2516	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2517	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2518	let c = _mm_setr_ps(NAN, `5.0`, `6.0`, `7.0`);
2519	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2520
2521	let b1 = !`0u32`; // a.extract(0) ord b.extract(0)
2522	let c1 = `0u32`; // a.extract(0) ord c.extract(0)
2523	let d1 = !`0u32`; // a.extract(0) ord d.extract(0)
2524
2525	let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2526	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2527	assert_eq!(rb, eb);
2528
2529	let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2530	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2531	assert_eq!(rc, ec);
2532
2533	let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2534	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2535	assert_eq!(rd, ed);
2536	}
2537
2538	#[simd_test(enable = "sse")]
2539	unsafe fn test_mm_cmpunord_ss() {
2540	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2541	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2542	let c = _mm_setr_ps(NAN, `5.0`, `6.0`, `7.0`);
2543	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2544
2545	let b1 = `0u32`; // a.extract(0) unord b.extract(0)
2546	let c1 = !`0u32`; // a.extract(0) unord c.extract(0)
2547	let d1 = `0u32`; // a.extract(0) unord d.extract(0)
2548
2549	let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2550	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2551	assert_eq!(rb, eb);
2552
2553	let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2554	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2555	assert_eq!(rc, ec);
2556
2557	let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2558	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2559	assert_eq!(rd, ed);
2560	}
2561
2562	#[simd_test(enable = "sse")]
2563	unsafe fn test_mm_cmpeq_ps() {
2564	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2565	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, NAN);
2566	let tru = !`0u32`;
2567	let fls = `0u32`;
2568
2569	let e = u32x4::new(fls, fls, tru, fls);
2570	let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2571	assert_eq!(r, e);
2572	}
2573
2574	#[simd_test(enable = "sse")]
2575	unsafe fn test_mm_cmplt_ps() {
2576	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2577	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, NAN);
2578	let tru = !`0u32`;
2579	let fls = `0u32`;
2580
2581	let e = u32x4::new(tru, fls, fls, fls);
2582	let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2583	assert_eq!(r, e);
2584	}
2585
2586	#[simd_test(enable = "sse")]
2587	unsafe fn test_mm_cmple_ps() {
2588	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, `4.0`);
2589	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, NAN);
2590	let tru = !`0u32`;
2591	let fls = `0u32`;
2592
2593	let e = u32x4::new(tru, fls, tru, fls);
2594	let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2595	assert_eq!(r, e);
2596	}
2597
2598	#[simd_test(enable = "sse")]
2599	unsafe fn test_mm_cmpgt_ps() {
2600	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2601	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `42.0`);
2602	let tru = !`0u32`;
2603	let fls = `0u32`;
2604
2605	let e = u32x4::new(fls, tru, fls, fls);
2606	let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2607	assert_eq!(r, e);
2608	}
2609
2610	#[simd_test(enable = "sse")]
2611	unsafe fn test_mm_cmpge_ps() {
2612	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2613	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `42.0`);
2614	let tru = !`0u32`;
2615	let fls = `0u32`;
2616
2617	let e = u32x4::new(fls, tru, tru, fls);
2618	let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2619	assert_eq!(r, e);
2620	}
2621
2622	#[simd_test(enable = "sse")]
2623	unsafe fn test_mm_cmpneq_ps() {
2624	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2625	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, NAN);
2626	let tru = !`0u32`;
2627	let fls = `0u32`;
2628
2629	let e = u32x4::new(tru, tru, fls, tru);
2630	let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2631	assert_eq!(r, e);
2632	}
2633
2634	#[simd_test(enable = "sse")]
2635	unsafe fn test_mm_cmpnlt_ps() {
2636	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2637	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `5.0`);
2638	let tru = !`0u32`;
2639	let fls = `0u32`;
2640
2641	let e = u32x4::new(fls, tru, tru, tru);
2642	let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2643	assert_eq!(r, e);
2644	}
2645
2646	#[simd_test(enable = "sse")]
2647	unsafe fn test_mm_cmpnle_ps() {
2648	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2649	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `5.0`);
2650	let tru = !`0u32`;
2651	let fls = `0u32`;
2652
2653	let e = u32x4::new(fls, tru, fls, tru);
2654	let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2655	assert_eq!(r, e);
2656	}
2657
2658	#[simd_test(enable = "sse")]
2659	unsafe fn test_mm_cmpngt_ps() {
2660	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2661	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `5.0`);
2662	let tru = !`0u32`;
2663	let fls = `0u32`;
2664
2665	let e = u32x4::new(tru, fls, tru, tru);
2666	let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2667	assert_eq!(r, e);
2668	}
2669
2670	#[simd_test(enable = "sse")]
2671	unsafe fn test_mm_cmpnge_ps() {
2672	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2673	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `5.0`);
2674	let tru = !`0u32`;
2675	let fls = `0u32`;
2676
2677	let e = u32x4::new(tru, fls, fls, tru);
2678	let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2679	assert_eq!(r, e);
2680	}
2681
2682	#[simd_test(enable = "sse")]
2683	unsafe fn test_mm_cmpord_ps() {
2684	let a = _mm_setr_ps(`10.0`, `50.0`, NAN, NAN);
2685	let b = _mm_setr_ps(`15.0`, NAN, `1.0`, NAN);
2686	let tru = !`0u32`;
2687	let fls = `0u32`;
2688
2689	let e = u32x4::new(tru, fls, fls, fls);
2690	let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2691	assert_eq!(r, e);
2692	}
2693
2694	#[simd_test(enable = "sse")]
2695	unsafe fn test_mm_cmpunord_ps() {
2696	let a = _mm_setr_ps(`10.0`, `50.0`, NAN, NAN);
2697	let b = _mm_setr_ps(`15.0`, NAN, `1.0`, NAN);
2698	let tru = !`0u32`;
2699	let fls = `0u32`;
2700
2701	let e = u32x4::new(fls, tru, tru, tru);
2702	let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2703	assert_eq!(r, e);
2704	}
2705
2706	#[simd_test(enable = "sse")]
2707	unsafe fn test_mm_comieq_ss() {
2708	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2709	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2710
2711	let ee = &[`1i32`, `0`, `0`, `0`];
2712
2713	for i in `0`..`4` {
2714	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2715	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2716
2717	let r = _mm_comieq_ss(a, b);
2718
2719	assert_eq!(
2720	ee[i], r,
2721	"_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2722	a, b, r, ee[i], i
2723	);
2724	}
2725	}
2726
2727	#[simd_test(enable = "sse")]
2728	unsafe fn test_mm_comilt_ss() {
2729	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2730	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2731
2732	let ee = &[`0i32`, `1`, `0`, `0`];
2733
2734	for i in `0`..`4` {
2735	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2736	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2737
2738	let r = _mm_comilt_ss(a, b);
2739
2740	assert_eq!(
2741	ee[i], r,
2742	"_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2743	a, b, r, ee[i], i
2744	);
2745	}
2746	}
2747
2748	#[simd_test(enable = "sse")]
2749	unsafe fn test_mm_comile_ss() {
2750	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2751	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2752
2753	let ee = &[`1i32`, `1`, `0`, `0`];
2754
2755	for i in `0`..`4` {
2756	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2757	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2758
2759	let r = _mm_comile_ss(a, b);
2760
2761	assert_eq!(
2762	ee[i], r,
2763	"_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2764	a, b, r, ee[i], i
2765	);
2766	}
2767	}
2768
2769	#[simd_test(enable = "sse")]
2770	unsafe fn test_mm_comigt_ss() {
2771	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2772	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2773
2774	let ee = &[`1i32`, `0`, `1`, `0`];
2775
2776	for i in `0`..`4` {
2777	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2778	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2779
2780	let r = _mm_comige_ss(a, b);
2781
2782	assert_eq!(
2783	ee[i], r,
2784	"_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2785	a, b, r, ee[i], i
2786	);
2787	}
2788	}
2789
2790	#[simd_test(enable = "sse")]
2791	unsafe fn test_mm_comineq_ss() {
2792	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2793	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2794
2795	let ee = &[`0i32`, `1`, `1`, `1`];
2796
2797	for i in `0`..`4` {
2798	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2799	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2800
2801	let r = _mm_comineq_ss(a, b);
2802
2803	assert_eq!(
2804	ee[i], r,
2805	"_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2806	a, b, r, ee[i], i
2807	);
2808	}
2809	}
2810
2811	#[simd_test(enable = "sse")]
2812	unsafe fn test_mm_ucomieq_ss() {
2813	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2814	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2815
2816	let ee = &[`1i32`, `0`, `0`, `0`];
2817
2818	for i in `0`..`4` {
2819	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2820	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2821
2822	let r = _mm_ucomieq_ss(a, b);
2823
2824	assert_eq!(
2825	ee[i], r,
2826	"_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2827	a, b, r, ee[i], i
2828	);
2829	}
2830	}
2831
2832	#[simd_test(enable = "sse")]
2833	unsafe fn test_mm_ucomilt_ss() {
2834	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2835	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2836
2837	let ee = &[`0i32`, `1`, `0`, `0`];
2838
2839	for i in `0`..`4` {
2840	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2841	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2842
2843	let r = _mm_ucomilt_ss(a, b);
2844
2845	assert_eq!(
2846	ee[i], r,
2847	"_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2848	a, b, r, ee[i], i
2849	);
2850	}
2851	}
2852
2853	#[simd_test(enable = "sse")]
2854	unsafe fn test_mm_ucomile_ss() {
2855	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2856	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2857
2858	let ee = &[`1i32`, `1`, `0`, `0`];
2859
2860	for i in `0`..`4` {
2861	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2862	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2863
2864	let r = _mm_ucomile_ss(a, b);
2865
2866	assert_eq!(
2867	ee[i], r,
2868	"_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2869	a, b, r, ee[i], i
2870	);
2871	}
2872	}
2873
2874	#[simd_test(enable = "sse")]
2875	unsafe fn test_mm_ucomigt_ss() {
2876	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2877	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2878
2879	let ee = &[`0i32`, `0`, `1`, `0`];
2880
2881	for i in `0`..`4` {
2882	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2883	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2884
2885	let r = _mm_ucomigt_ss(a, b);
2886
2887	assert_eq!(
2888	ee[i], r,
2889	"_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2890	a, b, r, ee[i], i
2891	);
2892	}
2893	}
2894
2895	#[simd_test(enable = "sse")]
2896	unsafe fn test_mm_ucomige_ss() {
2897	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2898	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2899
2900	let ee = &[`1i32`, `0`, `1`, `0`];
2901
2902	for i in `0`..`4` {
2903	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2904	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2905
2906	let r = _mm_ucomige_ss(a, b);
2907
2908	assert_eq!(
2909	ee[i], r,
2910	"_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2911	a, b, r, ee[i], i
2912	);
2913	}
2914	}
2915
2916	#[simd_test(enable = "sse")]
2917	unsafe fn test_mm_ucomineq_ss() {
2918	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2919	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2920
2921	let ee = &[`0i32`, `1`, `1`, `1`];
2922
2923	for i in `0`..`4` {
2924	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2925	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2926
2927	let r = _mm_ucomineq_ss(a, b);
2928
2929	assert_eq!(
2930	ee[i], r,
2931	"_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2932	a, b, r, ee[i], i
2933	);
2934	}
2935	}
2936
2937	#[simd_test(enable = "sse")]
2938	unsafe fn test_mm_cvtss_si32() {
2939	let inputs = &[`42.0f32`, `-3.1`, `4.0e10`, `4.0e-20`, NAN, `2147483500.1`];
2940	let result = &[`42i32`, `-3`, i32::MIN, `0`, i32::MIN, `2147483520`];
2941	for i in `0`..inputs.len() {
2942	let x = _mm_setr_ps(inputs[i], `1.0`, `3.0`, `4.0`);
2943	let e = result[i];
2944	let r = _mm_cvtss_si32(x);
2945	assert_eq!(
2946	e, r,
2947	"TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2948	i, x, r, e
2949	);
2950	}
2951	}
2952
2953	#[simd_test(enable = "sse")]
2954	unsafe fn test_mm_cvttss_si32() {
2955	let inputs = &[
2956	(`42.0f32`, `42i32`),
2957	(`-31.4`, `-31`),
2958	(`-33.5`, `-33`),
2959	(`-34.5`, `-34`),
2960	(`10.999`, `10`),
2961	(`-5.99`, `-5`),
2962	(`4.0e10`, i32::MIN),
2963	(`4.0e-10`, `0`),
2964	(NAN, i32::MIN),
2965	(`2147483500.1`, `2147483520`),
2966	];
2967	for (i, &(xi, e)) in inputs.iter().enumerate() {
2968	let x = _mm_setr_ps(xi, `1.0`, `3.0`, `4.0`);
2969	let r = _mm_cvttss_si32(x);
2970	assert_eq!(
2971	e, r,
2972	"TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2973	i, x, r, e
2974	);
2975	}
2976	}
2977
2978	#[simd_test(enable = "sse")]
2979	unsafe fn test_mm_cvtsi32_ss() {
2980	let inputs = &[
2981	(`4555i32`, `4555.0f32`),
2982	(`322223333`, `322223330.0`),
2983	(`-432`, `-432.0`),
2984	(`-322223333`, `-322223330.0`),
2985	];
2986
2987	for &(x, f) in inputs.iter() {
2988	let a = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
2989	let r = _mm_cvtsi32_ss(a, x);
2990	let e = _mm_setr_ps(f, `6.0`, `7.0`, `8.0`);
2991	assert_eq_m128(e, r);
2992	}
2993	}
2994
2995	#[simd_test(enable = "sse")]
2996	unsafe fn test_mm_cvtss_f32() {
2997	let a = _mm_setr_ps(`312.0134`, `5.0`, `6.0`, `7.0`);
2998	assert_eq!(_mm_cvtss_f32(a), `312.0134`);
2999	}
3000
3001	#[simd_test(enable = "sse")]
3002	unsafe fn test_mm_set_ss() {
3003	let r = _mm_set_ss(black_box(`4.25`));
3004	assert_eq_m128(r, _mm_setr_ps(`4.25`, `0.0`, `0.0`, `0.0`));
3005	}
3006
3007	#[simd_test(enable = "sse")]
3008	unsafe fn test_mm_set1_ps() {
3009	let r1 = _mm_set1_ps(black_box(`4.25`));
3010	let r2 = _mm_set_ps1(black_box(`4.25`));
3011	assert_eq!(get_m128(r1, `0`), `4.25`);
3012	assert_eq!(get_m128(r1, `1`), `4.25`);
3013	assert_eq!(get_m128(r1, `2`), `4.25`);
3014	assert_eq!(get_m128(r1, `3`), `4.25`);
3015	assert_eq!(get_m128(r2, `0`), `4.25`);
3016	assert_eq!(get_m128(r2, `1`), `4.25`);
3017	assert_eq!(get_m128(r2, `2`), `4.25`);
3018	assert_eq!(get_m128(r2, `3`), `4.25`);
3019	}
3020
3021	#[simd_test(enable = "sse")]
3022	unsafe fn test_mm_set_ps() {
3023	let r = _mm_set_ps(
3024	black_box(`1.0`),
3025	black_box(`2.0`),
3026	black_box(`3.0`),
3027	black_box(`4.0`),
3028	);
3029	assert_eq!(get_m128(r, `0`), `4.0`);
3030	assert_eq!(get_m128(r, `1`), `3.0`);
3031	assert_eq!(get_m128(r, `2`), `2.0`);
3032	assert_eq!(get_m128(r, `3`), `1.0`);
3033	}
3034
3035	#[simd_test(enable = "sse")]
3036	unsafe fn test_mm_setr_ps() {
3037	let r = _mm_setr_ps(
3038	black_box(`1.0`),
3039	black_box(`2.0`),
3040	black_box(`3.0`),
3041	black_box(`4.0`),
3042	);
3043	assert_eq_m128(r, _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`));
3044	}
3045
3046	#[simd_test(enable = "sse")]
3047	unsafe fn test_mm_setzero_ps() {
3048	let r = *black_box(&_mm_setzero_ps());
3049	assert_eq_m128(r, _mm_set1_ps(`0.0`));
3050	}
3051
3052	#[simd_test(enable = "sse")]
3053	unsafe fn test_MM_SHUFFLE() {
3054	assert_eq!(_MM_SHUFFLE(`0`, `1`, `1`, `3`), `0b00_01_01_11`);
3055	assert_eq!(_MM_SHUFFLE(`3`, `1`, `1`, `0`), `0b11_01_01_00`);
3056	assert_eq!(_MM_SHUFFLE(`1`, `2`, `2`, `1`), `0b01_10_10_01`);
3057	}
3058
3059	#[simd_test(enable = "sse")]
3060	unsafe fn test_mm_shuffle_ps() {
3061	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3062	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3063	let r = _mm_shuffle_ps::<`0b00_01_01_11`>(a, b);
3064	assert_eq_m128(r, _mm_setr_ps(`4.0`, `2.0`, `6.0`, `5.0`));
3065	}
3066
3067	#[simd_test(enable = "sse")]
3068	unsafe fn test_mm_unpackhi_ps() {
3069	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3070	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3071	let r = _mm_unpackhi_ps(a, b);
3072	assert_eq_m128(r, _mm_setr_ps(`3.0`, `7.0`, `4.0`, `8.0`));
3073	}
3074
3075	#[simd_test(enable = "sse")]
3076	unsafe fn test_mm_unpacklo_ps() {
3077	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3078	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3079	let r = _mm_unpacklo_ps(a, b);
3080	assert_eq_m128(r, _mm_setr_ps(`1.0`, `5.0`, `2.0`, `6.0`));
3081	}
3082
3083	#[simd_test(enable = "sse")]
3084	unsafe fn test_mm_movehl_ps() {
3085	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3086	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3087	let r = _mm_movehl_ps(a, b);
3088	assert_eq_m128(r, _mm_setr_ps(`7.0`, `8.0`, `3.0`, `4.0`));
3089	}
3090
3091	#[simd_test(enable = "sse")]
3092	unsafe fn test_mm_movelh_ps() {
3093	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3094	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3095	let r = _mm_movelh_ps(a, b);
3096	assert_eq_m128(r, _mm_setr_ps(`1.0`, `2.0`, `5.0`, `6.0`));
3097	}
3098
3099	#[simd_test(enable = "sse")]
3100	unsafe fn test_mm_load_ss() {
3101	let a = `42.0f32`;
3102	let r = _mm_load_ss(ptr::addr_of!(a));
3103	assert_eq_m128(r, _mm_setr_ps(`42.0`, `0.0`, `0.0`, `0.0`));
3104	}
3105
3106	#[simd_test(enable = "sse")]
3107	unsafe fn test_mm_load1_ps() {
3108	let a = `42.0f32`;
3109	let r = _mm_load1_ps(ptr::addr_of!(a));
3110	assert_eq_m128(r, _mm_setr_ps(`42.0`, `42.0`, `42.0`, `42.0`));
3111	}
3112
3113	#[simd_test(enable = "sse")]
3114	unsafe fn test_mm_load_ps() {
3115	let vals = &[`1.0f32`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`];
3116
3117	let mut p = vals.as_ptr();
3118	let mut fixup = `0.0f32`;
3119
3120	// Make sure p is aligned, otherwise we might get a
3121	// (signal: 11, SIGSEGV: invalid memory reference)
3122
3123	let unalignment = (p as usize) & `0xf`;
3124	if unalignment != `0` {
3125	let delta = (`16` - unalignment) >> `2`;
3126	fixup = delta as f32;
3127	p = p.add(delta);
3128	}
3129
3130	let r = _mm_load_ps(p);
3131	let e = _mm_add_ps(_mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`), _mm_set1_ps(fixup));
3132	assert_eq_m128(r, e);
3133	}
3134
3135	#[simd_test(enable = "sse")]
3136	unsafe fn test_mm_loadu_ps() {
3137	let vals = &[`1.0f32`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`];
3138	let p = vals.as_ptr().add(`3`);
3139	let r = _mm_loadu_ps(black_box(p));
3140	assert_eq_m128(r, _mm_setr_ps(`4.0`, `5.0`, `6.0`, `7.0`));
3141	}
3142
3143	#[simd_test(enable = "sse")]
3144	unsafe fn test_mm_loadr_ps() {
3145	let vals = &[`1.0f32`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`];
3146
3147	let mut p = vals.as_ptr();
3148	let mut fixup = `0.0f32`;
3149
3150	// Make sure p is aligned, otherwise we might get a
3151	// (signal: 11, SIGSEGV: invalid memory reference)
3152
3153	let unalignment = (p as usize) & `0xf`;
3154	if unalignment != `0` {
3155	let delta = (`16` - unalignment) >> `2`;
3156	fixup = delta as f32;
3157	p = p.add(delta);
3158	}
3159
3160	let r = _mm_loadr_ps(p);
3161	let e = _mm_add_ps(_mm_setr_ps(`4.0`, `3.0`, `2.0`, `1.0`), _mm_set1_ps(fixup));
3162	assert_eq_m128(r, e);
3163	}
3164
3165	#[simd_test(enable = "sse")]
3166	unsafe fn test_mm_store_ss() {
3167	let mut vals = [`0.0f32`; `8`];
3168	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3169	_mm_store_ss(vals.as_mut_ptr().add(`1`), a);
3170
3171	assert_eq!(vals[`0`], `0.0`);
3172	assert_eq!(vals[`1`], `1.0`);
3173	assert_eq!(vals[`2`], `0.0`);
3174	}
3175
3176	#[simd_test(enable = "sse")]
3177	unsafe fn test_mm_store1_ps() {
3178	let mut vals = [`0.0f32`; `8`];
3179	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3180
3181	let mut ofs = `0`;
3182	let mut p = vals.as_mut_ptr();
3183
3184	if (p as usize) & `0xf` != `0` {
3185	ofs = (`16` - ((p as usize) & `0xf`)) >> `2`;
3186	p = p.add(ofs);
3187	}
3188
3189	_mm_store1_ps(p, *black_box(&a));
3190
3191	if ofs > `0` {
3192	assert_eq!(vals[ofs - `1`], `0.0`);
3193	}
3194	assert_eq!(vals[ofs + `0`], `1.0`);
3195	assert_eq!(vals[ofs + `1`], `1.0`);
3196	assert_eq!(vals[ofs + `2`], `1.0`);
3197	assert_eq!(vals[ofs + `3`], `1.0`);
3198	assert_eq!(vals[ofs + `4`], `0.0`);
3199	}
3200
3201	#[simd_test(enable = "sse")]
3202	unsafe fn test_mm_store_ps() {
3203	let mut vals = [`0.0f32`; `8`];
3204	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3205
3206	let mut ofs = `0`;
3207	let mut p = vals.as_mut_ptr();
3208
3209	// Align p to 16-byte boundary
3210	if (p as usize) & `0xf` != `0` {
3211	ofs = (`16` - ((p as usize) & `0xf`)) >> `2`;
3212	p = p.add(ofs);
3213	}
3214
3215	_mm_store_ps(p, *black_box(&a));
3216
3217	if ofs > `0` {
3218	assert_eq!(vals[ofs - `1`], `0.0`);
3219	}
3220	assert_eq!(vals[ofs + `0`], `1.0`);
3221	assert_eq!(vals[ofs + `1`], `2.0`);
3222	assert_eq!(vals[ofs + `2`], `3.0`);
3223	assert_eq!(vals[ofs + `3`], `4.0`);
3224	assert_eq!(vals[ofs + `4`], `0.0`);
3225	}
3226
3227	#[simd_test(enable = "sse")]
3228	unsafe fn test_mm_storer_ps() {
3229	let mut vals = [`0.0f32`; `8`];
3230	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3231
3232	let mut ofs = `0`;
3233	let mut p = vals.as_mut_ptr();
3234
3235	// Align p to 16-byte boundary
3236	if (p as usize) & `0xf` != `0` {
3237	ofs = (`16` - ((p as usize) & `0xf`)) >> `2`;
3238	p = p.add(ofs);
3239	}
3240
3241	_mm_storer_ps(p, *black_box(&a));
3242
3243	if ofs > `0` {
3244	assert_eq!(vals[ofs - `1`], `0.0`);
3245	}
3246	assert_eq!(vals[ofs + `0`], `4.0`);
3247	assert_eq!(vals[ofs + `1`], `3.0`);
3248	assert_eq!(vals[ofs + `2`], `2.0`);
3249	assert_eq!(vals[ofs + `3`], `1.0`);
3250	assert_eq!(vals[ofs + `4`], `0.0`);
3251	}
3252
3253	#[simd_test(enable = "sse")]
3254	unsafe fn test_mm_storeu_ps() {
3255	let mut vals = [`0.0f32`; `8`];
3256	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3257
3258	let mut ofs = `0`;
3259	let mut p = vals.as_mut_ptr();
3260
3261	// Make sure p is not* aligned to 16-byte boundary*
3262	if (p as usize) & `0xf` == `0` {
3263	ofs = `1`;
3264	p = p.add(`1`);
3265	}
3266
3267	_mm_storeu_ps(p, *black_box(&a));
3268
3269	if ofs > `0` {
3270	assert_eq!(vals[ofs - `1`], `0.0`);
3271	}
3272	assert_eq!(vals[ofs + `0`], `1.0`);
3273	assert_eq!(vals[ofs + `1`], `2.0`);
3274	assert_eq!(vals[ofs + `2`], `3.0`);
3275	assert_eq!(vals[ofs + `3`], `4.0`);
3276	assert_eq!(vals[ofs + `4`], `0.0`);
3277	}
3278
3279	#[simd_test(enable = "sse")]
3280	unsafe fn test_mm_move_ss() {
3281	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3282	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3283
3284	let r = _mm_move_ss(a, b);
3285	let e = _mm_setr_ps(`5.0`, `2.0`, `3.0`, `4.0`);
3286	assert_eq_m128(e, r);
3287	}
3288
3289	#[simd_test(enable = "sse")]
3290	unsafe fn test_mm_movemask_ps() {
3291	let r = _mm_movemask_ps(_mm_setr_ps(`-1.0`, `5.0`, `-5.0`, `0.0`));
3292	assert_eq!(r, `0b0101`);
3293
3294	let r = _mm_movemask_ps(_mm_setr_ps(`-1.0`, `-5.0`, `-5.0`, `0.0`));
3295	assert_eq!(r, `0b0111`);
3296	}
3297
3298	#[simd_test(enable = "sse")]
3299	// Miri cannot support this until it is clear how it fits in the Rust memory model
3300	#[cfg_attr(miri, ignore)]
3301	unsafe fn test_mm_sfence() {
3302	_mm_sfence();
3303	}
3304
3305	#[simd_test(enable = "sse")]
3306	unsafe fn test_MM_TRANSPOSE4_PS() {
3307	let mut a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3308	let mut b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3309	let mut c = _mm_setr_ps(`9.0`, `10.0`, `11.0`, `12.0`);
3310	let mut d = _mm_setr_ps(`13.0`, `14.0`, `15.0`, `16.0`);
3311
3312	_MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3313
3314	assert_eq_m128(a, _mm_setr_ps(`1.0`, `5.0`, `9.0`, `13.0`));
3315	assert_eq_m128(b, _mm_setr_ps(`2.0`, `6.0`, `10.0`, `14.0`));
3316	assert_eq_m128(c, _mm_setr_ps(`3.0`, `7.0`, `11.0`, `15.0`));
3317	assert_eq_m128(d, _mm_setr_ps(`4.0`, `8.0`, `12.0`, `16.0`));
3318	}
3319
3320	#[repr(align(`16`))]
3321	struct Memory {
3322	pub data: [f32; `4`],
3323	}
3324
3325	#[simd_test(enable = "sse")]
3326	// Miri cannot support this until it is clear how it fits in the Rust memory model
3327	// (non-temporal store)
3328	#[cfg_attr(miri, ignore)]
3329	unsafe fn test_mm_stream_ps() {
3330	let a = _mm_set1_ps(`7.0`);
3331	let mut mem = Memory { data: [`-1.0`; `4`] };
3332
3333	_mm_stream_ps(ptr::addr_of_mut!(mem.data[`0`]), a);
3334	for i in `0`..`4` {
3335	assert_eq!(mem.data[i], get_m128(a, i));
3336	}
3337	}
3338	}
3339