sse.rs source code [crates/core_arch/src/x86/sse.rs]

1	//! Streaming SIMD Extensions (SSE)
2
3	use crate::{
4	core_arch::{simd::, simd_llvm::, x86::*},
5	intrinsics, mem, ptr,
6	};
7
8	#[cfg(test)]
9	use stdarch_test::assert_instr;
10
11	/// Adds the first component of `a` and `b`, the other components are copied
12	/// from `a`.
13	///
14	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
15	#[inline]
16	#[target_feature(enable = "sse")]
17	#[cfg_attr(test, assert_instr(addss))]
18	#[stable(feature = "simd_x86", since = "1.27.0")]
19	pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
20	addss(a, b)
21	}
22
23	/// Adds __m128 vectors.
24	///
25	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
26	#[inline]
27	#[target_feature(enable = "sse")]
28	#[cfg_attr(test, assert_instr(addps))]
29	#[stable(feature = "simd_x86", since = "1.27.0")]
30	pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
31	simd_add(x:a, y:b)
32	}
33
34	/// Subtracts the first component of `b` from `a`, the other components are
35	/// copied from `a`.
36	///
37	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
38	#[inline]
39	#[target_feature(enable = "sse")]
40	#[cfg_attr(test, assert_instr(subss))]
41	#[stable(feature = "simd_x86", since = "1.27.0")]
42	pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
43	subss(a, b)
44	}
45
46	/// Subtracts __m128 vectors.
47	///
48	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
49	#[inline]
50	#[target_feature(enable = "sse")]
51	#[cfg_attr(test, assert_instr(subps))]
52	#[stable(feature = "simd_x86", since = "1.27.0")]
53	pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
54	simd_sub(x:a, y:b)
55	}
56
57	/// Multiplies the first component of `a` and `b`, the other components are
58	/// copied from `a`.
59	///
60	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
61	#[inline]
62	#[target_feature(enable = "sse")]
63	#[cfg_attr(test, assert_instr(mulss))]
64	#[stable(feature = "simd_x86", since = "1.27.0")]
65	pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
66	mulss(a, b)
67	}
68
69	/// Multiplies __m128 vectors.
70	///
71	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
72	#[inline]
73	#[target_feature(enable = "sse")]
74	#[cfg_attr(test, assert_instr(mulps))]
75	#[stable(feature = "simd_x86", since = "1.27.0")]
76	pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
77	simd_mul(x:a, y:b)
78	}
79
80	/// Divides the first component of `b` by `a`, the other components are
81	/// copied from `a`.
82	///
83	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
84	#[inline]
85	#[target_feature(enable = "sse")]
86	#[cfg_attr(test, assert_instr(divss))]
87	#[stable(feature = "simd_x86", since = "1.27.0")]
88	pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
89	divss(a, b)
90	}
91
92	/// Divides __m128 vectors.
93	///
94	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
95	#[inline]
96	#[target_feature(enable = "sse")]
97	#[cfg_attr(test, assert_instr(divps))]
98	#[stable(feature = "simd_x86", since = "1.27.0")]
99	pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
100	simd_div(x:a, y:b)
101	}
102
103	/// Returns the square root of the first single-precision (32-bit)
104	/// floating-point element in `a`, the other elements are unchanged.
105	///
106	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
107	#[inline]
108	#[target_feature(enable = "sse")]
109	#[cfg_attr(test, assert_instr(sqrtss))]
110	#[stable(feature = "simd_x86", since = "1.27.0")]
111	pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
112	sqrtss(a)
113	}
114
115	/// Returns the square root of packed single-precision (32-bit) floating-point
116	/// elements in `a`.
117	///
118	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
119	#[inline]
120	#[target_feature(enable = "sse")]
121	#[cfg_attr(test, assert_instr(sqrtps))]
122	#[stable(feature = "simd_x86", since = "1.27.0")]
123	pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
124	sqrtps(a)
125	}
126
127	/// Returns the approximate reciprocal of the first single-precision
128	/// (32-bit) floating-point element in `a`, the other elements are unchanged.
129	///
130	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
131	#[inline]
132	#[target_feature(enable = "sse")]
133	#[cfg_attr(test, assert_instr(rcpss))]
134	#[stable(feature = "simd_x86", since = "1.27.0")]
135	pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
136	rcpss(a)
137	}
138
139	/// Returns the approximate reciprocal of packed single-precision (32-bit)
140	/// floating-point elements in `a`.
141	///
142	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
143	#[inline]
144	#[target_feature(enable = "sse")]
145	#[cfg_attr(test, assert_instr(rcpps))]
146	#[stable(feature = "simd_x86", since = "1.27.0")]
147	pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
148	rcpps(a)
149	}
150
151	/// Returns the approximate reciprocal square root of the first single-precision
152	/// (32-bit) floating-point element in `a`, the other elements are unchanged.
153	///
154	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
155	#[inline]
156	#[target_feature(enable = "sse")]
157	#[cfg_attr(test, assert_instr(rsqrtss))]
158	#[stable(feature = "simd_x86", since = "1.27.0")]
159	pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
160	rsqrtss(a)
161	}
162
163	/// Returns the approximate reciprocal square root of packed single-precision
164	/// (32-bit) floating-point elements in `a`.
165	///
166	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
167	#[inline]
168	#[target_feature(enable = "sse")]
169	#[cfg_attr(test, assert_instr(rsqrtps))]
170	#[stable(feature = "simd_x86", since = "1.27.0")]
171	pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
172	rsqrtps(a)
173	}
174
175	/// Compares the first single-precision (32-bit) floating-point element of `a`
176	/// and `b`, and return the minimum value in the first element of the return
177	/// value, the other elements are copied from `a`.
178	///
179	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
180	#[inline]
181	#[target_feature(enable = "sse")]
182	#[cfg_attr(test, assert_instr(minss))]
183	#[stable(feature = "simd_x86", since = "1.27.0")]
184	pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
185	minss(a, b)
186	}
187
188	/// Compares packed single-precision (32-bit) floating-point elements in `a` and
189	/// `b`, and return the corresponding minimum values.
190	///
191	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
192	#[inline]
193	#[target_feature(enable = "sse")]
194	#[cfg_attr(test, assert_instr(minps))]
195	#[stable(feature = "simd_x86", since = "1.27.0")]
196	pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
197	// See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
198	minps(a, b)
199	}
200
201	/// Compares the first single-precision (32-bit) floating-point element of `a`
202	/// and `b`, and return the maximum value in the first element of the return
203	/// value, the other elements are copied from `a`.
204	///
205	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
206	#[inline]
207	#[target_feature(enable = "sse")]
208	#[cfg_attr(test, assert_instr(maxss))]
209	#[stable(feature = "simd_x86", since = "1.27.0")]
210	pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
211	maxss(a, b)
212	}
213
214	/// Compares packed single-precision (32-bit) floating-point elements in `a` and
215	/// `b`, and return the corresponding maximum values.
216	///
217	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
218	#[inline]
219	#[target_feature(enable = "sse")]
220	#[cfg_attr(test, assert_instr(maxps))]
221	#[stable(feature = "simd_x86", since = "1.27.0")]
222	pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
223	// See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
224	maxps(a, b)
225	}
226
227	/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
228	///
229	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
230	#[inline]
231	#[target_feature(enable = "sse")]
232	// i586 only seems to generate plain `and` instructions, so ignore it.
233	#[cfg_attr(
234	all(test, any(target_arch = "x86_64", target_feature = "sse2")),
235	assert_instr(andps)
236	)]
237	#[stable(feature = "simd_x86", since = "1.27.0")]
238	pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
239	let a: __m128i = mem::transmute(src:a);
240	let b: __m128i = mem::transmute(src:b);
241	mem::transmute(src:simd_and(x:a, y:b))
242	}
243
244	/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
245	/// elements.
246	///
247	/// Computes `!a & b` for each bit in `a` and `b`.
248	///
249	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
250	#[inline]
251	#[target_feature(enable = "sse")]
252	// i586 only seems to generate plain `not` and `and` instructions, so ignore
253	// it.
254	#[cfg_attr(
255	all(test, any(target_arch = "x86_64", target_feature = "sse2")),
256	assert_instr(andnps)
257	)]
258	#[stable(feature = "simd_x86", since = "1.27.0")]
259	pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
260	let a: __m128i = mem::transmute(src:a);
261	let b: __m128i = mem::transmute(src:b);
262	let mask: __m128i = mem::transmute(src:i32x4::splat(`-1`));
263	mem::transmute(src:simd_and(x:simd_xor(mask, a), y:b))
264	}
265
266	/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
267	///
268	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
269	#[inline]
270	#[target_feature(enable = "sse")]
271	// i586 only seems to generate plain `or` instructions, so we ignore it.
272	#[cfg_attr(
273	all(test, any(target_arch = "x86_64", target_feature = "sse2")),
274	assert_instr(orps)
275	)]
276	#[stable(feature = "simd_x86", since = "1.27.0")]
277	pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
278	let a: __m128i = mem::transmute(src:a);
279	let b: __m128i = mem::transmute(src:b);
280	mem::transmute(src:simd_or(x:a, y:b))
281	}
282
283	/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
284	/// elements.
285	///
286	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
287	#[inline]
288	#[target_feature(enable = "sse")]
289	// i586 only seems to generate plain `xor` instructions, so we ignore it.
290	#[cfg_attr(
291	all(test, any(target_arch = "x86_64", target_feature = "sse2")),
292	assert_instr(xorps)
293	)]
294	#[stable(feature = "simd_x86", since = "1.27.0")]
295	pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
296	let a: __m128i = mem::transmute(src:a);
297	let b: __m128i = mem::transmute(src:b);
298	mem::transmute(src:simd_xor(x:a, y:b))
299	}
300
301	/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
302	/// the result will be `0xffffffff` if the two inputs are equal, or `0`
303	/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
304	///
305	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
306	#[inline]
307	#[target_feature(enable = "sse")]
308	#[cfg_attr(test, assert_instr(cmpeqss))]
309	#[stable(feature = "simd_x86", since = "1.27.0")]
310	pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
311	cmpss(a, b, imm8:`0`)
312	}
313
314	/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
315	/// of the result will be `0xffffffff` if `a.extract(0)` is less than
316	/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
317	/// upper 96 bits of `a`.
318	///
319	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
320	#[inline]
321	#[target_feature(enable = "sse")]
322	#[cfg_attr(test, assert_instr(cmpltss))]
323	#[stable(feature = "simd_x86", since = "1.27.0")]
324	pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
325	cmpss(a, b, imm8:`1`)
326	}
327
328	/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
329	/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
330	/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
331	/// are the upper 96 bits of `a`.
332	///
333	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
334	#[inline]
335	#[target_feature(enable = "sse")]
336	#[cfg_attr(test, assert_instr(cmpless))]
337	#[stable(feature = "simd_x86", since = "1.27.0")]
338	pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
339	cmpss(a, b, imm8:`2`)
340	}
341
342	/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
343	/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
344	/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
345	/// are the upper 96 bits of `a`.
346	///
347	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
348	#[inline]
349	#[target_feature(enable = "sse")]
350	#[cfg_attr(test, assert_instr(cmpltss))]
351	#[stable(feature = "simd_x86", since = "1.27.0")]
352	pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
353	simd_shuffle!(a, cmpss(b, a, `1`), [`4`, `1`, `2`, `3`])
354	}
355
356	/// Compares the lowest `f32` of both inputs for greater than or equal. The
357	/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
358	/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
359	/// of the result are the upper 96 bits of `a`.
360	///
361	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
362	#[inline]
363	#[target_feature(enable = "sse")]
364	#[cfg_attr(test, assert_instr(cmpless))]
365	#[stable(feature = "simd_x86", since = "1.27.0")]
366	pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
367	simd_shuffle!(a, cmpss(b, a, `2`), [`4`, `1`, `2`, `3`])
368	}
369
370	/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
371	/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
372	/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
373	/// upper 96 bits of `a`.
374	///
375	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
376	#[inline]
377	#[target_feature(enable = "sse")]
378	#[cfg_attr(test, assert_instr(cmpneqss))]
379	#[stable(feature = "simd_x86", since = "1.27.0")]
380	pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
381	cmpss(a, b, imm8:`4`)
382	}
383
384	/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
385	/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
386	/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
387	/// upper 96 bits of `a`.
388	///
389	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
390	#[inline]
391	#[target_feature(enable = "sse")]
392	#[cfg_attr(test, assert_instr(cmpnltss))]
393	#[stable(feature = "simd_x86", since = "1.27.0")]
394	pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
395	cmpss(a, b, imm8:`5`)
396	}
397
398	/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
399	/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
400	/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
401	/// of the result are the upper 96 bits of `a`.
402	///
403	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
404	#[inline]
405	#[target_feature(enable = "sse")]
406	#[cfg_attr(test, assert_instr(cmpnless))]
407	#[stable(feature = "simd_x86", since = "1.27.0")]
408	pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
409	cmpss(a, b, imm8:`6`)
410	}
411
412	/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
413	/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
414	/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
415	/// the upper 96 bits of `a`.
416	///
417	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
418	#[inline]
419	#[target_feature(enable = "sse")]
420	#[cfg_attr(test, assert_instr(cmpnltss))]
421	#[stable(feature = "simd_x86", since = "1.27.0")]
422	pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
423	simd_shuffle!(a, cmpss(b, a, `5`), [`4`, `1`, `2`, `3`])
424	}
425
426	/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
427	/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
428	/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
429	/// bits of the result are the upper 96 bits of `a`.
430	///
431	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
432	#[inline]
433	#[target_feature(enable = "sse")]
434	#[cfg_attr(test, assert_instr(cmpnless))]
435	#[stable(feature = "simd_x86", since = "1.27.0")]
436	pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
437	simd_shuffle!(a, cmpss(b, a, `6`), [`4`, `1`, `2`, `3`])
438	}
439
440	/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
441	/// the result will be `0xffffffff` if neither of `a.extract(0)` or
442	/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
443	/// are the upper 96 bits of `a`.
444	///
445	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
446	#[inline]
447	#[target_feature(enable = "sse")]
448	#[cfg_attr(test, assert_instr(cmpordss))]
449	#[stable(feature = "simd_x86", since = "1.27.0")]
450	pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
451	cmpss(a, b, imm8:`7`)
452	}
453
454	/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
455	/// of the result will be `0xffffffff` if any of `a.extract(0)` or
456	/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
457	/// are the upper 96 bits of `a`.
458	///
459	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
460	#[inline]
461	#[target_feature(enable = "sse")]
462	#[cfg_attr(test, assert_instr(cmpunordss))]
463	#[stable(feature = "simd_x86", since = "1.27.0")]
464	pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
465	cmpss(a, b, imm8:`3`)
466	}
467
468	/// Compares each of the four floats in `a` to the corresponding element in `b`.
469	/// The result in the output vector will be `0xffffffff` if the input elements
470	/// were equal, or `0` otherwise.
471	///
472	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
473	#[inline]
474	#[target_feature(enable = "sse")]
475	#[cfg_attr(test, assert_instr(cmpeqps))]
476	#[stable(feature = "simd_x86", since = "1.27.0")]
477	pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
478	cmpps(a, b, imm8:`0`)
479	}
480
481	/// Compares each of the four floats in `a` to the corresponding element in `b`.
482	/// The result in the output vector will be `0xffffffff` if the input element
483	/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
484	///
485	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
486	#[inline]
487	#[target_feature(enable = "sse")]
488	#[cfg_attr(test, assert_instr(cmpltps))]
489	#[stable(feature = "simd_x86", since = "1.27.0")]
490	pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
491	cmpps(a, b, imm8:`1`)
492	}
493
494	/// Compares each of the four floats in `a` to the corresponding element in `b`.
495	/// The result in the output vector will be `0xffffffff` if the input element
496	/// in `a` is less than or equal to the corresponding element in `b`, or `0`
497	/// otherwise.
498	///
499	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
500	#[inline]
501	#[target_feature(enable = "sse")]
502	#[cfg_attr(test, assert_instr(cmpleps))]
503	#[stable(feature = "simd_x86", since = "1.27.0")]
504	pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
505	cmpps(a, b, imm8:`2`)
506	}
507
508	/// Compares each of the four floats in `a` to the corresponding element in `b`.
509	/// The result in the output vector will be `0xffffffff` if the input element
510	/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
511	///
512	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
513	#[inline]
514	#[target_feature(enable = "sse")]
515	#[cfg_attr(test, assert_instr(cmpltps))]
516	#[stable(feature = "simd_x86", since = "1.27.0")]
517	pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
518	cmpps(a:b, b:a, imm8:`1`)
519	}
520
521	/// Compares each of the four floats in `a` to the corresponding element in `b`.
522	/// The result in the output vector will be `0xffffffff` if the input element
523	/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
524	/// otherwise.
525	///
526	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
527	#[inline]
528	#[target_feature(enable = "sse")]
529	#[cfg_attr(test, assert_instr(cmpleps))]
530	#[stable(feature = "simd_x86", since = "1.27.0")]
531	pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
532	cmpps(a:b, b:a, imm8:`2`)
533	}
534
535	/// Compares each of the four floats in `a` to the corresponding element in `b`.
536	/// The result in the output vector will be `0xffffffff` if the input elements
537	/// are not* equal, or `0` otherwise.*
538	///
539	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
540	#[inline]
541	#[target_feature(enable = "sse")]
542	#[cfg_attr(test, assert_instr(cmpneqps))]
543	#[stable(feature = "simd_x86", since = "1.27.0")]
544	pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
545	cmpps(a, b, imm8:`4`)
546	}
547
548	/// Compares each of the four floats in `a` to the corresponding element in `b`.
549	/// The result in the output vector will be `0xffffffff` if the input element
550	/// in `a` is not* less than the corresponding element in `b`, or `0`*
551	/// otherwise.
552	///
553	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
554	#[inline]
555	#[target_feature(enable = "sse")]
556	#[cfg_attr(test, assert_instr(cmpnltps))]
557	#[stable(feature = "simd_x86", since = "1.27.0")]
558	pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
559	cmpps(a, b, imm8:`5`)
560	}
561
562	/// Compares each of the four floats in `a` to the corresponding element in `b`.
563	/// The result in the output vector will be `0xffffffff` if the input element
564	/// in `a` is not* less than or equal to the corresponding element in `b`, or*
565	/// `0` otherwise.
566	///
567	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
568	#[inline]
569	#[target_feature(enable = "sse")]
570	#[cfg_attr(test, assert_instr(cmpnleps))]
571	#[stable(feature = "simd_x86", since = "1.27.0")]
572	pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
573	cmpps(a, b, imm8:`6`)
574	}
575
576	/// Compares each of the four floats in `a` to the corresponding element in `b`.
577	/// The result in the output vector will be `0xffffffff` if the input element
578	/// in `a` is not* greater than the corresponding element in `b`, or `0`*
579	/// otherwise.
580	///
581	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
582	#[inline]
583	#[target_feature(enable = "sse")]
584	#[cfg_attr(test, assert_instr(cmpnltps))]
585	#[stable(feature = "simd_x86", since = "1.27.0")]
586	pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
587	cmpps(a:b, b:a, imm8:`5`)
588	}
589
590	/// Compares each of the four floats in `a` to the corresponding element in `b`.
591	/// The result in the output vector will be `0xffffffff` if the input element
592	/// in `a` is not* greater than or equal to the corresponding element in `b`,*
593	/// or `0` otherwise.
594	///
595	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
596	#[inline]
597	#[target_feature(enable = "sse")]
598	#[cfg_attr(test, assert_instr(cmpnleps))]
599	#[stable(feature = "simd_x86", since = "1.27.0")]
600	pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
601	cmpps(a:b, b:a, imm8:`6`)
602	}
603
604	/// Compares each of the four floats in `a` to the corresponding element in `b`.
605	/// Returns four floats that have one of two possible bit patterns. The element
606	/// in the output vector will be `0xffffffff` if the input elements in `a` and
607	/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
608	///
609	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
610	#[inline]
611	#[target_feature(enable = "sse")]
612	#[cfg_attr(test, assert_instr(cmpordps))]
613	#[stable(feature = "simd_x86", since = "1.27.0")]
614	pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
615	cmpps(a:b, b:a, imm8:`7`)
616	}
617
618	/// Compares each of the four floats in `a` to the corresponding element in `b`.
619	/// Returns four floats that have one of two possible bit patterns. The element
620	/// in the output vector will be `0xffffffff` if the input elements in `a` and
621	/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
622	///
623	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
624	#[inline]
625	#[target_feature(enable = "sse")]
626	#[cfg_attr(test, assert_instr(cmpunordps))]
627	#[stable(feature = "simd_x86", since = "1.27.0")]
628	pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
629	cmpps(a:b, b:a, imm8:`3`)
630	}
631
632	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
633	/// `1` if they are equal, or `0` otherwise.
634	///
635	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
636	#[inline]
637	#[target_feature(enable = "sse")]
638	#[cfg_attr(test, assert_instr(comiss))]
639	#[stable(feature = "simd_x86", since = "1.27.0")]
640	pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
641	comieq_ss(a, b)
642	}
643
644	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
645	/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
646	///
647	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
648	#[inline]
649	#[target_feature(enable = "sse")]
650	#[cfg_attr(test, assert_instr(comiss))]
651	#[stable(feature = "simd_x86", since = "1.27.0")]
652	pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
653	comilt_ss(a, b)
654	}
655
656	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
657	/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
658	/// otherwise.
659	///
660	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
661	#[inline]
662	#[target_feature(enable = "sse")]
663	#[cfg_attr(test, assert_instr(comiss))]
664	#[stable(feature = "simd_x86", since = "1.27.0")]
665	pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
666	comile_ss(a, b)
667	}
668
669	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
670	/// `1` if the value from `a` is greater than the one from `b`, or `0`
671	/// otherwise.
672	///
673	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
674	#[inline]
675	#[target_feature(enable = "sse")]
676	#[cfg_attr(test, assert_instr(comiss))]
677	#[stable(feature = "simd_x86", since = "1.27.0")]
678	pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
679	comigt_ss(a, b)
680	}
681
682	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
683	/// `1` if the value from `a` is greater than or equal to the one from `b`, or
684	/// `0` otherwise.
685	///
686	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
687	#[inline]
688	#[target_feature(enable = "sse")]
689	#[cfg_attr(test, assert_instr(comiss))]
690	#[stable(feature = "simd_x86", since = "1.27.0")]
691	pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
692	comige_ss(a, b)
693	}
694
695	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
696	/// `1` if they are not* equal, or `0` otherwise.*
697	///
698	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
699	#[inline]
700	#[target_feature(enable = "sse")]
701	#[cfg_attr(test, assert_instr(comiss))]
702	#[stable(feature = "simd_x86", since = "1.27.0")]
703	pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
704	comineq_ss(a, b)
705	}
706
707	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
708	/// `1` if they are equal, or `0` otherwise. This instruction will not signal
709	/// an exception if either argument is a quiet NaN.
710	///
711	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
712	#[inline]
713	#[target_feature(enable = "sse")]
714	#[cfg_attr(test, assert_instr(ucomiss))]
715	#[stable(feature = "simd_x86", since = "1.27.0")]
716	pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
717	ucomieq_ss(a, b)
718	}
719
720	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
721	/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
722	/// This instruction will not signal an exception if either argument is a quiet
723	/// NaN.
724	///
725	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
726	#[inline]
727	#[target_feature(enable = "sse")]
728	#[cfg_attr(test, assert_instr(ucomiss))]
729	#[stable(feature = "simd_x86", since = "1.27.0")]
730	pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
731	ucomilt_ss(a, b)
732	}
733
734	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
735	/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
736	/// otherwise. This instruction will not signal an exception if either argument
737	/// is a quiet NaN.
738	///
739	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
740	#[inline]
741	#[target_feature(enable = "sse")]
742	#[cfg_attr(test, assert_instr(ucomiss))]
743	#[stable(feature = "simd_x86", since = "1.27.0")]
744	pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
745	ucomile_ss(a, b)
746	}
747
748	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
749	/// `1` if the value from `a` is greater than the one from `b`, or `0`
750	/// otherwise. This instruction will not signal an exception if either argument
751	/// is a quiet NaN.
752	///
753	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
754	#[inline]
755	#[target_feature(enable = "sse")]
756	#[cfg_attr(test, assert_instr(ucomiss))]
757	#[stable(feature = "simd_x86", since = "1.27.0")]
758	pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
759	ucomigt_ss(a, b)
760	}
761
762	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
763	/// `1` if the value from `a` is greater than or equal to the one from `b`, or
764	/// `0` otherwise. This instruction will not signal an exception if either
765	/// argument is a quiet NaN.
766	///
767	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
768	#[inline]
769	#[target_feature(enable = "sse")]
770	#[cfg_attr(test, assert_instr(ucomiss))]
771	#[stable(feature = "simd_x86", since = "1.27.0")]
772	pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
773	ucomige_ss(a, b)
774	}
775
776	/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
777	/// `1` if they are not* equal, or `0` otherwise. This instruction will not*
778	/// signal an exception if either argument is a quiet NaN.
779	///
780	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
781	#[inline]
782	#[target_feature(enable = "sse")]
783	#[cfg_attr(test, assert_instr(ucomiss))]
784	#[stable(feature = "simd_x86", since = "1.27.0")]
785	pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
786	ucomineq_ss(a, b)
787	}
788
789	/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
790	///
791	/// The result is rounded according to the current rounding mode. If the result
792	/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
793	/// (`i32::MIN`).
794	///
795	/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
796	///
797	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
798	#[inline]
799	#[target_feature(enable = "sse")]
800	#[cfg_attr(test, assert_instr(cvtss2si))]
801	#[stable(feature = "simd_x86", since = "1.27.0")]
802	pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
803	cvtss2si(a)
804	}
805
806	/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
807	///
808	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
809	#[inline]
810	#[target_feature(enable = "sse")]
811	#[cfg_attr(test, assert_instr(cvtss2si))]
812	#[stable(feature = "simd_x86", since = "1.27.0")]
813	pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
814	_mm_cvtss_si32(a)
815	}
816
817	/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
818	/// with
819	/// truncation.
820	///
821	/// The result is rounded always using truncation (round towards zero). If the
822	/// result cannot be represented as a 32 bit integer the result will be
823	/// `0x8000_0000` (`i32::MIN`).
824	///
825	/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
826	///
827	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
828	#[inline]
829	#[target_feature(enable = "sse")]
830	#[cfg_attr(test, assert_instr(cvttss2si))]
831	#[stable(feature = "simd_x86", since = "1.27.0")]
832	pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
833	cvttss2si(a)
834	}
835
836	/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
837	///
838	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
839	#[inline]
840	#[target_feature(enable = "sse")]
841	#[cfg_attr(test, assert_instr(cvttss2si))]
842	#[stable(feature = "simd_x86", since = "1.27.0")]
843	pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
844	_mm_cvttss_si32(a)
845	}
846
847	/// Extracts the lowest 32 bit float from the input vector.
848	///
849	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
850	#[inline]
851	#[target_feature(enable = "sse")]
852	// No point in using assert_instrs. In Unix x86_64 calling convention this is a
853	// no-op, and on Windows it's just a `mov`.
854	#[stable(feature = "simd_x86", since = "1.27.0")]
855	pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
856	simd_extract(x:a, idx:`0`)
857	}
858
859	/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
860	/// vector `a` with the lowest 32 bit float replaced by the converted integer.
861	///
862	/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
863	/// input).
864	///
865	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
866	#[inline]
867	#[target_feature(enable = "sse")]
868	#[cfg_attr(test, assert_instr(cvtsi2ss))]
869	#[stable(feature = "simd_x86", since = "1.27.0")]
870	pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
871	cvtsi2ss(a, b)
872	}
873
874	/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
875	///
876	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
877	#[inline]
878	#[target_feature(enable = "sse")]
879	#[cfg_attr(test, assert_instr(cvtsi2ss))]
880	#[stable(feature = "simd_x86", since = "1.27.0")]
881	pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
882	_mm_cvtsi32_ss(a, b)
883	}
884
885	/// Construct a `__m128` with the lowest element set to `a` and the rest set to
886	/// zero.
887	///
888	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
889	#[inline]
890	#[target_feature(enable = "sse")]
891	#[cfg_attr(test, assert_instr(movss))]
892	#[stable(feature = "simd_x86", since = "1.27.0")]
893	pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
894	__m128(a, `0.0`, `0.0`, `0.0`)
895	}
896
897	/// Construct a `__m128` with all element set to `a`.
898	///
899	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
900	#[inline]
901	#[target_feature(enable = "sse")]
902	#[cfg_attr(test, assert_instr(shufps))]
903	#[stable(feature = "simd_x86", since = "1.27.0")]
904	pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
905	__m128(a, a, a, a)
906	}
907
908	/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
909	///
910	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
911	#[inline]
912	#[target_feature(enable = "sse")]
913	#[cfg_attr(test, assert_instr(shufps))]
914	#[stable(feature = "simd_x86", since = "1.27.0")]
915	pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
916	_mm_set1_ps(a)
917	}
918
919	/// Construct a `__m128` from four floating point values highest to lowest.
920	///
921	/// Note that `a` will be the highest 32 bits of the result, and `d` the
922	/// lowest. This matches the standard way of writing bit patterns on x86:
923	///
924	/// ```text
925	/// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0
926	/// +---------+---------+---------+---------+
927	/// \| a \| b \| c \| d \| result
928	/// +---------+---------+---------+---------+
929	/// ```
930	///
931	/// Alternatively:
932	///
933	/// ```text
934	/// let v = _mm_set_ps(d, c, b, a);
935	/// ```
936	///
937	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
938	#[inline]
939	#[target_feature(enable = "sse")]
940	#[cfg_attr(test, assert_instr(unpcklps))]
941	#[stable(feature = "simd_x86", since = "1.27.0")]
942	pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
943	__m128(d, c, b, a)
944	}
945
946	/// Construct a `__m128` from four floating point values lowest to highest.
947	///
948	/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
949	/// bits of the result, and `d` the highest.
950	///
951	/// ```text
952	/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
953	/// ```
954	///
955	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
956	#[inline]
957	#[target_feature(enable = "sse")]
958	#[cfg_attr(
959	all(test, any(target_os = "windows", target_arch = "x86_64")),
960	assert_instr(unpcklps)
961	)]
962	// On a 32-bit architecture on non-Windows it just copies the operands from the stack.
963	#[cfg_attr(
964	all(test, all(not(target_os = "windows"), target_arch = "x86")),
965	assert_instr(movaps)
966	)]
967	#[stable(feature = "simd_x86", since = "1.27.0")]
968	pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
969	__m128(a, b, c, d)
970	}
971
972	/// Construct a `__m128` with all elements initialized to zero.
973	///
974	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
975	#[inline]
976	#[target_feature(enable = "sse")]
977	#[cfg_attr(test, assert_instr(xorps))]
978	#[stable(feature = "simd_x86", since = "1.27.0")]
979	pub unsafe fn _mm_setzero_ps() -> __m128 {
980	__m128(`0.0`, `0.0`, `0.0`, `0.0`)
981	}
982
983	/// A utility function for creating masks to use with Intel shuffle and
984	/// permute intrinsics.
985	#[inline]
986	#[allow(non_snake_case)]
987	#[unstable(feature = "stdarch", issue = "27731")]
988	pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
989	((z << `6`) \| (y << `4`) \| (x << `2`) \| w) as i32
990	}
991
992	/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
993	/// `b` using `MASK`.
994	///
995	/// The lower half of result takes values from `a` and the higher half from
996	/// `b`. Mask is split to 2 control bits each to index the element from inputs.
997	///
998	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
999	///
1000	/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1001	/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1002	/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1003	/// Performing an implicit type conversion between an unsigned integer and a signed integer
1004	/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1005	#[inline]
1006	#[target_feature(enable = "sse")]
1007	#[cfg_attr(test, assert_instr(shufps, MASK = `3`))]
1008	#[rustc_legacy_const_generics(`2`)]
1009	#[stable(feature = "simd_x86", since = "1.27.0")]
1010	pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1011	static_assert_uimm_bits!(MASK, `8`);
1012	simd_shuffle!(
1013	a,
1014	b,
1015	[
1016	MASK as u32 & `0b11`,
1017	(MASK as u32 >> `2`) & `0b11`,
1018	((MASK as u32 >> `4`) & `0b11`) + `4`,
1019	((MASK as u32 >> `6`) & `0b11`) + `4`,
1020	],
1021	)
1022	}
1023
1024	/// Unpacks and interleave single-precision (32-bit) floating-point elements
1025	/// from the higher half of `a` and `b`.
1026	///
1027	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1028	#[inline]
1029	#[target_feature(enable = "sse")]
1030	#[cfg_attr(test, assert_instr(unpckhps))]
1031	#[stable(feature = "simd_x86", since = "1.27.0")]
1032	pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1033	simd_shuffle!(a, b, [`2`, `6`, `3`, `7`])
1034	}
1035
1036	/// Unpacks and interleave single-precision (32-bit) floating-point elements
1037	/// from the lower half of `a` and `b`.
1038	///
1039	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1040	#[inline]
1041	#[target_feature(enable = "sse")]
1042	#[cfg_attr(test, assert_instr(unpcklps))]
1043	#[stable(feature = "simd_x86", since = "1.27.0")]
1044	pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1045	simd_shuffle!(a, b, [`0`, `4`, `1`, `5`])
1046	}
1047
1048	/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1049	/// lower half of result.
1050	///
1051	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1052	#[inline]
1053	#[target_feature(enable = "sse")]
1054	#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
1055	#[stable(feature = "simd_x86", since = "1.27.0")]
1056	pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1057	// TODO; figure why this is a different instruction on Windows?
1058	simd_shuffle!(a, b, [`6`, `7`, `2`, `3`])
1059	}
1060
1061	/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1062	/// higher half of result.
1063	///
1064	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1065	#[inline]
1066	#[target_feature(enable = "sse")]
1067	#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
1068	#[stable(feature = "simd_x86", since = "1.27.0")]
1069	pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1070	simd_shuffle!(a, b, [`0`, `1`, `4`, `5`])
1071	}
1072
1073	/// Returns a mask of the most significant bit of each element in `a`.
1074	///
1075	/// The mask is stored in the 4 least significant bits of the return value.
1076	/// All other bits are set to `0`.
1077	///
1078	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1079	#[inline]
1080	#[target_feature(enable = "sse")]
1081	#[cfg_attr(test, assert_instr(movmskps))]
1082	#[stable(feature = "simd_x86", since = "1.27.0")]
1083	pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
1084	// Propagate the highest bit to the rest, because simd_bitmask
1085	// requires all-1 or all-0.
1086	let mask: i32x4 = simd_lt(x:transmute(a), y:i32x4::splat(`0`));
1087	simd_bitmask::<i32x4, u8>(mask).into()
1088	}
1089
1090	/// Construct a `__m128` with the lowest element read from `p` and the other
1091	/// elements set to zero.
1092	///
1093	/// This corresponds to instructions `VMOVSS` / `MOVSS`.
1094	///
1095	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1096	#[inline]
1097	#[target_feature(enable = "sse")]
1098	#[cfg_attr(test, assert_instr(movss))]
1099	#[stable(feature = "simd_x86", since = "1.27.0")]
1100	pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1101	__m128(*p, `0.0`, `0.0`, `0.0`)
1102	}
1103
1104	/// Construct a `__m128` by duplicating the value read from `p` into all
1105	/// elements.
1106	///
1107	/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1108	/// shuffling.
1109	///
1110	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1111	#[inline]
1112	#[target_feature(enable = "sse")]
1113	#[cfg_attr(test, assert_instr(movss))]
1114	#[stable(feature = "simd_x86", since = "1.27.0")]
1115	pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1116	let a: f32 = *p;
1117	__m128(a, a, a, a)
1118	}
1119
1120	/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1121	///
1122	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1123	#[inline]
1124	#[target_feature(enable = "sse")]
1125	#[cfg_attr(test, assert_instr(movss))]
1126	#[stable(feature = "simd_x86", since = "1.27.0")]
1127	pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1128	_mm_load1_ps(p)
1129	}
1130
1131	/// Loads four `f32` values from aligned* memory into a `__m128`. If the*
1132	/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1133	/// protection fault will be triggered (fatal program crash).
1134	///
1135	/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1136	/// memory.
1137	///
1138	/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1139	///
1140	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1141	#[inline]
1142	#[target_feature(enable = "sse")]
1143	#[cfg_attr(test, assert_instr(movaps))]
1144	#[stable(feature = "simd_x86", since = "1.27.0")]
1145	#[allow(clippy::cast_ptr_alignment)]
1146	pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1147	(p as const __m128)
1148	}
1149
1150	/// Loads four `f32` values from memory into a `__m128`. There are no
1151	/// restrictions
1152	/// on memory alignment. For aligned memory
1153	/// [`_mm_load_ps`](fn._mm_load_ps.html)
1154	/// may be faster.
1155	///
1156	/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1157	///
1158	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1159	#[inline]
1160	#[target_feature(enable = "sse")]
1161	#[cfg_attr(test, assert_instr(movups))]
1162	#[stable(feature = "simd_x86", since = "1.27.0")]
1163	pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1164	// Note: Using `p` would require `f32` alignment, but `movups` has no*
1165	// alignment restrictions.
1166	let mut dst: __m128 = _mm_undefined_ps();
1167	ptr::copy_nonoverlapping(
1168	src:p as *const u8,
1169	&mut dst as *mut __m128 as *mut u8,
1170	count:mem::size_of::<__m128>(),
1171	);
1172	dst
1173	}
1174
1175	/// Loads four `f32` values from aligned memory into a `__m128` in reverse
1176	/// order.
1177	///
1178	/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1179	/// protection fault will be triggered (fatal program crash).
1180	///
1181	/// Functionally equivalent to the following code sequence (assuming `p`
1182	/// satisfies the alignment restrictions):
1183	///
1184	/// ```text
1185	/// let a0 = p;*
1186	/// let a1 = p.add(1);*
1187	/// let a2 = p.add(2);*
1188	/// let a3 = p.add(3);*
1189	/// __m128::new(a3, a2, a1, a0)
1190	/// ```
1191	///
1192	/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1193	/// shuffling.
1194	///
1195	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1196	#[inline]
1197	#[target_feature(enable = "sse")]
1198	#[cfg_attr(test, assert_instr(movaps))]
1199	#[stable(feature = "simd_x86", since = "1.27.0")]
1200	pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1201	let a: __m128 = _mm_load_ps(p);
1202	simd_shuffle!(a, a, [`3`, `2`, `1`, `0`])
1203	}
1204
1205	/// Loads unaligned 64-bits of integer data from memory into new vector.
1206	///
1207	/// `mem_addr` does not need to be aligned on any particular boundary.
1208	///
1209	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
1210	#[inline]
1211	#[target_feature(enable = "sse")]
1212	#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
1213	pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
1214	transmute(src:i64x2(ptr::read_unaligned(src:mem_addr as *const i64), `0`))
1215	}
1216
1217	/// Stores the lowest 32 bit float of `a` into memory.
1218	///
1219	/// This intrinsic corresponds to the `MOVSS` instruction.
1220	///
1221	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1222	#[inline]
1223	#[target_feature(enable = "sse")]
1224	#[cfg_attr(test, assert_instr(movss))]
1225	#[stable(feature = "simd_x86", since = "1.27.0")]
1226	pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1227	*p = simd_extract(x:a, idx:`0`);
1228	}
1229
1230	/// Stores the lowest 32 bit float of `a` repeated four times into aligned
1231	/// memory.
1232	///
1233	/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1234	/// protection fault will be triggered (fatal program crash).
1235	///
1236	/// Functionally equivalent to the following code sequence (assuming `p`
1237	/// satisfies the alignment restrictions):
1238	///
1239	/// ```text
1240	/// let x = a.extract(0);
1241	/// p = x;*
1242	/// p.add(1) = x;*
1243	/// p.add(2) = x;*
1244	/// p.add(3) = x;*
1245	/// ```
1246	///
1247	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1248	#[inline]
1249	#[target_feature(enable = "sse")]
1250	#[cfg_attr(test, assert_instr(movaps))]
1251	#[stable(feature = "simd_x86", since = "1.27.0")]
1252	#[allow(clippy::cast_ptr_alignment)]
1253	pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1254	let b: __m128 = simd_shuffle!(a, a, [`0`, `0`, `0`, `0`]);
1255	(p as mut __m128) = b;
1256	}
1257
1258	/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1259	///
1260	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1261	#[inline]
1262	#[target_feature(enable = "sse")]
1263	#[cfg_attr(test, assert_instr(movaps))]
1264	#[stable(feature = "simd_x86", since = "1.27.0")]
1265	pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1266	_mm_store1_ps(p, a);
1267	}
1268
1269	/// Stores four 32-bit floats into aligned* memory.*
1270	///
1271	/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1272	/// protection fault will be triggered (fatal program crash).
1273	///
1274	/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1275	/// memory.
1276	///
1277	/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1278	///
1279	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1280	#[inline]
1281	#[target_feature(enable = "sse")]
1282	#[cfg_attr(test, assert_instr(movaps))]
1283	#[stable(feature = "simd_x86", since = "1.27.0")]
1284	#[allow(clippy::cast_ptr_alignment)]
1285	pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1286	(p as mut __m128) = a;
1287	}
1288
1289	/// Stores four 32-bit floats into memory. There are no restrictions on memory
1290	/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1291	/// faster.
1292	///
1293	/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1294	///
1295	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1296	#[inline]
1297	#[target_feature(enable = "sse")]
1298	#[cfg_attr(test, assert_instr(movups))]
1299	#[stable(feature = "simd_x86", since = "1.27.0")]
1300	pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1301	ptr::copy_nonoverlapping(
1302	&a as *const __m128 as *const u8,
1303	dst:p as *mut u8,
1304	count:mem::size_of::<__m128>(),
1305	);
1306	}
1307
1308	/// Stores four 32-bit floats into aligned* memory in reverse order.*
1309	///
1310	/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1311	/// protection fault will be triggered (fatal program crash).
1312	///
1313	/// Functionally equivalent to the following code sequence (assuming `p`
1314	/// satisfies the alignment restrictions):
1315	///
1316	/// ```text
1317	/// p = a.extract(3);*
1318	/// p.add(1) = a.extract(2);*
1319	/// p.add(2) = a.extract(1);*
1320	/// p.add(3) = a.extract(0);*
1321	/// ```
1322	///
1323	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1324	#[inline]
1325	#[target_feature(enable = "sse")]
1326	#[cfg_attr(test, assert_instr(movaps))]
1327	#[stable(feature = "simd_x86", since = "1.27.0")]
1328	#[allow(clippy::cast_ptr_alignment)]
1329	pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1330	let b: __m128 = simd_shuffle!(a, a, [`3`, `2`, `1`, `0`]);
1331	(p as mut __m128) = b;
1332	}
1333
1334	/// Returns a `__m128` with the first component from `b` and the remaining
1335	/// components from `a`.
1336	///
1337	/// In other words for any `a` and `b`:
1338	/// ```text
1339	/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1340	/// ```
1341	///
1342	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1343	#[inline]
1344	#[target_feature(enable = "sse")]
1345	#[cfg_attr(test, assert_instr(movss))]
1346	#[stable(feature = "simd_x86", since = "1.27.0")]
1347	pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1348	simd_shuffle!(a, b, [`4`, `1`, `2`, `3`])
1349	}
1350
1351	/// Performs a serializing operation on all store-to-memory instructions that
1352	/// were issued prior to this instruction.
1353	///
1354	/// Guarantees that every store instruction that precedes, in program order, is
1355	/// globally visible before any store instruction which follows the fence in
1356	/// program order.
1357	///
1358	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1359	#[inline]
1360	#[target_feature(enable = "sse")]
1361	#[cfg_attr(test, assert_instr(sfence))]
1362	#[stable(feature = "simd_x86", since = "1.27.0")]
1363	pub unsafe fn _mm_sfence() {
1364	sfence()
1365	}
1366
1367	/// Gets the unsigned 32-bit value of the MXCSR control and status register.
1368	///
1369	/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
1370	/// floating-point operations may or may not result in this register getting updated with exception
1371	/// state, and the register can change between two invocations of this function even when no
1372	/// floating-point operations appear in the source code (since floating-point operations appearing
1373	/// earlier or later can be reordered).
1374	///
1375	/// If you need to perform some floating-point operations and check whether they raised an
1376	/// exception, use an inline assembly block for the entire sequence of operations.
1377	///
1378	/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1379	///
1380	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1381	#[inline]
1382	#[target_feature(enable = "sse")]
1383	#[cfg_attr(test, assert_instr(stmxcsr))]
1384	#[stable(feature = "simd_x86", since = "1.27.0")]
1385	#[deprecated(
1386	since = "1.75.0",
1387	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1388	)]
1389	pub unsafe fn _mm_getcsr() -> u32 {
1390	let mut result: i32 = `0_i32`;
1391	stmxcsr(&mut result as *mut _ as *mut i8);
1392	result as u32
1393	}
1394
1395	/// Sets the MXCSR register with the 32-bit unsigned integer value.
1396	///
1397	/// This register controls how SIMD instructions handle floating point
1398	/// operations. Modifying this register only affects the current thread.
1399	///
1400	/// It contains several groups of flags:
1401	///
1402	/// Exception flags report which exceptions occurred since last they were*
1403	/// reset.
1404	///
1405	/// Masking flags can be used to mask (ignore) certain exceptions. By*
1406	/// default
1407	/// these flags are all set to 1, so all exceptions are masked. When an
1408	/// an exception is masked, the processor simply sets the exception flag and
1409	/// continues the operation. If the exception is unmasked, the flag is also set
1410	/// but additionally an exception handler is invoked.
1411	///
1412	/// Rounding mode flags control the rounding mode of floating point*
1413	/// instructions.
1414	///
1415	/// The denormals-are-zero mode flag turns all numbers which would be*
1416	/// denormalized (exponent bits are all zeros) into zeros.
1417	///
1418	/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
1419	/// immediate Undefined Behavior: Rust assumes that these are always in their default state and
1420	/// will optimize accordingly. This even applies when the register is altered and later reset to its
1421	/// original value without any floating-point operations appearing in the source code between those
1422	/// operations (since floating-point operations appearing earlier or later can be reordered).
1423	///
1424	/// If you need to perform some floating-point operations under a different masking flags, rounding
1425	/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
1426	/// original MXCSR register state before the end of the block.
1427	///
1428	/// ## Exception Flags
1429	///
1430	/// `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing*
1431	/// Infinity by Infinity).
1432	///
1433	/// `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized*
1434	/// number. Mainly this can cause loss of precision.
1435	///
1436	/// `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.*
1437	///
1438	/// `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a*
1439	/// result was too large to be represented (e.g., an `f32` with absolute
1440	/// value
1441	/// greater than `2^128`).
1442	///
1443	/// `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a*
1444	/// result was too small to be represented in a normalized way (e.g., an
1445	/// `f32`
1446	/// with absulte value smaller than `2^-126`.)
1447	///
1448	/// `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.*
1449	/// precision exception). This means some precision was lost due to rounding.
1450	/// For example, the fraction `1/3` cannot be represented accurately in a
1451	/// 32 or 64 bit float and computing it would cause this exception to be
1452	/// raised. Precision exceptions are very common, so they are usually masked.
1453	///
1454	/// Exception flags can be read and set using the convenience functions
1455	/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1456	/// check if an operation caused some overflow:
1457	///
1458	/// ```rust,ignore
1459	/// _MM_SET_EXCEPTION_STATE(`0`); // clear all exception flags
1460	/// // perform calculations
1461	/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != `0` {
1462	/// // handle overflow
1463	/// }
1464	/// ```
1465	///
1466	/// ## Masking Flags
1467	///
1468	/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1469	/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1470	/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1471	///
1472	/// A single masking bit can be set via
1473	///
1474	/// ```rust,ignore
1475	/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1476	/// ```
1477	///
1478	/// However, since mask bits are by default all set to 1, it is more common to
1479	/// want to disable* certain bits. For example, to unmask the underflow*
1480	/// exception, use:
1481	///
1482	/// ```rust,ignore
1483	/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1484	/// exception
1485	/// ```
1486	///
1487	/// Warning: an unmasked exception will cause an exception handler to be
1488	/// called.
1489	/// The standard handler will simply terminate the process. So, in this case
1490	/// any underflow exception would terminate the current process with something
1491	/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1492	///
1493	/// ## Rounding Mode
1494	///
1495	/// The rounding mode is describe using two bits. It can be read and set using
1496	/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1497	/// `_MM_SET_ROUNDING_MODE(mode)`.
1498	///
1499	/// The rounding modes are:
1500	///
1501	/// `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision*
1502	/// value. If two values are equally close, round to even (i.e., least
1503	/// significant bit will be zero).
1504	///
1505	/// `_MM_ROUND_DOWN`: Round toward negative Infinity.*
1506	///
1507	/// `_MM_ROUND_UP`: Round toward positive Infinity.*
1508	///
1509	/// `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).*
1510	///
1511	/// Example:
1512	///
1513	/// ```rust,ignore
1514	/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1515	/// ```
1516	///
1517	/// ## Denormals-are-zero/Flush-to-zero Mode
1518	///
1519	/// If this bit is set, values that would be denormalized will be set to zero
1520	/// instead. This is turned off by default.
1521	///
1522	/// You can read and enable/disable this mode via the helper functions
1523	/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1524	///
1525	/// ```rust,ignore
1526	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1527	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1528	/// ```
1529	///
1530	///
1531	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1532	#[inline]
1533	#[target_feature(enable = "sse")]
1534	#[cfg_attr(test, assert_instr(ldmxcsr))]
1535	#[stable(feature = "simd_x86", since = "1.27.0")]
1536	#[deprecated(
1537	since = "1.75.0",
1538	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1539	)]
1540	pub unsafe fn _mm_setcsr(val: u32) {
1541	ldmxcsr(&val as *const _ as *const i8);
1542	}
1543
1544	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1545	#[stable(feature = "simd_x86", since = "1.27.0")]
1546	pub const _MM_EXCEPT_INVALID: u32 = `0x0001`;
1547	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1548	#[stable(feature = "simd_x86", since = "1.27.0")]
1549	pub const _MM_EXCEPT_DENORM: u32 = `0x0002`;
1550	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1551	#[stable(feature = "simd_x86", since = "1.27.0")]
1552	pub const _MM_EXCEPT_DIV_ZERO: u32 = `0x0004`;
1553	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1554	#[stable(feature = "simd_x86", since = "1.27.0")]
1555	pub const _MM_EXCEPT_OVERFLOW: u32 = `0x0008`;
1556	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1557	#[stable(feature = "simd_x86", since = "1.27.0")]
1558	pub const _MM_EXCEPT_UNDERFLOW: u32 = `0x0010`;
1559	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1560	#[stable(feature = "simd_x86", since = "1.27.0")]
1561	pub const _MM_EXCEPT_INEXACT: u32 = `0x0020`;
1562	/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1563	#[stable(feature = "simd_x86", since = "1.27.0")]
1564	pub const _MM_EXCEPT_MASK: u32 = `0x003f`;
1565
1566	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1567	#[stable(feature = "simd_x86", since = "1.27.0")]
1568	pub const _MM_MASK_INVALID: u32 = `0x0080`;
1569	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1570	#[stable(feature = "simd_x86", since = "1.27.0")]
1571	pub const _MM_MASK_DENORM: u32 = `0x0100`;
1572	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1573	#[stable(feature = "simd_x86", since = "1.27.0")]
1574	pub const _MM_MASK_DIV_ZERO: u32 = `0x0200`;
1575	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1576	#[stable(feature = "simd_x86", since = "1.27.0")]
1577	pub const _MM_MASK_OVERFLOW: u32 = `0x0400`;
1578	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1579	#[stable(feature = "simd_x86", since = "1.27.0")]
1580	pub const _MM_MASK_UNDERFLOW: u32 = `0x0800`;
1581	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1582	#[stable(feature = "simd_x86", since = "1.27.0")]
1583	pub const _MM_MASK_INEXACT: u32 = `0x1000`;
1584	/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1585	#[stable(feature = "simd_x86", since = "1.27.0")]
1586	pub const _MM_MASK_MASK: u32 = `0x1f80`;
1587
1588	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1589	#[stable(feature = "simd_x86", since = "1.27.0")]
1590	pub const _MM_ROUND_NEAREST: u32 = `0x0000`;
1591	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1592	#[stable(feature = "simd_x86", since = "1.27.0")]
1593	pub const _MM_ROUND_DOWN: u32 = `0x2000`;
1594	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1595	#[stable(feature = "simd_x86", since = "1.27.0")]
1596	pub const _MM_ROUND_UP: u32 = `0x4000`;
1597	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1598	#[stable(feature = "simd_x86", since = "1.27.0")]
1599	pub const _MM_ROUND_TOWARD_ZERO: u32 = `0x6000`;
1600
1601	/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1602	#[stable(feature = "simd_x86", since = "1.27.0")]
1603	pub const _MM_ROUND_MASK: u32 = `0x6000`;
1604
1605	/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1606	#[stable(feature = "simd_x86", since = "1.27.0")]
1607	pub const _MM_FLUSH_ZERO_MASK: u32 = `0x8000`;
1608	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1609	#[stable(feature = "simd_x86", since = "1.27.0")]
1610	pub const _MM_FLUSH_ZERO_ON: u32 = `0x8000`;
1611	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1612	#[stable(feature = "simd_x86", since = "1.27.0")]
1613	pub const _MM_FLUSH_ZERO_OFF: u32 = `0x0000`;
1614
1615	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1616	///
1617	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1618	#[inline]
1619	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1620	#[allow(non_snake_case)]
1621	#[target_feature(enable = "sse")]
1622	#[stable(feature = "simd_x86", since = "1.27.0")]
1623	#[deprecated(
1624	since = "1.75.0",
1625	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1626	)]
1627	pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1628	_mm_getcsr() & _MM_MASK_MASK
1629	}
1630
1631	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1632	///
1633	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1634	#[inline]
1635	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1636	#[allow(non_snake_case)]
1637	#[target_feature(enable = "sse")]
1638	#[stable(feature = "simd_x86", since = "1.27.0")]
1639	#[deprecated(
1640	since = "1.75.0",
1641	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1642	)]
1643	pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1644	_mm_getcsr() & _MM_EXCEPT_MASK
1645	}
1646
1647	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1648	///
1649	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1650	#[inline]
1651	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1652	#[allow(non_snake_case)]
1653	#[target_feature(enable = "sse")]
1654	#[stable(feature = "simd_x86", since = "1.27.0")]
1655	#[deprecated(
1656	since = "1.75.0",
1657	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1658	)]
1659	pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1660	_mm_getcsr() & _MM_FLUSH_ZERO_MASK
1661	}
1662
1663	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1664	///
1665	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1666	#[inline]
1667	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1668	#[allow(non_snake_case)]
1669	#[target_feature(enable = "sse")]
1670	#[stable(feature = "simd_x86", since = "1.27.0")]
1671	#[deprecated(
1672	since = "1.75.0",
1673	note = "see `_mm_getcsr` documentation - use inline assembly instead"
1674	)]
1675	pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1676	_mm_getcsr() & _MM_ROUND_MASK
1677	}
1678
1679	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1680	///
1681	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1682	#[inline]
1683	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1684	#[allow(non_snake_case)]
1685	#[target_feature(enable = "sse")]
1686	#[stable(feature = "simd_x86", since = "1.27.0")]
1687	#[deprecated(
1688	since = "1.75.0",
1689	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1690	)]
1691	pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1692	_mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) \| x)
1693	}
1694
1695	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1696	///
1697	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1698	#[inline]
1699	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1700	#[allow(non_snake_case)]
1701	#[target_feature(enable = "sse")]
1702	#[stable(feature = "simd_x86", since = "1.27.0")]
1703	#[deprecated(
1704	since = "1.75.0",
1705	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1706	)]
1707	pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1708	_mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) \| x)
1709	}
1710
1711	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1712	///
1713	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1714	#[inline]
1715	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1716	#[allow(non_snake_case)]
1717	#[target_feature(enable = "sse")]
1718	#[stable(feature = "simd_x86", since = "1.27.0")]
1719	#[deprecated(
1720	since = "1.75.0",
1721	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1722	)]
1723	pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1724	let val: u32 = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) \| x;
1725	// println!("setting csr={:x}", val);
1726	_mm_setcsr(val)
1727	}
1728
1729	/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1730	///
1731	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1732	#[inline]
1733	#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1734	#[allow(non_snake_case)]
1735	#[target_feature(enable = "sse")]
1736	#[stable(feature = "simd_x86", since = "1.27.0")]
1737	#[deprecated(
1738	since = "1.75.0",
1739	note = "see `_mm_setcsr` documentation - use inline assembly instead"
1740	)]
1741	pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1742	_mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) \| x)
1743	}
1744
1745	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1746	#[stable(feature = "simd_x86", since = "1.27.0")]
1747	pub const _MM_HINT_T0: i32 = `3`;
1748
1749	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1750	#[stable(feature = "simd_x86", since = "1.27.0")]
1751	pub const _MM_HINT_T1: i32 = `2`;
1752
1753	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1754	#[stable(feature = "simd_x86", since = "1.27.0")]
1755	pub const _MM_HINT_T2: i32 = `1`;
1756
1757	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1758	#[stable(feature = "simd_x86", since = "1.27.0")]
1759	pub const _MM_HINT_NTA: i32 = `0`;
1760
1761	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1762	#[stable(feature = "simd_x86", since = "1.27.0")]
1763	pub const _MM_HINT_ET0: i32 = `7`;
1764
1765	/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1766	#[stable(feature = "simd_x86", since = "1.27.0")]
1767	pub const _MM_HINT_ET1: i32 = `6`;
1768
1769	/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1770	///
1771	/// The `STRATEGY` must be one of:
1772	///
1773	/// [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the*
1774	/// cache hierarchy.
1775	///
1776	/// [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.*
1777	///
1778	/// [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or*
1779	/// an implementation-specific choice (e.g., L2 if there is no L3).
1780	///
1781	/// [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the*
1782	/// non-temporal access (NTA) hint. It may be a place closer than main memory
1783	/// but outside of the cache hierarchy. This is used to reduce access latency
1784	/// without polluting the cache.
1785	///
1786	/// [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and*
1787	/// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1788	/// and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1789	///
1790	/// The actual implementation depends on the particular CPU. This instruction
1791	/// is considered a hint, so the CPU is also free to simply ignore the request.
1792	///
1793	/// The amount of prefetched data depends on the cache line size of the
1794	/// specific CPU, but it will be at least 32 bytes.
1795	///
1796	/// Common caveats:
1797	///
1798	/// Most modern CPUs already automatically prefetch data based on predicted*
1799	/// access patterns.
1800	///
1801	/// Data is usually not fetched if this would cause a TLB miss or a page*
1802	/// fault.
1803	///
1804	/// Too much prefetching can cause unnecessary cache evictions.*
1805	///
1806	/// Prefetching may also fail if there are not enough memory-subsystem*
1807	/// resources (e.g., request buffers).
1808	///
1809	///
1810	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1811	#[inline]
1812	#[target_feature(enable = "sse")]
1813	#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1814	#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1815	#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1816	#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1817	#[rustc_legacy_const_generics(`1`)]
1818	#[stable(feature = "simd_x86", since = "1.27.0")]
1819	pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1820	// We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1821	// `locality` and `rw` are based on our `STRATEGY`.
1822	prefetch(p, (STRATEGY >> `2`) & `1`, STRATEGY & `3`, ty:`1`);
1823	}
1824
1825	/// Returns vector of type __m128 with indeterminate elements.
1826	/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
1827	/// In practice, this is equivalent to [`mem::zeroed`].
1828	///
1829	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1830	#[inline]
1831	#[target_feature(enable = "sse")]
1832	#[stable(feature = "simd_x86", since = "1.27.0")]
1833	pub unsafe fn _mm_undefined_ps() -> __m128 {
1834	_mm_set1_ps(`0.0`)
1835	}
1836
1837	/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1838	///
1839	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1840	#[inline]
1841	#[allow(non_snake_case)]
1842	#[target_feature(enable = "sse")]
1843	#[stable(feature = "simd_x86", since = "1.27.0")]
1844	pub unsafe fn _MM_TRANSPOSE4_PS(
1845	row0: &mut __m128,
1846	row1: &mut __m128,
1847	row2: &mut __m128,
1848	row3: &mut __m128,
1849	) {
1850	let tmp0: __m128 = _mm_unpacklo_ps(row0, row1);
1851	let tmp2: __m128 = _mm_unpacklo_ps(row2, row3);
1852	let tmp1: __m128 = _mm_unpackhi_ps(row0, row1);
1853	let tmp3: __m128 = _mm_unpackhi_ps(row2, row3);
1854
1855	*row0 = _mm_movelh_ps(a:tmp0, b:tmp2);
1856	*row1 = _mm_movehl_ps(a:tmp2, b:tmp0);
1857	*row2 = _mm_movelh_ps(a:tmp1, b:tmp3);
1858	*row3 = _mm_movehl_ps(a:tmp3, b:tmp1);
1859	}
1860
1861	#[allow(improper_ctypes)]
1862	extern "C" {
1863	#[link_name = "llvm.x86.sse.add.ss"]
1864	fn addss(a: __m128, b: __m128) -> __m128;
1865	#[link_name = "llvm.x86.sse.sub.ss"]
1866	fn subss(a: __m128, b: __m128) -> __m128;
1867	#[link_name = "llvm.x86.sse.mul.ss"]
1868	fn mulss(a: __m128, b: __m128) -> __m128;
1869	#[link_name = "llvm.x86.sse.div.ss"]
1870	fn divss(a: __m128, b: __m128) -> __m128;
1871	#[link_name = "llvm.x86.sse.sqrt.ss"]
1872	fn sqrtss(a: __m128) -> __m128;
1873	#[link_name = "llvm.x86.sse.sqrt.ps"]
1874	fn sqrtps(a: __m128) -> __m128;
1875	#[link_name = "llvm.x86.sse.rcp.ss"]
1876	fn rcpss(a: __m128) -> __m128;
1877	#[link_name = "llvm.x86.sse.rcp.ps"]
1878	fn rcpps(a: __m128) -> __m128;
1879	#[link_name = "llvm.x86.sse.rsqrt.ss"]
1880	fn rsqrtss(a: __m128) -> __m128;
1881	#[link_name = "llvm.x86.sse.rsqrt.ps"]
1882	fn rsqrtps(a: __m128) -> __m128;
1883	#[link_name = "llvm.x86.sse.min.ss"]
1884	fn minss(a: __m128, b: __m128) -> __m128;
1885	#[link_name = "llvm.x86.sse.min.ps"]
1886	fn minps(a: __m128, b: __m128) -> __m128;
1887	#[link_name = "llvm.x86.sse.max.ss"]
1888	fn maxss(a: __m128, b: __m128) -> __m128;
1889	#[link_name = "llvm.x86.sse.max.ps"]
1890	fn maxps(a: __m128, b: __m128) -> __m128;
1891	#[link_name = "llvm.x86.sse.cmp.ps"]
1892	fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1893	#[link_name = "llvm.x86.sse.comieq.ss"]
1894	fn comieq_ss(a: __m128, b: __m128) -> i32;
1895	#[link_name = "llvm.x86.sse.comilt.ss"]
1896	fn comilt_ss(a: __m128, b: __m128) -> i32;
1897	#[link_name = "llvm.x86.sse.comile.ss"]
1898	fn comile_ss(a: __m128, b: __m128) -> i32;
1899	#[link_name = "llvm.x86.sse.comigt.ss"]
1900	fn comigt_ss(a: __m128, b: __m128) -> i32;
1901	#[link_name = "llvm.x86.sse.comige.ss"]
1902	fn comige_ss(a: __m128, b: __m128) -> i32;
1903	#[link_name = "llvm.x86.sse.comineq.ss"]
1904	fn comineq_ss(a: __m128, b: __m128) -> i32;
1905	#[link_name = "llvm.x86.sse.ucomieq.ss"]
1906	fn ucomieq_ss(a: __m128, b: __m128) -> i32;
1907	#[link_name = "llvm.x86.sse.ucomilt.ss"]
1908	fn ucomilt_ss(a: __m128, b: __m128) -> i32;
1909	#[link_name = "llvm.x86.sse.ucomile.ss"]
1910	fn ucomile_ss(a: __m128, b: __m128) -> i32;
1911	#[link_name = "llvm.x86.sse.ucomigt.ss"]
1912	fn ucomigt_ss(a: __m128, b: __m128) -> i32;
1913	#[link_name = "llvm.x86.sse.ucomige.ss"]
1914	fn ucomige_ss(a: __m128, b: __m128) -> i32;
1915	#[link_name = "llvm.x86.sse.ucomineq.ss"]
1916	fn ucomineq_ss(a: __m128, b: __m128) -> i32;
1917	#[link_name = "llvm.x86.sse.cvtss2si"]
1918	fn cvtss2si(a: __m128) -> i32;
1919	#[link_name = "llvm.x86.sse.cvttss2si"]
1920	fn cvttss2si(a: __m128) -> i32;
1921	#[link_name = "llvm.x86.sse.cvtsi2ss"]
1922	fn cvtsi2ss(a: __m128, b: i32) -> __m128;
1923	#[link_name = "llvm.x86.sse.sfence"]
1924	fn sfence();
1925	#[link_name = "llvm.x86.sse.stmxcsr"]
1926	fn stmxcsr(p: *mut i8);
1927	#[link_name = "llvm.x86.sse.ldmxcsr"]
1928	fn ldmxcsr(p: *const i8);
1929	#[link_name = "llvm.prefetch"]
1930	fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
1931	#[link_name = "llvm.x86.sse.cmp.ss"]
1932	fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
1933	}
1934
1935	/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1936	///
1937	/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1938	/// exception _may_ be generated.
1939	///
1940	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
1941	#[inline]
1942	#[target_feature(enable = "sse")]
1943	#[cfg_attr(test, assert_instr(movntps))]
1944	#[stable(feature = "simd_x86", since = "1.27.0")]
1945	#[allow(clippy::cast_ptr_alignment)]
1946	pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
1947	intrinsics::nontemporal_store(ptr:mem_addr as *mut __m128, val:a);
1948	}
1949
1950	#[cfg(test)]
1951	mod tests {
1952	use crate::{hint::black_box, mem::transmute};
1953	use std::{boxed, f32::NAN};
1954	use stdarch_test::simd_test;
1955
1956	use crate::core_arch::{simd::, x86::};
1957
1958	#[simd_test(enable = "sse")]
1959	unsafe fn test_mm_add_ps() {
1960	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
1961	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
1962	let r = _mm_add_ps(a, b);
1963	assert_eq_m128(r, _mm_setr_ps(`-101.0`, `25.0`, `0.0`, `-15.0`));
1964	}
1965
1966	#[simd_test(enable = "sse")]
1967	unsafe fn test_mm_add_ss() {
1968	let a = _mm_set_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
1969	let b = _mm_set_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
1970	let r = _mm_add_ss(a, b);
1971	assert_eq_m128(r, _mm_set_ps(`-1.0`, `5.0`, `0.0`, `-15.0`));
1972	}
1973
1974	#[simd_test(enable = "sse")]
1975	unsafe fn test_mm_sub_ps() {
1976	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
1977	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
1978	let r = _mm_sub_ps(a, b);
1979	assert_eq_m128(r, _mm_setr_ps(`99.0`, `-15.0`, `0.0`, `-5.0`));
1980	}
1981
1982	#[simd_test(enable = "sse")]
1983	unsafe fn test_mm_sub_ss() {
1984	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
1985	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
1986	let r = _mm_sub_ss(a, b);
1987	assert_eq_m128(r, _mm_setr_ps(`99.0`, `5.0`, `0.0`, `-10.0`));
1988	}
1989
1990	#[simd_test(enable = "sse")]
1991	unsafe fn test_mm_mul_ps() {
1992	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
1993	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
1994	let r = _mm_mul_ps(a, b);
1995	assert_eq_m128(r, _mm_setr_ps(`100.0`, `100.0`, `0.0`, `50.0`));
1996	}
1997
1998	#[simd_test(enable = "sse")]
1999	unsafe fn test_mm_mul_ss() {
2000	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2001	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2002	let r = _mm_mul_ss(a, b);
2003	assert_eq_m128(r, _mm_setr_ps(`100.0`, `5.0`, `0.0`, `-10.0`));
2004	}
2005
2006	#[simd_test(enable = "sse")]
2007	unsafe fn test_mm_div_ps() {
2008	let a = _mm_setr_ps(`-1.0`, `5.0`, `2.0`, `-10.0`);
2009	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.2`, `-5.0`);
2010	let r = _mm_div_ps(a, b);
2011	assert_eq_m128(r, _mm_setr_ps(`0.01`, `0.25`, `10.0`, `2.0`));
2012	}
2013
2014	#[simd_test(enable = "sse")]
2015	unsafe fn test_mm_div_ss() {
2016	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2017	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2018	let r = _mm_div_ss(a, b);
2019	assert_eq_m128(r, _mm_setr_ps(`0.01`, `5.0`, `0.0`, `-10.0`));
2020	}
2021
2022	#[simd_test(enable = "sse")]
2023	unsafe fn test_mm_sqrt_ss() {
2024	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2025	let r = _mm_sqrt_ss(a);
2026	let e = _mm_setr_ps(`2.0`, `13.0`, `16.0`, `100.0`);
2027	assert_eq_m128(r, e);
2028	}
2029
2030	#[simd_test(enable = "sse")]
2031	unsafe fn test_mm_sqrt_ps() {
2032	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2033	let r = _mm_sqrt_ps(a);
2034	let e = _mm_setr_ps(`2.0`, `3.6055512`, `4.0`, `10.0`);
2035	assert_eq_m128(r, e);
2036	}
2037
2038	#[simd_test(enable = "sse")]
2039	unsafe fn test_mm_rcp_ss() {
2040	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2041	let r = _mm_rcp_ss(a);
2042	let e = _mm_setr_ps(`0.24993896`, `13.0`, `16.0`, `100.0`);
2043	let rel_err = `0.00048828125`;
2044	assert_approx_eq!(get_m128(r, `0`), get_m128(e, `0`), `2.` * rel_err);
2045	for i in `1`..`4` {
2046	assert_eq!(get_m128(r, i), get_m128(e, i));
2047	}
2048	}
2049
2050	#[simd_test(enable = "sse")]
2051	unsafe fn test_mm_rcp_ps() {
2052	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2053	let r = _mm_rcp_ps(a);
2054	let e = _mm_setr_ps(`0.24993896`, `0.0769043`, `0.06248474`, `0.0099983215`);
2055	let rel_err = `0.00048828125`;
2056	for i in `0`..`4` {
2057	assert_approx_eq!(get_m128(r, i), get_m128(e, i), `2.` * rel_err);
2058	}
2059	}
2060
2061	#[simd_test(enable = "sse")]
2062	unsafe fn test_mm_rsqrt_ss() {
2063	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2064	let r = _mm_rsqrt_ss(a);
2065	let e = _mm_setr_ps(`0.49987793`, `13.0`, `16.0`, `100.0`);
2066	let rel_err = `0.00048828125`;
2067	for i in `0`..`4` {
2068	assert_approx_eq!(get_m128(r, i), get_m128(e, i), `2.` * rel_err);
2069	}
2070	}
2071
2072	#[simd_test(enable = "sse")]
2073	unsafe fn test_mm_rsqrt_ps() {
2074	let a = _mm_setr_ps(`4.0`, `13.0`, `16.0`, `100.0`);
2075	let r = _mm_rsqrt_ps(a);
2076	let e = _mm_setr_ps(`0.49987793`, `0.2772827`, `0.24993896`, `0.099990845`);
2077	let rel_err = `0.00048828125`;
2078	for i in `0`..`4` {
2079	assert_approx_eq!(get_m128(r, i), get_m128(e, i), `2.` * rel_err);
2080	}
2081	}
2082
2083	#[simd_test(enable = "sse")]
2084	unsafe fn test_mm_min_ss() {
2085	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2086	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2087	let r = _mm_min_ss(a, b);
2088	assert_eq_m128(r, _mm_setr_ps(`-100.0`, `5.0`, `0.0`, `-10.0`));
2089	}
2090
2091	#[simd_test(enable = "sse")]
2092	unsafe fn test_mm_min_ps() {
2093	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2094	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2095	let r = _mm_min_ps(a, b);
2096	assert_eq_m128(r, _mm_setr_ps(`-100.0`, `5.0`, `0.0`, `-10.0`));
2097
2098	// `_mm_min_ps` can not* be implemented using the `simd_min` rust intrinsic. `simd_min`*
2099	// is lowered by the llvm codegen backend to `llvm.minnum.v` llvm intrinsic. This intrinsic*
2100	// doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2101	// the `minps` x86 instruction on x86. The `llvm.minnum.v` llvm intrinsic equals*
2102	// `r1` to `a` and `r2` to `b`.
2103	let a = _mm_setr_ps(`-0.0`, `0.0`, `0.0`, `0.0`);
2104	let b = _mm_setr_ps(`0.0`, `0.0`, `0.0`, `0.0`);
2105	let r1: [u8; `16`] = transmute(_mm_min_ps(a, b));
2106	let r2: [u8; `16`] = transmute(_mm_min_ps(b, a));
2107	let a: [u8; `16`] = transmute(a);
2108	let b: [u8; `16`] = transmute(b);
2109	assert_eq!(r1, b);
2110	assert_eq!(r2, a);
2111	assert_ne!(a, b); // sanity check that -0.0 is actually present
2112	}
2113
2114	#[simd_test(enable = "sse")]
2115	unsafe fn test_mm_max_ss() {
2116	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2117	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2118	let r = _mm_max_ss(a, b);
2119	assert_eq_m128(r, _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`));
2120	}
2121
2122	#[simd_test(enable = "sse")]
2123	unsafe fn test_mm_max_ps() {
2124	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
2125	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
2126	let r = _mm_max_ps(a, b);
2127	assert_eq_m128(r, _mm_setr_ps(`-1.0`, `20.0`, `0.0`, `-5.0`));
2128
2129	// Check SSE-specific semantics for -0.0 handling.
2130	let a = _mm_setr_ps(`-0.0`, `0.0`, `0.0`, `0.0`);
2131	let b = _mm_setr_ps(`0.0`, `0.0`, `0.0`, `0.0`);
2132	let r1: [u8; `16`] = transmute(_mm_max_ps(a, b));
2133	let r2: [u8; `16`] = transmute(_mm_max_ps(b, a));
2134	let a: [u8; `16`] = transmute(a);
2135	let b: [u8; `16`] = transmute(b);
2136	assert_eq!(r1, b);
2137	assert_eq!(r2, a);
2138	assert_ne!(a, b); // sanity check that -0.0 is actually present
2139	}
2140
2141	#[simd_test(enable = "sse")]
2142	unsafe fn test_mm_and_ps() {
2143	let a = transmute(u32x4::splat(`0b0011`));
2144	let b = transmute(u32x4::splat(`0b0101`));
2145	let r = _mm_and_ps(black_box(&a), black_box(&b));
2146	let e = transmute(u32x4::splat(`0b0001`));
2147	assert_eq_m128(r, e);
2148	}
2149
2150	#[simd_test(enable = "sse")]
2151	unsafe fn test_mm_andnot_ps() {
2152	let a = transmute(u32x4::splat(`0b0011`));
2153	let b = transmute(u32x4::splat(`0b0101`));
2154	let r = _mm_andnot_ps(black_box(&a), black_box(&b));
2155	let e = transmute(u32x4::splat(`0b0100`));
2156	assert_eq_m128(r, e);
2157	}
2158
2159	#[simd_test(enable = "sse")]
2160	unsafe fn test_mm_or_ps() {
2161	let a = transmute(u32x4::splat(`0b0011`));
2162	let b = transmute(u32x4::splat(`0b0101`));
2163	let r = _mm_or_ps(black_box(&a), black_box(&b));
2164	let e = transmute(u32x4::splat(`0b0111`));
2165	assert_eq_m128(r, e);
2166	}
2167
2168	#[simd_test(enable = "sse")]
2169	unsafe fn test_mm_xor_ps() {
2170	let a = transmute(u32x4::splat(`0b0011`));
2171	let b = transmute(u32x4::splat(`0b0101`));
2172	let r = _mm_xor_ps(black_box(&a), black_box(&b));
2173	let e = transmute(u32x4::splat(`0b0110`));
2174	assert_eq_m128(r, e);
2175	}
2176
2177	#[simd_test(enable = "sse")]
2178	unsafe fn test_mm_cmpeq_ss() {
2179	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2180	let b = _mm_setr_ps(`-1.0`, `5.0`, `6.0`, `7.0`);
2181	let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2182	let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(`0`), `2.0`, `3.0`, `4.0`));
2183	assert_eq!(r, e);
2184
2185	let b2 = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2186	let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
2187	let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(`0xffffffff`), `2.0`, `3.0`, `4.0`));
2188	assert_eq!(r2, e2);
2189	}
2190
2191	#[simd_test(enable = "sse")]
2192	unsafe fn test_mm_cmplt_ss() {
2193	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2194	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2195	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2196	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2197
2198	let b1 = `0u32`; // a.extract(0) < b.extract(0)
2199	let c1 = `0u32`; // a.extract(0) < c.extract(0)
2200	let d1 = !`0u32`; // a.extract(0) < d.extract(0)
2201
2202	let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2203	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2204	assert_eq!(rb, eb);
2205
2206	let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2207	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2208	assert_eq!(rc, ec);
2209
2210	let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2211	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2212	assert_eq!(rd, ed);
2213	}
2214
2215	#[simd_test(enable = "sse")]
2216	unsafe fn test_mm_cmple_ss() {
2217	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2218	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2219	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2220	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2221
2222	let b1 = `0u32`; // a.extract(0) <= b.extract(0)
2223	let c1 = !`0u32`; // a.extract(0) <= c.extract(0)
2224	let d1 = !`0u32`; // a.extract(0) <= d.extract(0)
2225
2226	let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2227	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2228	assert_eq!(rb, eb);
2229
2230	let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2231	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2232	assert_eq!(rc, ec);
2233
2234	let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2235	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2236	assert_eq!(rd, ed);
2237	}
2238
2239	#[simd_test(enable = "sse")]
2240	unsafe fn test_mm_cmpgt_ss() {
2241	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2242	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2243	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2244	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2245
2246	let b1 = !`0u32`; // a.extract(0) > b.extract(0)
2247	let c1 = `0u32`; // a.extract(0) > c.extract(0)
2248	let d1 = `0u32`; // a.extract(0) > d.extract(0)
2249
2250	let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2251	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2252	assert_eq!(rb, eb);
2253
2254	let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2255	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2256	assert_eq!(rc, ec);
2257
2258	let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2259	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2260	assert_eq!(rd, ed);
2261	}
2262
2263	#[simd_test(enable = "sse")]
2264	unsafe fn test_mm_cmpge_ss() {
2265	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2266	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2267	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2268	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2269
2270	let b1 = !`0u32`; // a.extract(0) >= b.extract(0)
2271	let c1 = !`0u32`; // a.extract(0) >= c.extract(0)
2272	let d1 = `0u32`; // a.extract(0) >= d.extract(0)
2273
2274	let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2275	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2276	assert_eq!(rb, eb);
2277
2278	let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2279	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2280	assert_eq!(rc, ec);
2281
2282	let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2283	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2284	assert_eq!(rd, ed);
2285	}
2286
2287	#[simd_test(enable = "sse")]
2288	unsafe fn test_mm_cmpneq_ss() {
2289	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2290	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2291	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2292	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2293
2294	let b1 = !`0u32`; // a.extract(0) != b.extract(0)
2295	let c1 = `0u32`; // a.extract(0) != c.extract(0)
2296	let d1 = !`0u32`; // a.extract(0) != d.extract(0)
2297
2298	let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2299	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2300	assert_eq!(rb, eb);
2301
2302	let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2303	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2304	assert_eq!(rc, ec);
2305
2306	let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2307	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2308	assert_eq!(rd, ed);
2309	}
2310
2311	#[simd_test(enable = "sse")]
2312	unsafe fn test_mm_cmpnlt_ss() {
2313	// TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2314	// must be a difference. It may have to do with behavior in the
2315	// presence of NaNs (signaling or quiet). If so, we should add tests
2316	// for those.
2317
2318	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2319	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2320	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2321	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2322
2323	let b1 = !`0u32`; // a.extract(0) >= b.extract(0)
2324	let c1 = !`0u32`; // a.extract(0) >= c.extract(0)
2325	let d1 = `0u32`; // a.extract(0) >= d.extract(0)
2326
2327	let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2328	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2329	assert_eq!(rb, eb);
2330
2331	let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2332	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2333	assert_eq!(rc, ec);
2334
2335	let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2336	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2337	assert_eq!(rd, ed);
2338	}
2339
2340	#[simd_test(enable = "sse")]
2341	unsafe fn test_mm_cmpnle_ss() {
2342	// TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2343	// must be a difference. It may have to do with behavior in the
2344	// presence
2345	// of NaNs (signaling or quiet). If so, we should add tests for those.
2346
2347	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2348	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2349	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2350	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2351
2352	let b1 = !`0u32`; // a.extract(0) > b.extract(0)
2353	let c1 = `0u32`; // a.extract(0) > c.extract(0)
2354	let d1 = `0u32`; // a.extract(0) > d.extract(0)
2355
2356	let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2357	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2358	assert_eq!(rb, eb);
2359
2360	let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2361	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2362	assert_eq!(rc, ec);
2363
2364	let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2365	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2366	assert_eq!(rd, ed);
2367	}
2368
2369	#[simd_test(enable = "sse")]
2370	unsafe fn test_mm_cmpngt_ss() {
2371	// TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2372	// must be a difference. It may have to do with behavior in the
2373	// presence of NaNs (signaling or quiet). If so, we should add tests
2374	// for those.
2375
2376	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2377	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2378	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2379	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2380
2381	let b1 = `0u32`; // a.extract(0) <= b.extract(0)
2382	let c1 = !`0u32`; // a.extract(0) <= c.extract(0)
2383	let d1 = !`0u32`; // a.extract(0) <= d.extract(0)
2384
2385	let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2386	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2387	assert_eq!(rb, eb);
2388
2389	let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2390	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2391	assert_eq!(rc, ec);
2392
2393	let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2394	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2395	assert_eq!(rd, ed);
2396	}
2397
2398	#[simd_test(enable = "sse")]
2399	unsafe fn test_mm_cmpnge_ss() {
2400	// TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2401	// must be a difference. It may have to do with behavior in the
2402	// presence of NaNs (signaling or quiet). If so, we should add tests
2403	// for those.
2404
2405	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2406	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2407	let c = _mm_setr_ps(`1.0`, `5.0`, `6.0`, `7.0`);
2408	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2409
2410	let b1 = `0u32`; // a.extract(0) < b.extract(0)
2411	let c1 = `0u32`; // a.extract(0) < c.extract(0)
2412	let d1 = !`0u32`; // a.extract(0) < d.extract(0)
2413
2414	let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2415	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2416	assert_eq!(rb, eb);
2417
2418	let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2419	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2420	assert_eq!(rc, ec);
2421
2422	let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2423	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2424	assert_eq!(rd, ed);
2425	}
2426
2427	#[simd_test(enable = "sse")]
2428	unsafe fn test_mm_cmpord_ss() {
2429	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2430	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2431	let c = _mm_setr_ps(NAN, `5.0`, `6.0`, `7.0`);
2432	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2433
2434	let b1 = !`0u32`; // a.extract(0) ord b.extract(0)
2435	let c1 = `0u32`; // a.extract(0) ord c.extract(0)
2436	let d1 = !`0u32`; // a.extract(0) ord d.extract(0)
2437
2438	let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2439	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2440	assert_eq!(rb, eb);
2441
2442	let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2443	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2444	assert_eq!(rc, ec);
2445
2446	let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2447	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2448	assert_eq!(rd, ed);
2449	}
2450
2451	#[simd_test(enable = "sse")]
2452	unsafe fn test_mm_cmpunord_ss() {
2453	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
2454	let b = _mm_setr_ps(`0.0`, `5.0`, `6.0`, `7.0`);
2455	let c = _mm_setr_ps(NAN, `5.0`, `6.0`, `7.0`);
2456	let d = _mm_setr_ps(`2.0`, `5.0`, `6.0`, `7.0`);
2457
2458	let b1 = `0u32`; // a.extract(0) unord b.extract(0)
2459	let c1 = !`0u32`; // a.extract(0) unord c.extract(0)
2460	let d1 = `0u32`; // a.extract(0) unord d.extract(0)
2461
2462	let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2463	let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), `2.0`, `3.0`, `4.0`));
2464	assert_eq!(rb, eb);
2465
2466	let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2467	let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), `2.0`, `3.0`, `4.0`));
2468	assert_eq!(rc, ec);
2469
2470	let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2471	let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), `2.0`, `3.0`, `4.0`));
2472	assert_eq!(rd, ed);
2473	}
2474
2475	#[simd_test(enable = "sse")]
2476	unsafe fn test_mm_cmpeq_ps() {
2477	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2478	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, NAN);
2479	let tru = !`0u32`;
2480	let fls = `0u32`;
2481
2482	let e = u32x4::new(fls, fls, tru, fls);
2483	let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2484	assert_eq!(r, e);
2485	}
2486
2487	#[simd_test(enable = "sse")]
2488	unsafe fn test_mm_cmplt_ps() {
2489	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2490	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, NAN);
2491	let tru = !`0u32`;
2492	let fls = `0u32`;
2493
2494	let e = u32x4::new(tru, fls, fls, fls);
2495	let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2496	assert_eq!(r, e);
2497	}
2498
2499	#[simd_test(enable = "sse")]
2500	unsafe fn test_mm_cmple_ps() {
2501	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, `4.0`);
2502	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, NAN);
2503	let tru = !`0u32`;
2504	let fls = `0u32`;
2505
2506	let e = u32x4::new(tru, fls, tru, fls);
2507	let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2508	assert_eq!(r, e);
2509	}
2510
2511	#[simd_test(enable = "sse")]
2512	unsafe fn test_mm_cmpgt_ps() {
2513	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2514	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `42.0`);
2515	let tru = !`0u32`;
2516	let fls = `0u32`;
2517
2518	let e = u32x4::new(fls, tru, fls, fls);
2519	let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2520	assert_eq!(r, e);
2521	}
2522
2523	#[simd_test(enable = "sse")]
2524	unsafe fn test_mm_cmpge_ps() {
2525	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2526	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `42.0`);
2527	let tru = !`0u32`;
2528	let fls = `0u32`;
2529
2530	let e = u32x4::new(fls, tru, tru, fls);
2531	let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2532	assert_eq!(r, e);
2533	}
2534
2535	#[simd_test(enable = "sse")]
2536	unsafe fn test_mm_cmpneq_ps() {
2537	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2538	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, NAN);
2539	let tru = !`0u32`;
2540	let fls = `0u32`;
2541
2542	let e = u32x4::new(tru, tru, fls, tru);
2543	let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2544	assert_eq!(r, e);
2545	}
2546
2547	#[simd_test(enable = "sse")]
2548	unsafe fn test_mm_cmpnlt_ps() {
2549	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2550	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `5.0`);
2551	let tru = !`0u32`;
2552	let fls = `0u32`;
2553
2554	let e = u32x4::new(fls, tru, tru, tru);
2555	let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2556	assert_eq!(r, e);
2557	}
2558
2559	#[simd_test(enable = "sse")]
2560	unsafe fn test_mm_cmpnle_ps() {
2561	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2562	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `5.0`);
2563	let tru = !`0u32`;
2564	let fls = `0u32`;
2565
2566	let e = u32x4::new(fls, tru, fls, tru);
2567	let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2568	assert_eq!(r, e);
2569	}
2570
2571	#[simd_test(enable = "sse")]
2572	unsafe fn test_mm_cmpngt_ps() {
2573	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2574	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `5.0`);
2575	let tru = !`0u32`;
2576	let fls = `0u32`;
2577
2578	let e = u32x4::new(tru, fls, tru, tru);
2579	let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2580	assert_eq!(r, e);
2581	}
2582
2583	#[simd_test(enable = "sse")]
2584	unsafe fn test_mm_cmpnge_ps() {
2585	let a = _mm_setr_ps(`10.0`, `50.0`, `1.0`, NAN);
2586	let b = _mm_setr_ps(`15.0`, `20.0`, `1.0`, `5.0`);
2587	let tru = !`0u32`;
2588	let fls = `0u32`;
2589
2590	let e = u32x4::new(tru, fls, fls, tru);
2591	let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2592	assert_eq!(r, e);
2593	}
2594
2595	#[simd_test(enable = "sse")]
2596	unsafe fn test_mm_cmpord_ps() {
2597	let a = _mm_setr_ps(`10.0`, `50.0`, NAN, NAN);
2598	let b = _mm_setr_ps(`15.0`, NAN, `1.0`, NAN);
2599	let tru = !`0u32`;
2600	let fls = `0u32`;
2601
2602	let e = u32x4::new(tru, fls, fls, fls);
2603	let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2604	assert_eq!(r, e);
2605	}
2606
2607	#[simd_test(enable = "sse")]
2608	unsafe fn test_mm_cmpunord_ps() {
2609	let a = _mm_setr_ps(`10.0`, `50.0`, NAN, NAN);
2610	let b = _mm_setr_ps(`15.0`, NAN, `1.0`, NAN);
2611	let tru = !`0u32`;
2612	let fls = `0u32`;
2613
2614	let e = u32x4::new(fls, tru, tru, tru);
2615	let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2616	assert_eq!(r, e);
2617	}
2618
2619	#[simd_test(enable = "sse")]
2620	unsafe fn test_mm_comieq_ss() {
2621	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2622	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2623
2624	let ee = &[`1i32`, `0`, `0`, `0`];
2625
2626	for i in `0`..`4` {
2627	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2628	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2629
2630	let r = _mm_comieq_ss(a, b);
2631
2632	assert_eq!(
2633	ee[i], r,
2634	"_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2635	a, b, r, ee[i], i
2636	);
2637	}
2638	}
2639
2640	#[simd_test(enable = "sse")]
2641	unsafe fn test_mm_comilt_ss() {
2642	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2643	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2644
2645	let ee = &[`0i32`, `1`, `0`, `0`];
2646
2647	for i in `0`..`4` {
2648	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2649	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2650
2651	let r = _mm_comilt_ss(a, b);
2652
2653	assert_eq!(
2654	ee[i], r,
2655	"_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2656	a, b, r, ee[i], i
2657	);
2658	}
2659	}
2660
2661	#[simd_test(enable = "sse")]
2662	unsafe fn test_mm_comile_ss() {
2663	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2664	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2665
2666	let ee = &[`1i32`, `1`, `0`, `0`];
2667
2668	for i in `0`..`4` {
2669	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2670	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2671
2672	let r = _mm_comile_ss(a, b);
2673
2674	assert_eq!(
2675	ee[i], r,
2676	"_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2677	a, b, r, ee[i], i
2678	);
2679	}
2680	}
2681
2682	#[simd_test(enable = "sse")]
2683	unsafe fn test_mm_comigt_ss() {
2684	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2685	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2686
2687	let ee = &[`1i32`, `0`, `1`, `0`];
2688
2689	for i in `0`..`4` {
2690	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2691	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2692
2693	let r = _mm_comige_ss(a, b);
2694
2695	assert_eq!(
2696	ee[i], r,
2697	"_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2698	a, b, r, ee[i], i
2699	);
2700	}
2701	}
2702
2703	#[simd_test(enable = "sse")]
2704	unsafe fn test_mm_comineq_ss() {
2705	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2706	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2707
2708	let ee = &[`0i32`, `1`, `1`, `1`];
2709
2710	for i in `0`..`4` {
2711	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2712	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2713
2714	let r = _mm_comineq_ss(a, b);
2715
2716	assert_eq!(
2717	ee[i], r,
2718	"_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2719	a, b, r, ee[i], i
2720	);
2721	}
2722	}
2723
2724	#[simd_test(enable = "sse")]
2725	unsafe fn test_mm_ucomieq_ss() {
2726	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2727	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2728
2729	let ee = &[`1i32`, `0`, `0`, `0`];
2730
2731	for i in `0`..`4` {
2732	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2733	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2734
2735	let r = _mm_ucomieq_ss(a, b);
2736
2737	assert_eq!(
2738	ee[i], r,
2739	"_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2740	a, b, r, ee[i], i
2741	);
2742	}
2743	}
2744
2745	#[simd_test(enable = "sse")]
2746	unsafe fn test_mm_ucomilt_ss() {
2747	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2748	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2749
2750	let ee = &[`0i32`, `1`, `0`, `0`];
2751
2752	for i in `0`..`4` {
2753	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2754	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2755
2756	let r = _mm_ucomilt_ss(a, b);
2757
2758	assert_eq!(
2759	ee[i], r,
2760	"_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2761	a, b, r, ee[i], i
2762	);
2763	}
2764	}
2765
2766	#[simd_test(enable = "sse")]
2767	unsafe fn test_mm_ucomile_ss() {
2768	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2769	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2770
2771	let ee = &[`1i32`, `1`, `0`, `0`];
2772
2773	for i in `0`..`4` {
2774	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2775	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2776
2777	let r = _mm_ucomile_ss(a, b);
2778
2779	assert_eq!(
2780	ee[i], r,
2781	"_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2782	a, b, r, ee[i], i
2783	);
2784	}
2785	}
2786
2787	#[simd_test(enable = "sse")]
2788	unsafe fn test_mm_ucomigt_ss() {
2789	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2790	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2791
2792	let ee = &[`0i32`, `0`, `1`, `0`];
2793
2794	for i in `0`..`4` {
2795	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2796	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2797
2798	let r = _mm_ucomigt_ss(a, b);
2799
2800	assert_eq!(
2801	ee[i], r,
2802	"_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2803	a, b, r, ee[i], i
2804	);
2805	}
2806	}
2807
2808	#[simd_test(enable = "sse")]
2809	unsafe fn test_mm_ucomige_ss() {
2810	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2811	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2812
2813	let ee = &[`1i32`, `0`, `1`, `0`];
2814
2815	for i in `0`..`4` {
2816	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2817	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2818
2819	let r = _mm_ucomige_ss(a, b);
2820
2821	assert_eq!(
2822	ee[i], r,
2823	"_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2824	a, b, r, ee[i], i
2825	);
2826	}
2827	}
2828
2829	#[simd_test(enable = "sse")]
2830	unsafe fn test_mm_ucomineq_ss() {
2831	let aa = &[`3.0f32`, `12.0`, `23.0`, NAN];
2832	let bb = &[`3.0f32`, `47.5`, `1.5`, NAN];
2833
2834	let ee = &[`0i32`, `1`, `1`, `1`];
2835
2836	for i in `0`..`4` {
2837	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2838	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2839
2840	let r = _mm_ucomineq_ss(a, b);
2841
2842	assert_eq!(
2843	ee[i], r,
2844	"_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2845	a, b, r, ee[i], i
2846	);
2847	}
2848	}
2849
2850	#[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
2851	#[simd_test(enable = "sse")]
2852	#[cfg_attr(miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri
2853	unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
2854	// If one of the arguments is a quiet NaN `comieq_ss` should signal an
2855	// Invalid Operation Exception while `ucomieq_ss` should not.
2856	let aa = &[`3.0f32`, NAN, `23.0`, NAN];
2857	let bb = &[`3.0f32`, `47.5`, NAN, NAN];
2858
2859	let ee = &[`1i32`, `0`, `0`, `0`];
2860	let exc = &[`0u32`, `1`, `1`, `1`]; // Should comieq_ss signal an exception?
2861
2862	for i in `0`..`4` {
2863	let a = _mm_setr_ps(aa[i], `1.0`, `2.0`, `3.0`);
2864	let b = _mm_setr_ps(bb[i], `0.0`, `2.0`, `4.0`);
2865
2866	_MM_SET_EXCEPTION_STATE(`0`);
2867	let r1 = _mm_comieq_ss(*black_box(&a), b);
2868	let s1 = _MM_GET_EXCEPTION_STATE();
2869
2870	_MM_SET_EXCEPTION_STATE(`0`);
2871	let r2 = _mm_ucomieq_ss(*black_box(&a), b);
2872	let s2 = _MM_GET_EXCEPTION_STATE();
2873
2874	assert_eq!(
2875	ee[i], r1,
2876	"_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2877	a, b, r1, ee[i], i
2878	);
2879	assert_eq!(
2880	ee[i], r2,
2881	"_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2882	a, b, r2, ee[i], i
2883	);
2884	assert_eq!(
2885	s1,
2886	exc[i] * _MM_EXCEPT_INVALID,
2887	"_mm_comieq_ss() set exception flags: {} (i={})",
2888	s1,
2889	i
2890	);
2891	assert_eq!(
2892	s2,
2893	`0`, // ucomieq_ss should not signal an exception
2894	"_mm_ucomieq_ss() set exception flags: {} (i={})",
2895	s2,
2896	i
2897	);
2898	}
2899	}
2900
2901	#[simd_test(enable = "sse")]
2902	unsafe fn test_mm_cvtss_si32() {
2903	let inputs = &[`42.0f32`, `-3.1`, `4.0e10`, `4.0e-20`, NAN, `2147483500.1`];
2904	let result = &[`42i32`, `-3`, i32::MIN, `0`, i32::MIN, `2147483520`];
2905	for i in `0`..inputs.len() {
2906	let x = _mm_setr_ps(inputs[i], `1.0`, `3.0`, `4.0`);
2907	let e = result[i];
2908	let r = _mm_cvtss_si32(x);
2909	assert_eq!(
2910	e, r,
2911	"TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2912	i, x, r, e
2913	);
2914	}
2915	}
2916
2917	#[simd_test(enable = "sse")]
2918	unsafe fn test_mm_cvttss_si32() {
2919	let inputs = &[
2920	(`42.0f32`, `42i32`),
2921	(`-31.4`, `-31`),
2922	(`-33.5`, `-33`),
2923	(`-34.5`, `-34`),
2924	(`10.999`, `10`),
2925	(`-5.99`, `-5`),
2926	(`4.0e10`, i32::MIN),
2927	(`4.0e-10`, `0`),
2928	(NAN, i32::MIN),
2929	(`2147483500.1`, `2147483520`),
2930	];
2931	for i in `0`..inputs.len() {
2932	let (xi, e) = inputs[i];
2933	let x = _mm_setr_ps(xi, `1.0`, `3.0`, `4.0`);
2934	let r = _mm_cvttss_si32(x);
2935	assert_eq!(
2936	e, r,
2937	"TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2938	i, x, r, e
2939	);
2940	}
2941	}
2942
2943	#[simd_test(enable = "sse")]
2944	unsafe fn test_mm_cvtsi32_ss() {
2945	let inputs = &[
2946	(`4555i32`, `4555.0f32`),
2947	(`322223333`, `322223330.0`),
2948	(`-432`, `-432.0`),
2949	(`-322223333`, `-322223330.0`),
2950	];
2951
2952	for i in `0`..inputs.len() {
2953	let (x, f) = inputs[i];
2954	let a = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
2955	let r = _mm_cvtsi32_ss(a, x);
2956	let e = _mm_setr_ps(f, `6.0`, `7.0`, `8.0`);
2957	assert_eq_m128(e, r);
2958	}
2959	}
2960
2961	#[simd_test(enable = "sse")]
2962	unsafe fn test_mm_cvtss_f32() {
2963	let a = _mm_setr_ps(`312.0134`, `5.0`, `6.0`, `7.0`);
2964	assert_eq!(_mm_cvtss_f32(a), `312.0134`);
2965	}
2966
2967	#[simd_test(enable = "sse")]
2968	unsafe fn test_mm_set_ss() {
2969	let r = _mm_set_ss(black_box(`4.25`));
2970	assert_eq_m128(r, _mm_setr_ps(`4.25`, `0.0`, `0.0`, `0.0`));
2971	}
2972
2973	#[simd_test(enable = "sse")]
2974	unsafe fn test_mm_set1_ps() {
2975	let r1 = _mm_set1_ps(black_box(`4.25`));
2976	let r2 = _mm_set_ps1(black_box(`4.25`));
2977	assert_eq!(get_m128(r1, `0`), `4.25`);
2978	assert_eq!(get_m128(r1, `1`), `4.25`);
2979	assert_eq!(get_m128(r1, `2`), `4.25`);
2980	assert_eq!(get_m128(r1, `3`), `4.25`);
2981	assert_eq!(get_m128(r2, `0`), `4.25`);
2982	assert_eq!(get_m128(r2, `1`), `4.25`);
2983	assert_eq!(get_m128(r2, `2`), `4.25`);
2984	assert_eq!(get_m128(r2, `3`), `4.25`);
2985	}
2986
2987	#[simd_test(enable = "sse")]
2988	unsafe fn test_mm_set_ps() {
2989	let r = _mm_set_ps(
2990	black_box(`1.0`),
2991	black_box(`2.0`),
2992	black_box(`3.0`),
2993	black_box(`4.0`),
2994	);
2995	assert_eq!(get_m128(r, `0`), `4.0`);
2996	assert_eq!(get_m128(r, `1`), `3.0`);
2997	assert_eq!(get_m128(r, `2`), `2.0`);
2998	assert_eq!(get_m128(r, `3`), `1.0`);
2999	}
3000
3001	#[simd_test(enable = "sse")]
3002	unsafe fn test_mm_setr_ps() {
3003	let r = _mm_setr_ps(
3004	black_box(`1.0`),
3005	black_box(`2.0`),
3006	black_box(`3.0`),
3007	black_box(`4.0`),
3008	);
3009	assert_eq_m128(r, _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`));
3010	}
3011
3012	#[simd_test(enable = "sse")]
3013	unsafe fn test_mm_setzero_ps() {
3014	let r = *black_box(&_mm_setzero_ps());
3015	assert_eq_m128(r, _mm_set1_ps(`0.0`));
3016	}
3017
3018	#[simd_test(enable = "sse")]
3019	unsafe fn test_mm_shuffle() {
3020	assert_eq!(_MM_SHUFFLE(`0`, `1`, `1`, `3`), `0b00_01_01_11`);
3021	assert_eq!(_MM_SHUFFLE(`3`, `1`, `1`, `0`), `0b11_01_01_00`);
3022	assert_eq!(_MM_SHUFFLE(`1`, `2`, `2`, `1`), `0b01_10_10_01`);
3023	}
3024
3025	#[simd_test(enable = "sse")]
3026	unsafe fn test_mm_shuffle_ps() {
3027	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3028	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3029	let r = _mm_shuffle_ps::<`0b00_01_01_11`>(a, b);
3030	assert_eq_m128(r, _mm_setr_ps(`4.0`, `2.0`, `6.0`, `5.0`));
3031	}
3032
3033	#[simd_test(enable = "sse")]
3034	unsafe fn test_mm_unpackhi_ps() {
3035	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3036	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3037	let r = _mm_unpackhi_ps(a, b);
3038	assert_eq_m128(r, _mm_setr_ps(`3.0`, `7.0`, `4.0`, `8.0`));
3039	}
3040
3041	#[simd_test(enable = "sse")]
3042	unsafe fn test_mm_unpacklo_ps() {
3043	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3044	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3045	let r = _mm_unpacklo_ps(a, b);
3046	assert_eq_m128(r, _mm_setr_ps(`1.0`, `5.0`, `2.0`, `6.0`));
3047	}
3048
3049	#[simd_test(enable = "sse")]
3050	unsafe fn test_mm_movehl_ps() {
3051	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3052	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3053	let r = _mm_movehl_ps(a, b);
3054	assert_eq_m128(r, _mm_setr_ps(`7.0`, `8.0`, `3.0`, `4.0`));
3055	}
3056
3057	#[simd_test(enable = "sse")]
3058	unsafe fn test_mm_movelh_ps() {
3059	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3060	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3061	let r = _mm_movelh_ps(a, b);
3062	assert_eq_m128(r, _mm_setr_ps(`1.0`, `2.0`, `5.0`, `6.0`));
3063	}
3064
3065	#[simd_test(enable = "sse")]
3066	unsafe fn test_mm_load_ss() {
3067	let a = `42.0f32`;
3068	let r = _mm_load_ss(&a as *const f32);
3069	assert_eq_m128(r, _mm_setr_ps(`42.0`, `0.0`, `0.0`, `0.0`));
3070	}
3071
3072	#[simd_test(enable = "sse")]
3073	unsafe fn test_mm_load1_ps() {
3074	let a = `42.0f32`;
3075	let r = _mm_load1_ps(&a as *const f32);
3076	assert_eq_m128(r, _mm_setr_ps(`42.0`, `42.0`, `42.0`, `42.0`));
3077	}
3078
3079	#[simd_test(enable = "sse")]
3080	unsafe fn test_mm_load_ps() {
3081	let vals = &[`1.0f32`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`];
3082
3083	let mut p = vals.as_ptr();
3084	let mut fixup = `0.0f32`;
3085
3086	// Make sure p is aligned, otherwise we might get a
3087	// (signal: 11, SIGSEGV: invalid memory reference)
3088
3089	let unalignment = (p as usize) & `0xf`;
3090	if unalignment != `0` {
3091	let delta = (`16` - unalignment) >> `2`;
3092	fixup = delta as f32;
3093	p = p.add(delta);
3094	}
3095
3096	let r = _mm_load_ps(p);
3097	let e = _mm_add_ps(_mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`), _mm_set1_ps(fixup));
3098	assert_eq_m128(r, e);
3099	}
3100
3101	#[simd_test(enable = "sse")]
3102	unsafe fn test_mm_loadu_ps() {
3103	let vals = &[`1.0f32`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`];
3104	let p = vals.as_ptr().add(`3`);
3105	let r = _mm_loadu_ps(black_box(p));
3106	assert_eq_m128(r, _mm_setr_ps(`4.0`, `5.0`, `6.0`, `7.0`));
3107	}
3108
3109	#[simd_test(enable = "sse")]
3110	unsafe fn test_mm_loadr_ps() {
3111	let vals = &[`1.0f32`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`];
3112
3113	let mut p = vals.as_ptr();
3114	let mut fixup = `0.0f32`;
3115
3116	// Make sure p is aligned, otherwise we might get a
3117	// (signal: 11, SIGSEGV: invalid memory reference)
3118
3119	let unalignment = (p as usize) & `0xf`;
3120	if unalignment != `0` {
3121	let delta = (`16` - unalignment) >> `2`;
3122	fixup = delta as f32;
3123	p = p.add(delta);
3124	}
3125
3126	let r = _mm_loadr_ps(p);
3127	let e = _mm_add_ps(_mm_setr_ps(`4.0`, `3.0`, `2.0`, `1.0`), _mm_set1_ps(fixup));
3128	assert_eq_m128(r, e);
3129	}
3130
3131	#[simd_test(enable = "sse2")]
3132	unsafe fn test_mm_loadu_si64() {
3133	let a = _mm_setr_epi64x(`5`, `6`);
3134	let r = _mm_loadu_si64(&a as *const _ as *const _);
3135	assert_eq_m128i(r, _mm_setr_epi64x(`5`, `0`));
3136	}
3137
3138	#[simd_test(enable = "sse")]
3139	unsafe fn test_mm_store_ss() {
3140	let mut vals = [`0.0f32`; `8`];
3141	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3142	_mm_store_ss(vals.as_mut_ptr().add(`1`), a);
3143
3144	assert_eq!(vals[`0`], `0.0`);
3145	assert_eq!(vals[`1`], `1.0`);
3146	assert_eq!(vals[`2`], `0.0`);
3147	}
3148
3149	#[simd_test(enable = "sse")]
3150	unsafe fn test_mm_store1_ps() {
3151	let mut vals = [`0.0f32`; `8`];
3152	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3153
3154	let mut ofs = `0`;
3155	let mut p = vals.as_mut_ptr();
3156
3157	if (p as usize) & `0xf` != `0` {
3158	ofs = (`16` - ((p as usize) & `0xf`)) >> `2`;
3159	p = p.add(ofs);
3160	}
3161
3162	_mm_store1_ps(p, *black_box(&a));
3163
3164	if ofs > `0` {
3165	assert_eq!(vals[ofs - `1`], `0.0`);
3166	}
3167	assert_eq!(vals[ofs + `0`], `1.0`);
3168	assert_eq!(vals[ofs + `1`], `1.0`);
3169	assert_eq!(vals[ofs + `2`], `1.0`);
3170	assert_eq!(vals[ofs + `3`], `1.0`);
3171	assert_eq!(vals[ofs + `4`], `0.0`);
3172	}
3173
3174	#[simd_test(enable = "sse")]
3175	unsafe fn test_mm_store_ps() {
3176	let mut vals = [`0.0f32`; `8`];
3177	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3178
3179	let mut ofs = `0`;
3180	let mut p = vals.as_mut_ptr();
3181
3182	// Align p to 16-byte boundary
3183	if (p as usize) & `0xf` != `0` {
3184	ofs = (`16` - ((p as usize) & `0xf`)) >> `2`;
3185	p = p.add(ofs);
3186	}
3187
3188	_mm_store_ps(p, *black_box(&a));
3189
3190	if ofs > `0` {
3191	assert_eq!(vals[ofs - `1`], `0.0`);
3192	}
3193	assert_eq!(vals[ofs + `0`], `1.0`);
3194	assert_eq!(vals[ofs + `1`], `2.0`);
3195	assert_eq!(vals[ofs + `2`], `3.0`);
3196	assert_eq!(vals[ofs + `3`], `4.0`);
3197	assert_eq!(vals[ofs + `4`], `0.0`);
3198	}
3199
3200	#[simd_test(enable = "sse")]
3201	unsafe fn test_mm_storer_ps() {
3202	let mut vals = [`0.0f32`; `8`];
3203	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3204
3205	let mut ofs = `0`;
3206	let mut p = vals.as_mut_ptr();
3207
3208	// Align p to 16-byte boundary
3209	if (p as usize) & `0xf` != `0` {
3210	ofs = (`16` - ((p as usize) & `0xf`)) >> `2`;
3211	p = p.add(ofs);
3212	}
3213
3214	_mm_storer_ps(p, *black_box(&a));
3215
3216	if ofs > `0` {
3217	assert_eq!(vals[ofs - `1`], `0.0`);
3218	}
3219	assert_eq!(vals[ofs + `0`], `4.0`);
3220	assert_eq!(vals[ofs + `1`], `3.0`);
3221	assert_eq!(vals[ofs + `2`], `2.0`);
3222	assert_eq!(vals[ofs + `3`], `1.0`);
3223	assert_eq!(vals[ofs + `4`], `0.0`);
3224	}
3225
3226	#[simd_test(enable = "sse")]
3227	unsafe fn test_mm_storeu_ps() {
3228	let mut vals = [`0.0f32`; `8`];
3229	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3230
3231	let mut ofs = `0`;
3232	let mut p = vals.as_mut_ptr();
3233
3234	// Make sure p is not* aligned to 16-byte boundary*
3235	if (p as usize) & `0xf` == `0` {
3236	ofs = `1`;
3237	p = p.add(`1`);
3238	}
3239
3240	_mm_storeu_ps(p, *black_box(&a));
3241
3242	if ofs > `0` {
3243	assert_eq!(vals[ofs - `1`], `0.0`);
3244	}
3245	assert_eq!(vals[ofs + `0`], `1.0`);
3246	assert_eq!(vals[ofs + `1`], `2.0`);
3247	assert_eq!(vals[ofs + `2`], `3.0`);
3248	assert_eq!(vals[ofs + `3`], `4.0`);
3249	assert_eq!(vals[ofs + `4`], `0.0`);
3250	}
3251
3252	#[simd_test(enable = "sse")]
3253	unsafe fn test_mm_move_ss() {
3254	let a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3255	let b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3256
3257	let r = _mm_move_ss(a, b);
3258	let e = _mm_setr_ps(`5.0`, `2.0`, `3.0`, `4.0`);
3259	assert_eq_m128(e, r);
3260	}
3261
3262	#[simd_test(enable = "sse")]
3263	unsafe fn test_mm_movemask_ps() {
3264	let r = _mm_movemask_ps(_mm_setr_ps(`-1.0`, `5.0`, `-5.0`, `0.0`));
3265	assert_eq!(r, `0b0101`);
3266
3267	let r = _mm_movemask_ps(_mm_setr_ps(`-1.0`, `-5.0`, `-5.0`, `0.0`));
3268	assert_eq!(r, `0b0111`);
3269	}
3270
3271	#[simd_test(enable = "sse")]
3272	// Miri cannot support this until it is clear how it fits in the Rust memory model
3273	#[cfg_attr(miri, ignore)]
3274	unsafe fn test_mm_sfence() {
3275	_mm_sfence();
3276	}
3277
3278	#[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3279	#[simd_test(enable = "sse")]
3280	#[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3281	unsafe fn test_mm_getcsr_setcsr_1() {
3282	let saved_csr = _mm_getcsr();
3283
3284	let a = _mm_setr_ps(`1.1e-36`, `0.0`, `0.0`, `1.0`);
3285	let b = _mm_setr_ps(`0.001`, `0.0`, `0.0`, `1.0`);
3286
3287	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
3288	let r = _mm_mul_ps(black_box(&a), black_box(&b));
3289
3290	_mm_setcsr(saved_csr);
3291
3292	let exp = _mm_setr_ps(`0.0`, `0.0`, `0.0`, `1.0`);
3293	assert_eq_m128(r, exp); // first component is a denormalized f32
3294	}
3295
3296	#[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3297	#[simd_test(enable = "sse")]
3298	#[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3299	unsafe fn test_mm_getcsr_setcsr_2() {
3300	// Same as _mm_setcsr_1 test, but with opposite flag value.
3301
3302	let saved_csr = _mm_getcsr();
3303
3304	let a = _mm_setr_ps(`1.1e-36`, `0.0`, `0.0`, `1.0`);
3305	let b = _mm_setr_ps(`0.001`, `0.0`, `0.0`, `1.0`);
3306
3307	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
3308	let r = _mm_mul_ps(black_box(&a), black_box(&b));
3309
3310	_mm_setcsr(saved_csr);
3311
3312	let exp = _mm_setr_ps(`1.1e-39`, `0.0`, `0.0`, `1.0`);
3313	assert_eq_m128(r, exp); // first component is a denormalized f32
3314	}
3315
3316	#[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3317	#[simd_test(enable = "sse")]
3318	#[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3319	unsafe fn test_mm_getcsr_setcsr_underflow() {
3320	_MM_SET_EXCEPTION_STATE(`0`);
3321
3322	let a = _mm_setr_ps(`1.1e-36`, `0.0`, `0.0`, `1.0`);
3323	let b = _mm_setr_ps(`1e-5`, `0.0`, `0.0`, `1.0`);
3324
3325	assert_eq!(_MM_GET_EXCEPTION_STATE(), `0`); // just to be sure
3326
3327	let r = _mm_mul_ps(black_box(&a), black_box(&b));
3328
3329	let exp = _mm_setr_ps(`1.1e-41`, `0.0`, `0.0`, `1.0`);
3330	assert_eq_m128(r, exp);
3331
3332	let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != `0`;
3333	assert_eq!(underflow, `true`);
3334	}
3335
3336	#[simd_test(enable = "sse")]
3337	unsafe fn test_MM_TRANSPOSE4_PS() {
3338	let mut a = _mm_setr_ps(`1.0`, `2.0`, `3.0`, `4.0`);
3339	let mut b = _mm_setr_ps(`5.0`, `6.0`, `7.0`, `8.0`);
3340	let mut c = _mm_setr_ps(`9.0`, `10.0`, `11.0`, `12.0`);
3341	let mut d = _mm_setr_ps(`13.0`, `14.0`, `15.0`, `16.0`);
3342
3343	_MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3344
3345	assert_eq_m128(a, _mm_setr_ps(`1.0`, `5.0`, `9.0`, `13.0`));
3346	assert_eq_m128(b, _mm_setr_ps(`2.0`, `6.0`, `10.0`, `14.0`));
3347	assert_eq_m128(c, _mm_setr_ps(`3.0`, `7.0`, `11.0`, `15.0`));
3348	assert_eq_m128(d, _mm_setr_ps(`4.0`, `8.0`, `12.0`, `16.0`));
3349	}
3350
3351	#[repr(align(`16`))]
3352	struct Memory {
3353	pub data: [f32; `4`],
3354	}
3355
3356	#[simd_test(enable = "sse")]
3357	// Miri cannot support this until it is clear how it fits in the Rust memory model
3358	// (non-temporal store)
3359	#[cfg_attr(miri, ignore)]
3360	unsafe fn test_mm_stream_ps() {
3361	let a = _mm_set1_ps(`7.0`);
3362	let mut mem = Memory { data: [`-1.0`; `4`] };
3363
3364	_mm_stream_ps(&mut mem.data[`0`] as *mut f32, a);
3365	for i in `0`..`4` {
3366	assert_eq!(mem.data[i], get_m128(a, i));
3367	}
3368	}
3369	}
3370