1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//! Programmer's Manual, Volume 3: General-Purpose and System
8//! Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17 core_arch::{simd::*, x86::*},
18 intrinsics::simd::*,
19 mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
34 unsafe { simd_add(x:a, y:b) }
35}
36
37/// Adds packed single-precision (32-bit) floating-point elements in `a` and
38/// `b`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
41#[inline]
42#[target_feature(enable = "avx")]
43#[cfg_attr(test, assert_instr(vaddps))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
46 unsafe { simd_add(x:a, y:b) }
47}
48
49/// Computes the bitwise AND of a packed double-precision (64-bit)
50/// floating-point elements in `a` and `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
53#[inline]
54#[target_feature(enable = "avx")]
55// See https://github.com/rust-lang/stdarch/issues/71
56#[cfg_attr(test, assert_instr(vandp))]
57#[stable(feature = "simd_x86", since = "1.27.0")]
58pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
59 unsafe {
60 let a: u64x4 = transmute(src:a);
61 let b: u64x4 = transmute(src:b);
62 transmute(src:simd_and(x:a, y:b))
63 }
64}
65
66/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
67/// elements in `a` and `b`.
68///
69/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
70#[inline]
71#[target_feature(enable = "avx")]
72#[cfg_attr(test, assert_instr(vandps))]
73#[stable(feature = "simd_x86", since = "1.27.0")]
74pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
75 unsafe {
76 let a: u32x8 = transmute(src:a);
77 let b: u32x8 = transmute(src:b);
78 transmute(src:simd_and(x:a, y:b))
79 }
80}
81
82/// Computes the bitwise OR packed double-precision (64-bit) floating-point
83/// elements in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
86#[inline]
87#[target_feature(enable = "avx")]
88// See <https://github.com/rust-lang/stdarch/issues/71>.
89#[cfg_attr(test, assert_instr(vorp))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
92 unsafe {
93 let a: u64x4 = transmute(src:a);
94 let b: u64x4 = transmute(src:b);
95 transmute(src:simd_or(x:a, y:b))
96 }
97}
98
99/// Computes the bitwise OR packed single-precision (32-bit) floating-point
100/// elements in `a` and `b`.
101///
102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
103#[inline]
104#[target_feature(enable = "avx")]
105#[cfg_attr(test, assert_instr(vorps))]
106#[stable(feature = "simd_x86", since = "1.27.0")]
107pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
108 unsafe {
109 let a: u32x8 = transmute(src:a);
110 let b: u32x8 = transmute(src:b);
111 transmute(src:simd_or(x:a, y:b))
112 }
113}
114
115/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
116/// lanes using the control in `imm8`.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
119#[inline]
120#[target_feature(enable = "avx")]
121#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
122#[rustc_legacy_const_generics(2)]
123#[stable(feature = "simd_x86", since = "1.27.0")]
124pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
125 static_assert_uimm_bits!(MASK, 8);
126 unsafe {
127 simd_shuffle!(
128 a,
129 b,
130 [
131 MASK as u32 & 0b1,
132 ((MASK as u32 >> 1) & 0b1) + 4,
133 ((MASK as u32 >> 2) & 0b1) + 2,
134 ((MASK as u32 >> 3) & 0b1) + 6,
135 ],
136 )
137 }
138}
139
140/// Shuffles single-precision (32-bit) floating-point elements in `a` within
141/// 128-bit lanes using the control in `imm8`.
142///
143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
144#[inline]
145#[target_feature(enable = "avx")]
146#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
147#[rustc_legacy_const_generics(2)]
148#[stable(feature = "simd_x86", since = "1.27.0")]
149pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
150 static_assert_uimm_bits!(MASK, 8);
151 unsafe {
152 simd_shuffle!(
153 a,
154 b,
155 [
156 MASK as u32 & 0b11,
157 (MASK as u32 >> 2) & 0b11,
158 ((MASK as u32 >> 4) & 0b11) + 8,
159 ((MASK as u32 >> 6) & 0b11) + 8,
160 (MASK as u32 & 0b11) + 4,
161 ((MASK as u32 >> 2) & 0b11) + 4,
162 ((MASK as u32 >> 4) & 0b11) + 12,
163 ((MASK as u32 >> 6) & 0b11) + 12,
164 ],
165 )
166 }
167}
168
169/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
170/// elements in `a`, and then AND with `b`.
171///
172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
173#[inline]
174#[target_feature(enable = "avx")]
175#[cfg_attr(test, assert_instr(vandnp))]
176#[stable(feature = "simd_x86", since = "1.27.0")]
177pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
178 unsafe {
179 let a: u64x4 = transmute(src:a);
180 let b: u64x4 = transmute(src:b);
181 transmute(src:simd_and(x:simd_xor(u64x4::splat(!(0_u64)), a), y:b))
182 }
183}
184
185/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
186/// elements in `a`
187/// and then AND with `b`.
188///
189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
190#[inline]
191#[target_feature(enable = "avx")]
192#[cfg_attr(test, assert_instr(vandnps))]
193#[stable(feature = "simd_x86", since = "1.27.0")]
194pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
195 unsafe {
196 let a: u32x8 = transmute(src:a);
197 let b: u32x8 = transmute(src:b);
198 transmute(src:simd_and(x:simd_xor(u32x8::splat(!(0_u32)), a), y:b))
199 }
200}
201
202/// Compares packed double-precision (64-bit) floating-point elements
203/// in `a` and `b`, and returns packed maximum values
204///
205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
206#[inline]
207#[target_feature(enable = "avx")]
208#[cfg_attr(test, assert_instr(vmaxpd))]
209#[stable(feature = "simd_x86", since = "1.27.0")]
210pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
211 unsafe { vmaxpd(a, b) }
212}
213
214/// Compares packed single-precision (32-bit) floating-point elements in `a`
215/// and `b`, and returns packed maximum values
216///
217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
218#[inline]
219#[target_feature(enable = "avx")]
220#[cfg_attr(test, assert_instr(vmaxps))]
221#[stable(feature = "simd_x86", since = "1.27.0")]
222pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
223 unsafe { vmaxps(a, b) }
224}
225
226/// Compares packed double-precision (64-bit) floating-point elements
227/// in `a` and `b`, and returns packed minimum values
228///
229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
230#[inline]
231#[target_feature(enable = "avx")]
232#[cfg_attr(test, assert_instr(vminpd))]
233#[stable(feature = "simd_x86", since = "1.27.0")]
234pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
235 unsafe { vminpd(a, b) }
236}
237
238/// Compares packed single-precision (32-bit) floating-point elements in `a`
239/// and `b`, and returns packed minimum values
240///
241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
242#[inline]
243#[target_feature(enable = "avx")]
244#[cfg_attr(test, assert_instr(vminps))]
245#[stable(feature = "simd_x86", since = "1.27.0")]
246pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
247 unsafe { vminps(a, b) }
248}
249
250/// Multiplies packed double-precision (64-bit) floating-point elements
251/// in `a` and `b`.
252///
253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
254#[inline]
255#[target_feature(enable = "avx")]
256#[cfg_attr(test, assert_instr(vmulpd))]
257#[stable(feature = "simd_x86", since = "1.27.0")]
258pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
259 unsafe { simd_mul(x:a, y:b) }
260}
261
262/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
263/// `b`.
264///
265/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
266#[inline]
267#[target_feature(enable = "avx")]
268#[cfg_attr(test, assert_instr(vmulps))]
269#[stable(feature = "simd_x86", since = "1.27.0")]
270pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
271 unsafe { simd_mul(x:a, y:b) }
272}
273
274/// Alternatively adds and subtracts packed double-precision (64-bit)
275/// floating-point elements in `a` to/from packed elements in `b`.
276///
277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
278#[inline]
279#[target_feature(enable = "avx")]
280#[cfg_attr(test, assert_instr(vaddsubpd))]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
283 unsafe {
284 let a: f64x4 = a.as_f64x4();
285 let b: f64x4 = b.as_f64x4();
286 let add: f64x4 = simd_add(x:a, y:b);
287 let sub: f64x4 = simd_sub(lhs:a, rhs:b);
288 simd_shuffle!(add, sub, [4, 1, 6, 3])
289 }
290}
291
292/// Alternatively adds and subtracts packed single-precision (32-bit)
293/// floating-point elements in `a` to/from packed elements in `b`.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
296#[inline]
297#[target_feature(enable = "avx")]
298#[cfg_attr(test, assert_instr(vaddsubps))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
301 unsafe {
302 let a: f32x8 = a.as_f32x8();
303 let b: f32x8 = b.as_f32x8();
304 let add: f32x8 = simd_add(x:a, y:b);
305 let sub: f32x8 = simd_sub(lhs:a, rhs:b);
306 simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
307 }
308}
309
310/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
311/// from packed elements in `a`.
312///
313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
314#[inline]
315#[target_feature(enable = "avx")]
316#[cfg_attr(test, assert_instr(vsubpd))]
317#[stable(feature = "simd_x86", since = "1.27.0")]
318pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
319 unsafe { simd_sub(lhs:a, rhs:b) }
320}
321
322/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
323/// from packed elements in `a`.
324///
325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
326#[inline]
327#[target_feature(enable = "avx")]
328#[cfg_attr(test, assert_instr(vsubps))]
329#[stable(feature = "simd_x86", since = "1.27.0")]
330pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
331 unsafe { simd_sub(lhs:a, rhs:b) }
332}
333
334/// Computes the division of each of the 8 packed 32-bit floating-point elements
335/// in `a` by the corresponding packed elements in `b`.
336///
337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
338#[inline]
339#[target_feature(enable = "avx")]
340#[cfg_attr(test, assert_instr(vdivps))]
341#[stable(feature = "simd_x86", since = "1.27.0")]
342pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
343 unsafe { simd_div(lhs:a, rhs:b) }
344}
345
346/// Computes the division of each of the 4 packed 64-bit floating-point elements
347/// in `a` by the corresponding packed elements in `b`.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
350#[inline]
351#[target_feature(enable = "avx")]
352#[cfg_attr(test, assert_instr(vdivpd))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
355 unsafe { simd_div(lhs:a, rhs:b) }
356}
357
358/// Rounds packed double-precision (64-bit) floating point elements in `a`
359/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
360///
361/// - `0x00`: Round to the nearest whole number.
362/// - `0x01`: Round down, toward negative infinity.
363/// - `0x02`: Round up, toward positive infinity.
364/// - `0x03`: Truncate the values.
365///
366/// For a complete list of options, check [the LLVM docs][llvm_docs].
367///
368/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
369///
370/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
371#[inline]
372#[target_feature(enable = "avx")]
373#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
374#[rustc_legacy_const_generics(1)]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
377 static_assert_uimm_bits!(ROUNDING, 4);
378 unsafe { roundpd256(a, ROUNDING) }
379}
380
381/// Rounds packed double-precision (64-bit) floating point elements in `a`
382/// toward positive infinity.
383///
384/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
385#[inline]
386#[target_feature(enable = "avx")]
387#[cfg_attr(test, assert_instr(vroundpd))]
388#[stable(feature = "simd_x86", since = "1.27.0")]
389pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
390 unsafe { simd_ceil(a) }
391}
392
393/// Rounds packed double-precision (64-bit) floating point elements in `a`
394/// toward negative infinity.
395///
396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
397#[inline]
398#[target_feature(enable = "avx")]
399#[cfg_attr(test, assert_instr(vroundpd))]
400#[stable(feature = "simd_x86", since = "1.27.0")]
401pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
402 unsafe { simd_floor(a) }
403}
404
405/// Rounds packed single-precision (32-bit) floating point elements in `a`
406/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
407///
408/// - `0x00`: Round to the nearest whole number.
409/// - `0x01`: Round down, toward negative infinity.
410/// - `0x02`: Round up, toward positive infinity.
411/// - `0x03`: Truncate the values.
412///
413/// For a complete list of options, check [the LLVM docs][llvm_docs].
414///
415/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
418#[inline]
419#[target_feature(enable = "avx")]
420#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
421#[rustc_legacy_const_generics(1)]
422#[stable(feature = "simd_x86", since = "1.27.0")]
423pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
424 static_assert_uimm_bits!(ROUNDING, 4);
425 unsafe { roundps256(a, ROUNDING) }
426}
427
428/// Rounds packed single-precision (32-bit) floating point elements in `a`
429/// toward positive infinity.
430///
431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
432#[inline]
433#[target_feature(enable = "avx")]
434#[cfg_attr(test, assert_instr(vroundps))]
435#[stable(feature = "simd_x86", since = "1.27.0")]
436pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
437 unsafe { simd_ceil(a) }
438}
439
440/// Rounds packed single-precision (32-bit) floating point elements in `a`
441/// toward negative infinity.
442///
443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
444#[inline]
445#[target_feature(enable = "avx")]
446#[cfg_attr(test, assert_instr(vroundps))]
447#[stable(feature = "simd_x86", since = "1.27.0")]
448pub fn _mm256_floor_ps(a: __m256) -> __m256 {
449 unsafe { simd_floor(a) }
450}
451
452/// Returns the square root of packed single-precision (32-bit) floating point
453/// elements in `a`.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
456#[inline]
457#[target_feature(enable = "avx")]
458#[cfg_attr(test, assert_instr(vsqrtps))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
461 unsafe { simd_fsqrt(a) }
462}
463
464/// Returns the square root of packed double-precision (64-bit) floating point
465/// elements in `a`.
466///
467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
468#[inline]
469#[target_feature(enable = "avx")]
470#[cfg_attr(test, assert_instr(vsqrtpd))]
471#[stable(feature = "simd_x86", since = "1.27.0")]
472pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
473 unsafe { simd_fsqrt(a) }
474}
475
476/// Blends packed double-precision (64-bit) floating-point elements from
477/// `a` and `b` using control mask `imm8`.
478///
479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
480#[inline]
481#[target_feature(enable = "avx")]
482// Note: LLVM7 prefers single-precision blend instructions when
483// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
484// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
485#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
486#[rustc_legacy_const_generics(2)]
487#[stable(feature = "simd_x86", since = "1.27.0")]
488pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
489 static_assert_uimm_bits!(IMM4, 4);
490 unsafe {
491 simd_shuffle!(
492 a,
493 b,
494 [
495 ((IMM4 as u32 >> 0) & 1) * 4 + 0,
496 ((IMM4 as u32 >> 1) & 1) * 4 + 1,
497 ((IMM4 as u32 >> 2) & 1) * 4 + 2,
498 ((IMM4 as u32 >> 3) & 1) * 4 + 3,
499 ],
500 )
501 }
502}
503
504/// Blends packed single-precision (32-bit) floating-point elements from
505/// `a` and `b` using control mask `imm8`.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
508#[inline]
509#[target_feature(enable = "avx")]
510#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
511#[rustc_legacy_const_generics(2)]
512#[stable(feature = "simd_x86", since = "1.27.0")]
513pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
514 static_assert_uimm_bits!(IMM8, 8);
515 unsafe {
516 simd_shuffle!(
517 a,
518 b,
519 [
520 ((IMM8 as u32 >> 0) & 1) * 8 + 0,
521 ((IMM8 as u32 >> 1) & 1) * 8 + 1,
522 ((IMM8 as u32 >> 2) & 1) * 8 + 2,
523 ((IMM8 as u32 >> 3) & 1) * 8 + 3,
524 ((IMM8 as u32 >> 4) & 1) * 8 + 4,
525 ((IMM8 as u32 >> 5) & 1) * 8 + 5,
526 ((IMM8 as u32 >> 6) & 1) * 8 + 6,
527 ((IMM8 as u32 >> 7) & 1) * 8 + 7,
528 ],
529 )
530 }
531}
532
533/// Blends packed double-precision (64-bit) floating-point elements from
534/// `a` and `b` using `c` as a mask.
535///
536/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
537#[inline]
538#[target_feature(enable = "avx")]
539#[cfg_attr(test, assert_instr(vblendvpd))]
540#[stable(feature = "simd_x86", since = "1.27.0")]
541pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
542 unsafe {
543 let mask: i64x4 = simd_lt(x:transmute::<_, i64x4>(c), y:i64x4::ZERO);
544 transmute(src:simd_select(mask, if_true:b.as_f64x4(), if_false:a.as_f64x4()))
545 }
546}
547
548/// Blends packed single-precision (32-bit) floating-point elements from
549/// `a` and `b` using `c` as a mask.
550///
551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
552#[inline]
553#[target_feature(enable = "avx")]
554#[cfg_attr(test, assert_instr(vblendvps))]
555#[stable(feature = "simd_x86", since = "1.27.0")]
556pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
557 unsafe {
558 let mask: i32x8 = simd_lt(x:transmute::<_, i32x8>(c), y:i32x8::ZERO);
559 transmute(src:simd_select(mask, if_true:b.as_f32x8(), if_false:a.as_f32x8()))
560 }
561}
562
563/// Conditionally multiplies the packed single-precision (32-bit) floating-point
564/// elements in `a` and `b` using the high 4 bits in `imm8`,
565/// sum the four products, and conditionally return the sum
566/// using the low 4 bits of `imm8`.
567///
568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
569#[inline]
570#[target_feature(enable = "avx")]
571#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
572#[rustc_legacy_const_generics(2)]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
575 static_assert_uimm_bits!(IMM8, 8);
576 unsafe { vdpps(a, b, IMM8 as i8) }
577}
578
579/// Horizontal addition of adjacent pairs in the two packed vectors
580/// of 4 64-bit floating points `a` and `b`.
581/// In the result, sums of elements from `a` are returned in even locations,
582/// while sums of elements from `b` are returned in odd locations.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
585#[inline]
586#[target_feature(enable = "avx")]
587#[cfg_attr(test, assert_instr(vhaddpd))]
588#[stable(feature = "simd_x86", since = "1.27.0")]
589pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
590 unsafe { vhaddpd(a, b) }
591}
592
593/// Horizontal addition of adjacent pairs in the two packed vectors
594/// of 8 32-bit floating points `a` and `b`.
595/// In the result, sums of elements from `a` are returned in locations of
596/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
597/// 2, 3, 6, 7.
598///
599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
600#[inline]
601#[target_feature(enable = "avx")]
602#[cfg_attr(test, assert_instr(vhaddps))]
603#[stable(feature = "simd_x86", since = "1.27.0")]
604pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
605 unsafe { vhaddps(a, b) }
606}
607
608/// Horizontal subtraction of adjacent pairs in the two packed vectors
609/// of 4 64-bit floating points `a` and `b`.
610/// In the result, sums of elements from `a` are returned in even locations,
611/// while sums of elements from `b` are returned in odd locations.
612///
613/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
614#[inline]
615#[target_feature(enable = "avx")]
616#[cfg_attr(test, assert_instr(vhsubpd))]
617#[stable(feature = "simd_x86", since = "1.27.0")]
618pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
619 unsafe { vhsubpd(a, b) }
620}
621
622/// Horizontal subtraction of adjacent pairs in the two packed vectors
623/// of 8 32-bit floating points `a` and `b`.
624/// In the result, sums of elements from `a` are returned in locations of
625/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
626/// 2, 3, 6, 7.
627///
628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
629#[inline]
630#[target_feature(enable = "avx")]
631#[cfg_attr(test, assert_instr(vhsubps))]
632#[stable(feature = "simd_x86", since = "1.27.0")]
633pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
634 unsafe { vhsubps(a, b) }
635}
636
637/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
638/// elements in `a` and `b`.
639///
640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
641#[inline]
642#[target_feature(enable = "avx")]
643#[cfg_attr(test, assert_instr(vxorp))]
644#[stable(feature = "simd_x86", since = "1.27.0")]
645pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
646 unsafe {
647 let a: u64x4 = transmute(src:a);
648 let b: u64x4 = transmute(src:b);
649 transmute(src:simd_xor(x:a, y:b))
650 }
651}
652
653/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
654/// elements in `a` and `b`.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
657#[inline]
658#[target_feature(enable = "avx")]
659#[cfg_attr(test, assert_instr(vxorps))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
662 unsafe {
663 let a: u32x8 = transmute(src:a);
664 let b: u32x8 = transmute(src:b);
665 transmute(src:simd_xor(x:a, y:b))
666 }
667}
668
669/// Equal (ordered, non-signaling)
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub const _CMP_EQ_OQ: i32 = 0x00;
672/// Less-than (ordered, signaling)
673#[stable(feature = "simd_x86", since = "1.27.0")]
674pub const _CMP_LT_OS: i32 = 0x01;
675/// Less-than-or-equal (ordered, signaling)
676#[stable(feature = "simd_x86", since = "1.27.0")]
677pub const _CMP_LE_OS: i32 = 0x02;
678/// Unordered (non-signaling)
679#[stable(feature = "simd_x86", since = "1.27.0")]
680pub const _CMP_UNORD_Q: i32 = 0x03;
681/// Not-equal (unordered, non-signaling)
682#[stable(feature = "simd_x86", since = "1.27.0")]
683pub const _CMP_NEQ_UQ: i32 = 0x04;
684/// Not-less-than (unordered, signaling)
685#[stable(feature = "simd_x86", since = "1.27.0")]
686pub const _CMP_NLT_US: i32 = 0x05;
687/// Not-less-than-or-equal (unordered, signaling)
688#[stable(feature = "simd_x86", since = "1.27.0")]
689pub const _CMP_NLE_US: i32 = 0x06;
690/// Ordered (non-signaling)
691#[stable(feature = "simd_x86", since = "1.27.0")]
692pub const _CMP_ORD_Q: i32 = 0x07;
693/// Equal (unordered, non-signaling)
694#[stable(feature = "simd_x86", since = "1.27.0")]
695pub const _CMP_EQ_UQ: i32 = 0x08;
696/// Not-greater-than-or-equal (unordered, signaling)
697#[stable(feature = "simd_x86", since = "1.27.0")]
698pub const _CMP_NGE_US: i32 = 0x09;
699/// Not-greater-than (unordered, signaling)
700#[stable(feature = "simd_x86", since = "1.27.0")]
701pub const _CMP_NGT_US: i32 = 0x0a;
702/// False (ordered, non-signaling)
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub const _CMP_FALSE_OQ: i32 = 0x0b;
705/// Not-equal (ordered, non-signaling)
706#[stable(feature = "simd_x86", since = "1.27.0")]
707pub const _CMP_NEQ_OQ: i32 = 0x0c;
708/// Greater-than-or-equal (ordered, signaling)
709#[stable(feature = "simd_x86", since = "1.27.0")]
710pub const _CMP_GE_OS: i32 = 0x0d;
711/// Greater-than (ordered, signaling)
712#[stable(feature = "simd_x86", since = "1.27.0")]
713pub const _CMP_GT_OS: i32 = 0x0e;
714/// True (unordered, non-signaling)
715#[stable(feature = "simd_x86", since = "1.27.0")]
716pub const _CMP_TRUE_UQ: i32 = 0x0f;
717/// Equal (ordered, signaling)
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub const _CMP_EQ_OS: i32 = 0x10;
720/// Less-than (ordered, non-signaling)
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub const _CMP_LT_OQ: i32 = 0x11;
723/// Less-than-or-equal (ordered, non-signaling)
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub const _CMP_LE_OQ: i32 = 0x12;
726/// Unordered (signaling)
727#[stable(feature = "simd_x86", since = "1.27.0")]
728pub const _CMP_UNORD_S: i32 = 0x13;
729/// Not-equal (unordered, signaling)
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub const _CMP_NEQ_US: i32 = 0x14;
732/// Not-less-than (unordered, non-signaling)
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub const _CMP_NLT_UQ: i32 = 0x15;
735/// Not-less-than-or-equal (unordered, non-signaling)
736#[stable(feature = "simd_x86", since = "1.27.0")]
737pub const _CMP_NLE_UQ: i32 = 0x16;
738/// Ordered (signaling)
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub const _CMP_ORD_S: i32 = 0x17;
741/// Equal (unordered, signaling)
742#[stable(feature = "simd_x86", since = "1.27.0")]
743pub const _CMP_EQ_US: i32 = 0x18;
744/// Not-greater-than-or-equal (unordered, non-signaling)
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub const _CMP_NGE_UQ: i32 = 0x19;
747/// Not-greater-than (unordered, non-signaling)
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub const _CMP_NGT_UQ: i32 = 0x1a;
750/// False (ordered, signaling)
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub const _CMP_FALSE_OS: i32 = 0x1b;
753/// Not-equal (ordered, signaling)
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub const _CMP_NEQ_OS: i32 = 0x1c;
756/// Greater-than-or-equal (ordered, non-signaling)
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub const _CMP_GE_OQ: i32 = 0x1d;
759/// Greater-than (ordered, non-signaling)
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub const _CMP_GT_OQ: i32 = 0x1e;
762/// True (unordered, signaling)
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub const _CMP_TRUE_US: i32 = 0x1f;
765
766/// Compares packed double-precision (64-bit) floating-point
767/// elements in `a` and `b` based on the comparison operand
768/// specified by `IMM5`.
769///
770/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
771#[inline]
772#[target_feature(enable = "avx")]
773#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
774#[rustc_legacy_const_generics(2)]
775#[stable(feature = "simd_x86", since = "1.27.0")]
776pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
777 static_assert_uimm_bits!(IMM5, 5);
778 unsafe { vcmppd(a, b, imm8:const { IMM5 as i8 }) }
779}
780
781/// Compares packed double-precision (64-bit) floating-point
782/// elements in `a` and `b` based on the comparison operand
783/// specified by `IMM5`.
784///
785/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
786#[inline]
787#[target_feature(enable = "avx")]
788#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
789#[rustc_legacy_const_generics(2)]
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
792 static_assert_uimm_bits!(IMM5, 5);
793 unsafe { vcmppd256(a, b, IMM5 as u8) }
794}
795
796/// Compares packed single-precision (32-bit) floating-point
797/// elements in `a` and `b` based on the comparison operand
798/// specified by `IMM5`.
799///
800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
801#[inline]
802#[target_feature(enable = "avx")]
803#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
804#[rustc_legacy_const_generics(2)]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
807 static_assert_uimm_bits!(IMM5, 5);
808 unsafe { vcmpps(a, b, imm8:const { IMM5 as i8 }) }
809}
810
811/// Compares packed single-precision (32-bit) floating-point
812/// elements in `a` and `b` based on the comparison operand
813/// specified by `IMM5`.
814///
815/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
816#[inline]
817#[target_feature(enable = "avx")]
818#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
819#[rustc_legacy_const_generics(2)]
820#[stable(feature = "simd_x86", since = "1.27.0")]
821pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
822 static_assert_uimm_bits!(IMM5, 5);
823 unsafe { vcmpps256(a, b, imm8:const { IMM5 as u8 }) }
824}
825
826/// Compares the lower double-precision (64-bit) floating-point element in
827/// `a` and `b` based on the comparison operand specified by `IMM5`,
828/// store the result in the lower element of returned vector,
829/// and copies the upper element from `a` to the upper element of returned
830/// vector.
831///
832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
833#[inline]
834#[target_feature(enable = "avx")]
835#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
836#[rustc_legacy_const_generics(2)]
837#[stable(feature = "simd_x86", since = "1.27.0")]
838pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
839 static_assert_uimm_bits!(IMM5, 5);
840 unsafe { vcmpsd(a, b, IMM5 as i8) }
841}
842
843/// Compares the lower single-precision (32-bit) floating-point element in
844/// `a` and `b` based on the comparison operand specified by `IMM5`,
845/// store the result in the lower element of returned vector,
846/// and copies the upper 3 packed elements from `a` to the upper elements of
847/// returned vector.
848///
849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
850#[inline]
851#[target_feature(enable = "avx")]
852#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
853#[rustc_legacy_const_generics(2)]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
856 static_assert_uimm_bits!(IMM5, 5);
857 unsafe { vcmpss(a, b, IMM5 as i8) }
858}
859
860/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
861/// floating-point elements.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
864#[inline]
865#[target_feature(enable = "avx")]
866#[cfg_attr(test, assert_instr(vcvtdq2pd))]
867#[stable(feature = "simd_x86", since = "1.27.0")]
868pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
869 unsafe { simd_cast(a.as_i32x4()) }
870}
871
872/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
873/// floating-point elements.
874///
875/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
876#[inline]
877#[target_feature(enable = "avx")]
878#[cfg_attr(test, assert_instr(vcvtdq2ps))]
879#[stable(feature = "simd_x86", since = "1.27.0")]
880pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
881 unsafe { simd_cast(a.as_i32x8()) }
882}
883
884/// Converts packed double-precision (64-bit) floating-point elements in `a`
885/// to packed single-precision (32-bit) floating-point elements.
886///
887/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
888#[inline]
889#[target_feature(enable = "avx")]
890#[cfg_attr(test, assert_instr(vcvtpd2ps))]
891#[stable(feature = "simd_x86", since = "1.27.0")]
892pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
893 unsafe { simd_cast(a) }
894}
895
896/// Converts packed single-precision (32-bit) floating-point elements in `a`
897/// to packed 32-bit integers.
898///
899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
900#[inline]
901#[target_feature(enable = "avx")]
902#[cfg_attr(test, assert_instr(vcvtps2dq))]
903#[stable(feature = "simd_x86", since = "1.27.0")]
904pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
905 unsafe { transmute(src:vcvtps2dq(a)) }
906}
907
908/// Converts packed single-precision (32-bit) floating-point elements in `a`
909/// to packed double-precision (64-bit) floating-point elements.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
912#[inline]
913#[target_feature(enable = "avx")]
914#[cfg_attr(test, assert_instr(vcvtps2pd))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
917 unsafe { simd_cast(a) }
918}
919
920/// Returns the first element of the input vector of `[4 x double]`.
921///
922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
923#[inline]
924#[target_feature(enable = "avx")]
925//#[cfg_attr(test, assert_instr(movsd))] FIXME
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
928 unsafe { simd_extract!(a, 0) }
929}
930
931/// Converts packed double-precision (64-bit) floating-point elements in `a`
932/// to packed 32-bit integers with truncation.
933///
934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
935#[inline]
936#[target_feature(enable = "avx")]
937#[cfg_attr(test, assert_instr(vcvttpd2dq))]
938#[stable(feature = "simd_x86", since = "1.27.0")]
939pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
940 unsafe { transmute(src:vcvttpd2dq(a)) }
941}
942
943/// Converts packed double-precision (64-bit) floating-point elements in `a`
944/// to packed 32-bit integers.
945///
946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
947#[inline]
948#[target_feature(enable = "avx")]
949#[cfg_attr(test, assert_instr(vcvtpd2dq))]
950#[stable(feature = "simd_x86", since = "1.27.0")]
951pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
952 unsafe { transmute(src:vcvtpd2dq(a)) }
953}
954
955/// Converts packed single-precision (32-bit) floating-point elements in `a`
956/// to packed 32-bit integers with truncation.
957///
958/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
959#[inline]
960#[target_feature(enable = "avx")]
961#[cfg_attr(test, assert_instr(vcvttps2dq))]
962#[stable(feature = "simd_x86", since = "1.27.0")]
963pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
964 unsafe { transmute(src:vcvttps2dq(a)) }
965}
966
967/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
968/// floating-point elements) from `a`, selected with `imm8`.
969///
970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
971#[inline]
972#[target_feature(enable = "avx")]
973#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
974#[rustc_legacy_const_generics(1)]
975#[stable(feature = "simd_x86", since = "1.27.0")]
976pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
977 static_assert_uimm_bits!(IMM1, 1);
978 unsafe {
979 simd_shuffle!(
980 a,
981 _mm256_undefined_ps(),
982 [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
983 )
984 }
985}
986
987/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
988/// floating-point elements) from `a`, selected with `imm8`.
989///
990/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
991#[inline]
992#[target_feature(enable = "avx")]
993#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
994#[rustc_legacy_const_generics(1)]
995#[stable(feature = "simd_x86", since = "1.27.0")]
996pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
997 static_assert_uimm_bits!(IMM1, 1);
998 unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
999}
1000
1001/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1002///
1003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1004#[inline]
1005#[target_feature(enable = "avx")]
1006#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1007#[rustc_legacy_const_generics(1)]
1008#[stable(feature = "simd_x86", since = "1.27.0")]
1009pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1010 static_assert_uimm_bits!(IMM1, 1);
1011 unsafe {
1012 let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1013 transmute(src:dst)
1014 }
1015}
1016
1017/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1018///
1019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1020#[inline]
1021#[target_feature(enable = "avx")]
1022// This intrinsic has no corresponding instruction.
1023#[rustc_legacy_const_generics(1)]
1024#[stable(feature = "simd_x86", since = "1.27.0")]
1025pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1026 static_assert_uimm_bits!(INDEX, 3);
1027 unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1028}
1029
1030/// Returns the first element of the input vector of `[8 x i32]`.
1031///
1032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1033#[inline]
1034#[target_feature(enable = "avx")]
1035#[stable(feature = "simd_x86", since = "1.27.0")]
1036pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1037 unsafe { simd_extract!(a.as_i32x8(), 0) }
1038}
1039
1040/// Zeroes the contents of all XMM or YMM registers.
1041///
1042/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1043#[inline]
1044#[target_feature(enable = "avx")]
1045#[cfg_attr(test, assert_instr(vzeroall))]
1046#[stable(feature = "simd_x86", since = "1.27.0")]
1047pub fn _mm256_zeroall() {
1048 unsafe { vzeroall() }
1049}
1050
1051/// Zeroes the upper 128 bits of all YMM registers;
1052/// the lower 128-bits of the registers are unmodified.
1053///
1054/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1055#[inline]
1056#[target_feature(enable = "avx")]
1057#[cfg_attr(test, assert_instr(vzeroupper))]
1058#[stable(feature = "simd_x86", since = "1.27.0")]
1059pub fn _mm256_zeroupper() {
1060 unsafe { vzeroupper() }
1061}
1062
1063/// Shuffles single-precision (32-bit) floating-point elements in `a`
1064/// within 128-bit lanes using the control in `b`.
1065///
1066/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1067#[inline]
1068#[target_feature(enable = "avx")]
1069#[cfg_attr(test, assert_instr(vpermilps))]
1070#[stable(feature = "simd_x86", since = "1.27.0")]
1071pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1072 unsafe { vpermilps256(a, b.as_i32x8()) }
1073}
1074
1075/// Shuffles single-precision (32-bit) floating-point elements in `a`
1076/// using the control in `b`.
1077///
1078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1079#[inline]
1080#[target_feature(enable = "avx")]
1081#[cfg_attr(test, assert_instr(vpermilps))]
1082#[stable(feature = "simd_x86", since = "1.27.0")]
1083pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1084 unsafe { vpermilps(a, b.as_i32x4()) }
1085}
1086
1087/// Shuffles single-precision (32-bit) floating-point elements in `a`
1088/// within 128-bit lanes using the control in `imm8`.
1089///
1090/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1091#[inline]
1092#[target_feature(enable = "avx")]
1093#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1094#[rustc_legacy_const_generics(1)]
1095#[stable(feature = "simd_x86", since = "1.27.0")]
1096pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1097 static_assert_uimm_bits!(IMM8, 8);
1098 unsafe {
1099 simd_shuffle!(
1100 a,
1101 _mm256_undefined_ps(),
1102 [
1103 (IMM8 as u32 >> 0) & 0b11,
1104 (IMM8 as u32 >> 2) & 0b11,
1105 (IMM8 as u32 >> 4) & 0b11,
1106 (IMM8 as u32 >> 6) & 0b11,
1107 ((IMM8 as u32 >> 0) & 0b11) + 4,
1108 ((IMM8 as u32 >> 2) & 0b11) + 4,
1109 ((IMM8 as u32 >> 4) & 0b11) + 4,
1110 ((IMM8 as u32 >> 6) & 0b11) + 4,
1111 ],
1112 )
1113 }
1114}
1115
1116/// Shuffles single-precision (32-bit) floating-point elements in `a`
1117/// using the control in `imm8`.
1118///
1119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1120#[inline]
1121#[target_feature(enable = "avx")]
1122#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1123#[rustc_legacy_const_generics(1)]
1124#[stable(feature = "simd_x86", since = "1.27.0")]
1125pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1126 static_assert_uimm_bits!(IMM8, 8);
1127 unsafe {
1128 simd_shuffle!(
1129 a,
1130 _mm_undefined_ps(),
1131 [
1132 (IMM8 as u32 >> 0) & 0b11,
1133 (IMM8 as u32 >> 2) & 0b11,
1134 (IMM8 as u32 >> 4) & 0b11,
1135 (IMM8 as u32 >> 6) & 0b11,
1136 ],
1137 )
1138 }
1139}
1140
1141/// Shuffles double-precision (64-bit) floating-point elements in `a`
1142/// within 256-bit lanes using the control in `b`.
1143///
1144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1145#[inline]
1146#[target_feature(enable = "avx")]
1147#[cfg_attr(test, assert_instr(vpermilpd))]
1148#[stable(feature = "simd_x86", since = "1.27.0")]
1149pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1150 unsafe { vpermilpd256(a, b.as_i64x4()) }
1151}
1152
1153/// Shuffles double-precision (64-bit) floating-point elements in `a`
1154/// using the control in `b`.
1155///
1156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1157#[inline]
1158#[target_feature(enable = "avx")]
1159#[cfg_attr(test, assert_instr(vpermilpd))]
1160#[stable(feature = "simd_x86", since = "1.27.0")]
1161pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1162 unsafe { vpermilpd(a, b.as_i64x2()) }
1163}
1164
1165/// Shuffles double-precision (64-bit) floating-point elements in `a`
1166/// within 128-bit lanes using the control in `imm8`.
1167///
1168/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1169#[inline]
1170#[target_feature(enable = "avx")]
1171#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1172#[rustc_legacy_const_generics(1)]
1173#[stable(feature = "simd_x86", since = "1.27.0")]
1174pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1175 static_assert_uimm_bits!(IMM4, 4);
1176 unsafe {
1177 simd_shuffle!(
1178 a,
1179 _mm256_undefined_pd(),
1180 [
1181 ((IMM4 as u32 >> 0) & 1),
1182 ((IMM4 as u32 >> 1) & 1),
1183 ((IMM4 as u32 >> 2) & 1) + 2,
1184 ((IMM4 as u32 >> 3) & 1) + 2,
1185 ],
1186 )
1187 }
1188}
1189
1190/// Shuffles double-precision (64-bit) floating-point elements in `a`
1191/// using the control in `imm8`.
1192///
1193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1194#[inline]
1195#[target_feature(enable = "avx")]
1196#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1197#[rustc_legacy_const_generics(1)]
1198#[stable(feature = "simd_x86", since = "1.27.0")]
1199pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1200 static_assert_uimm_bits!(IMM2, 2);
1201 unsafe {
1202 simd_shuffle!(
1203 a,
1204 _mm_undefined_pd(),
1205 [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1206 )
1207 }
1208}
1209
1210/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1211/// floating-point elements) selected by `imm8` from `a` and `b`.
1212///
1213/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1214#[inline]
1215#[target_feature(enable = "avx")]
1216#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1217#[rustc_legacy_const_generics(2)]
1218#[stable(feature = "simd_x86", since = "1.27.0")]
1219pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1220 static_assert_uimm_bits!(IMM8, 8);
1221 unsafe { vperm2f128ps256(a, b, IMM8 as i8) }
1222}
1223
1224/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1225/// floating-point elements) selected by `imm8` from `a` and `b`.
1226///
1227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1228#[inline]
1229#[target_feature(enable = "avx")]
1230#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1231#[rustc_legacy_const_generics(2)]
1232#[stable(feature = "simd_x86", since = "1.27.0")]
1233pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1234 static_assert_uimm_bits!(IMM8, 8);
1235 unsafe { vperm2f128pd256(a, b, IMM8 as i8) }
1236}
1237
1238/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1239/// from `a` and `b`.
1240///
1241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1242#[inline]
1243#[target_feature(enable = "avx")]
1244#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1245#[rustc_legacy_const_generics(2)]
1246#[stable(feature = "simd_x86", since = "1.27.0")]
1247pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1248 static_assert_uimm_bits!(IMM8, 8);
1249 unsafe { transmute(src:vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
1250}
1251
1252/// Broadcasts a single-precision (32-bit) floating-point element from memory
1253/// to all elements of the returned vector.
1254///
1255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1256#[inline]
1257#[target_feature(enable = "avx")]
1258#[cfg_attr(test, assert_instr(vbroadcastss))]
1259#[stable(feature = "simd_x86", since = "1.27.0")]
1260#[allow(clippy::trivially_copy_pass_by_ref)]
1261pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1262 _mm256_set1_ps(*f)
1263}
1264
1265/// Broadcasts a single-precision (32-bit) floating-point element from memory
1266/// to all elements of the returned vector.
1267///
1268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1269#[inline]
1270#[target_feature(enable = "avx")]
1271#[cfg_attr(test, assert_instr(vbroadcastss))]
1272#[stable(feature = "simd_x86", since = "1.27.0")]
1273#[allow(clippy::trivially_copy_pass_by_ref)]
1274pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
1275 _mm_set1_ps(*f)
1276}
1277
1278/// Broadcasts a double-precision (64-bit) floating-point element from memory
1279/// to all elements of the returned vector.
1280///
1281/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1282#[inline]
1283#[target_feature(enable = "avx")]
1284#[cfg_attr(test, assert_instr(vbroadcastsd))]
1285#[stable(feature = "simd_x86", since = "1.27.0")]
1286#[allow(clippy::trivially_copy_pass_by_ref)]
1287pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1288 _mm256_set1_pd(*f)
1289}
1290
1291/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1292/// (32-bit) floating-point elements) to all elements of the returned vector.
1293///
1294/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1295#[inline]
1296#[target_feature(enable = "avx")]
1297#[cfg_attr(test, assert_instr(vbroadcastf128))]
1298#[stable(feature = "simd_x86", since = "1.27.0")]
1299pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1300 simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])
1301}
1302
1303/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1304/// (64-bit) floating-point elements) to all elements of the returned vector.
1305///
1306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1307#[inline]
1308#[target_feature(enable = "avx")]
1309#[cfg_attr(test, assert_instr(vbroadcastf128))]
1310#[stable(feature = "simd_x86", since = "1.27.0")]
1311pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1312 simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1])
1313}
1314
1315/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1316/// single-precision (32-bit) floating-point elements) from `b` into result
1317/// at the location specified by `imm8`.
1318///
1319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1320#[inline]
1321#[target_feature(enable = "avx")]
1322#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1323#[rustc_legacy_const_generics(2)]
1324#[stable(feature = "simd_x86", since = "1.27.0")]
1325pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1326 static_assert_uimm_bits!(IMM1, 1);
1327 unsafe {
1328 simd_shuffle!(
1329 a,
1330 _mm256_castps128_ps256(b),
1331 [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1332 )
1333 }
1334}
1335
1336/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1337/// double-precision (64-bit) floating-point elements) from `b` into result
1338/// at the location specified by `imm8`.
1339///
1340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1341#[inline]
1342#[target_feature(enable = "avx")]
1343#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1344#[rustc_legacy_const_generics(2)]
1345#[stable(feature = "simd_x86", since = "1.27.0")]
1346pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1347 static_assert_uimm_bits!(IMM1, 1);
1348 unsafe {
1349 simd_shuffle!(
1350 a,
1351 _mm256_castpd128_pd256(b),
1352 [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1353 )
1354 }
1355}
1356
1357/// Copies `a` to result, then inserts 128 bits from `b` into result
1358/// at the location specified by `imm8`.
1359///
1360/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1361#[inline]
1362#[target_feature(enable = "avx")]
1363#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1364#[rustc_legacy_const_generics(2)]
1365#[stable(feature = "simd_x86", since = "1.27.0")]
1366pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1367 static_assert_uimm_bits!(IMM1, 1);
1368 unsafe {
1369 let dst: i64x4 = simd_shuffle!(
1370 a.as_i64x4(),
1371 _mm256_castsi128_si256(b).as_i64x4(),
1372 [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1373 );
1374 transmute(src:dst)
1375 }
1376}
1377
1378/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1379/// at the location specified by `index`.
1380///
1381/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1382#[inline]
1383#[target_feature(enable = "avx")]
1384// This intrinsic has no corresponding instruction.
1385#[rustc_legacy_const_generics(2)]
1386#[stable(feature = "simd_x86", since = "1.27.0")]
1387pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1388 static_assert_uimm_bits!(INDEX, 5);
1389 unsafe { transmute(src:simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1390}
1391
1392/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1393/// at the location specified by `index`.
1394///
1395/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1396#[inline]
1397#[target_feature(enable = "avx")]
1398// This intrinsic has no corresponding instruction.
1399#[rustc_legacy_const_generics(2)]
1400#[stable(feature = "simd_x86", since = "1.27.0")]
1401pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1402 static_assert_uimm_bits!(INDEX, 4);
1403 unsafe { transmute(src:simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1404}
1405
1406/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1407/// at the location specified by `index`.
1408///
1409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1410#[inline]
1411#[target_feature(enable = "avx")]
1412// This intrinsic has no corresponding instruction.
1413#[rustc_legacy_const_generics(2)]
1414#[stable(feature = "simd_x86", since = "1.27.0")]
1415pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1416 static_assert_uimm_bits!(INDEX, 3);
1417 unsafe { transmute(src:simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1418}
1419
1420/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1421/// floating-point elements) from memory into result.
1422/// `mem_addr` must be aligned on a 32-byte boundary or a
1423/// general-protection exception may be generated.
1424///
1425/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1426#[inline]
1427#[target_feature(enable = "avx")]
1428#[cfg_attr(
1429 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1430 assert_instr(vmovap)
1431)]
1432#[stable(feature = "simd_x86", since = "1.27.0")]
1433#[allow(clippy::cast_ptr_alignment)]
1434pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1435 *(mem_addr as *const __m256d)
1436}
1437
1438/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1439/// floating-point elements) from `a` into memory.
1440/// `mem_addr` must be aligned on a 32-byte boundary or a
1441/// general-protection exception may be generated.
1442///
1443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1444#[inline]
1445#[target_feature(enable = "avx")]
1446#[cfg_attr(
1447 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1448 assert_instr(vmovap)
1449)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451#[allow(clippy::cast_ptr_alignment)]
1452pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1453 *(mem_addr as *mut __m256d) = a;
1454}
1455
1456/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1457/// floating-point elements) from memory into result.
1458/// `mem_addr` must be aligned on a 32-byte boundary or a
1459/// general-protection exception may be generated.
1460///
1461/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1462#[inline]
1463#[target_feature(enable = "avx")]
1464#[cfg_attr(
1465 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1466 assert_instr(vmovaps)
1467)]
1468#[stable(feature = "simd_x86", since = "1.27.0")]
1469#[allow(clippy::cast_ptr_alignment)]
1470pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1471 *(mem_addr as *const __m256)
1472}
1473
1474/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1475/// floating-point elements) from `a` into memory.
1476/// `mem_addr` must be aligned on a 32-byte boundary or a
1477/// general-protection exception may be generated.
1478///
1479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1480#[inline]
1481#[target_feature(enable = "avx")]
1482#[cfg_attr(
1483 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1484 assert_instr(vmovaps)
1485)]
1486#[stable(feature = "simd_x86", since = "1.27.0")]
1487#[allow(clippy::cast_ptr_alignment)]
1488pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1489 *(mem_addr as *mut __m256) = a;
1490}
1491
1492/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1493/// floating-point elements) from memory into result.
1494/// `mem_addr` does not need to be aligned on any particular boundary.
1495///
1496/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1497#[inline]
1498#[target_feature(enable = "avx")]
1499#[cfg_attr(test, assert_instr(vmovup))]
1500#[stable(feature = "simd_x86", since = "1.27.0")]
1501pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1502 let mut dst: __m256d = _mm256_undefined_pd();
1503 ptr::copy_nonoverlapping(
1504 src:mem_addr as *const u8,
1505 dst:ptr::addr_of_mut!(dst) as *mut u8,
1506 count:mem::size_of::<__m256d>(),
1507 );
1508 dst
1509}
1510
1511/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1512/// floating-point elements) from `a` into memory.
1513/// `mem_addr` does not need to be aligned on any particular boundary.
1514///
1515/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1516#[inline]
1517#[target_feature(enable = "avx")]
1518#[cfg_attr(test, assert_instr(vmovup))]
1519#[stable(feature = "simd_x86", since = "1.27.0")]
1520pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1521 mem_addr.cast::<__m256d>().write_unaligned(val:a);
1522}
1523
1524/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1525/// floating-point elements) from memory into result.
1526/// `mem_addr` does not need to be aligned on any particular boundary.
1527///
1528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1529#[inline]
1530#[target_feature(enable = "avx")]
1531#[cfg_attr(test, assert_instr(vmovups))]
1532#[stable(feature = "simd_x86", since = "1.27.0")]
1533pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1534 let mut dst: __m256 = _mm256_undefined_ps();
1535 ptr::copy_nonoverlapping(
1536 src:mem_addr as *const u8,
1537 dst:ptr::addr_of_mut!(dst) as *mut u8,
1538 count:mem::size_of::<__m256>(),
1539 );
1540 dst
1541}
1542
1543/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1544/// floating-point elements) from `a` into memory.
1545/// `mem_addr` does not need to be aligned on any particular boundary.
1546///
1547/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1548#[inline]
1549#[target_feature(enable = "avx")]
1550#[cfg_attr(test, assert_instr(vmovups))]
1551#[stable(feature = "simd_x86", since = "1.27.0")]
1552pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1553 mem_addr.cast::<__m256>().write_unaligned(val:a);
1554}
1555
1556/// Loads 256-bits of integer data from memory into result.
1557/// `mem_addr` must be aligned on a 32-byte boundary or a
1558/// general-protection exception may be generated.
1559///
1560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1561#[inline]
1562#[target_feature(enable = "avx")]
1563#[cfg_attr(
1564 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1565 assert_instr(vmovaps)
1566)] // FIXME vmovdqa expected
1567#[stable(feature = "simd_x86", since = "1.27.0")]
1568pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1569 *mem_addr
1570}
1571
1572/// Stores 256-bits of integer data from `a` into memory.
1573/// `mem_addr` must be aligned on a 32-byte boundary or a
1574/// general-protection exception may be generated.
1575///
1576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1577#[inline]
1578#[target_feature(enable = "avx")]
1579#[cfg_attr(
1580 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1581 assert_instr(vmovaps)
1582)] // FIXME vmovdqa expected
1583#[stable(feature = "simd_x86", since = "1.27.0")]
1584pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1585 *mem_addr = a;
1586}
1587
1588/// Loads 256-bits of integer data from memory into result.
1589/// `mem_addr` does not need to be aligned on any particular boundary.
1590///
1591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1592#[inline]
1593#[target_feature(enable = "avx")]
1594#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1595#[stable(feature = "simd_x86", since = "1.27.0")]
1596pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1597 let mut dst: __m256i = _mm256_undefined_si256();
1598 ptr::copy_nonoverlapping(
1599 src:mem_addr as *const u8,
1600 dst:ptr::addr_of_mut!(dst) as *mut u8,
1601 count:mem::size_of::<__m256i>(),
1602 );
1603 dst
1604}
1605
1606/// Stores 256-bits of integer data from `a` into memory.
1607/// `mem_addr` does not need to be aligned on any particular boundary.
1608///
1609/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1610#[inline]
1611#[target_feature(enable = "avx")]
1612#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1613#[stable(feature = "simd_x86", since = "1.27.0")]
1614pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1615 mem_addr.write_unaligned(val:a);
1616}
1617
1618/// Loads packed double-precision (64-bit) floating-point elements from memory
1619/// into result using `mask` (elements are zeroed out when the high bit of the
1620/// corresponding element is not set).
1621///
1622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1623#[inline]
1624#[target_feature(enable = "avx")]
1625#[cfg_attr(test, assert_instr(vmaskmovpd))]
1626#[stable(feature = "simd_x86", since = "1.27.0")]
1627pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1628 maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
1629}
1630
1631/// Stores packed double-precision (64-bit) floating-point elements from `a`
1632/// into memory using `mask`.
1633///
1634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1635#[inline]
1636#[target_feature(enable = "avx")]
1637#[cfg_attr(test, assert_instr(vmaskmovpd))]
1638#[stable(feature = "simd_x86", since = "1.27.0")]
1639pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1640 maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
1641}
1642
1643/// Loads packed double-precision (64-bit) floating-point elements from memory
1644/// into result using `mask` (elements are zeroed out when the high bit of the
1645/// corresponding element is not set).
1646///
1647/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1648#[inline]
1649#[target_feature(enable = "avx")]
1650#[cfg_attr(test, assert_instr(vmaskmovpd))]
1651#[stable(feature = "simd_x86", since = "1.27.0")]
1652pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1653 maskloadpd(mem_addr as *const i8, mask.as_i64x2())
1654}
1655
1656/// Stores packed double-precision (64-bit) floating-point elements from `a`
1657/// into memory using `mask`.
1658///
1659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1660#[inline]
1661#[target_feature(enable = "avx")]
1662#[cfg_attr(test, assert_instr(vmaskmovpd))]
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1665 maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
1666}
1667
1668/// Loads packed single-precision (32-bit) floating-point elements from memory
1669/// into result using `mask` (elements are zeroed out when the high bit of the
1670/// corresponding element is not set).
1671///
1672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1673#[inline]
1674#[target_feature(enable = "avx")]
1675#[cfg_attr(test, assert_instr(vmaskmovps))]
1676#[stable(feature = "simd_x86", since = "1.27.0")]
1677pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1678 maskloadps256(mem_addr as *const i8, mask.as_i32x8())
1679}
1680
1681/// Stores packed single-precision (32-bit) floating-point elements from `a`
1682/// into memory using `mask`.
1683///
1684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1685#[inline]
1686#[target_feature(enable = "avx")]
1687#[cfg_attr(test, assert_instr(vmaskmovps))]
1688#[stable(feature = "simd_x86", since = "1.27.0")]
1689pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1690 maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
1691}
1692
1693/// Loads packed single-precision (32-bit) floating-point elements from memory
1694/// into result using `mask` (elements are zeroed out when the high bit of the
1695/// corresponding element is not set).
1696///
1697/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1698#[inline]
1699#[target_feature(enable = "avx")]
1700#[cfg_attr(test, assert_instr(vmaskmovps))]
1701#[stable(feature = "simd_x86", since = "1.27.0")]
1702pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1703 maskloadps(mem_addr as *const i8, mask.as_i32x4())
1704}
1705
1706/// Stores packed single-precision (32-bit) floating-point elements from `a`
1707/// into memory using `mask`.
1708///
1709/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1710#[inline]
1711#[target_feature(enable = "avx")]
1712#[cfg_attr(test, assert_instr(vmaskmovps))]
1713#[stable(feature = "simd_x86", since = "1.27.0")]
1714pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1715 maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
1716}
1717
1718/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1719/// from `a`, and returns the results.
1720///
1721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1722#[inline]
1723#[target_feature(enable = "avx")]
1724#[cfg_attr(test, assert_instr(vmovshdup))]
1725#[stable(feature = "simd_x86", since = "1.27.0")]
1726pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1727 unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1728}
1729
1730/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1731/// from `a`, and returns the results.
1732///
1733/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1734#[inline]
1735#[target_feature(enable = "avx")]
1736#[cfg_attr(test, assert_instr(vmovsldup))]
1737#[stable(feature = "simd_x86", since = "1.27.0")]
1738pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1739 unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1740}
1741
1742/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1743/// from `a`, and returns the results.
1744///
1745/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1746#[inline]
1747#[target_feature(enable = "avx")]
1748#[cfg_attr(test, assert_instr(vmovddup))]
1749#[stable(feature = "simd_x86", since = "1.27.0")]
1750pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1751 unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1752}
1753
1754/// Loads 256-bits of integer data from unaligned memory into result.
1755/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1756/// data crosses a cache line boundary.
1757///
1758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1759#[inline]
1760#[target_feature(enable = "avx")]
1761#[cfg_attr(test, assert_instr(vlddqu))]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1764 transmute(src:vlddqu(mem_addr as *const i8))
1765}
1766
1767/// Moves integer data from a 256-bit integer vector to a 32-byte
1768/// aligned memory location. To minimize caching, the data is flagged as
1769/// non-temporal (unlikely to be used again soon)
1770///
1771/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1772///
1773/// # Safety of non-temporal stores
1774///
1775/// After using this intrinsic, but before any other access to the memory that this intrinsic
1776/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1777/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1778/// return.
1779///
1780/// See [`_mm_sfence`] for details.
1781#[inline]
1782#[target_feature(enable = "avx")]
1783#[cfg_attr(test, assert_instr(vmovntdq))]
1784#[stable(feature = "simd_x86", since = "1.27.0")]
1785pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1786 crate::arch::asm!(
1787 vps!("vmovntdq", ",{a}"),
1788 p = in(reg) mem_addr,
1789 a = in(ymm_reg) a,
1790 options(nostack, preserves_flags),
1791 );
1792}
1793
1794/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1795/// to a 32-byte aligned memory location. To minimize caching, the data is
1796/// flagged as non-temporal (unlikely to be used again soon).
1797///
1798/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1799///
1800/// # Safety of non-temporal stores
1801///
1802/// After using this intrinsic, but before any other access to the memory that this intrinsic
1803/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1804/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1805/// return.
1806///
1807/// See [`_mm_sfence`] for details.
1808#[inline]
1809#[target_feature(enable = "avx")]
1810#[cfg_attr(test, assert_instr(vmovntpd))]
1811#[stable(feature = "simd_x86", since = "1.27.0")]
1812#[allow(clippy::cast_ptr_alignment)]
1813pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1814 crate::arch::asm!(
1815 vps!("vmovntpd", ",{a}"),
1816 p = in(reg) mem_addr,
1817 a = in(ymm_reg) a,
1818 options(nostack, preserves_flags),
1819 );
1820}
1821
1822/// Moves single-precision floating point values from a 256-bit vector
1823/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1824/// caching, the data is flagged as non-temporal (unlikely to be used again
1825/// soon).
1826///
1827/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1828///
1829/// # Safety of non-temporal stores
1830///
1831/// After using this intrinsic, but before any other access to the memory that this intrinsic
1832/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1833/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1834/// return.
1835///
1836/// See [`_mm_sfence`] for details.
1837#[inline]
1838#[target_feature(enable = "avx")]
1839#[cfg_attr(test, assert_instr(vmovntps))]
1840#[stable(feature = "simd_x86", since = "1.27.0")]
1841#[allow(clippy::cast_ptr_alignment)]
1842pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1843 crate::arch::asm!(
1844 vps!("vmovntps", ",{a}"),
1845 p = in(reg) mem_addr,
1846 a = in(ymm_reg) a,
1847 options(nostack, preserves_flags),
1848 );
1849}
1850
1851/// Computes the approximate reciprocal of packed single-precision (32-bit)
1852/// floating-point elements in `a`, and returns the results. The maximum
1853/// relative error for this approximation is less than 1.5*2^-12.
1854///
1855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
1856#[inline]
1857#[target_feature(enable = "avx")]
1858#[cfg_attr(test, assert_instr(vrcpps))]
1859#[stable(feature = "simd_x86", since = "1.27.0")]
1860pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
1861 unsafe { vrcpps(a) }
1862}
1863
1864/// Computes the approximate reciprocal square root of packed single-precision
1865/// (32-bit) floating-point elements in `a`, and returns the results.
1866/// The maximum relative error for this approximation is less than 1.5*2^-12.
1867///
1868/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
1869#[inline]
1870#[target_feature(enable = "avx")]
1871#[cfg_attr(test, assert_instr(vrsqrtps))]
1872#[stable(feature = "simd_x86", since = "1.27.0")]
1873pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
1874 unsafe { vrsqrtps(a) }
1875}
1876
1877/// Unpacks and interleave double-precision (64-bit) floating-point elements
1878/// from the high half of each 128-bit lane in `a` and `b`.
1879///
1880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
1881#[inline]
1882#[target_feature(enable = "avx")]
1883#[cfg_attr(test, assert_instr(vunpckhpd))]
1884#[stable(feature = "simd_x86", since = "1.27.0")]
1885pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
1886 unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
1887}
1888
1889/// Unpacks and interleave single-precision (32-bit) floating-point elements
1890/// from the high half of each 128-bit lane in `a` and `b`.
1891///
1892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
1893#[inline]
1894#[target_feature(enable = "avx")]
1895#[cfg_attr(test, assert_instr(vunpckhps))]
1896#[stable(feature = "simd_x86", since = "1.27.0")]
1897pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
1898 unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
1899}
1900
1901/// Unpacks and interleave double-precision (64-bit) floating-point elements
1902/// from the low half of each 128-bit lane in `a` and `b`.
1903///
1904/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
1905#[inline]
1906#[target_feature(enable = "avx")]
1907#[cfg_attr(test, assert_instr(vunpcklpd))]
1908#[stable(feature = "simd_x86", since = "1.27.0")]
1909pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
1910 unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
1911}
1912
1913/// Unpacks and interleave single-precision (32-bit) floating-point elements
1914/// from the low half of each 128-bit lane in `a` and `b`.
1915///
1916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
1917#[inline]
1918#[target_feature(enable = "avx")]
1919#[cfg_attr(test, assert_instr(vunpcklps))]
1920#[stable(feature = "simd_x86", since = "1.27.0")]
1921pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
1922 unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
1923}
1924
1925/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1926/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1927/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1928/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
1929///
1930/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
1931#[inline]
1932#[target_feature(enable = "avx")]
1933#[cfg_attr(test, assert_instr(vptest))]
1934#[stable(feature = "simd_x86", since = "1.27.0")]
1935pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1936 unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) }
1937}
1938
1939/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1940/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1941/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1942/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
1943///
1944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
1945#[inline]
1946#[target_feature(enable = "avx")]
1947#[cfg_attr(test, assert_instr(vptest))]
1948#[stable(feature = "simd_x86", since = "1.27.0")]
1949pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
1950 unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) }
1951}
1952
1953/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1954/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1955/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1956/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
1957/// `CF` values are zero, otherwise return 0.
1958///
1959/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
1960#[inline]
1961#[target_feature(enable = "avx")]
1962#[cfg_attr(test, assert_instr(vptest))]
1963#[stable(feature = "simd_x86", since = "1.27.0")]
1964pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
1965 unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
1966}
1967
1968/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
1969/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1970/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1971/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1972/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1973/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1974/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
1975///
1976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
1977#[inline]
1978#[target_feature(enable = "avx")]
1979#[cfg_attr(test, assert_instr(vtestpd))]
1980#[stable(feature = "simd_x86", since = "1.27.0")]
1981pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
1982 unsafe { vtestzpd256(a, b) }
1983}
1984
1985/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
1986/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1987/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1988/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1989/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1990/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1991/// is zero, otherwise set `CF` to 0. Return the `CF` value.
1992///
1993/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
1994#[inline]
1995#[target_feature(enable = "avx")]
1996#[cfg_attr(test, assert_instr(vtestpd))]
1997#[stable(feature = "simd_x86", since = "1.27.0")]
1998pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
1999 unsafe { vtestcpd256(a, b) }
2000}
2001
2002/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2003/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2004/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2005/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2006/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2007/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2008/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2009/// are zero, otherwise return 0.
2010///
2011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2012#[inline]
2013#[target_feature(enable = "avx")]
2014#[cfg_attr(test, assert_instr(vtestpd))]
2015#[stable(feature = "simd_x86", since = "1.27.0")]
2016pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2017 unsafe { vtestnzcpd256(a, b) }
2018}
2019
2020/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2021/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2022/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2023/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2024/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2025/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2026/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2027///
2028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2029#[inline]
2030#[target_feature(enable = "avx")]
2031#[cfg_attr(test, assert_instr(vtestpd))]
2032#[stable(feature = "simd_x86", since = "1.27.0")]
2033pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2034 unsafe { vtestzpd(a, b) }
2035}
2036
2037/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2038/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2039/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2040/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2041/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2042/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2043/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2044///
2045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2046#[inline]
2047#[target_feature(enable = "avx")]
2048#[cfg_attr(test, assert_instr(vtestpd))]
2049#[stable(feature = "simd_x86", since = "1.27.0")]
2050pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2051 unsafe { vtestcpd(a, b) }
2052}
2053
2054/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2055/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2056/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2057/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2058/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2059/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2060/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2061/// are zero, otherwise return 0.
2062///
2063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2064#[inline]
2065#[target_feature(enable = "avx")]
2066#[cfg_attr(test, assert_instr(vtestpd))]
2067#[stable(feature = "simd_x86", since = "1.27.0")]
2068pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2069 unsafe { vtestnzcpd(a, b) }
2070}
2071
2072/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2073/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2074/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2075/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2076/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2077/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2078/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2079///
2080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2081#[inline]
2082#[target_feature(enable = "avx")]
2083#[cfg_attr(test, assert_instr(vtestps))]
2084#[stable(feature = "simd_x86", since = "1.27.0")]
2085pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2086 unsafe { vtestzps256(a, b) }
2087}
2088
2089/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2090/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2091/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2092/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2093/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2094/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2095/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2096///
2097/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2098#[inline]
2099#[target_feature(enable = "avx")]
2100#[cfg_attr(test, assert_instr(vtestps))]
2101#[stable(feature = "simd_x86", since = "1.27.0")]
2102pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2103 unsafe { vtestcps256(a, b) }
2104}
2105
2106/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2107/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2108/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2109/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2110/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2111/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2112/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2113/// are zero, otherwise return 0.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vtestps))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2121 unsafe { vtestnzcps256(a, b) }
2122}
2123
2124/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2125/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2126/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2127/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2128/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2129/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2130/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2133#[inline]
2134#[target_feature(enable = "avx")]
2135#[cfg_attr(test, assert_instr(vtestps))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2138 unsafe { vtestzps(a, b) }
2139}
2140
2141/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2142/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2143/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2144/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2145/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2146/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2147/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2148///
2149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2150#[inline]
2151#[target_feature(enable = "avx")]
2152#[cfg_attr(test, assert_instr(vtestps))]
2153#[stable(feature = "simd_x86", since = "1.27.0")]
2154pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2155 unsafe { vtestcps(a, b) }
2156}
2157
2158/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2159/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2160/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2161/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2162/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2163/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2164/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2165/// are zero, otherwise return 0.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2168#[inline]
2169#[target_feature(enable = "avx")]
2170#[cfg_attr(test, assert_instr(vtestps))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2173 unsafe { vtestnzcps(a, b) }
2174}
2175
2176/// Sets each bit of the returned mask based on the most significant bit of the
2177/// corresponding packed double-precision (64-bit) floating-point element in
2178/// `a`.
2179///
2180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2181#[inline]
2182#[target_feature(enable = "avx")]
2183#[cfg_attr(test, assert_instr(vmovmskpd))]
2184#[stable(feature = "simd_x86", since = "1.27.0")]
2185pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
2186 // Propagate the highest bit to the rest, because simd_bitmask
2187 // requires all-1 or all-0.
2188 unsafe {
2189 let mask: i64x4 = simd_lt(x:transmute(a), y:i64x4::ZERO);
2190 simd_bitmask::<i64x4, u8>(mask).into()
2191 }
2192}
2193
2194/// Sets each bit of the returned mask based on the most significant bit of the
2195/// corresponding packed single-precision (32-bit) floating-point element in
2196/// `a`.
2197///
2198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2199#[inline]
2200#[target_feature(enable = "avx")]
2201#[cfg_attr(test, assert_instr(vmovmskps))]
2202#[stable(feature = "simd_x86", since = "1.27.0")]
2203pub fn _mm256_movemask_ps(a: __m256) -> i32 {
2204 // Propagate the highest bit to the rest, because simd_bitmask
2205 // requires all-1 or all-0.
2206 unsafe {
2207 let mask: i32x8 = simd_lt(x:transmute(a), y:i32x8::ZERO);
2208 simd_bitmask::<i32x8, u8>(mask).into()
2209 }
2210}
2211
2212/// Returns vector of type __m256d with all elements set to zero.
2213///
2214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2215#[inline]
2216#[target_feature(enable = "avx")]
2217#[cfg_attr(test, assert_instr(vxorp))]
2218#[stable(feature = "simd_x86", since = "1.27.0")]
2219pub fn _mm256_setzero_pd() -> __m256d {
2220 const { unsafe { mem::zeroed() } }
2221}
2222
2223/// Returns vector of type __m256 with all elements set to zero.
2224///
2225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2226#[inline]
2227#[target_feature(enable = "avx")]
2228#[cfg_attr(test, assert_instr(vxorps))]
2229#[stable(feature = "simd_x86", since = "1.27.0")]
2230pub fn _mm256_setzero_ps() -> __m256 {
2231 const { unsafe { mem::zeroed() } }
2232}
2233
2234/// Returns vector of type __m256i with all elements set to zero.
2235///
2236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2237#[inline]
2238#[target_feature(enable = "avx")]
2239#[cfg_attr(test, assert_instr(vxor))]
2240#[stable(feature = "simd_x86", since = "1.27.0")]
2241pub fn _mm256_setzero_si256() -> __m256i {
2242 const { unsafe { mem::zeroed() } }
2243}
2244
2245/// Sets packed double-precision (64-bit) floating-point elements in returned
2246/// vector with the supplied values.
2247///
2248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2249#[inline]
2250#[target_feature(enable = "avx")]
2251// This intrinsic has no corresponding instruction.
2252#[cfg_attr(test, assert_instr(vinsertf128))]
2253#[stable(feature = "simd_x86", since = "1.27.0")]
2254pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2255 _mm256_setr_pd(a:d, b:c, c:b, d:a)
2256}
2257
2258/// Sets packed single-precision (32-bit) floating-point elements in returned
2259/// vector with the supplied values.
2260///
2261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2262#[inline]
2263#[target_feature(enable = "avx")]
2264// This intrinsic has no corresponding instruction.
2265#[stable(feature = "simd_x86", since = "1.27.0")]
2266pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2267 _mm256_setr_ps(a:h, b:g, c:f, d:e, e:d, f:c, g:b, h:a)
2268}
2269
2270/// Sets packed 8-bit integers in returned vector with the supplied values.
2271///
2272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2273#[inline]
2274#[target_feature(enable = "avx")]
2275// This intrinsic has no corresponding instruction.
2276#[stable(feature = "simd_x86", since = "1.27.0")]
2277pub fn _mm256_set_epi8(
2278 e00: i8,
2279 e01: i8,
2280 e02: i8,
2281 e03: i8,
2282 e04: i8,
2283 e05: i8,
2284 e06: i8,
2285 e07: i8,
2286 e08: i8,
2287 e09: i8,
2288 e10: i8,
2289 e11: i8,
2290 e12: i8,
2291 e13: i8,
2292 e14: i8,
2293 e15: i8,
2294 e16: i8,
2295 e17: i8,
2296 e18: i8,
2297 e19: i8,
2298 e20: i8,
2299 e21: i8,
2300 e22: i8,
2301 e23: i8,
2302 e24: i8,
2303 e25: i8,
2304 e26: i8,
2305 e27: i8,
2306 e28: i8,
2307 e29: i8,
2308 e30: i8,
2309 e31: i8,
2310) -> __m256i {
2311 #[rustfmt::skip]
2312 _mm256_setr_epi8(
2313 e00:e31, e01:e30, e02:e29, e03:e28, e04:e27, e05:e26, e06:e25, e07:e24,
2314 e08:e23, e09:e22, e10:e21, e11:e20, e12:e19, e13:e18, e14:e17, e15:e16,
2315 e16:e15, e17:e14, e18:e13, e19:e12, e20:e11, e21:e10, e22:e09, e23:e08,
2316 e24:e07, e25:e06, e26:e05, e27:e04, e28:e03, e29:e02, e30:e01, e31:e00,
2317 )
2318}
2319
2320/// Sets packed 16-bit integers in returned vector with the supplied values.
2321///
2322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2323#[inline]
2324#[target_feature(enable = "avx")]
2325// This intrinsic has no corresponding instruction.
2326#[stable(feature = "simd_x86", since = "1.27.0")]
2327pub fn _mm256_set_epi16(
2328 e00: i16,
2329 e01: i16,
2330 e02: i16,
2331 e03: i16,
2332 e04: i16,
2333 e05: i16,
2334 e06: i16,
2335 e07: i16,
2336 e08: i16,
2337 e09: i16,
2338 e10: i16,
2339 e11: i16,
2340 e12: i16,
2341 e13: i16,
2342 e14: i16,
2343 e15: i16,
2344) -> __m256i {
2345 #[rustfmt::skip]
2346 _mm256_setr_epi16(
2347 e00:e15, e01:e14, e02:e13, e03:e12,
2348 e04:e11, e05:e10, e06:e09, e07:e08,
2349 e08:e07, e09:e06, e10:e05, e11:e04,
2350 e12:e03, e13:e02, e14:e01, e15:e00,
2351 )
2352}
2353
2354/// Sets packed 32-bit integers in returned vector with the supplied values.
2355///
2356/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2357#[inline]
2358#[target_feature(enable = "avx")]
2359// This intrinsic has no corresponding instruction.
2360#[stable(feature = "simd_x86", since = "1.27.0")]
2361pub fn _mm256_set_epi32(
2362 e0: i32,
2363 e1: i32,
2364 e2: i32,
2365 e3: i32,
2366 e4: i32,
2367 e5: i32,
2368 e6: i32,
2369 e7: i32,
2370) -> __m256i {
2371 _mm256_setr_epi32(e0:e7, e1:e6, e2:e5, e3:e4, e4:e3, e5:e2, e6:e1, e7:e0)
2372}
2373
2374/// Sets packed 64-bit integers in returned vector with the supplied values.
2375///
2376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2377#[inline]
2378#[target_feature(enable = "avx")]
2379// This intrinsic has no corresponding instruction.
2380#[stable(feature = "simd_x86", since = "1.27.0")]
2381pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2382 _mm256_setr_epi64x(a:d, b:c, c:b, d:a)
2383}
2384
2385/// Sets packed double-precision (64-bit) floating-point elements in returned
2386/// vector with the supplied values in reverse order.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2389#[inline]
2390#[target_feature(enable = "avx")]
2391// This intrinsic has no corresponding instruction.
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2394 __m256d([a, b, c, d])
2395}
2396
2397/// Sets packed single-precision (32-bit) floating-point elements in returned
2398/// vector with the supplied values in reverse order.
2399///
2400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2401#[inline]
2402#[target_feature(enable = "avx")]
2403// This intrinsic has no corresponding instruction.
2404#[stable(feature = "simd_x86", since = "1.27.0")]
2405pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2406 __m256([a, b, c, d, e, f, g, h])
2407}
2408
2409/// Sets packed 8-bit integers in returned vector with the supplied values in
2410/// reverse order.
2411///
2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2413#[inline]
2414#[target_feature(enable = "avx")]
2415// This intrinsic has no corresponding instruction.
2416#[stable(feature = "simd_x86", since = "1.27.0")]
2417pub fn _mm256_setr_epi8(
2418 e00: i8,
2419 e01: i8,
2420 e02: i8,
2421 e03: i8,
2422 e04: i8,
2423 e05: i8,
2424 e06: i8,
2425 e07: i8,
2426 e08: i8,
2427 e09: i8,
2428 e10: i8,
2429 e11: i8,
2430 e12: i8,
2431 e13: i8,
2432 e14: i8,
2433 e15: i8,
2434 e16: i8,
2435 e17: i8,
2436 e18: i8,
2437 e19: i8,
2438 e20: i8,
2439 e21: i8,
2440 e22: i8,
2441 e23: i8,
2442 e24: i8,
2443 e25: i8,
2444 e26: i8,
2445 e27: i8,
2446 e28: i8,
2447 e29: i8,
2448 e30: i8,
2449 e31: i8,
2450) -> __m256i {
2451 unsafe {
2452 #[rustfmt::skip]
2453 transmute(src:i8x32::new(
2454 x0:e00, x1:e01, x2:e02, x3:e03, x4:e04, x5:e05, x6:e06, x7:e07,
2455 x8:e08, x9:e09, x10:e10, x11:e11, x12:e12, x13:e13, x14:e14, x15:e15,
2456 x16:e16, x17:e17, x18:e18, x19:e19, x20:e20, x21:e21, x22:e22, x23:e23,
2457 x24:e24, x25:e25, x26:e26, x27:e27, x28:e28, x29:e29, x30:e30, x31:e31,
2458 ))
2459 }
2460}
2461
2462/// Sets packed 16-bit integers in returned vector with the supplied values in
2463/// reverse order.
2464///
2465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2466#[inline]
2467#[target_feature(enable = "avx")]
2468// This intrinsic has no corresponding instruction.
2469#[stable(feature = "simd_x86", since = "1.27.0")]
2470pub fn _mm256_setr_epi16(
2471 e00: i16,
2472 e01: i16,
2473 e02: i16,
2474 e03: i16,
2475 e04: i16,
2476 e05: i16,
2477 e06: i16,
2478 e07: i16,
2479 e08: i16,
2480 e09: i16,
2481 e10: i16,
2482 e11: i16,
2483 e12: i16,
2484 e13: i16,
2485 e14: i16,
2486 e15: i16,
2487) -> __m256i {
2488 unsafe {
2489 #[rustfmt::skip]
2490 transmute(src:i16x16::new(
2491 x0:e00, x1:e01, x2:e02, x3:e03,
2492 x4:e04, x5:e05, x6:e06, x7:e07,
2493 x8:e08, x9:e09, x10:e10, x11:e11,
2494 x12:e12, x13:e13, x14:e14, x15:e15,
2495 ))
2496 }
2497}
2498
2499/// Sets packed 32-bit integers in returned vector with the supplied values in
2500/// reverse order.
2501///
2502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2503#[inline]
2504#[target_feature(enable = "avx")]
2505// This intrinsic has no corresponding instruction.
2506#[stable(feature = "simd_x86", since = "1.27.0")]
2507pub fn _mm256_setr_epi32(
2508 e0: i32,
2509 e1: i32,
2510 e2: i32,
2511 e3: i32,
2512 e4: i32,
2513 e5: i32,
2514 e6: i32,
2515 e7: i32,
2516) -> __m256i {
2517 unsafe { transmute(src:i32x8::new(x0:e0, x1:e1, x2:e2, x3:e3, x4:e4, x5:e5, x6:e6, x7:e7)) }
2518}
2519
2520/// Sets packed 64-bit integers in returned vector with the supplied values in
2521/// reverse order.
2522///
2523/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2524#[inline]
2525#[target_feature(enable = "avx")]
2526// This intrinsic has no corresponding instruction.
2527#[stable(feature = "simd_x86", since = "1.27.0")]
2528pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2529 unsafe { transmute(src:i64x4::new(x0:a, x1:b, x2:c, x3:d)) }
2530}
2531
2532/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2533/// elements of returned vector.
2534///
2535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2536#[inline]
2537#[target_feature(enable = "avx")]
2538// This intrinsic has no corresponding instruction.
2539#[stable(feature = "simd_x86", since = "1.27.0")]
2540pub fn _mm256_set1_pd(a: f64) -> __m256d {
2541 _mm256_setr_pd(a, b:a, c:a, d:a)
2542}
2543
2544/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2545/// elements of returned vector.
2546///
2547/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2548#[inline]
2549#[target_feature(enable = "avx")]
2550// This intrinsic has no corresponding instruction.
2551#[stable(feature = "simd_x86", since = "1.27.0")]
2552pub fn _mm256_set1_ps(a: f32) -> __m256 {
2553 _mm256_setr_ps(a, b:a, c:a, d:a, e:a, f:a, g:a, h:a)
2554}
2555
2556/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2557/// This intrinsic may generate the `vpbroadcastb`.
2558///
2559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2560#[inline]
2561#[target_feature(enable = "avx")]
2562// This intrinsic has no corresponding instruction.
2563#[stable(feature = "simd_x86", since = "1.27.0")]
2564pub fn _mm256_set1_epi8(a: i8) -> __m256i {
2565 #[rustfmt::skip]
2566 _mm256_setr_epi8(
2567 e00:a, e01:a, e02:a, e03:a, e04:a, e05:a, e06:a, e07:a,
2568 e08:a, e09:a, e10:a, e11:a, e12:a, e13:a, e14:a, e15:a,
2569 e16:a, e17:a, e18:a, e19:a, e20:a, e21:a, e22:a, e23:a,
2570 e24:a, e25:a, e26:a, e27:a, e28:a, e29:a, e30:a, e31:a,
2571 )
2572}
2573
2574/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2575/// This intrinsic may generate the `vpbroadcastw`.
2576///
2577/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2578#[inline]
2579#[target_feature(enable = "avx")]
2580//#[cfg_attr(test, assert_instr(vpshufb))]
2581#[cfg_attr(test, assert_instr(vinsertf128))]
2582// This intrinsic has no corresponding instruction.
2583#[stable(feature = "simd_x86", since = "1.27.0")]
2584pub fn _mm256_set1_epi16(a: i16) -> __m256i {
2585 _mm256_setr_epi16(e00:a, e01:a, e02:a, e03:a, e04:a, e05:a, e06:a, e07:a, e08:a, e09:a, e10:a, e11:a, e12:a, e13:a, e14:a, e15:a)
2586}
2587
2588/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2589/// This intrinsic may generate the `vpbroadcastd`.
2590///
2591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2592#[inline]
2593#[target_feature(enable = "avx")]
2594// This intrinsic has no corresponding instruction.
2595#[stable(feature = "simd_x86", since = "1.27.0")]
2596pub fn _mm256_set1_epi32(a: i32) -> __m256i {
2597 _mm256_setr_epi32(e0:a, e1:a, e2:a, e3:a, e4:a, e5:a, e6:a, e7:a)
2598}
2599
2600/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2601/// This intrinsic may generate the `vpbroadcastq`.
2602///
2603/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2604#[inline]
2605#[target_feature(enable = "avx")]
2606#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2607#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2608// This intrinsic has no corresponding instruction.
2609#[stable(feature = "simd_x86", since = "1.27.0")]
2610pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
2611 _mm256_setr_epi64x(a, b:a, c:a, d:a)
2612}
2613
2614/// Cast vector of type __m256d to type __m256.
2615///
2616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2617#[inline]
2618#[target_feature(enable = "avx")]
2619// This intrinsic is only used for compilation and does not generate any
2620// instructions, thus it has zero latency.
2621#[stable(feature = "simd_x86", since = "1.27.0")]
2622pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2623 unsafe { transmute(src:a) }
2624}
2625
2626/// Cast vector of type __m256 to type __m256d.
2627///
2628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2629#[inline]
2630#[target_feature(enable = "avx")]
2631// This intrinsic is only used for compilation and does not generate any
2632// instructions, thus it has zero latency.
2633#[stable(feature = "simd_x86", since = "1.27.0")]
2634pub fn _mm256_castps_pd(a: __m256) -> __m256d {
2635 unsafe { transmute(src:a) }
2636}
2637
2638/// Casts vector of type __m256 to type __m256i.
2639///
2640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2641#[inline]
2642#[target_feature(enable = "avx")]
2643// This intrinsic is only used for compilation and does not generate any
2644// instructions, thus it has zero latency.
2645#[stable(feature = "simd_x86", since = "1.27.0")]
2646pub fn _mm256_castps_si256(a: __m256) -> __m256i {
2647 unsafe { transmute(src:a) }
2648}
2649
2650/// Casts vector of type __m256i to type __m256.
2651///
2652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2653#[inline]
2654#[target_feature(enable = "avx")]
2655// This intrinsic is only used for compilation and does not generate any
2656// instructions, thus it has zero latency.
2657#[stable(feature = "simd_x86", since = "1.27.0")]
2658pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2659 unsafe { transmute(src:a) }
2660}
2661
2662/// Casts vector of type __m256d to type __m256i.
2663///
2664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2665#[inline]
2666#[target_feature(enable = "avx")]
2667// This intrinsic is only used for compilation and does not generate any
2668// instructions, thus it has zero latency.
2669#[stable(feature = "simd_x86", since = "1.27.0")]
2670pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2671 unsafe { transmute(src:a) }
2672}
2673
2674/// Casts vector of type __m256i to type __m256d.
2675///
2676/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2677#[inline]
2678#[target_feature(enable = "avx")]
2679// This intrinsic is only used for compilation and does not generate any
2680// instructions, thus it has zero latency.
2681#[stable(feature = "simd_x86", since = "1.27.0")]
2682pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2683 unsafe { transmute(src:a) }
2684}
2685
2686/// Casts vector of type __m256 to type __m128.
2687///
2688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2689#[inline]
2690#[target_feature(enable = "avx")]
2691// This intrinsic is only used for compilation and does not generate any
2692// instructions, thus it has zero latency.
2693#[stable(feature = "simd_x86", since = "1.27.0")]
2694pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2695 unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2696}
2697
2698/// Casts vector of type __m256d to type __m128d.
2699///
2700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2701#[inline]
2702#[target_feature(enable = "avx")]
2703// This intrinsic is only used for compilation and does not generate any
2704// instructions, thus it has zero latency.
2705#[stable(feature = "simd_x86", since = "1.27.0")]
2706pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2707 unsafe { simd_shuffle!(a, a, [0, 1]) }
2708}
2709
2710/// Casts vector of type __m256i to type __m128i.
2711///
2712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2713#[inline]
2714#[target_feature(enable = "avx")]
2715// This intrinsic is only used for compilation and does not generate any
2716// instructions, thus it has zero latency.
2717#[stable(feature = "simd_x86", since = "1.27.0")]
2718pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2719 unsafe {
2720 let a: i64x4 = a.as_i64x4();
2721 let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2722 transmute(src:dst)
2723 }
2724}
2725
2726/// Casts vector of type __m128 to type __m256;
2727/// the upper 128 bits of the result are undefined.
2728///
2729/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2730#[inline]
2731#[target_feature(enable = "avx")]
2732// This intrinsic is only used for compilation and does not generate any
2733// instructions, thus it has zero latency.
2734#[stable(feature = "simd_x86", since = "1.27.0")]
2735pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2736 unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2737}
2738
2739/// Casts vector of type __m128d to type __m256d;
2740/// the upper 128 bits of the result are undefined.
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2743#[inline]
2744#[target_feature(enable = "avx")]
2745// This intrinsic is only used for compilation and does not generate any
2746// instructions, thus it has zero latency.
2747#[stable(feature = "simd_x86", since = "1.27.0")]
2748pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2749 unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2750}
2751
2752/// Casts vector of type __m128i to type __m256i;
2753/// the upper 128 bits of the result are undefined.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2756#[inline]
2757#[target_feature(enable = "avx")]
2758// This intrinsic is only used for compilation and does not generate any
2759// instructions, thus it has zero latency.
2760#[stable(feature = "simd_x86", since = "1.27.0")]
2761pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2762 unsafe {
2763 let a: i64x2 = a.as_i64x2();
2764 let undefined: i64x2 = i64x2::ZERO;
2765 let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2766 transmute(src:dst)
2767 }
2768}
2769
2770/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2771/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
2772/// the value of the source vector. The upper 128 bits are set to zero.
2773///
2774/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
2775#[inline]
2776#[target_feature(enable = "avx")]
2777// This intrinsic is only used for compilation and does not generate any
2778// instructions, thus it has zero latency.
2779#[stable(feature = "simd_x86", since = "1.27.0")]
2780pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
2781 unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
2782}
2783
2784/// Constructs a 256-bit integer vector from a 128-bit integer vector.
2785/// The lower 128 bits contain the value of the source vector. The upper
2786/// 128 bits are set to zero.
2787///
2788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
2789#[inline]
2790#[target_feature(enable = "avx")]
2791// This intrinsic is only used for compilation and does not generate any
2792// instructions, thus it has zero latency.
2793#[stable(feature = "simd_x86", since = "1.27.0")]
2794pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
2795 unsafe {
2796 let b: i64x2 = i64x2::ZERO;
2797 let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
2798 transmute(src:dst)
2799 }
2800}
2801
2802/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
2803/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
2804/// contain the value of the source vector. The upper 128 bits are set
2805/// to zero.
2806///
2807/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
2808#[inline]
2809#[target_feature(enable = "avx")]
2810// This intrinsic is only used for compilation and does not generate any
2811// instructions, thus it has zero latency.
2812#[stable(feature = "simd_x86", since = "1.27.0")]
2813pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
2814 unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
2815}
2816
2817/// Returns vector of type `__m256` with indeterminate elements.
2818/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2819/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2820/// In practice, this is typically equivalent to [`mem::zeroed`].
2821///
2822/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
2823#[inline]
2824#[target_feature(enable = "avx")]
2825// This intrinsic has no corresponding instruction.
2826#[stable(feature = "simd_x86", since = "1.27.0")]
2827pub fn _mm256_undefined_ps() -> __m256 {
2828 const { unsafe { mem::zeroed() } }
2829}
2830
2831/// Returns vector of type `__m256d` with indeterminate elements.
2832/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2833/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2834/// In practice, this is typically equivalent to [`mem::zeroed`].
2835///
2836/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
2837#[inline]
2838#[target_feature(enable = "avx")]
2839// This intrinsic has no corresponding instruction.
2840#[stable(feature = "simd_x86", since = "1.27.0")]
2841pub fn _mm256_undefined_pd() -> __m256d {
2842 const { unsafe { mem::zeroed() } }
2843}
2844
2845/// Returns vector of type __m256i with with indeterminate elements.
2846/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2847/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2848/// In practice, this is typically equivalent to [`mem::zeroed`].
2849///
2850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
2851#[inline]
2852#[target_feature(enable = "avx")]
2853// This intrinsic has no corresponding instruction.
2854#[stable(feature = "simd_x86", since = "1.27.0")]
2855pub fn _mm256_undefined_si256() -> __m256i {
2856 const { unsafe { mem::zeroed() } }
2857}
2858
2859/// Sets packed __m256 returned vector with the supplied values.
2860///
2861/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
2862#[inline]
2863#[target_feature(enable = "avx")]
2864#[cfg_attr(test, assert_instr(vinsertf128))]
2865#[stable(feature = "simd_x86", since = "1.27.0")]
2866pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
2867 unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
2868}
2869
2870/// Sets packed __m256d returned vector with the supplied values.
2871///
2872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
2873#[inline]
2874#[target_feature(enable = "avx")]
2875#[cfg_attr(test, assert_instr(vinsertf128))]
2876#[stable(feature = "simd_x86", since = "1.27.0")]
2877pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
2878 unsafe {
2879 let hi: __m128 = transmute(src:hi);
2880 let lo: __m128 = transmute(src:lo);
2881 transmute(src:_mm256_set_m128(hi, lo))
2882 }
2883}
2884
2885/// Sets packed __m256i returned vector with the supplied values.
2886///
2887/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
2888#[inline]
2889#[target_feature(enable = "avx")]
2890#[cfg_attr(test, assert_instr(vinsertf128))]
2891#[stable(feature = "simd_x86", since = "1.27.0")]
2892pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
2893 unsafe {
2894 let hi: __m128 = transmute(src:hi);
2895 let lo: __m128 = transmute(src:lo);
2896 transmute(src:_mm256_set_m128(hi, lo))
2897 }
2898}
2899
2900/// Sets packed __m256 returned vector with the supplied values.
2901///
2902/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
2903#[inline]
2904#[target_feature(enable = "avx")]
2905#[cfg_attr(test, assert_instr(vinsertf128))]
2906#[stable(feature = "simd_x86", since = "1.27.0")]
2907pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
2908 _mm256_set_m128(hi, lo)
2909}
2910
2911/// Sets packed __m256d returned vector with the supplied values.
2912///
2913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
2914#[inline]
2915#[target_feature(enable = "avx")]
2916#[cfg_attr(test, assert_instr(vinsertf128))]
2917#[stable(feature = "simd_x86", since = "1.27.0")]
2918pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
2919 _mm256_set_m128d(hi, lo)
2920}
2921
2922/// Sets packed __m256i returned vector with the supplied values.
2923///
2924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
2925#[inline]
2926#[target_feature(enable = "avx")]
2927#[cfg_attr(test, assert_instr(vinsertf128))]
2928#[stable(feature = "simd_x86", since = "1.27.0")]
2929pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
2930 _mm256_set_m128i(hi, lo)
2931}
2932
2933/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
2934/// floating-point elements) from memory, and combine them into a 256-bit
2935/// value.
2936/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2937///
2938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
2939#[inline]
2940#[target_feature(enable = "avx")]
2941// This intrinsic has no corresponding instruction.
2942#[stable(feature = "simd_x86", since = "1.27.0")]
2943pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
2944 let a: __m256 = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
2945 _mm256_insertf128_ps::<1>(a, b:_mm_loadu_ps(hiaddr))
2946}
2947
2948/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
2949/// floating-point elements) from memory, and combine them into a 256-bit
2950/// value.
2951/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2952///
2953/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
2954#[inline]
2955#[target_feature(enable = "avx")]
2956// This intrinsic has no corresponding instruction.
2957#[stable(feature = "simd_x86", since = "1.27.0")]
2958pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
2959 let a: __m256d = _mm256_castpd128_pd256(_mm_loadu_pd(mem_addr:loaddr));
2960 _mm256_insertf128_pd::<1>(a, b:_mm_loadu_pd(mem_addr:hiaddr))
2961}
2962
2963/// Loads two 128-bit values (composed of integer data) from memory, and combine
2964/// them into a 256-bit value.
2965/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2966///
2967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
2968#[inline]
2969#[target_feature(enable = "avx")]
2970// This intrinsic has no corresponding instruction.
2971#[stable(feature = "simd_x86", since = "1.27.0")]
2972pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
2973 let a: __m256i = _mm256_castsi128_si256(_mm_loadu_si128(mem_addr:loaddr));
2974 _mm256_insertf128_si256::<1>(a, b:_mm_loadu_si128(mem_addr:hiaddr))
2975}
2976
2977/// Stores the high and low 128-bit halves (each composed of 4 packed
2978/// single-precision (32-bit) floating-point elements) from `a` into memory two
2979/// different 128-bit locations.
2980/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2981///
2982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
2983#[inline]
2984#[target_feature(enable = "avx")]
2985// This intrinsic has no corresponding instruction.
2986#[stable(feature = "simd_x86", since = "1.27.0")]
2987pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
2988 let lo: __m128 = _mm256_castps256_ps128(a);
2989 _mm_storeu_ps(p:loaddr, a:lo);
2990 let hi: __m128 = _mm256_extractf128_ps::<1>(a);
2991 _mm_storeu_ps(p:hiaddr, a:hi);
2992}
2993
2994/// Stores the high and low 128-bit halves (each composed of 2 packed
2995/// double-precision (64-bit) floating-point elements) from `a` into memory two
2996/// different 128-bit locations.
2997/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2998///
2999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3000#[inline]
3001#[target_feature(enable = "avx")]
3002// This intrinsic has no corresponding instruction.
3003#[stable(feature = "simd_x86", since = "1.27.0")]
3004pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3005 let lo: __m128d = _mm256_castpd256_pd128(a);
3006 _mm_storeu_pd(mem_addr:loaddr, a:lo);
3007 let hi: __m128d = _mm256_extractf128_pd::<1>(a);
3008 _mm_storeu_pd(mem_addr:hiaddr, a:hi);
3009}
3010
3011/// Stores the high and low 128-bit halves (each composed of integer data) from
3012/// `a` into memory two different 128-bit locations.
3013/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3014///
3015/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3016#[inline]
3017#[target_feature(enable = "avx")]
3018// This intrinsic has no corresponding instruction.
3019#[stable(feature = "simd_x86", since = "1.27.0")]
3020pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3021 let lo: __m128i = _mm256_castsi256_si128(a);
3022 _mm_storeu_si128(mem_addr:loaddr, a:lo);
3023 let hi: __m128i = _mm256_extractf128_si256::<1>(a);
3024 _mm_storeu_si128(mem_addr:hiaddr, a:hi);
3025}
3026
3027/// Returns the first element of the input vector of `[8 x float]`.
3028///
3029/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3030#[inline]
3031#[target_feature(enable = "avx")]
3032//#[cfg_attr(test, assert_instr(movss))] FIXME
3033#[stable(feature = "simd_x86", since = "1.27.0")]
3034pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
3035 unsafe { simd_extract!(a, 0) }
3036}
3037
3038// LLVM intrinsics used in the above functions
3039#[allow(improper_ctypes)]
3040unsafe extern "C" {
3041 #[link_name = "llvm.x86.avx.round.pd.256"]
3042 unsafefn roundpd256(a: __m256d, b: i32) -> __m256d;
3043 #[link_name = "llvm.x86.avx.round.ps.256"]
3044 unsafefn roundps256(a: __m256, b: i32) -> __m256;
3045 #[link_name = "llvm.x86.avx.dp.ps.256"]
3046 unsafefn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3047 #[link_name = "llvm.x86.avx.hadd.pd.256"]
3048 unsafefn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
3049 #[link_name = "llvm.x86.avx.hadd.ps.256"]
3050 unsafefn vhaddps(a: __m256, b: __m256) -> __m256;
3051 #[link_name = "llvm.x86.avx.hsub.pd.256"]
3052 unsafefn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
3053 #[link_name = "llvm.x86.avx.hsub.ps.256"]
3054 unsafefn vhsubps(a: __m256, b: __m256) -> __m256;
3055 #[link_name = "llvm.x86.sse2.cmp.pd"]
3056 unsafefn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3057 #[link_name = "llvm.x86.avx.cmp.pd.256"]
3058 unsafefn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3059 #[link_name = "llvm.x86.sse.cmp.ps"]
3060 unsafefn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3061 #[link_name = "llvm.x86.avx.cmp.ps.256"]
3062 unsafefn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3063 #[link_name = "llvm.x86.sse2.cmp.sd"]
3064 unsafefn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3065 #[link_name = "llvm.x86.sse.cmp.ss"]
3066 unsafefn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3067 #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3068 unsafefn vcvtps2dq(a: __m256) -> i32x8;
3069 #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3070 unsafefn vcvttpd2dq(a: __m256d) -> i32x4;
3071 #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3072 unsafefn vcvtpd2dq(a: __m256d) -> i32x4;
3073 #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3074 unsafefn vcvttps2dq(a: __m256) -> i32x8;
3075 #[link_name = "llvm.x86.avx.vzeroall"]
3076 unsafefn vzeroall();
3077 #[link_name = "llvm.x86.avx.vzeroupper"]
3078 unsafefn vzeroupper();
3079 #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3080 unsafefn vpermilps256(a: __m256, b: i32x8) -> __m256;
3081 #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3082 unsafefn vpermilps(a: __m128, b: i32x4) -> __m128;
3083 #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3084 unsafefn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3085 #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3086 unsafefn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3087 #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
3088 unsafefn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
3089 #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
3090 unsafefn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
3091 #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
3092 unsafefn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
3093 #[link_name = "llvm.x86.avx.maskload.pd.256"]
3094 unsafefn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
3095 #[link_name = "llvm.x86.avx.maskstore.pd.256"]
3096 unsafefn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
3097 #[link_name = "llvm.x86.avx.maskload.pd"]
3098 unsafefn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
3099 #[link_name = "llvm.x86.avx.maskstore.pd"]
3100 unsafefn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
3101 #[link_name = "llvm.x86.avx.maskload.ps.256"]
3102 unsafefn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
3103 #[link_name = "llvm.x86.avx.maskstore.ps.256"]
3104 unsafefn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
3105 #[link_name = "llvm.x86.avx.maskload.ps"]
3106 unsafefn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
3107 #[link_name = "llvm.x86.avx.maskstore.ps"]
3108 unsafefn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
3109 #[link_name = "llvm.x86.avx.ldu.dq.256"]
3110 unsafefn vlddqu(mem_addr: *const i8) -> i8x32;
3111 #[link_name = "llvm.x86.avx.rcp.ps.256"]
3112 unsafefn vrcpps(a: __m256) -> __m256;
3113 #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3114 unsafefn vrsqrtps(a: __m256) -> __m256;
3115 #[link_name = "llvm.x86.avx.ptestz.256"]
3116 unsafefn ptestz256(a: i64x4, b: i64x4) -> i32;
3117 #[link_name = "llvm.x86.avx.ptestc.256"]
3118 unsafefn ptestc256(a: i64x4, b: i64x4) -> i32;
3119 #[link_name = "llvm.x86.avx.ptestnzc.256"]
3120 unsafefn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3121 #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3122 unsafefn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3123 #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3124 unsafefn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3125 #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3126 unsafefn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3127 #[link_name = "llvm.x86.avx.vtestz.pd"]
3128 unsafefn vtestzpd(a: __m128d, b: __m128d) -> i32;
3129 #[link_name = "llvm.x86.avx.vtestc.pd"]
3130 unsafefn vtestcpd(a: __m128d, b: __m128d) -> i32;
3131 #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3132 unsafefn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3133 #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3134 unsafefn vtestzps256(a: __m256, b: __m256) -> i32;
3135 #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3136 unsafefn vtestcps256(a: __m256, b: __m256) -> i32;
3137 #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3138 unsafefn vtestnzcps256(a: __m256, b: __m256) -> i32;
3139 #[link_name = "llvm.x86.avx.vtestz.ps"]
3140 unsafefn vtestzps(a: __m128, b: __m128) -> i32;
3141 #[link_name = "llvm.x86.avx.vtestc.ps"]
3142 unsafefn vtestcps(a: __m128, b: __m128) -> i32;
3143 #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3144 unsafefn vtestnzcps(a: __m128, b: __m128) -> i32;
3145 #[link_name = "llvm.x86.avx.min.ps.256"]
3146 unsafefn vminps(a: __m256, b: __m256) -> __m256;
3147 #[link_name = "llvm.x86.avx.max.ps.256"]
3148 unsafefn vmaxps(a: __m256, b: __m256) -> __m256;
3149 #[link_name = "llvm.x86.avx.min.pd.256"]
3150 unsafefn vminpd(a: __m256d, b: __m256d) -> __m256d;
3151 #[link_name = "llvm.x86.avx.max.pd.256"]
3152 unsafefn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3153}
3154
3155#[cfg(test)]
3156mod tests {
3157 use crate::hint::black_box;
3158 use crate::ptr;
3159 use stdarch_test::simd_test;
3160
3161 use crate::core_arch::x86::*;
3162
3163 #[simd_test(enable = "avx")]
3164 unsafe fn test_mm256_add_pd() {
3165 let a = _mm256_setr_pd(1., 2., 3., 4.);
3166 let b = _mm256_setr_pd(5., 6., 7., 8.);
3167 let r = _mm256_add_pd(a, b);
3168 let e = _mm256_setr_pd(6., 8., 10., 12.);
3169 assert_eq_m256d(r, e);
3170 }
3171
3172 #[simd_test(enable = "avx")]
3173 unsafe fn test_mm256_add_ps() {
3174 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3175 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3176 let r = _mm256_add_ps(a, b);
3177 let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3178 assert_eq_m256(r, e);
3179 }
3180
3181 #[simd_test(enable = "avx")]
3182 unsafe fn test_mm256_and_pd() {
3183 let a = _mm256_set1_pd(1.);
3184 let b = _mm256_set1_pd(0.6);
3185 let r = _mm256_and_pd(a, b);
3186 let e = _mm256_set1_pd(0.5);
3187 assert_eq_m256d(r, e);
3188 }
3189
3190 #[simd_test(enable = "avx")]
3191 unsafe fn test_mm256_and_ps() {
3192 let a = _mm256_set1_ps(1.);
3193 let b = _mm256_set1_ps(0.6);
3194 let r = _mm256_and_ps(a, b);
3195 let e = _mm256_set1_ps(0.5);
3196 assert_eq_m256(r, e);
3197 }
3198
3199 #[simd_test(enable = "avx")]
3200 unsafe fn test_mm256_or_pd() {
3201 let a = _mm256_set1_pd(1.);
3202 let b = _mm256_set1_pd(0.6);
3203 let r = _mm256_or_pd(a, b);
3204 let e = _mm256_set1_pd(1.2);
3205 assert_eq_m256d(r, e);
3206 }
3207
3208 #[simd_test(enable = "avx")]
3209 unsafe fn test_mm256_or_ps() {
3210 let a = _mm256_set1_ps(1.);
3211 let b = _mm256_set1_ps(0.6);
3212 let r = _mm256_or_ps(a, b);
3213 let e = _mm256_set1_ps(1.2);
3214 assert_eq_m256(r, e);
3215 }
3216
3217 #[simd_test(enable = "avx")]
3218 unsafe fn test_mm256_shuffle_pd() {
3219 let a = _mm256_setr_pd(1., 4., 5., 8.);
3220 let b = _mm256_setr_pd(2., 3., 6., 7.);
3221 let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3222 let e = _mm256_setr_pd(4., 3., 8., 7.);
3223 assert_eq_m256d(r, e);
3224 }
3225
3226 #[simd_test(enable = "avx")]
3227 unsafe fn test_mm256_shuffle_ps() {
3228 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3229 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3230 let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3231 let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3232 assert_eq_m256(r, e);
3233 }
3234
3235 #[simd_test(enable = "avx")]
3236 unsafe fn test_mm256_andnot_pd() {
3237 let a = _mm256_set1_pd(0.);
3238 let b = _mm256_set1_pd(0.6);
3239 let r = _mm256_andnot_pd(a, b);
3240 assert_eq_m256d(r, b);
3241 }
3242
3243 #[simd_test(enable = "avx")]
3244 unsafe fn test_mm256_andnot_ps() {
3245 let a = _mm256_set1_ps(0.);
3246 let b = _mm256_set1_ps(0.6);
3247 let r = _mm256_andnot_ps(a, b);
3248 assert_eq_m256(r, b);
3249 }
3250
3251 #[simd_test(enable = "avx")]
3252 unsafe fn test_mm256_max_pd() {
3253 let a = _mm256_setr_pd(1., 4., 5., 8.);
3254 let b = _mm256_setr_pd(2., 3., 6., 7.);
3255 let r = _mm256_max_pd(a, b);
3256 let e = _mm256_setr_pd(2., 4., 6., 8.);
3257 assert_eq_m256d(r, e);
3258 // > If the values being compared are both 0.0s (of either sign), the
3259 // > value in the second operand (source operand) is returned.
3260 let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3261 let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3262 let wu: [u64; 4] = transmute(w);
3263 let xu: [u64; 4] = transmute(x);
3264 assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3265 assert_eq!(xu, [0u64; 4]);
3266 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3267 // > second operand (source operand), either a NaN or a valid
3268 // > floating-point value, is written to the result.
3269 let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3270 let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3271 let yf: [f64; 4] = transmute(y);
3272 let zf: [f64; 4] = transmute(z);
3273 assert_eq!(yf, [0.0; 4]);
3274 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3275 }
3276
3277 #[simd_test(enable = "avx")]
3278 unsafe fn test_mm256_max_ps() {
3279 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3280 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3281 let r = _mm256_max_ps(a, b);
3282 let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3283 assert_eq_m256(r, e);
3284 // > If the values being compared are both 0.0s (of either sign), the
3285 // > value in the second operand (source operand) is returned.
3286 let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3287 let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3288 let wu: [u32; 8] = transmute(w);
3289 let xu: [u32; 8] = transmute(x);
3290 assert_eq!(wu, [0x8000_0000u32; 8]);
3291 assert_eq!(xu, [0u32; 8]);
3292 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3293 // > second operand (source operand), either a NaN or a valid
3294 // > floating-point value, is written to the result.
3295 let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3296 let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3297 let yf: [f32; 8] = transmute(y);
3298 let zf: [f32; 8] = transmute(z);
3299 assert_eq!(yf, [0.0; 8]);
3300 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3301 }
3302
3303 #[simd_test(enable = "avx")]
3304 unsafe fn test_mm256_min_pd() {
3305 let a = _mm256_setr_pd(1., 4., 5., 8.);
3306 let b = _mm256_setr_pd(2., 3., 6., 7.);
3307 let r = _mm256_min_pd(a, b);
3308 let e = _mm256_setr_pd(1., 3., 5., 7.);
3309 assert_eq_m256d(r, e);
3310 // > If the values being compared are both 0.0s (of either sign), the
3311 // > value in the second operand (source operand) is returned.
3312 let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3313 let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3314 let wu: [u64; 4] = transmute(w);
3315 let xu: [u64; 4] = transmute(x);
3316 assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3317 assert_eq!(xu, [0u64; 4]);
3318 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3319 // > second operand (source operand), either a NaN or a valid
3320 // > floating-point value, is written to the result.
3321 let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3322 let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3323 let yf: [f64; 4] = transmute(y);
3324 let zf: [f64; 4] = transmute(z);
3325 assert_eq!(yf, [0.0; 4]);
3326 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3327 }
3328
3329 #[simd_test(enable = "avx")]
3330 unsafe fn test_mm256_min_ps() {
3331 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3332 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3333 let r = _mm256_min_ps(a, b);
3334 let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3335 assert_eq_m256(r, e);
3336 // > If the values being compared are both 0.0s (of either sign), the
3337 // > value in the second operand (source operand) is returned.
3338 let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3339 let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3340 let wu: [u32; 8] = transmute(w);
3341 let xu: [u32; 8] = transmute(x);
3342 assert_eq!(wu, [0x8000_0000u32; 8]);
3343 assert_eq!(xu, [0u32; 8]);
3344 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3345 // > second operand (source operand), either a NaN or a valid
3346 // > floating-point value, is written to the result.
3347 let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3348 let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3349 let yf: [f32; 8] = transmute(y);
3350 let zf: [f32; 8] = transmute(z);
3351 assert_eq!(yf, [0.0; 8]);
3352 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3353 }
3354
3355 #[simd_test(enable = "avx")]
3356 unsafe fn test_mm256_mul_pd() {
3357 let a = _mm256_setr_pd(1., 2., 3., 4.);
3358 let b = _mm256_setr_pd(5., 6., 7., 8.);
3359 let r = _mm256_mul_pd(a, b);
3360 let e = _mm256_setr_pd(5., 12., 21., 32.);
3361 assert_eq_m256d(r, e);
3362 }
3363
3364 #[simd_test(enable = "avx")]
3365 unsafe fn test_mm256_mul_ps() {
3366 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3367 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3368 let r = _mm256_mul_ps(a, b);
3369 let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3370 assert_eq_m256(r, e);
3371 }
3372
3373 #[simd_test(enable = "avx")]
3374 unsafe fn test_mm256_addsub_pd() {
3375 let a = _mm256_setr_pd(1., 2., 3., 4.);
3376 let b = _mm256_setr_pd(5., 6., 7., 8.);
3377 let r = _mm256_addsub_pd(a, b);
3378 let e = _mm256_setr_pd(-4., 8., -4., 12.);
3379 assert_eq_m256d(r, e);
3380 }
3381
3382 #[simd_test(enable = "avx")]
3383 unsafe fn test_mm256_addsub_ps() {
3384 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3385 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3386 let r = _mm256_addsub_ps(a, b);
3387 let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3388 assert_eq_m256(r, e);
3389 }
3390
3391 #[simd_test(enable = "avx")]
3392 unsafe fn test_mm256_sub_pd() {
3393 let a = _mm256_setr_pd(1., 2., 3., 4.);
3394 let b = _mm256_setr_pd(5., 6., 7., 8.);
3395 let r = _mm256_sub_pd(a, b);
3396 let e = _mm256_setr_pd(-4., -4., -4., -4.);
3397 assert_eq_m256d(r, e);
3398 }
3399
3400 #[simd_test(enable = "avx")]
3401 unsafe fn test_mm256_sub_ps() {
3402 let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3403 let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3404 let r = _mm256_sub_ps(a, b);
3405 let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3406 assert_eq_m256(r, e);
3407 }
3408
3409 #[simd_test(enable = "avx")]
3410 unsafe fn test_mm256_round_pd() {
3411 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3412 let result_closest = _mm256_round_pd::<0b0000>(a);
3413 let result_down = _mm256_round_pd::<0b0001>(a);
3414 let result_up = _mm256_round_pd::<0b0010>(a);
3415 let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3416 let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3417 let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3418 assert_eq_m256d(result_closest, expected_closest);
3419 assert_eq_m256d(result_down, expected_down);
3420 assert_eq_m256d(result_up, expected_up);
3421 }
3422
3423 #[simd_test(enable = "avx")]
3424 unsafe fn test_mm256_floor_pd() {
3425 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3426 let result_down = _mm256_floor_pd(a);
3427 let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3428 assert_eq_m256d(result_down, expected_down);
3429 }
3430
3431 #[simd_test(enable = "avx")]
3432 unsafe fn test_mm256_ceil_pd() {
3433 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3434 let result_up = _mm256_ceil_pd(a);
3435 let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3436 assert_eq_m256d(result_up, expected_up);
3437 }
3438
3439 #[simd_test(enable = "avx")]
3440 unsafe fn test_mm256_round_ps() {
3441 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3442 let result_closest = _mm256_round_ps::<0b0000>(a);
3443 let result_down = _mm256_round_ps::<0b0001>(a);
3444 let result_up = _mm256_round_ps::<0b0010>(a);
3445 let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3446 let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3447 let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3448 assert_eq_m256(result_closest, expected_closest);
3449 assert_eq_m256(result_down, expected_down);
3450 assert_eq_m256(result_up, expected_up);
3451 }
3452
3453 #[simd_test(enable = "avx")]
3454 unsafe fn test_mm256_floor_ps() {
3455 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3456 let result_down = _mm256_floor_ps(a);
3457 let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3458 assert_eq_m256(result_down, expected_down);
3459 }
3460
3461 #[simd_test(enable = "avx")]
3462 unsafe fn test_mm256_ceil_ps() {
3463 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3464 let result_up = _mm256_ceil_ps(a);
3465 let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3466 assert_eq_m256(result_up, expected_up);
3467 }
3468
3469 #[simd_test(enable = "avx")]
3470 unsafe fn test_mm256_sqrt_pd() {
3471 let a = _mm256_setr_pd(4., 9., 16., 25.);
3472 let r = _mm256_sqrt_pd(a);
3473 let e = _mm256_setr_pd(2., 3., 4., 5.);
3474 assert_eq_m256d(r, e);
3475 }
3476
3477 #[simd_test(enable = "avx")]
3478 unsafe fn test_mm256_sqrt_ps() {
3479 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3480 let r = _mm256_sqrt_ps(a);
3481 let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3482 assert_eq_m256(r, e);
3483 }
3484
3485 #[simd_test(enable = "avx")]
3486 unsafe fn test_mm256_div_ps() {
3487 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3488 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3489 let r = _mm256_div_ps(a, b);
3490 let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3491 assert_eq_m256(r, e);
3492 }
3493
3494 #[simd_test(enable = "avx")]
3495 unsafe fn test_mm256_div_pd() {
3496 let a = _mm256_setr_pd(4., 9., 16., 25.);
3497 let b = _mm256_setr_pd(4., 3., 2., 5.);
3498 let r = _mm256_div_pd(a, b);
3499 let e = _mm256_setr_pd(1., 3., 8., 5.);
3500 assert_eq_m256d(r, e);
3501 }
3502
3503 #[simd_test(enable = "avx")]
3504 unsafe fn test_mm256_blend_pd() {
3505 let a = _mm256_setr_pd(4., 9., 16., 25.);
3506 let b = _mm256_setr_pd(4., 3., 2., 5.);
3507 let r = _mm256_blend_pd::<0x0>(a, b);
3508 assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3509 let r = _mm256_blend_pd::<0x3>(a, b);
3510 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3511 let r = _mm256_blend_pd::<0xF>(a, b);
3512 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3513 }
3514
3515 #[simd_test(enable = "avx")]
3516 unsafe fn test_mm256_blend_ps() {
3517 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3518 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3519 let r = _mm256_blend_ps::<0x0>(a, b);
3520 assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3521 let r = _mm256_blend_ps::<0x3>(a, b);
3522 assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3523 let r = _mm256_blend_ps::<0xF>(a, b);
3524 assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3525 }
3526
3527 #[simd_test(enable = "avx")]
3528 unsafe fn test_mm256_blendv_pd() {
3529 let a = _mm256_setr_pd(4., 9., 16., 25.);
3530 let b = _mm256_setr_pd(4., 3., 2., 5.);
3531 let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3532 let r = _mm256_blendv_pd(a, b, c);
3533 let e = _mm256_setr_pd(4., 9., 2., 5.);
3534 assert_eq_m256d(r, e);
3535 }
3536
3537 #[simd_test(enable = "avx")]
3538 unsafe fn test_mm256_blendv_ps() {
3539 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3540 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3541 #[rustfmt::skip]
3542 let c = _mm256_setr_ps(
3543 0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3544 );
3545 let r = _mm256_blendv_ps(a, b, c);
3546 let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3547 assert_eq_m256(r, e);
3548 }
3549
3550 #[simd_test(enable = "avx")]
3551 unsafe fn test_mm256_dp_ps() {
3552 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3553 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3554 let r = _mm256_dp_ps::<0xFF>(a, b);
3555 let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3556 assert_eq_m256(r, e);
3557 }
3558
3559 #[simd_test(enable = "avx")]
3560 unsafe fn test_mm256_hadd_pd() {
3561 let a = _mm256_setr_pd(4., 9., 16., 25.);
3562 let b = _mm256_setr_pd(4., 3., 2., 5.);
3563 let r = _mm256_hadd_pd(a, b);
3564 let e = _mm256_setr_pd(13., 7., 41., 7.);
3565 assert_eq_m256d(r, e);
3566
3567 let a = _mm256_setr_pd(1., 2., 3., 4.);
3568 let b = _mm256_setr_pd(5., 6., 7., 8.);
3569 let r = _mm256_hadd_pd(a, b);
3570 let e = _mm256_setr_pd(3., 11., 7., 15.);
3571 assert_eq_m256d(r, e);
3572 }
3573
3574 #[simd_test(enable = "avx")]
3575 unsafe fn test_mm256_hadd_ps() {
3576 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3577 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3578 let r = _mm256_hadd_ps(a, b);
3579 let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3580 assert_eq_m256(r, e);
3581
3582 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3583 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3584 let r = _mm256_hadd_ps(a, b);
3585 let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3586 assert_eq_m256(r, e);
3587 }
3588
3589 #[simd_test(enable = "avx")]
3590 unsafe fn test_mm256_hsub_pd() {
3591 let a = _mm256_setr_pd(4., 9., 16., 25.);
3592 let b = _mm256_setr_pd(4., 3., 2., 5.);
3593 let r = _mm256_hsub_pd(a, b);
3594 let e = _mm256_setr_pd(-5., 1., -9., -3.);
3595 assert_eq_m256d(r, e);
3596
3597 let a = _mm256_setr_pd(1., 2., 3., 4.);
3598 let b = _mm256_setr_pd(5., 6., 7., 8.);
3599 let r = _mm256_hsub_pd(a, b);
3600 let e = _mm256_setr_pd(-1., -1., -1., -1.);
3601 assert_eq_m256d(r, e);
3602 }
3603
3604 #[simd_test(enable = "avx")]
3605 unsafe fn test_mm256_hsub_ps() {
3606 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3607 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3608 let r = _mm256_hsub_ps(a, b);
3609 let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3610 assert_eq_m256(r, e);
3611
3612 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3613 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3614 let r = _mm256_hsub_ps(a, b);
3615 let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3616 assert_eq_m256(r, e);
3617 }
3618
3619 #[simd_test(enable = "avx")]
3620 unsafe fn test_mm256_xor_pd() {
3621 let a = _mm256_setr_pd(4., 9., 16., 25.);
3622 let b = _mm256_set1_pd(0.);
3623 let r = _mm256_xor_pd(a, b);
3624 assert_eq_m256d(r, a);
3625 }
3626
3627 #[simd_test(enable = "avx")]
3628 unsafe fn test_mm256_xor_ps() {
3629 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3630 let b = _mm256_set1_ps(0.);
3631 let r = _mm256_xor_ps(a, b);
3632 assert_eq_m256(r, a);
3633 }
3634
3635 #[simd_test(enable = "avx")]
3636 unsafe fn test_mm_cmp_pd() {
3637 let a = _mm_setr_pd(4., 9.);
3638 let b = _mm_setr_pd(4., 3.);
3639 let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3640 assert!(get_m128d(r, 0).is_nan());
3641 assert!(get_m128d(r, 1).is_nan());
3642 }
3643
3644 #[simd_test(enable = "avx")]
3645 unsafe fn test_mm256_cmp_pd() {
3646 let a = _mm256_setr_pd(1., 2., 3., 4.);
3647 let b = _mm256_setr_pd(5., 6., 7., 8.);
3648 let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3649 let e = _mm256_set1_pd(0.);
3650 assert_eq_m256d(r, e);
3651 }
3652
3653 #[simd_test(enable = "avx")]
3654 unsafe fn test_mm_cmp_ps() {
3655 let a = _mm_setr_ps(4., 3., 2., 5.);
3656 let b = _mm_setr_ps(4., 9., 16., 25.);
3657 let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3658 assert!(get_m128(r, 0).is_nan());
3659 assert_eq!(get_m128(r, 1), 0.);
3660 assert_eq!(get_m128(r, 2), 0.);
3661 assert_eq!(get_m128(r, 3), 0.);
3662 }
3663
3664 #[simd_test(enable = "avx")]
3665 unsafe fn test_mm256_cmp_ps() {
3666 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3667 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3668 let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3669 let e = _mm256_set1_ps(0.);
3670 assert_eq_m256(r, e);
3671 }
3672
3673 #[simd_test(enable = "avx")]
3674 unsafe fn test_mm_cmp_sd() {
3675 let a = _mm_setr_pd(4., 9.);
3676 let b = _mm_setr_pd(4., 3.);
3677 let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3678 assert!(get_m128d(r, 0).is_nan());
3679 assert_eq!(get_m128d(r, 1), 9.);
3680 }
3681
3682 #[simd_test(enable = "avx")]
3683 unsafe fn test_mm_cmp_ss() {
3684 let a = _mm_setr_ps(4., 3., 2., 5.);
3685 let b = _mm_setr_ps(4., 9., 16., 25.);
3686 let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3687 assert!(get_m128(r, 0).is_nan());
3688 assert_eq!(get_m128(r, 1), 3.);
3689 assert_eq!(get_m128(r, 2), 2.);
3690 assert_eq!(get_m128(r, 3), 5.);
3691 }
3692
3693 #[simd_test(enable = "avx")]
3694 unsafe fn test_mm256_cvtepi32_pd() {
3695 let a = _mm_setr_epi32(4, 9, 16, 25);
3696 let r = _mm256_cvtepi32_pd(a);
3697 let e = _mm256_setr_pd(4., 9., 16., 25.);
3698 assert_eq_m256d(r, e);
3699 }
3700
3701 #[simd_test(enable = "avx")]
3702 unsafe fn test_mm256_cvtepi32_ps() {
3703 let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3704 let r = _mm256_cvtepi32_ps(a);
3705 let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3706 assert_eq_m256(r, e);
3707 }
3708
3709 #[simd_test(enable = "avx")]
3710 unsafe fn test_mm256_cvtpd_ps() {
3711 let a = _mm256_setr_pd(4., 9., 16., 25.);
3712 let r = _mm256_cvtpd_ps(a);
3713 let e = _mm_setr_ps(4., 9., 16., 25.);
3714 assert_eq_m128(r, e);
3715 }
3716
3717 #[simd_test(enable = "avx")]
3718 unsafe fn test_mm256_cvtps_epi32() {
3719 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3720 let r = _mm256_cvtps_epi32(a);
3721 let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3722 assert_eq_m256i(r, e);
3723 }
3724
3725 #[simd_test(enable = "avx")]
3726 unsafe fn test_mm256_cvtps_pd() {
3727 let a = _mm_setr_ps(4., 9., 16., 25.);
3728 let r = _mm256_cvtps_pd(a);
3729 let e = _mm256_setr_pd(4., 9., 16., 25.);
3730 assert_eq_m256d(r, e);
3731 }
3732
3733 #[simd_test(enable = "avx")]
3734 unsafe fn test_mm256_cvtsd_f64() {
3735 let a = _mm256_setr_pd(1., 2., 3., 4.);
3736 let r = _mm256_cvtsd_f64(a);
3737 assert_eq!(r, 1.);
3738 }
3739
3740 #[simd_test(enable = "avx")]
3741 unsafe fn test_mm256_cvttpd_epi32() {
3742 let a = _mm256_setr_pd(4., 9., 16., 25.);
3743 let r = _mm256_cvttpd_epi32(a);
3744 let e = _mm_setr_epi32(4, 9, 16, 25);
3745 assert_eq_m128i(r, e);
3746 }
3747
3748 #[simd_test(enable = "avx")]
3749 unsafe fn test_mm256_cvtpd_epi32() {
3750 let a = _mm256_setr_pd(4., 9., 16., 25.);
3751 let r = _mm256_cvtpd_epi32(a);
3752 let e = _mm_setr_epi32(4, 9, 16, 25);
3753 assert_eq_m128i(r, e);
3754 }
3755
3756 #[simd_test(enable = "avx")]
3757 unsafe fn test_mm256_cvttps_epi32() {
3758 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3759 let r = _mm256_cvttps_epi32(a);
3760 let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3761 assert_eq_m256i(r, e);
3762 }
3763
3764 #[simd_test(enable = "avx")]
3765 unsafe fn test_mm256_extractf128_ps() {
3766 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3767 let r = _mm256_extractf128_ps::<0>(a);
3768 let e = _mm_setr_ps(4., 3., 2., 5.);
3769 assert_eq_m128(r, e);
3770 }
3771
3772 #[simd_test(enable = "avx")]
3773 unsafe fn test_mm256_extractf128_pd() {
3774 let a = _mm256_setr_pd(4., 3., 2., 5.);
3775 let r = _mm256_extractf128_pd::<0>(a);
3776 let e = _mm_setr_pd(4., 3.);
3777 assert_eq_m128d(r, e);
3778 }
3779
3780 #[simd_test(enable = "avx")]
3781 unsafe fn test_mm256_extractf128_si256() {
3782 let a = _mm256_setr_epi64x(4, 3, 2, 5);
3783 let r = _mm256_extractf128_si256::<0>(a);
3784 let e = _mm_setr_epi64x(4, 3);
3785 assert_eq_m128i(r, e);
3786 }
3787
3788 #[simd_test(enable = "avx")]
3789 unsafe fn test_mm256_extract_epi32() {
3790 let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3791 let r1 = _mm256_extract_epi32::<0>(a);
3792 let r2 = _mm256_extract_epi32::<3>(a);
3793 assert_eq!(r1, -1);
3794 assert_eq!(r2, 3);
3795 }
3796
3797 #[simd_test(enable = "avx")]
3798 unsafe fn test_mm256_cvtsi256_si32() {
3799 let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3800 let r = _mm256_cvtsi256_si32(a);
3801 assert_eq!(r, 1);
3802 }
3803
3804 #[simd_test(enable = "avx")]
3805 #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3806 unsafe fn test_mm256_zeroall() {
3807 _mm256_zeroall();
3808 }
3809
3810 #[simd_test(enable = "avx")]
3811 #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3812 unsafe fn test_mm256_zeroupper() {
3813 _mm256_zeroupper();
3814 }
3815
3816 #[simd_test(enable = "avx")]
3817 unsafe fn test_mm256_permutevar_ps() {
3818 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3819 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3820 let r = _mm256_permutevar_ps(a, b);
3821 let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
3822 assert_eq_m256(r, e);
3823 }
3824
3825 #[simd_test(enable = "avx")]
3826 unsafe fn test_mm_permutevar_ps() {
3827 let a = _mm_setr_ps(4., 3., 2., 5.);
3828 let b = _mm_setr_epi32(1, 2, 3, 4);
3829 let r = _mm_permutevar_ps(a, b);
3830 let e = _mm_setr_ps(3., 2., 5., 4.);
3831 assert_eq_m128(r, e);
3832 }
3833
3834 #[simd_test(enable = "avx")]
3835 unsafe fn test_mm256_permute_ps() {
3836 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3837 let r = _mm256_permute_ps::<0x1b>(a);
3838 let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
3839 assert_eq_m256(r, e);
3840 }
3841
3842 #[simd_test(enable = "avx")]
3843 unsafe fn test_mm_permute_ps() {
3844 let a = _mm_setr_ps(4., 3., 2., 5.);
3845 let r = _mm_permute_ps::<0x1b>(a);
3846 let e = _mm_setr_ps(5., 2., 3., 4.);
3847 assert_eq_m128(r, e);
3848 }
3849
3850 #[simd_test(enable = "avx")]
3851 unsafe fn test_mm256_permutevar_pd() {
3852 let a = _mm256_setr_pd(4., 3., 2., 5.);
3853 let b = _mm256_setr_epi64x(1, 2, 3, 4);
3854 let r = _mm256_permutevar_pd(a, b);
3855 let e = _mm256_setr_pd(4., 3., 5., 2.);
3856 assert_eq_m256d(r, e);
3857 }
3858
3859 #[simd_test(enable = "avx")]
3860 unsafe fn test_mm_permutevar_pd() {
3861 let a = _mm_setr_pd(4., 3.);
3862 let b = _mm_setr_epi64x(3, 0);
3863 let r = _mm_permutevar_pd(a, b);
3864 let e = _mm_setr_pd(3., 4.);
3865 assert_eq_m128d(r, e);
3866 }
3867
3868 #[simd_test(enable = "avx")]
3869 unsafe fn test_mm256_permute_pd() {
3870 let a = _mm256_setr_pd(4., 3., 2., 5.);
3871 let r = _mm256_permute_pd::<5>(a);
3872 let e = _mm256_setr_pd(3., 4., 5., 2.);
3873 assert_eq_m256d(r, e);
3874 }
3875
3876 #[simd_test(enable = "avx")]
3877 unsafe fn test_mm_permute_pd() {
3878 let a = _mm_setr_pd(4., 3.);
3879 let r = _mm_permute_pd::<1>(a);
3880 let e = _mm_setr_pd(3., 4.);
3881 assert_eq_m128d(r, e);
3882 }
3883
3884 #[simd_test(enable = "avx")]
3885 unsafe fn test_mm256_permute2f128_ps() {
3886 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3887 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3888 let r = _mm256_permute2f128_ps::<0x13>(a, b);
3889 let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
3890 assert_eq_m256(r, e);
3891 }
3892
3893 #[simd_test(enable = "avx")]
3894 unsafe fn test_mm256_permute2f128_pd() {
3895 let a = _mm256_setr_pd(1., 2., 3., 4.);
3896 let b = _mm256_setr_pd(5., 6., 7., 8.);
3897 let r = _mm256_permute2f128_pd::<0x31>(a, b);
3898 let e = _mm256_setr_pd(3., 4., 7., 8.);
3899 assert_eq_m256d(r, e);
3900 }
3901
3902 #[simd_test(enable = "avx")]
3903 unsafe fn test_mm256_permute2f128_si256() {
3904 let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
3905 let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
3906 let r = _mm256_permute2f128_si256::<0x20>(a, b);
3907 let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3908 assert_eq_m256i(r, e);
3909 }
3910
3911 #[simd_test(enable = "avx")]
3912 unsafe fn test_mm256_broadcast_ss() {
3913 let r = _mm256_broadcast_ss(&3.);
3914 let e = _mm256_set1_ps(3.);
3915 assert_eq_m256(r, e);
3916 }
3917
3918 #[simd_test(enable = "avx")]
3919 unsafe fn test_mm_broadcast_ss() {
3920 let r = _mm_broadcast_ss(&3.);
3921 let e = _mm_set1_ps(3.);
3922 assert_eq_m128(r, e);
3923 }
3924
3925 #[simd_test(enable = "avx")]
3926 unsafe fn test_mm256_broadcast_sd() {
3927 let r = _mm256_broadcast_sd(&3.);
3928 let e = _mm256_set1_pd(3.);
3929 assert_eq_m256d(r, e);
3930 }
3931
3932 #[simd_test(enable = "avx")]
3933 unsafe fn test_mm256_broadcast_ps() {
3934 let a = _mm_setr_ps(4., 3., 2., 5.);
3935 let r = _mm256_broadcast_ps(&a);
3936 let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
3937 assert_eq_m256(r, e);
3938 }
3939
3940 #[simd_test(enable = "avx")]
3941 unsafe fn test_mm256_broadcast_pd() {
3942 let a = _mm_setr_pd(4., 3.);
3943 let r = _mm256_broadcast_pd(&a);
3944 let e = _mm256_setr_pd(4., 3., 4., 3.);
3945 assert_eq_m256d(r, e);
3946 }
3947
3948 #[simd_test(enable = "avx")]
3949 unsafe fn test_mm256_insertf128_ps() {
3950 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3951 let b = _mm_setr_ps(4., 9., 16., 25.);
3952 let r = _mm256_insertf128_ps::<0>(a, b);
3953 let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3954 assert_eq_m256(r, e);
3955 }
3956
3957 #[simd_test(enable = "avx")]
3958 unsafe fn test_mm256_insertf128_pd() {
3959 let a = _mm256_setr_pd(1., 2., 3., 4.);
3960 let b = _mm_setr_pd(5., 6.);
3961 let r = _mm256_insertf128_pd::<0>(a, b);
3962 let e = _mm256_setr_pd(5., 6., 3., 4.);
3963 assert_eq_m256d(r, e);
3964 }
3965
3966 #[simd_test(enable = "avx")]
3967 unsafe fn test_mm256_insertf128_si256() {
3968 let a = _mm256_setr_epi64x(1, 2, 3, 4);
3969 let b = _mm_setr_epi64x(5, 6);
3970 let r = _mm256_insertf128_si256::<0>(a, b);
3971 let e = _mm256_setr_epi64x(5, 6, 3, 4);
3972 assert_eq_m256i(r, e);
3973 }
3974
3975 #[simd_test(enable = "avx")]
3976 unsafe fn test_mm256_insert_epi8() {
3977 #[rustfmt::skip]
3978 let a = _mm256_setr_epi8(
3979 1, 2, 3, 4, 5, 6, 7, 8,
3980 9, 10, 11, 12, 13, 14, 15, 16,
3981 17, 18, 19, 20, 21, 22, 23, 24,
3982 25, 26, 27, 28, 29, 30, 31, 32,
3983 );
3984 let r = _mm256_insert_epi8::<31>(a, 0);
3985 #[rustfmt::skip]
3986 let e = _mm256_setr_epi8(
3987 1, 2, 3, 4, 5, 6, 7, 8,
3988 9, 10, 11, 12, 13, 14, 15, 16,
3989 17, 18, 19, 20, 21, 22, 23, 24,
3990 25, 26, 27, 28, 29, 30, 31, 0,
3991 );
3992 assert_eq_m256i(r, e);
3993 }
3994
3995 #[simd_test(enable = "avx")]
3996 unsafe fn test_mm256_insert_epi16() {
3997 #[rustfmt::skip]
3998 let a = _mm256_setr_epi16(
3999 0, 1, 2, 3, 4, 5, 6, 7,
4000 8, 9, 10, 11, 12, 13, 14, 15,
4001 );
4002 let r = _mm256_insert_epi16::<15>(a, 0);
4003 #[rustfmt::skip]
4004 let e = _mm256_setr_epi16(
4005 0, 1, 2, 3, 4, 5, 6, 7,
4006 8, 9, 10, 11, 12, 13, 14, 0,
4007 );
4008 assert_eq_m256i(r, e);
4009 }
4010
4011 #[simd_test(enable = "avx")]
4012 unsafe fn test_mm256_insert_epi32() {
4013 let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4014 let r = _mm256_insert_epi32::<7>(a, 0);
4015 let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4016 assert_eq_m256i(r, e);
4017 }
4018
4019 #[simd_test(enable = "avx")]
4020 unsafe fn test_mm256_load_pd() {
4021 let a = _mm256_setr_pd(1., 2., 3., 4.);
4022 let p = ptr::addr_of!(a) as *const f64;
4023 let r = _mm256_load_pd(p);
4024 let e = _mm256_setr_pd(1., 2., 3., 4.);
4025 assert_eq_m256d(r, e);
4026 }
4027
4028 #[simd_test(enable = "avx")]
4029 unsafe fn test_mm256_store_pd() {
4030 let a = _mm256_setr_pd(1., 2., 3., 4.);
4031 let mut r = _mm256_undefined_pd();
4032 _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4033 assert_eq_m256d(r, a);
4034 }
4035
4036 #[simd_test(enable = "avx")]
4037 unsafe fn test_mm256_load_ps() {
4038 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4039 let p = ptr::addr_of!(a) as *const f32;
4040 let r = _mm256_load_ps(p);
4041 let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4042 assert_eq_m256(r, e);
4043 }
4044
4045 #[simd_test(enable = "avx")]
4046 unsafe fn test_mm256_store_ps() {
4047 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4048 let mut r = _mm256_undefined_ps();
4049 _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4050 assert_eq_m256(r, a);
4051 }
4052
4053 #[simd_test(enable = "avx")]
4054 unsafe fn test_mm256_loadu_pd() {
4055 let a = &[1.0f64, 2., 3., 4.];
4056 let p = a.as_ptr();
4057 let r = _mm256_loadu_pd(black_box(p));
4058 let e = _mm256_setr_pd(1., 2., 3., 4.);
4059 assert_eq_m256d(r, e);
4060 }
4061
4062 #[simd_test(enable = "avx")]
4063 unsafe fn test_mm256_storeu_pd() {
4064 let a = _mm256_set1_pd(9.);
4065 let mut r = _mm256_undefined_pd();
4066 _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4067 assert_eq_m256d(r, a);
4068 }
4069
4070 #[simd_test(enable = "avx")]
4071 unsafe fn test_mm256_loadu_ps() {
4072 let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4073 let p = a.as_ptr();
4074 let r = _mm256_loadu_ps(black_box(p));
4075 let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4076 assert_eq_m256(r, e);
4077 }
4078
4079 #[simd_test(enable = "avx")]
4080 unsafe fn test_mm256_storeu_ps() {
4081 let a = _mm256_set1_ps(9.);
4082 let mut r = _mm256_undefined_ps();
4083 _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4084 assert_eq_m256(r, a);
4085 }
4086
4087 #[simd_test(enable = "avx")]
4088 unsafe fn test_mm256_load_si256() {
4089 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4090 let p = ptr::addr_of!(a);
4091 let r = _mm256_load_si256(p);
4092 let e = _mm256_setr_epi64x(1, 2, 3, 4);
4093 assert_eq_m256i(r, e);
4094 }
4095
4096 #[simd_test(enable = "avx")]
4097 unsafe fn test_mm256_store_si256() {
4098 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4099 let mut r = _mm256_undefined_si256();
4100 _mm256_store_si256(ptr::addr_of_mut!(r), a);
4101 assert_eq_m256i(r, a);
4102 }
4103
4104 #[simd_test(enable = "avx")]
4105 unsafe fn test_mm256_loadu_si256() {
4106 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4107 let p = ptr::addr_of!(a);
4108 let r = _mm256_loadu_si256(black_box(p));
4109 let e = _mm256_setr_epi64x(1, 2, 3, 4);
4110 assert_eq_m256i(r, e);
4111 }
4112
4113 #[simd_test(enable = "avx")]
4114 unsafe fn test_mm256_storeu_si256() {
4115 let a = _mm256_set1_epi8(9);
4116 let mut r = _mm256_undefined_si256();
4117 _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4118 assert_eq_m256i(r, a);
4119 }
4120
4121 #[simd_test(enable = "avx")]
4122 unsafe fn test_mm256_maskload_pd() {
4123 let a = &[1.0f64, 2., 3., 4.];
4124 let p = a.as_ptr();
4125 let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4126 let r = _mm256_maskload_pd(black_box(p), mask);
4127 let e = _mm256_setr_pd(0., 2., 0., 4.);
4128 assert_eq_m256d(r, e);
4129 }
4130
4131 #[simd_test(enable = "avx")]
4132 unsafe fn test_mm256_maskstore_pd() {
4133 let mut r = _mm256_set1_pd(0.);
4134 let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4135 let a = _mm256_setr_pd(1., 2., 3., 4.);
4136 _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4137 let e = _mm256_setr_pd(0., 2., 0., 4.);
4138 assert_eq_m256d(r, e);
4139 }
4140
4141 #[simd_test(enable = "avx")]
4142 unsafe fn test_mm_maskload_pd() {
4143 let a = &[1.0f64, 2.];
4144 let p = a.as_ptr();
4145 let mask = _mm_setr_epi64x(0, !0);
4146 let r = _mm_maskload_pd(black_box(p), mask);
4147 let e = _mm_setr_pd(0., 2.);
4148 assert_eq_m128d(r, e);
4149 }
4150
4151 #[simd_test(enable = "avx")]
4152 unsafe fn test_mm_maskstore_pd() {
4153 let mut r = _mm_set1_pd(0.);
4154 let mask = _mm_setr_epi64x(0, !0);
4155 let a = _mm_setr_pd(1., 2.);
4156 _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4157 let e = _mm_setr_pd(0., 2.);
4158 assert_eq_m128d(r, e);
4159 }
4160
4161 #[simd_test(enable = "avx")]
4162 unsafe fn test_mm256_maskload_ps() {
4163 let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4164 let p = a.as_ptr();
4165 let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4166 let r = _mm256_maskload_ps(black_box(p), mask);
4167 let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4168 assert_eq_m256(r, e);
4169 }
4170
4171 #[simd_test(enable = "avx")]
4172 unsafe fn test_mm256_maskstore_ps() {
4173 let mut r = _mm256_set1_ps(0.);
4174 let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4175 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4176 _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4177 let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4178 assert_eq_m256(r, e);
4179 }
4180
4181 #[simd_test(enable = "avx")]
4182 unsafe fn test_mm_maskload_ps() {
4183 let a = &[1.0f32, 2., 3., 4.];
4184 let p = a.as_ptr();
4185 let mask = _mm_setr_epi32(0, !0, 0, !0);
4186 let r = _mm_maskload_ps(black_box(p), mask);
4187 let e = _mm_setr_ps(0., 2., 0., 4.);
4188 assert_eq_m128(r, e);
4189 }
4190
4191 #[simd_test(enable = "avx")]
4192 unsafe fn test_mm_maskstore_ps() {
4193 let mut r = _mm_set1_ps(0.);
4194 let mask = _mm_setr_epi32(0, !0, 0, !0);
4195 let a = _mm_setr_ps(1., 2., 3., 4.);
4196 _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4197 let e = _mm_setr_ps(0., 2., 0., 4.);
4198 assert_eq_m128(r, e);
4199 }
4200
4201 #[simd_test(enable = "avx")]
4202 unsafe fn test_mm256_movehdup_ps() {
4203 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4204 let r = _mm256_movehdup_ps(a);
4205 let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4206 assert_eq_m256(r, e);
4207 }
4208
4209 #[simd_test(enable = "avx")]
4210 unsafe fn test_mm256_moveldup_ps() {
4211 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4212 let r = _mm256_moveldup_ps(a);
4213 let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4214 assert_eq_m256(r, e);
4215 }
4216
4217 #[simd_test(enable = "avx")]
4218 unsafe fn test_mm256_movedup_pd() {
4219 let a = _mm256_setr_pd(1., 2., 3., 4.);
4220 let r = _mm256_movedup_pd(a);
4221 let e = _mm256_setr_pd(1., 1., 3., 3.);
4222 assert_eq_m256d(r, e);
4223 }
4224
4225 #[simd_test(enable = "avx")]
4226 unsafe fn test_mm256_lddqu_si256() {
4227 #[rustfmt::skip]
4228 let a = _mm256_setr_epi8(
4229 1, 2, 3, 4, 5, 6, 7, 8,
4230 9, 10, 11, 12, 13, 14, 15, 16,
4231 17, 18, 19, 20, 21, 22, 23, 24,
4232 25, 26, 27, 28, 29, 30, 31, 32,
4233 );
4234 let p = ptr::addr_of!(a);
4235 let r = _mm256_lddqu_si256(black_box(p));
4236 #[rustfmt::skip]
4237 let e = _mm256_setr_epi8(
4238 1, 2, 3, 4, 5, 6, 7, 8,
4239 9, 10, 11, 12, 13, 14, 15, 16,
4240 17, 18, 19, 20, 21, 22, 23, 24,
4241 25, 26, 27, 28, 29, 30, 31, 32,
4242 );
4243 assert_eq_m256i(r, e);
4244 }
4245
4246 #[simd_test(enable = "avx")]
4247 #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4248 unsafe fn test_mm256_stream_si256() {
4249 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4250 let mut r = _mm256_undefined_si256();
4251 _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4252 assert_eq_m256i(r, a);
4253 }
4254
4255 #[simd_test(enable = "avx")]
4256 #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4257 unsafe fn test_mm256_stream_pd() {
4258 #[repr(align(32))]
4259 struct Memory {
4260 pub data: [f64; 4],
4261 }
4262 let a = _mm256_set1_pd(7.0);
4263 let mut mem = Memory { data: [-1.0; 4] };
4264
4265 _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4266 for i in 0..4 {
4267 assert_eq!(mem.data[i], get_m256d(a, i));
4268 }
4269 }
4270
4271 #[simd_test(enable = "avx")]
4272 #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4273 unsafe fn test_mm256_stream_ps() {
4274 #[repr(align(32))]
4275 struct Memory {
4276 pub data: [f32; 8],
4277 }
4278 let a = _mm256_set1_ps(7.0);
4279 let mut mem = Memory { data: [-1.0; 8] };
4280
4281 _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4282 for i in 0..8 {
4283 assert_eq!(mem.data[i], get_m256(a, i));
4284 }
4285 }
4286
4287 #[simd_test(enable = "avx")]
4288 unsafe fn test_mm256_rcp_ps() {
4289 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4290 let r = _mm256_rcp_ps(a);
4291 #[rustfmt::skip]
4292 let e = _mm256_setr_ps(
4293 0.99975586, 0.49987793, 0.33325195, 0.24993896,
4294 0.19995117, 0.16662598, 0.14282227, 0.12496948,
4295 );
4296 let rel_err = 0.00048828125;
4297 for i in 0..8 {
4298 assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4299 }
4300 }
4301
4302 #[simd_test(enable = "avx")]
4303 unsafe fn test_mm256_rsqrt_ps() {
4304 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4305 let r = _mm256_rsqrt_ps(a);
4306 #[rustfmt::skip]
4307 let e = _mm256_setr_ps(
4308 0.99975586, 0.7069092, 0.5772705, 0.49987793,
4309 0.44714355, 0.40820313, 0.3779297, 0.3534546,
4310 );
4311 let rel_err = 0.00048828125;
4312 for i in 0..8 {
4313 assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4314 }
4315 }
4316
4317 #[simd_test(enable = "avx")]
4318 unsafe fn test_mm256_unpackhi_pd() {
4319 let a = _mm256_setr_pd(1., 2., 3., 4.);
4320 let b = _mm256_setr_pd(5., 6., 7., 8.);
4321 let r = _mm256_unpackhi_pd(a, b);
4322 let e = _mm256_setr_pd(2., 6., 4., 8.);
4323 assert_eq_m256d(r, e);
4324 }
4325
4326 #[simd_test(enable = "avx")]
4327 unsafe fn test_mm256_unpackhi_ps() {
4328 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4329 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4330 let r = _mm256_unpackhi_ps(a, b);
4331 let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4332 assert_eq_m256(r, e);
4333 }
4334
4335 #[simd_test(enable = "avx")]
4336 unsafe fn test_mm256_unpacklo_pd() {
4337 let a = _mm256_setr_pd(1., 2., 3., 4.);
4338 let b = _mm256_setr_pd(5., 6., 7., 8.);
4339 let r = _mm256_unpacklo_pd(a, b);
4340 let e = _mm256_setr_pd(1., 5., 3., 7.);
4341 assert_eq_m256d(r, e);
4342 }
4343
4344 #[simd_test(enable = "avx")]
4345 unsafe fn test_mm256_unpacklo_ps() {
4346 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4347 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4348 let r = _mm256_unpacklo_ps(a, b);
4349 let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4350 assert_eq_m256(r, e);
4351 }
4352
4353 #[simd_test(enable = "avx")]
4354 unsafe fn test_mm256_testz_si256() {
4355 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4356 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4357 let r = _mm256_testz_si256(a, b);
4358 assert_eq!(r, 0);
4359 let b = _mm256_set1_epi64x(0);
4360 let r = _mm256_testz_si256(a, b);
4361 assert_eq!(r, 1);
4362 }
4363
4364 #[simd_test(enable = "avx")]
4365 unsafe fn test_mm256_testc_si256() {
4366 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4367 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4368 let r = _mm256_testc_si256(a, b);
4369 assert_eq!(r, 0);
4370 let b = _mm256_set1_epi64x(0);
4371 let r = _mm256_testc_si256(a, b);
4372 assert_eq!(r, 1);
4373 }
4374
4375 #[simd_test(enable = "avx")]
4376 unsafe fn test_mm256_testnzc_si256() {
4377 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4378 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4379 let r = _mm256_testnzc_si256(a, b);
4380 assert_eq!(r, 1);
4381 let a = _mm256_setr_epi64x(0, 0, 0, 0);
4382 let b = _mm256_setr_epi64x(0, 0, 0, 0);
4383 let r = _mm256_testnzc_si256(a, b);
4384 assert_eq!(r, 0);
4385 }
4386
4387 #[simd_test(enable = "avx")]
4388 unsafe fn test_mm256_testz_pd() {
4389 let a = _mm256_setr_pd(1., 2., 3., 4.);
4390 let b = _mm256_setr_pd(5., 6., 7., 8.);
4391 let r = _mm256_testz_pd(a, b);
4392 assert_eq!(r, 1);
4393 let a = _mm256_set1_pd(-1.);
4394 let r = _mm256_testz_pd(a, a);
4395 assert_eq!(r, 0);
4396 }
4397
4398 #[simd_test(enable = "avx")]
4399 unsafe fn test_mm256_testc_pd() {
4400 let a = _mm256_setr_pd(1., 2., 3., 4.);
4401 let b = _mm256_setr_pd(5., 6., 7., 8.);
4402 let r = _mm256_testc_pd(a, b);
4403 assert_eq!(r, 1);
4404 let a = _mm256_set1_pd(1.);
4405 let b = _mm256_set1_pd(-1.);
4406 let r = _mm256_testc_pd(a, b);
4407 assert_eq!(r, 0);
4408 }
4409
4410 #[simd_test(enable = "avx")]
4411 unsafe fn test_mm256_testnzc_pd() {
4412 let a = _mm256_setr_pd(1., 2., 3., 4.);
4413 let b = _mm256_setr_pd(5., 6., 7., 8.);
4414 let r = _mm256_testnzc_pd(a, b);
4415 assert_eq!(r, 0);
4416 let a = _mm256_setr_pd(1., -1., -1., -1.);
4417 let b = _mm256_setr_pd(-1., -1., 1., 1.);
4418 let r = _mm256_testnzc_pd(a, b);
4419 assert_eq!(r, 1);
4420 }
4421
4422 #[simd_test(enable = "avx")]
4423 unsafe fn test_mm_testz_pd() {
4424 let a = _mm_setr_pd(1., 2.);
4425 let b = _mm_setr_pd(5., 6.);
4426 let r = _mm_testz_pd(a, b);
4427 assert_eq!(r, 1);
4428 let a = _mm_set1_pd(-1.);
4429 let r = _mm_testz_pd(a, a);
4430 assert_eq!(r, 0);
4431 }
4432
4433 #[simd_test(enable = "avx")]
4434 unsafe fn test_mm_testc_pd() {
4435 let a = _mm_setr_pd(1., 2.);
4436 let b = _mm_setr_pd(5., 6.);
4437 let r = _mm_testc_pd(a, b);
4438 assert_eq!(r, 1);
4439 let a = _mm_set1_pd(1.);
4440 let b = _mm_set1_pd(-1.);
4441 let r = _mm_testc_pd(a, b);
4442 assert_eq!(r, 0);
4443 }
4444
4445 #[simd_test(enable = "avx")]
4446 unsafe fn test_mm_testnzc_pd() {
4447 let a = _mm_setr_pd(1., 2.);
4448 let b = _mm_setr_pd(5., 6.);
4449 let r = _mm_testnzc_pd(a, b);
4450 assert_eq!(r, 0);
4451 let a = _mm_setr_pd(1., -1.);
4452 let b = _mm_setr_pd(-1., -1.);
4453 let r = _mm_testnzc_pd(a, b);
4454 assert_eq!(r, 1);
4455 }
4456
4457 #[simd_test(enable = "avx")]
4458 unsafe fn test_mm256_testz_ps() {
4459 let a = _mm256_set1_ps(1.);
4460 let r = _mm256_testz_ps(a, a);
4461 assert_eq!(r, 1);
4462 let a = _mm256_set1_ps(-1.);
4463 let r = _mm256_testz_ps(a, a);
4464 assert_eq!(r, 0);
4465 }
4466
4467 #[simd_test(enable = "avx")]
4468 unsafe fn test_mm256_testc_ps() {
4469 let a = _mm256_set1_ps(1.);
4470 let r = _mm256_testc_ps(a, a);
4471 assert_eq!(r, 1);
4472 let b = _mm256_set1_ps(-1.);
4473 let r = _mm256_testc_ps(a, b);
4474 assert_eq!(r, 0);
4475 }
4476
4477 #[simd_test(enable = "avx")]
4478 unsafe fn test_mm256_testnzc_ps() {
4479 let a = _mm256_set1_ps(1.);
4480 let r = _mm256_testnzc_ps(a, a);
4481 assert_eq!(r, 0);
4482 let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4483 let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4484 let r = _mm256_testnzc_ps(a, b);
4485 assert_eq!(r, 1);
4486 }
4487
4488 #[simd_test(enable = "avx")]
4489 unsafe fn test_mm_testz_ps() {
4490 let a = _mm_set1_ps(1.);
4491 let r = _mm_testz_ps(a, a);
4492 assert_eq!(r, 1);
4493 let a = _mm_set1_ps(-1.);
4494 let r = _mm_testz_ps(a, a);
4495 assert_eq!(r, 0);
4496 }
4497
4498 #[simd_test(enable = "avx")]
4499 unsafe fn test_mm_testc_ps() {
4500 let a = _mm_set1_ps(1.);
4501 let r = _mm_testc_ps(a, a);
4502 assert_eq!(r, 1);
4503 let b = _mm_set1_ps(-1.);
4504 let r = _mm_testc_ps(a, b);
4505 assert_eq!(r, 0);
4506 }
4507
4508 #[simd_test(enable = "avx")]
4509 unsafe fn test_mm_testnzc_ps() {
4510 let a = _mm_set1_ps(1.);
4511 let r = _mm_testnzc_ps(a, a);
4512 assert_eq!(r, 0);
4513 let a = _mm_setr_ps(1., -1., -1., -1.);
4514 let b = _mm_setr_ps(-1., -1., 1., 1.);
4515 let r = _mm_testnzc_ps(a, b);
4516 assert_eq!(r, 1);
4517 }
4518
4519 #[simd_test(enable = "avx")]
4520 unsafe fn test_mm256_movemask_pd() {
4521 let a = _mm256_setr_pd(1., -2., 3., -4.);
4522 let r = _mm256_movemask_pd(a);
4523 assert_eq!(r, 0xA);
4524 }
4525
4526 #[simd_test(enable = "avx")]
4527 unsafe fn test_mm256_movemask_ps() {
4528 let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4529 let r = _mm256_movemask_ps(a);
4530 assert_eq!(r, 0xAA);
4531 }
4532
4533 #[simd_test(enable = "avx")]
4534 unsafe fn test_mm256_setzero_pd() {
4535 let r = _mm256_setzero_pd();
4536 assert_eq_m256d(r, _mm256_set1_pd(0.));
4537 }
4538
4539 #[simd_test(enable = "avx")]
4540 unsafe fn test_mm256_setzero_ps() {
4541 let r = _mm256_setzero_ps();
4542 assert_eq_m256(r, _mm256_set1_ps(0.));
4543 }
4544
4545 #[simd_test(enable = "avx")]
4546 unsafe fn test_mm256_setzero_si256() {
4547 let r = _mm256_setzero_si256();
4548 assert_eq_m256i(r, _mm256_set1_epi8(0));
4549 }
4550
4551 #[simd_test(enable = "avx")]
4552 unsafe fn test_mm256_set_pd() {
4553 let r = _mm256_set_pd(1., 2., 3., 4.);
4554 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4555 }
4556
4557 #[simd_test(enable = "avx")]
4558 unsafe fn test_mm256_set_ps() {
4559 let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4560 assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4561 }
4562
4563 #[simd_test(enable = "avx")]
4564 unsafe fn test_mm256_set_epi8() {
4565 #[rustfmt::skip]
4566 let r = _mm256_set_epi8(
4567 1, 2, 3, 4, 5, 6, 7, 8,
4568 9, 10, 11, 12, 13, 14, 15, 16,
4569 17, 18, 19, 20, 21, 22, 23, 24,
4570 25, 26, 27, 28, 29, 30, 31, 32,
4571 );
4572 #[rustfmt::skip]
4573 let e = _mm256_setr_epi8(
4574 32, 31, 30, 29, 28, 27, 26, 25,
4575 24, 23, 22, 21, 20, 19, 18, 17,
4576 16, 15, 14, 13, 12, 11, 10, 9,
4577 8, 7, 6, 5, 4, 3, 2, 1
4578 );
4579 assert_eq_m256i(r, e);
4580 }
4581
4582 #[simd_test(enable = "avx")]
4583 unsafe fn test_mm256_set_epi16() {
4584 #[rustfmt::skip]
4585 let r = _mm256_set_epi16(
4586 1, 2, 3, 4, 5, 6, 7, 8,
4587 9, 10, 11, 12, 13, 14, 15, 16,
4588 );
4589 #[rustfmt::skip]
4590 let e = _mm256_setr_epi16(
4591 16, 15, 14, 13, 12, 11, 10, 9, 8,
4592 7, 6, 5, 4, 3, 2, 1,
4593 );
4594 assert_eq_m256i(r, e);
4595 }
4596
4597 #[simd_test(enable = "avx")]
4598 unsafe fn test_mm256_set_epi32() {
4599 let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4600 assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4601 }
4602
4603 #[simd_test(enable = "avx")]
4604 unsafe fn test_mm256_set_epi64x() {
4605 let r = _mm256_set_epi64x(1, 2, 3, 4);
4606 assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4607 }
4608
4609 #[simd_test(enable = "avx")]
4610 unsafe fn test_mm256_setr_pd() {
4611 let r = _mm256_setr_pd(1., 2., 3., 4.);
4612 assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4613 }
4614
4615 #[simd_test(enable = "avx")]
4616 unsafe fn test_mm256_setr_ps() {
4617 let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4618 assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4619 }
4620
4621 #[simd_test(enable = "avx")]
4622 unsafe fn test_mm256_setr_epi8() {
4623 #[rustfmt::skip]
4624 let r = _mm256_setr_epi8(
4625 1, 2, 3, 4, 5, 6, 7, 8,
4626 9, 10, 11, 12, 13, 14, 15, 16,
4627 17, 18, 19, 20, 21, 22, 23, 24,
4628 25, 26, 27, 28, 29, 30, 31, 32,
4629 );
4630 #[rustfmt::skip]
4631 let e = _mm256_setr_epi8(
4632 1, 2, 3, 4, 5, 6, 7, 8,
4633 9, 10, 11, 12, 13, 14, 15, 16,
4634 17, 18, 19, 20, 21, 22, 23, 24,
4635 25, 26, 27, 28, 29, 30, 31, 32
4636 );
4637
4638 assert_eq_m256i(r, e);
4639 }
4640
4641 #[simd_test(enable = "avx")]
4642 unsafe fn test_mm256_setr_epi16() {
4643 #[rustfmt::skip]
4644 let r = _mm256_setr_epi16(
4645 1, 2, 3, 4, 5, 6, 7, 8,
4646 9, 10, 11, 12, 13, 14, 15, 16,
4647 );
4648 #[rustfmt::skip]
4649 let e = _mm256_setr_epi16(
4650 1, 2, 3, 4, 5, 6, 7, 8,
4651 9, 10, 11, 12, 13, 14, 15, 16,
4652 );
4653 assert_eq_m256i(r, e);
4654 }
4655
4656 #[simd_test(enable = "avx")]
4657 unsafe fn test_mm256_setr_epi32() {
4658 let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4659 assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4660 }
4661
4662 #[simd_test(enable = "avx")]
4663 unsafe fn test_mm256_setr_epi64x() {
4664 let r = _mm256_setr_epi64x(1, 2, 3, 4);
4665 assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4666 }
4667
4668 #[simd_test(enable = "avx")]
4669 unsafe fn test_mm256_set1_pd() {
4670 let r = _mm256_set1_pd(1.);
4671 assert_eq_m256d(r, _mm256_set1_pd(1.));
4672 }
4673
4674 #[simd_test(enable = "avx")]
4675 unsafe fn test_mm256_set1_ps() {
4676 let r = _mm256_set1_ps(1.);
4677 assert_eq_m256(r, _mm256_set1_ps(1.));
4678 }
4679
4680 #[simd_test(enable = "avx")]
4681 unsafe fn test_mm256_set1_epi8() {
4682 let r = _mm256_set1_epi8(1);
4683 assert_eq_m256i(r, _mm256_set1_epi8(1));
4684 }
4685
4686 #[simd_test(enable = "avx")]
4687 unsafe fn test_mm256_set1_epi16() {
4688 let r = _mm256_set1_epi16(1);
4689 assert_eq_m256i(r, _mm256_set1_epi16(1));
4690 }
4691
4692 #[simd_test(enable = "avx")]
4693 unsafe fn test_mm256_set1_epi32() {
4694 let r = _mm256_set1_epi32(1);
4695 assert_eq_m256i(r, _mm256_set1_epi32(1));
4696 }
4697
4698 #[simd_test(enable = "avx")]
4699 unsafe fn test_mm256_set1_epi64x() {
4700 let r = _mm256_set1_epi64x(1);
4701 assert_eq_m256i(r, _mm256_set1_epi64x(1));
4702 }
4703
4704 #[simd_test(enable = "avx")]
4705 unsafe fn test_mm256_castpd_ps() {
4706 let a = _mm256_setr_pd(1., 2., 3., 4.);
4707 let r = _mm256_castpd_ps(a);
4708 let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4709 assert_eq_m256(r, e);
4710 }
4711
4712 #[simd_test(enable = "avx")]
4713 unsafe fn test_mm256_castps_pd() {
4714 let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4715 let r = _mm256_castps_pd(a);
4716 let e = _mm256_setr_pd(1., 2., 3., 4.);
4717 assert_eq_m256d(r, e);
4718 }
4719
4720 #[simd_test(enable = "avx")]
4721 unsafe fn test_mm256_castps_si256() {
4722 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4723 let r = _mm256_castps_si256(a);
4724 #[rustfmt::skip]
4725 let e = _mm256_setr_epi8(
4726 0, 0, -128, 63, 0, 0, 0, 64,
4727 0, 0, 64, 64, 0, 0, -128, 64,
4728 0, 0, -96, 64, 0, 0, -64, 64,
4729 0, 0, -32, 64, 0, 0, 0, 65,
4730 );
4731 assert_eq_m256i(r, e);
4732 }
4733
4734 #[simd_test(enable = "avx")]
4735 unsafe fn test_mm256_castsi256_ps() {
4736 #[rustfmt::skip]
4737 let a = _mm256_setr_epi8(
4738 0, 0, -128, 63, 0, 0, 0, 64,
4739 0, 0, 64, 64, 0, 0, -128, 64,
4740 0, 0, -96, 64, 0, 0, -64, 64,
4741 0, 0, -32, 64, 0, 0, 0, 65,
4742 );
4743 let r = _mm256_castsi256_ps(a);
4744 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4745 assert_eq_m256(r, e);
4746 }
4747
4748 #[simd_test(enable = "avx")]
4749 unsafe fn test_mm256_castpd_si256() {
4750 let a = _mm256_setr_pd(1., 2., 3., 4.);
4751 let r = _mm256_castpd_si256(a);
4752 assert_eq_m256d(transmute(r), a);
4753 }
4754
4755 #[simd_test(enable = "avx")]
4756 unsafe fn test_mm256_castsi256_pd() {
4757 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4758 let r = _mm256_castsi256_pd(a);
4759 assert_eq_m256d(r, transmute(a));
4760 }
4761
4762 #[simd_test(enable = "avx")]
4763 unsafe fn test_mm256_castps256_ps128() {
4764 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4765 let r = _mm256_castps256_ps128(a);
4766 assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4767 }
4768
4769 #[simd_test(enable = "avx")]
4770 unsafe fn test_mm256_castpd256_pd128() {
4771 let a = _mm256_setr_pd(1., 2., 3., 4.);
4772 let r = _mm256_castpd256_pd128(a);
4773 assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4774 }
4775
4776 #[simd_test(enable = "avx")]
4777 unsafe fn test_mm256_castsi256_si128() {
4778 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4779 let r = _mm256_castsi256_si128(a);
4780 assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
4781 }
4782
4783 #[simd_test(enable = "avx")]
4784 unsafe fn test_mm256_castps128_ps256() {
4785 let a = _mm_setr_ps(1., 2., 3., 4.);
4786 let r = _mm256_castps128_ps256(a);
4787 assert_eq_m128(_mm256_castps256_ps128(r), a);
4788 }
4789
4790 #[simd_test(enable = "avx")]
4791 unsafe fn test_mm256_castpd128_pd256() {
4792 let a = _mm_setr_pd(1., 2.);
4793 let r = _mm256_castpd128_pd256(a);
4794 assert_eq_m128d(_mm256_castpd256_pd128(r), a);
4795 }
4796
4797 #[simd_test(enable = "avx")]
4798 unsafe fn test_mm256_castsi128_si256() {
4799 let a = _mm_setr_epi32(1, 2, 3, 4);
4800 let r = _mm256_castsi128_si256(a);
4801 assert_eq_m128i(_mm256_castsi256_si128(r), a);
4802 }
4803
4804 #[simd_test(enable = "avx")]
4805 unsafe fn test_mm256_zextps128_ps256() {
4806 let a = _mm_setr_ps(1., 2., 3., 4.);
4807 let r = _mm256_zextps128_ps256(a);
4808 let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
4809 assert_eq_m256(r, e);
4810 }
4811
4812 #[simd_test(enable = "avx")]
4813 unsafe fn test_mm256_zextsi128_si256() {
4814 let a = _mm_setr_epi64x(1, 2);
4815 let r = _mm256_zextsi128_si256(a);
4816 let e = _mm256_setr_epi64x(1, 2, 0, 0);
4817 assert_eq_m256i(r, e);
4818 }
4819
4820 #[simd_test(enable = "avx")]
4821 unsafe fn test_mm256_zextpd128_pd256() {
4822 let a = _mm_setr_pd(1., 2.);
4823 let r = _mm256_zextpd128_pd256(a);
4824 let e = _mm256_setr_pd(1., 2., 0., 0.);
4825 assert_eq_m256d(r, e);
4826 }
4827
4828 #[simd_test(enable = "avx")]
4829 unsafe fn test_mm256_set_m128() {
4830 let hi = _mm_setr_ps(5., 6., 7., 8.);
4831 let lo = _mm_setr_ps(1., 2., 3., 4.);
4832 let r = _mm256_set_m128(hi, lo);
4833 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4834 assert_eq_m256(r, e);
4835 }
4836
4837 #[simd_test(enable = "avx")]
4838 unsafe fn test_mm256_set_m128d() {
4839 let hi = _mm_setr_pd(3., 4.);
4840 let lo = _mm_setr_pd(1., 2.);
4841 let r = _mm256_set_m128d(hi, lo);
4842 let e = _mm256_setr_pd(1., 2., 3., 4.);
4843 assert_eq_m256d(r, e);
4844 }
4845
4846 #[simd_test(enable = "avx")]
4847 unsafe fn test_mm256_set_m128i() {
4848 #[rustfmt::skip]
4849 let hi = _mm_setr_epi8(
4850 17, 18, 19, 20,
4851 21, 22, 23, 24,
4852 25, 26, 27, 28,
4853 29, 30, 31, 32,
4854 );
4855 #[rustfmt::skip]
4856 let lo = _mm_setr_epi8(
4857 1, 2, 3, 4,
4858 5, 6, 7, 8,
4859 9, 10, 11, 12,
4860 13, 14, 15, 16,
4861 );
4862 let r = _mm256_set_m128i(hi, lo);
4863 #[rustfmt::skip]
4864 let e = _mm256_setr_epi8(
4865 1, 2, 3, 4, 5, 6, 7, 8,
4866 9, 10, 11, 12, 13, 14, 15, 16,
4867 17, 18, 19, 20, 21, 22, 23, 24,
4868 25, 26, 27, 28, 29, 30, 31, 32,
4869 );
4870 assert_eq_m256i(r, e);
4871 }
4872
4873 #[simd_test(enable = "avx")]
4874 unsafe fn test_mm256_setr_m128() {
4875 let lo = _mm_setr_ps(1., 2., 3., 4.);
4876 let hi = _mm_setr_ps(5., 6., 7., 8.);
4877 let r = _mm256_setr_m128(lo, hi);
4878 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4879 assert_eq_m256(r, e);
4880 }
4881
4882 #[simd_test(enable = "avx")]
4883 unsafe fn test_mm256_setr_m128d() {
4884 let lo = _mm_setr_pd(1., 2.);
4885 let hi = _mm_setr_pd(3., 4.);
4886 let r = _mm256_setr_m128d(lo, hi);
4887 let e = _mm256_setr_pd(1., 2., 3., 4.);
4888 assert_eq_m256d(r, e);
4889 }
4890
4891 #[simd_test(enable = "avx")]
4892 unsafe fn test_mm256_setr_m128i() {
4893 #[rustfmt::skip]
4894 let lo = _mm_setr_epi8(
4895 1, 2, 3, 4,
4896 5, 6, 7, 8,
4897 9, 10, 11, 12,
4898 13, 14, 15, 16,
4899 );
4900 #[rustfmt::skip]
4901 let hi = _mm_setr_epi8(
4902 17, 18, 19, 20, 21, 22, 23, 24,
4903 25, 26, 27, 28, 29, 30, 31, 32,
4904 );
4905 let r = _mm256_setr_m128i(lo, hi);
4906 #[rustfmt::skip]
4907 let e = _mm256_setr_epi8(
4908 1, 2, 3, 4, 5, 6, 7, 8,
4909 9, 10, 11, 12, 13, 14, 15, 16,
4910 17, 18, 19, 20, 21, 22, 23, 24,
4911 25, 26, 27, 28, 29, 30, 31, 32,
4912 );
4913 assert_eq_m256i(r, e);
4914 }
4915
4916 #[simd_test(enable = "avx")]
4917 unsafe fn test_mm256_loadu2_m128() {
4918 let hi = &[5., 6., 7., 8.];
4919 let hiaddr = hi.as_ptr();
4920 let lo = &[1., 2., 3., 4.];
4921 let loaddr = lo.as_ptr();
4922 let r = _mm256_loadu2_m128(hiaddr, loaddr);
4923 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4924 assert_eq_m256(r, e);
4925 }
4926
4927 #[simd_test(enable = "avx")]
4928 unsafe fn test_mm256_loadu2_m128d() {
4929 let hi = &[3., 4.];
4930 let hiaddr = hi.as_ptr();
4931 let lo = &[1., 2.];
4932 let loaddr = lo.as_ptr();
4933 let r = _mm256_loadu2_m128d(hiaddr, loaddr);
4934 let e = _mm256_setr_pd(1., 2., 3., 4.);
4935 assert_eq_m256d(r, e);
4936 }
4937
4938 #[simd_test(enable = "avx")]
4939 unsafe fn test_mm256_loadu2_m128i() {
4940 #[rustfmt::skip]
4941 let hi = _mm_setr_epi8(
4942 17, 18, 19, 20, 21, 22, 23, 24,
4943 25, 26, 27, 28, 29, 30, 31, 32,
4944 );
4945 #[rustfmt::skip]
4946 let lo = _mm_setr_epi8(
4947 1, 2, 3, 4, 5, 6, 7, 8,
4948 9, 10, 11, 12, 13, 14, 15, 16,
4949 );
4950 let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
4951 #[rustfmt::skip]
4952 let e = _mm256_setr_epi8(
4953 1, 2, 3, 4, 5, 6, 7, 8,
4954 9, 10, 11, 12, 13, 14, 15, 16,
4955 17, 18, 19, 20, 21, 22, 23, 24,
4956 25, 26, 27, 28, 29, 30, 31, 32,
4957 );
4958 assert_eq_m256i(r, e);
4959 }
4960
4961 #[simd_test(enable = "avx")]
4962 unsafe fn test_mm256_storeu2_m128() {
4963 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4964 let mut hi = _mm_undefined_ps();
4965 let mut lo = _mm_undefined_ps();
4966 _mm256_storeu2_m128(
4967 ptr::addr_of_mut!(hi) as *mut f32,
4968 ptr::addr_of_mut!(lo) as *mut f32,
4969 a,
4970 );
4971 assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
4972 assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
4973 }
4974
4975 #[simd_test(enable = "avx")]
4976 unsafe fn test_mm256_storeu2_m128d() {
4977 let a = _mm256_setr_pd(1., 2., 3., 4.);
4978 let mut hi = _mm_undefined_pd();
4979 let mut lo = _mm_undefined_pd();
4980 _mm256_storeu2_m128d(
4981 ptr::addr_of_mut!(hi) as *mut f64,
4982 ptr::addr_of_mut!(lo) as *mut f64,
4983 a,
4984 );
4985 assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
4986 assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
4987 }
4988
4989 #[simd_test(enable = "avx")]
4990 unsafe fn test_mm256_storeu2_m128i() {
4991 #[rustfmt::skip]
4992 let a = _mm256_setr_epi8(
4993 1, 2, 3, 4, 5, 6, 7, 8,
4994 9, 10, 11, 12, 13, 14, 15, 16,
4995 17, 18, 19, 20, 21, 22, 23, 24,
4996 25, 26, 27, 28, 29, 30, 31, 32,
4997 );
4998 let mut hi = _mm_undefined_si128();
4999 let mut lo = _mm_undefined_si128();
5000 _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5001 #[rustfmt::skip]
5002 let e_hi = _mm_setr_epi8(
5003 17, 18, 19, 20, 21, 22, 23, 24,
5004 25, 26, 27, 28, 29, 30, 31, 32
5005 );
5006 #[rustfmt::skip]
5007 let e_lo = _mm_setr_epi8(
5008 1, 2, 3, 4, 5, 6, 7, 8,
5009 9, 10, 11, 12, 13, 14, 15, 16
5010 );
5011
5012 assert_eq_m128i(hi, e_hi);
5013 assert_eq_m128i(lo, e_lo);
5014 }
5015
5016 #[simd_test(enable = "avx")]
5017 unsafe fn test_mm256_cvtss_f32() {
5018 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5019 let r = _mm256_cvtss_f32(a);
5020 assert_eq!(r, 1.);
5021 }
5022}
5023