1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//! Programmer's Manual, Volume 3: General-Purpose and System
8//! Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17 core_arch::{simd::*, x86::*},
18 intrinsics::simd::*,
19 mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
34 unsafe { simd_add(x:a, y:b) }
35}
36
37/// Adds packed single-precision (32-bit) floating-point elements in `a` and
38/// `b`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
41#[inline]
42#[target_feature(enable = "avx")]
43#[cfg_attr(test, assert_instr(vaddps))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
46 unsafe { simd_add(x:a, y:b) }
47}
48
49/// Computes the bitwise AND of a packed double-precision (64-bit)
50/// floating-point elements in `a` and `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
53#[inline]
54#[target_feature(enable = "avx")]
55// See https://github.com/rust-lang/stdarch/issues/71
56#[cfg_attr(test, assert_instr(vandp))]
57#[stable(feature = "simd_x86", since = "1.27.0")]
58pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
59 unsafe {
60 let a: u64x4 = transmute(src:a);
61 let b: u64x4 = transmute(src:b);
62 transmute(src:simd_and(x:a, y:b))
63 }
64}
65
66/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
67/// elements in `a` and `b`.
68///
69/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
70#[inline]
71#[target_feature(enable = "avx")]
72#[cfg_attr(test, assert_instr(vandps))]
73#[stable(feature = "simd_x86", since = "1.27.0")]
74pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
75 unsafe {
76 let a: u32x8 = transmute(src:a);
77 let b: u32x8 = transmute(src:b);
78 transmute(src:simd_and(x:a, y:b))
79 }
80}
81
82/// Computes the bitwise OR packed double-precision (64-bit) floating-point
83/// elements in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
86#[inline]
87#[target_feature(enable = "avx")]
88// See <https://github.com/rust-lang/stdarch/issues/71>.
89#[cfg_attr(test, assert_instr(vorp))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
92 unsafe {
93 let a: u64x4 = transmute(src:a);
94 let b: u64x4 = transmute(src:b);
95 transmute(src:simd_or(x:a, y:b))
96 }
97}
98
99/// Computes the bitwise OR packed single-precision (32-bit) floating-point
100/// elements in `a` and `b`.
101///
102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
103#[inline]
104#[target_feature(enable = "avx")]
105#[cfg_attr(test, assert_instr(vorps))]
106#[stable(feature = "simd_x86", since = "1.27.0")]
107pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
108 unsafe {
109 let a: u32x8 = transmute(src:a);
110 let b: u32x8 = transmute(src:b);
111 transmute(src:simd_or(x:a, y:b))
112 }
113}
114
115/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
116/// lanes using the control in `imm8`.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
119#[inline]
120#[target_feature(enable = "avx")]
121#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
122#[rustc_legacy_const_generics(2)]
123#[stable(feature = "simd_x86", since = "1.27.0")]
124pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
125 static_assert_uimm_bits!(MASK, 8);
126 unsafe {
127 simd_shuffle!(
128 a,
129 b,
130 [
131 MASK as u32 & 0b1,
132 ((MASK as u32 >> 1) & 0b1) + 4,
133 ((MASK as u32 >> 2) & 0b1) + 2,
134 ((MASK as u32 >> 3) & 0b1) + 6,
135 ],
136 )
137 }
138}
139
140/// Shuffles single-precision (32-bit) floating-point elements in `a` within
141/// 128-bit lanes using the control in `imm8`.
142///
143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
144#[inline]
145#[target_feature(enable = "avx")]
146#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
147#[rustc_legacy_const_generics(2)]
148#[stable(feature = "simd_x86", since = "1.27.0")]
149pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
150 static_assert_uimm_bits!(MASK, 8);
151 unsafe {
152 simd_shuffle!(
153 a,
154 b,
155 [
156 MASK as u32 & 0b11,
157 (MASK as u32 >> 2) & 0b11,
158 ((MASK as u32 >> 4) & 0b11) + 8,
159 ((MASK as u32 >> 6) & 0b11) + 8,
160 (MASK as u32 & 0b11) + 4,
161 ((MASK as u32 >> 2) & 0b11) + 4,
162 ((MASK as u32 >> 4) & 0b11) + 12,
163 ((MASK as u32 >> 6) & 0b11) + 12,
164 ],
165 )
166 }
167}
168
169/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
170/// elements in `a`, and then AND with `b`.
171///
172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
173#[inline]
174#[target_feature(enable = "avx")]
175#[cfg_attr(test, assert_instr(vandnp))]
176#[stable(feature = "simd_x86", since = "1.27.0")]
177pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
178 unsafe {
179 let a: u64x4 = transmute(src:a);
180 let b: u64x4 = transmute(src:b);
181 transmute(src:simd_and(x:simd_xor(u64x4::splat(!(0_u64)), a), y:b))
182 }
183}
184
185/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
186/// elements in `a`
187/// and then AND with `b`.
188///
189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
190#[inline]
191#[target_feature(enable = "avx")]
192#[cfg_attr(test, assert_instr(vandnps))]
193#[stable(feature = "simd_x86", since = "1.27.0")]
194pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
195 unsafe {
196 let a: u32x8 = transmute(src:a);
197 let b: u32x8 = transmute(src:b);
198 transmute(src:simd_and(x:simd_xor(u32x8::splat(!(0_u32)), a), y:b))
199 }
200}
201
202/// Compares packed double-precision (64-bit) floating-point elements
203/// in `a` and `b`, and returns packed maximum values
204///
205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
206#[inline]
207#[target_feature(enable = "avx")]
208#[cfg_attr(test, assert_instr(vmaxpd))]
209#[stable(feature = "simd_x86", since = "1.27.0")]
210pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
211 unsafe { vmaxpd(a, b) }
212}
213
214/// Compares packed single-precision (32-bit) floating-point elements in `a`
215/// and `b`, and returns packed maximum values
216///
217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
218#[inline]
219#[target_feature(enable = "avx")]
220#[cfg_attr(test, assert_instr(vmaxps))]
221#[stable(feature = "simd_x86", since = "1.27.0")]
222pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
223 unsafe { vmaxps(a, b) }
224}
225
226/// Compares packed double-precision (64-bit) floating-point elements
227/// in `a` and `b`, and returns packed minimum values
228///
229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
230#[inline]
231#[target_feature(enable = "avx")]
232#[cfg_attr(test, assert_instr(vminpd))]
233#[stable(feature = "simd_x86", since = "1.27.0")]
234pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
235 unsafe { vminpd(a, b) }
236}
237
238/// Compares packed single-precision (32-bit) floating-point elements in `a`
239/// and `b`, and returns packed minimum values
240///
241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
242#[inline]
243#[target_feature(enable = "avx")]
244#[cfg_attr(test, assert_instr(vminps))]
245#[stable(feature = "simd_x86", since = "1.27.0")]
246pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
247 unsafe { vminps(a, b) }
248}
249
250/// Multiplies packed double-precision (64-bit) floating-point elements
251/// in `a` and `b`.
252///
253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
254#[inline]
255#[target_feature(enable = "avx")]
256#[cfg_attr(test, assert_instr(vmulpd))]
257#[stable(feature = "simd_x86", since = "1.27.0")]
258pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
259 unsafe { simd_mul(x:a, y:b) }
260}
261
262/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
263/// `b`.
264///
265/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
266#[inline]
267#[target_feature(enable = "avx")]
268#[cfg_attr(test, assert_instr(vmulps))]
269#[stable(feature = "simd_x86", since = "1.27.0")]
270pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
271 unsafe { simd_mul(x:a, y:b) }
272}
273
274/// Alternatively adds and subtracts packed double-precision (64-bit)
275/// floating-point elements in `a` to/from packed elements in `b`.
276///
277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
278#[inline]
279#[target_feature(enable = "avx")]
280#[cfg_attr(test, assert_instr(vaddsubpd))]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
283 unsafe {
284 let a: f64x4 = a.as_f64x4();
285 let b: f64x4 = b.as_f64x4();
286 let add: f64x4 = simd_add(x:a, y:b);
287 let sub: f64x4 = simd_sub(lhs:a, rhs:b);
288 simd_shuffle!(add, sub, [4, 1, 6, 3])
289 }
290}
291
292/// Alternatively adds and subtracts packed single-precision (32-bit)
293/// floating-point elements in `a` to/from packed elements in `b`.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
296#[inline]
297#[target_feature(enable = "avx")]
298#[cfg_attr(test, assert_instr(vaddsubps))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
301 unsafe {
302 let a: f32x8 = a.as_f32x8();
303 let b: f32x8 = b.as_f32x8();
304 let add: f32x8 = simd_add(x:a, y:b);
305 let sub: f32x8 = simd_sub(lhs:a, rhs:b);
306 simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
307 }
308}
309
310/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
311/// from packed elements in `a`.
312///
313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
314#[inline]
315#[target_feature(enable = "avx")]
316#[cfg_attr(test, assert_instr(vsubpd))]
317#[stable(feature = "simd_x86", since = "1.27.0")]
318pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
319 unsafe { simd_sub(lhs:a, rhs:b) }
320}
321
322/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
323/// from packed elements in `a`.
324///
325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
326#[inline]
327#[target_feature(enable = "avx")]
328#[cfg_attr(test, assert_instr(vsubps))]
329#[stable(feature = "simd_x86", since = "1.27.0")]
330pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
331 unsafe { simd_sub(lhs:a, rhs:b) }
332}
333
334/// Computes the division of each of the 8 packed 32-bit floating-point elements
335/// in `a` by the corresponding packed elements in `b`.
336///
337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
338#[inline]
339#[target_feature(enable = "avx")]
340#[cfg_attr(test, assert_instr(vdivps))]
341#[stable(feature = "simd_x86", since = "1.27.0")]
342pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
343 unsafe { simd_div(lhs:a, rhs:b) }
344}
345
346/// Computes the division of each of the 4 packed 64-bit floating-point elements
347/// in `a` by the corresponding packed elements in `b`.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
350#[inline]
351#[target_feature(enable = "avx")]
352#[cfg_attr(test, assert_instr(vdivpd))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
355 unsafe { simd_div(lhs:a, rhs:b) }
356}
357
358/// Rounds packed double-precision (64-bit) floating point elements in `a`
359/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
360///
361/// - `0x00`: Round to the nearest whole number.
362/// - `0x01`: Round down, toward negative infinity.
363/// - `0x02`: Round up, toward positive infinity.
364/// - `0x03`: Truncate the values.
365///
366/// For a complete list of options, check [the LLVM docs][llvm_docs].
367///
368/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
369///
370/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
371#[inline]
372#[target_feature(enable = "avx")]
373#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
374#[rustc_legacy_const_generics(1)]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
377 static_assert_uimm_bits!(ROUNDING, 4);
378 unsafe { roundpd256(a, ROUNDING) }
379}
380
381/// Rounds packed double-precision (64-bit) floating point elements in `a`
382/// toward positive infinity.
383///
384/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
385#[inline]
386#[target_feature(enable = "avx")]
387#[cfg_attr(test, assert_instr(vroundpd))]
388#[stable(feature = "simd_x86", since = "1.27.0")]
389pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
390 unsafe { simd_ceil(a) }
391}
392
393/// Rounds packed double-precision (64-bit) floating point elements in `a`
394/// toward negative infinity.
395///
396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
397#[inline]
398#[target_feature(enable = "avx")]
399#[cfg_attr(test, assert_instr(vroundpd))]
400#[stable(feature = "simd_x86", since = "1.27.0")]
401pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
402 unsafe { simd_floor(a) }
403}
404
405/// Rounds packed single-precision (32-bit) floating point elements in `a`
406/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
407///
408/// - `0x00`: Round to the nearest whole number.
409/// - `0x01`: Round down, toward negative infinity.
410/// - `0x02`: Round up, toward positive infinity.
411/// - `0x03`: Truncate the values.
412///
413/// For a complete list of options, check [the LLVM docs][llvm_docs].
414///
415/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
418#[inline]
419#[target_feature(enable = "avx")]
420#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
421#[rustc_legacy_const_generics(1)]
422#[stable(feature = "simd_x86", since = "1.27.0")]
423pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
424 static_assert_uimm_bits!(ROUNDING, 4);
425 unsafe { roundps256(a, ROUNDING) }
426}
427
428/// Rounds packed single-precision (32-bit) floating point elements in `a`
429/// toward positive infinity.
430///
431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
432#[inline]
433#[target_feature(enable = "avx")]
434#[cfg_attr(test, assert_instr(vroundps))]
435#[stable(feature = "simd_x86", since = "1.27.0")]
436pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
437 unsafe { simd_ceil(a) }
438}
439
440/// Rounds packed single-precision (32-bit) floating point elements in `a`
441/// toward negative infinity.
442///
443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
444#[inline]
445#[target_feature(enable = "avx")]
446#[cfg_attr(test, assert_instr(vroundps))]
447#[stable(feature = "simd_x86", since = "1.27.0")]
448pub fn _mm256_floor_ps(a: __m256) -> __m256 {
449 unsafe { simd_floor(a) }
450}
451
452/// Returns the square root of packed single-precision (32-bit) floating point
453/// elements in `a`.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
456#[inline]
457#[target_feature(enable = "avx")]
458#[cfg_attr(test, assert_instr(vsqrtps))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
461 unsafe { simd_fsqrt(a) }
462}
463
464/// Returns the square root of packed double-precision (64-bit) floating point
465/// elements in `a`.
466///
467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
468#[inline]
469#[target_feature(enable = "avx")]
470#[cfg_attr(test, assert_instr(vsqrtpd))]
471#[stable(feature = "simd_x86", since = "1.27.0")]
472pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
473 unsafe { simd_fsqrt(a) }
474}
475
476/// Blends packed double-precision (64-bit) floating-point elements from
477/// `a` and `b` using control mask `imm8`.
478///
479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
480#[inline]
481#[target_feature(enable = "avx")]
482// Note: LLVM7 prefers single-precision blend instructions when
483// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
484// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
485#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
486#[rustc_legacy_const_generics(2)]
487#[stable(feature = "simd_x86", since = "1.27.0")]
488pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
489 static_assert_uimm_bits!(IMM4, 4);
490 unsafe {
491 simd_shuffle!(
492 a,
493 b,
494 [
495 ((IMM4 as u32 >> 0) & 1) * 4 + 0,
496 ((IMM4 as u32 >> 1) & 1) * 4 + 1,
497 ((IMM4 as u32 >> 2) & 1) * 4 + 2,
498 ((IMM4 as u32 >> 3) & 1) * 4 + 3,
499 ],
500 )
501 }
502}
503
504/// Blends packed single-precision (32-bit) floating-point elements from
505/// `a` and `b` using control mask `imm8`.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
508#[inline]
509#[target_feature(enable = "avx")]
510#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
511#[rustc_legacy_const_generics(2)]
512#[stable(feature = "simd_x86", since = "1.27.0")]
513pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
514 static_assert_uimm_bits!(IMM8, 8);
515 unsafe {
516 simd_shuffle!(
517 a,
518 b,
519 [
520 ((IMM8 as u32 >> 0) & 1) * 8 + 0,
521 ((IMM8 as u32 >> 1) & 1) * 8 + 1,
522 ((IMM8 as u32 >> 2) & 1) * 8 + 2,
523 ((IMM8 as u32 >> 3) & 1) * 8 + 3,
524 ((IMM8 as u32 >> 4) & 1) * 8 + 4,
525 ((IMM8 as u32 >> 5) & 1) * 8 + 5,
526 ((IMM8 as u32 >> 6) & 1) * 8 + 6,
527 ((IMM8 as u32 >> 7) & 1) * 8 + 7,
528 ],
529 )
530 }
531}
532
533/// Blends packed double-precision (64-bit) floating-point elements from
534/// `a` and `b` using `c` as a mask.
535///
536/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
537#[inline]
538#[target_feature(enable = "avx")]
539#[cfg_attr(test, assert_instr(vblendvpd))]
540#[stable(feature = "simd_x86", since = "1.27.0")]
541pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
542 unsafe {
543 let mask: i64x4 = simd_lt(x:transmute::<_, i64x4>(c), y:i64x4::ZERO);
544 transmute(src:simd_select(mask, if_true:b.as_f64x4(), if_false:a.as_f64x4()))
545 }
546}
547
548/// Blends packed single-precision (32-bit) floating-point elements from
549/// `a` and `b` using `c` as a mask.
550///
551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
552#[inline]
553#[target_feature(enable = "avx")]
554#[cfg_attr(test, assert_instr(vblendvps))]
555#[stable(feature = "simd_x86", since = "1.27.0")]
556pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
557 unsafe {
558 let mask: i32x8 = simd_lt(x:transmute::<_, i32x8>(c), y:i32x8::ZERO);
559 transmute(src:simd_select(mask, if_true:b.as_f32x8(), if_false:a.as_f32x8()))
560 }
561}
562
563/// Conditionally multiplies the packed single-precision (32-bit) floating-point
564/// elements in `a` and `b` using the high 4 bits in `imm8`,
565/// sum the four products, and conditionally return the sum
566/// using the low 4 bits of `imm8`.
567///
568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
569#[inline]
570#[target_feature(enable = "avx")]
571#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
572#[rustc_legacy_const_generics(2)]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
575 static_assert_uimm_bits!(IMM8, 8);
576 unsafe { vdpps(a, b, IMM8 as i8) }
577}
578
579/// Horizontal addition of adjacent pairs in the two packed vectors
580/// of 4 64-bit floating points `a` and `b`.
581/// In the result, sums of elements from `a` are returned in even locations,
582/// while sums of elements from `b` are returned in odd locations.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
585#[inline]
586#[target_feature(enable = "avx")]
587#[cfg_attr(test, assert_instr(vhaddpd))]
588#[stable(feature = "simd_x86", since = "1.27.0")]
589pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
590 unsafe {
591 let even: __m256d = simd_shuffle!(a, b, [0, 4, 2, 6]);
592 let odd: __m256d = simd_shuffle!(a, b, [1, 5, 3, 7]);
593 simd_add(x:even, y:odd)
594 }
595}
596
597/// Horizontal addition of adjacent pairs in the two packed vectors
598/// of 8 32-bit floating points `a` and `b`.
599/// In the result, sums of elements from `a` are returned in locations of
600/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
601/// 2, 3, 6, 7.
602///
603/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
604#[inline]
605#[target_feature(enable = "avx")]
606#[cfg_attr(test, assert_instr(vhaddps))]
607#[stable(feature = "simd_x86", since = "1.27.0")]
608pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
609 unsafe {
610 let even: __m256 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
611 let odd: __m256 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
612 simd_add(x:even, y:odd)
613 }
614}
615
616/// Horizontal subtraction of adjacent pairs in the two packed vectors
617/// of 4 64-bit floating points `a` and `b`.
618/// In the result, sums of elements from `a` are returned in even locations,
619/// while sums of elements from `b` are returned in odd locations.
620///
621/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
622#[inline]
623#[target_feature(enable = "avx")]
624#[cfg_attr(test, assert_instr(vhsubpd))]
625#[stable(feature = "simd_x86", since = "1.27.0")]
626pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
627 unsafe {
628 let even: __m256d = simd_shuffle!(a, b, [0, 4, 2, 6]);
629 let odd: __m256d = simd_shuffle!(a, b, [1, 5, 3, 7]);
630 simd_sub(lhs:even, rhs:odd)
631 }
632}
633
634/// Horizontal subtraction of adjacent pairs in the two packed vectors
635/// of 8 32-bit floating points `a` and `b`.
636/// In the result, sums of elements from `a` are returned in locations of
637/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
638/// 2, 3, 6, 7.
639///
640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
641#[inline]
642#[target_feature(enable = "avx")]
643#[cfg_attr(test, assert_instr(vhsubps))]
644#[stable(feature = "simd_x86", since = "1.27.0")]
645pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
646 unsafe {
647 let even: __m256 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
648 let odd: __m256 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
649 simd_sub(lhs:even, rhs:odd)
650 }
651}
652
653/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
654/// elements in `a` and `b`.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
657#[inline]
658#[target_feature(enable = "avx")]
659#[cfg_attr(test, assert_instr(vxorp))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
662 unsafe {
663 let a: u64x4 = transmute(src:a);
664 let b: u64x4 = transmute(src:b);
665 transmute(src:simd_xor(x:a, y:b))
666 }
667}
668
669/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
670/// elements in `a` and `b`.
671///
672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
673#[inline]
674#[target_feature(enable = "avx")]
675#[cfg_attr(test, assert_instr(vxorps))]
676#[stable(feature = "simd_x86", since = "1.27.0")]
677pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
678 unsafe {
679 let a: u32x8 = transmute(src:a);
680 let b: u32x8 = transmute(src:b);
681 transmute(src:simd_xor(x:a, y:b))
682 }
683}
684
685/// Equal (ordered, non-signaling)
686#[stable(feature = "simd_x86", since = "1.27.0")]
687pub const _CMP_EQ_OQ: i32 = 0x00;
688/// Less-than (ordered, signaling)
689#[stable(feature = "simd_x86", since = "1.27.0")]
690pub const _CMP_LT_OS: i32 = 0x01;
691/// Less-than-or-equal (ordered, signaling)
692#[stable(feature = "simd_x86", since = "1.27.0")]
693pub const _CMP_LE_OS: i32 = 0x02;
694/// Unordered (non-signaling)
695#[stable(feature = "simd_x86", since = "1.27.0")]
696pub const _CMP_UNORD_Q: i32 = 0x03;
697/// Not-equal (unordered, non-signaling)
698#[stable(feature = "simd_x86", since = "1.27.0")]
699pub const _CMP_NEQ_UQ: i32 = 0x04;
700/// Not-less-than (unordered, signaling)
701#[stable(feature = "simd_x86", since = "1.27.0")]
702pub const _CMP_NLT_US: i32 = 0x05;
703/// Not-less-than-or-equal (unordered, signaling)
704#[stable(feature = "simd_x86", since = "1.27.0")]
705pub const _CMP_NLE_US: i32 = 0x06;
706/// Ordered (non-signaling)
707#[stable(feature = "simd_x86", since = "1.27.0")]
708pub const _CMP_ORD_Q: i32 = 0x07;
709/// Equal (unordered, non-signaling)
710#[stable(feature = "simd_x86", since = "1.27.0")]
711pub const _CMP_EQ_UQ: i32 = 0x08;
712/// Not-greater-than-or-equal (unordered, signaling)
713#[stable(feature = "simd_x86", since = "1.27.0")]
714pub const _CMP_NGE_US: i32 = 0x09;
715/// Not-greater-than (unordered, signaling)
716#[stable(feature = "simd_x86", since = "1.27.0")]
717pub const _CMP_NGT_US: i32 = 0x0a;
718/// False (ordered, non-signaling)
719#[stable(feature = "simd_x86", since = "1.27.0")]
720pub const _CMP_FALSE_OQ: i32 = 0x0b;
721/// Not-equal (ordered, non-signaling)
722#[stable(feature = "simd_x86", since = "1.27.0")]
723pub const _CMP_NEQ_OQ: i32 = 0x0c;
724/// Greater-than-or-equal (ordered, signaling)
725#[stable(feature = "simd_x86", since = "1.27.0")]
726pub const _CMP_GE_OS: i32 = 0x0d;
727/// Greater-than (ordered, signaling)
728#[stable(feature = "simd_x86", since = "1.27.0")]
729pub const _CMP_GT_OS: i32 = 0x0e;
730/// True (unordered, non-signaling)
731#[stable(feature = "simd_x86", since = "1.27.0")]
732pub const _CMP_TRUE_UQ: i32 = 0x0f;
733/// Equal (ordered, signaling)
734#[stable(feature = "simd_x86", since = "1.27.0")]
735pub const _CMP_EQ_OS: i32 = 0x10;
736/// Less-than (ordered, non-signaling)
737#[stable(feature = "simd_x86", since = "1.27.0")]
738pub const _CMP_LT_OQ: i32 = 0x11;
739/// Less-than-or-equal (ordered, non-signaling)
740#[stable(feature = "simd_x86", since = "1.27.0")]
741pub const _CMP_LE_OQ: i32 = 0x12;
742/// Unordered (signaling)
743#[stable(feature = "simd_x86", since = "1.27.0")]
744pub const _CMP_UNORD_S: i32 = 0x13;
745/// Not-equal (unordered, signaling)
746#[stable(feature = "simd_x86", since = "1.27.0")]
747pub const _CMP_NEQ_US: i32 = 0x14;
748/// Not-less-than (unordered, non-signaling)
749#[stable(feature = "simd_x86", since = "1.27.0")]
750pub const _CMP_NLT_UQ: i32 = 0x15;
751/// Not-less-than-or-equal (unordered, non-signaling)
752#[stable(feature = "simd_x86", since = "1.27.0")]
753pub const _CMP_NLE_UQ: i32 = 0x16;
754/// Ordered (signaling)
755#[stable(feature = "simd_x86", since = "1.27.0")]
756pub const _CMP_ORD_S: i32 = 0x17;
757/// Equal (unordered, signaling)
758#[stable(feature = "simd_x86", since = "1.27.0")]
759pub const _CMP_EQ_US: i32 = 0x18;
760/// Not-greater-than-or-equal (unordered, non-signaling)
761#[stable(feature = "simd_x86", since = "1.27.0")]
762pub const _CMP_NGE_UQ: i32 = 0x19;
763/// Not-greater-than (unordered, non-signaling)
764#[stable(feature = "simd_x86", since = "1.27.0")]
765pub const _CMP_NGT_UQ: i32 = 0x1a;
766/// False (ordered, signaling)
767#[stable(feature = "simd_x86", since = "1.27.0")]
768pub const _CMP_FALSE_OS: i32 = 0x1b;
769/// Not-equal (ordered, signaling)
770#[stable(feature = "simd_x86", since = "1.27.0")]
771pub const _CMP_NEQ_OS: i32 = 0x1c;
772/// Greater-than-or-equal (ordered, non-signaling)
773#[stable(feature = "simd_x86", since = "1.27.0")]
774pub const _CMP_GE_OQ: i32 = 0x1d;
775/// Greater-than (ordered, non-signaling)
776#[stable(feature = "simd_x86", since = "1.27.0")]
777pub const _CMP_GT_OQ: i32 = 0x1e;
778/// True (unordered, signaling)
779#[stable(feature = "simd_x86", since = "1.27.0")]
780pub const _CMP_TRUE_US: i32 = 0x1f;
781
782/// Compares packed double-precision (64-bit) floating-point
783/// elements in `a` and `b` based on the comparison operand
784/// specified by `IMM5`.
785///
786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
787#[inline]
788#[target_feature(enable = "avx")]
789#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
790#[rustc_legacy_const_generics(2)]
791#[stable(feature = "simd_x86", since = "1.27.0")]
792pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
793 static_assert_uimm_bits!(IMM5, 5);
794 unsafe { vcmppd(a, b, imm8:const { IMM5 as i8 }) }
795}
796
797/// Compares packed double-precision (64-bit) floating-point
798/// elements in `a` and `b` based on the comparison operand
799/// specified by `IMM5`.
800///
801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
802#[inline]
803#[target_feature(enable = "avx")]
804#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
805#[rustc_legacy_const_generics(2)]
806#[stable(feature = "simd_x86", since = "1.27.0")]
807pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
808 static_assert_uimm_bits!(IMM5, 5);
809 unsafe { vcmppd256(a, b, IMM5 as u8) }
810}
811
812/// Compares packed single-precision (32-bit) floating-point
813/// elements in `a` and `b` based on the comparison operand
814/// specified by `IMM5`.
815///
816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
817#[inline]
818#[target_feature(enable = "avx")]
819#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
820#[rustc_legacy_const_generics(2)]
821#[stable(feature = "simd_x86", since = "1.27.0")]
822pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
823 static_assert_uimm_bits!(IMM5, 5);
824 unsafe { vcmpps(a, b, imm8:const { IMM5 as i8 }) }
825}
826
827/// Compares packed single-precision (32-bit) floating-point
828/// elements in `a` and `b` based on the comparison operand
829/// specified by `IMM5`.
830///
831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
832#[inline]
833#[target_feature(enable = "avx")]
834#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
835#[rustc_legacy_const_generics(2)]
836#[stable(feature = "simd_x86", since = "1.27.0")]
837pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
838 static_assert_uimm_bits!(IMM5, 5);
839 unsafe { vcmpps256(a, b, imm8:const { IMM5 as u8 }) }
840}
841
842/// Compares the lower double-precision (64-bit) floating-point element in
843/// `a` and `b` based on the comparison operand specified by `IMM5`,
844/// store the result in the lower element of returned vector,
845/// and copies the upper element from `a` to the upper element of returned
846/// vector.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
849#[inline]
850#[target_feature(enable = "avx")]
851#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
852#[rustc_legacy_const_generics(2)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
855 static_assert_uimm_bits!(IMM5, 5);
856 unsafe { vcmpsd(a, b, IMM5 as i8) }
857}
858
859/// Compares the lower single-precision (32-bit) floating-point element in
860/// `a` and `b` based on the comparison operand specified by `IMM5`,
861/// store the result in the lower element of returned vector,
862/// and copies the upper 3 packed elements from `a` to the upper elements of
863/// returned vector.
864///
865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
866#[inline]
867#[target_feature(enable = "avx")]
868#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
869#[rustc_legacy_const_generics(2)]
870#[stable(feature = "simd_x86", since = "1.27.0")]
871pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
872 static_assert_uimm_bits!(IMM5, 5);
873 unsafe { vcmpss(a, b, IMM5 as i8) }
874}
875
876/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
877/// floating-point elements.
878///
879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
880#[inline]
881#[target_feature(enable = "avx")]
882#[cfg_attr(test, assert_instr(vcvtdq2pd))]
883#[stable(feature = "simd_x86", since = "1.27.0")]
884pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
885 unsafe { simd_cast(a.as_i32x4()) }
886}
887
888/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
889/// floating-point elements.
890///
891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
892#[inline]
893#[target_feature(enable = "avx")]
894#[cfg_attr(test, assert_instr(vcvtdq2ps))]
895#[stable(feature = "simd_x86", since = "1.27.0")]
896pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
897 unsafe { simd_cast(a.as_i32x8()) }
898}
899
900/// Converts packed double-precision (64-bit) floating-point elements in `a`
901/// to packed single-precision (32-bit) floating-point elements.
902///
903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
904#[inline]
905#[target_feature(enable = "avx")]
906#[cfg_attr(test, assert_instr(vcvtpd2ps))]
907#[stable(feature = "simd_x86", since = "1.27.0")]
908pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
909 unsafe { simd_cast(a) }
910}
911
912/// Converts packed single-precision (32-bit) floating-point elements in `a`
913/// to packed 32-bit integers.
914///
915/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
916#[inline]
917#[target_feature(enable = "avx")]
918#[cfg_attr(test, assert_instr(vcvtps2dq))]
919#[stable(feature = "simd_x86", since = "1.27.0")]
920pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
921 unsafe { transmute(src:vcvtps2dq(a)) }
922}
923
924/// Converts packed single-precision (32-bit) floating-point elements in `a`
925/// to packed double-precision (64-bit) floating-point elements.
926///
927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
928#[inline]
929#[target_feature(enable = "avx")]
930#[cfg_attr(test, assert_instr(vcvtps2pd))]
931#[stable(feature = "simd_x86", since = "1.27.0")]
932pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
933 unsafe { simd_cast(a) }
934}
935
936/// Returns the first element of the input vector of `[4 x double]`.
937///
938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
939#[inline]
940#[target_feature(enable = "avx")]
941//#[cfg_attr(test, assert_instr(movsd))] FIXME
942#[stable(feature = "simd_x86", since = "1.27.0")]
943pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
944 unsafe { simd_extract!(a, 0) }
945}
946
947/// Converts packed double-precision (64-bit) floating-point elements in `a`
948/// to packed 32-bit integers with truncation.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
951#[inline]
952#[target_feature(enable = "avx")]
953#[cfg_attr(test, assert_instr(vcvttpd2dq))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
956 unsafe { transmute(src:vcvttpd2dq(a)) }
957}
958
959/// Converts packed double-precision (64-bit) floating-point elements in `a`
960/// to packed 32-bit integers.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
963#[inline]
964#[target_feature(enable = "avx")]
965#[cfg_attr(test, assert_instr(vcvtpd2dq))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
968 unsafe { transmute(src:vcvtpd2dq(a)) }
969}
970
971/// Converts packed single-precision (32-bit) floating-point elements in `a`
972/// to packed 32-bit integers with truncation.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
975#[inline]
976#[target_feature(enable = "avx")]
977#[cfg_attr(test, assert_instr(vcvttps2dq))]
978#[stable(feature = "simd_x86", since = "1.27.0")]
979pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
980 unsafe { transmute(src:vcvttps2dq(a)) }
981}
982
983/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
984/// floating-point elements) from `a`, selected with `imm8`.
985///
986/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
987#[inline]
988#[target_feature(enable = "avx")]
989#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
990#[rustc_legacy_const_generics(1)]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
993 static_assert_uimm_bits!(IMM1, 1);
994 unsafe {
995 simd_shuffle!(
996 a,
997 _mm256_undefined_ps(),
998 [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
999 )
1000 }
1001}
1002
1003/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
1004/// floating-point elements) from `a`, selected with `imm8`.
1005///
1006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
1007#[inline]
1008#[target_feature(enable = "avx")]
1009#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1010#[rustc_legacy_const_generics(1)]
1011#[stable(feature = "simd_x86", since = "1.27.0")]
1012pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1013 static_assert_uimm_bits!(IMM1, 1);
1014 unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1015}
1016
1017/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1018///
1019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1020#[inline]
1021#[target_feature(enable = "avx")]
1022#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1023#[rustc_legacy_const_generics(1)]
1024#[stable(feature = "simd_x86", since = "1.27.0")]
1025pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1026 static_assert_uimm_bits!(IMM1, 1);
1027 unsafe {
1028 let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1029 transmute(src:dst)
1030 }
1031}
1032
1033/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1034///
1035/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1036#[inline]
1037#[target_feature(enable = "avx")]
1038// This intrinsic has no corresponding instruction.
1039#[rustc_legacy_const_generics(1)]
1040#[stable(feature = "simd_x86", since = "1.27.0")]
1041pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1042 static_assert_uimm_bits!(INDEX, 3);
1043 unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1044}
1045
1046/// Returns the first element of the input vector of `[8 x i32]`.
1047///
1048/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1049#[inline]
1050#[target_feature(enable = "avx")]
1051#[stable(feature = "simd_x86", since = "1.27.0")]
1052pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1053 unsafe { simd_extract!(a.as_i32x8(), 0) }
1054}
1055
1056/// Zeroes the contents of all XMM or YMM registers.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1059#[inline]
1060#[target_feature(enable = "avx")]
1061#[cfg_attr(test, assert_instr(vzeroall))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm256_zeroall() {
1064 unsafe { vzeroall() }
1065}
1066
1067/// Zeroes the upper 128 bits of all YMM registers;
1068/// the lower 128-bits of the registers are unmodified.
1069///
1070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1071#[inline]
1072#[target_feature(enable = "avx")]
1073#[cfg_attr(test, assert_instr(vzeroupper))]
1074#[stable(feature = "simd_x86", since = "1.27.0")]
1075pub fn _mm256_zeroupper() {
1076 unsafe { vzeroupper() }
1077}
1078
1079/// Shuffles single-precision (32-bit) floating-point elements in `a`
1080/// within 128-bit lanes using the control in `b`.
1081///
1082/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1083#[inline]
1084#[target_feature(enable = "avx")]
1085#[cfg_attr(test, assert_instr(vpermilps))]
1086#[stable(feature = "simd_x86", since = "1.27.0")]
1087pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1088 unsafe { vpermilps256(a, b.as_i32x8()) }
1089}
1090
1091/// Shuffles single-precision (32-bit) floating-point elements in `a`
1092/// using the control in `b`.
1093///
1094/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1095#[inline]
1096#[target_feature(enable = "avx")]
1097#[cfg_attr(test, assert_instr(vpermilps))]
1098#[stable(feature = "simd_x86", since = "1.27.0")]
1099pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1100 unsafe { vpermilps(a, b.as_i32x4()) }
1101}
1102
1103/// Shuffles single-precision (32-bit) floating-point elements in `a`
1104/// within 128-bit lanes using the control in `imm8`.
1105///
1106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1107#[inline]
1108#[target_feature(enable = "avx")]
1109#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1110#[rustc_legacy_const_generics(1)]
1111#[stable(feature = "simd_x86", since = "1.27.0")]
1112pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1113 static_assert_uimm_bits!(IMM8, 8);
1114 unsafe {
1115 simd_shuffle!(
1116 a,
1117 _mm256_undefined_ps(),
1118 [
1119 (IMM8 as u32 >> 0) & 0b11,
1120 (IMM8 as u32 >> 2) & 0b11,
1121 (IMM8 as u32 >> 4) & 0b11,
1122 (IMM8 as u32 >> 6) & 0b11,
1123 ((IMM8 as u32 >> 0) & 0b11) + 4,
1124 ((IMM8 as u32 >> 2) & 0b11) + 4,
1125 ((IMM8 as u32 >> 4) & 0b11) + 4,
1126 ((IMM8 as u32 >> 6) & 0b11) + 4,
1127 ],
1128 )
1129 }
1130}
1131
1132/// Shuffles single-precision (32-bit) floating-point elements in `a`
1133/// using the control in `imm8`.
1134///
1135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1136#[inline]
1137#[target_feature(enable = "avx")]
1138#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1139#[rustc_legacy_const_generics(1)]
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1142 static_assert_uimm_bits!(IMM8, 8);
1143 unsafe {
1144 simd_shuffle!(
1145 a,
1146 _mm_undefined_ps(),
1147 [
1148 (IMM8 as u32 >> 0) & 0b11,
1149 (IMM8 as u32 >> 2) & 0b11,
1150 (IMM8 as u32 >> 4) & 0b11,
1151 (IMM8 as u32 >> 6) & 0b11,
1152 ],
1153 )
1154 }
1155}
1156
1157/// Shuffles double-precision (64-bit) floating-point elements in `a`
1158/// within 256-bit lanes using the control in `b`.
1159///
1160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1161#[inline]
1162#[target_feature(enable = "avx")]
1163#[cfg_attr(test, assert_instr(vpermilpd))]
1164#[stable(feature = "simd_x86", since = "1.27.0")]
1165pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1166 unsafe { vpermilpd256(a, b.as_i64x4()) }
1167}
1168
1169/// Shuffles double-precision (64-bit) floating-point elements in `a`
1170/// using the control in `b`.
1171///
1172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1173#[inline]
1174#[target_feature(enable = "avx")]
1175#[cfg_attr(test, assert_instr(vpermilpd))]
1176#[stable(feature = "simd_x86", since = "1.27.0")]
1177pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1178 unsafe { vpermilpd(a, b.as_i64x2()) }
1179}
1180
1181/// Shuffles double-precision (64-bit) floating-point elements in `a`
1182/// within 128-bit lanes using the control in `imm8`.
1183///
1184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1185#[inline]
1186#[target_feature(enable = "avx")]
1187#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1188#[rustc_legacy_const_generics(1)]
1189#[stable(feature = "simd_x86", since = "1.27.0")]
1190pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1191 static_assert_uimm_bits!(IMM4, 4);
1192 unsafe {
1193 simd_shuffle!(
1194 a,
1195 _mm256_undefined_pd(),
1196 [
1197 ((IMM4 as u32 >> 0) & 1),
1198 ((IMM4 as u32 >> 1) & 1),
1199 ((IMM4 as u32 >> 2) & 1) + 2,
1200 ((IMM4 as u32 >> 3) & 1) + 2,
1201 ],
1202 )
1203 }
1204}
1205
1206/// Shuffles double-precision (64-bit) floating-point elements in `a`
1207/// using the control in `imm8`.
1208///
1209/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1210#[inline]
1211#[target_feature(enable = "avx")]
1212#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1213#[rustc_legacy_const_generics(1)]
1214#[stable(feature = "simd_x86", since = "1.27.0")]
1215pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1216 static_assert_uimm_bits!(IMM2, 2);
1217 unsafe {
1218 simd_shuffle!(
1219 a,
1220 _mm_undefined_pd(),
1221 [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1222 )
1223 }
1224}
1225
1226/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1227/// floating-point elements) selected by `imm8` from `a` and `b`.
1228///
1229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1230#[inline]
1231#[target_feature(enable = "avx")]
1232#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1233#[rustc_legacy_const_generics(2)]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1236 static_assert_uimm_bits!(IMM8, 8);
1237 _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
1238 a:_mm256_castps_si256(a),
1239 b:_mm256_castps_si256(b),
1240 ))
1241}
1242
1243/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1244/// floating-point elements) selected by `imm8` from `a` and `b`.
1245///
1246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1247#[inline]
1248#[target_feature(enable = "avx")]
1249#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1250#[rustc_legacy_const_generics(2)]
1251#[stable(feature = "simd_x86", since = "1.27.0")]
1252pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1253 static_assert_uimm_bits!(IMM8, 8);
1254 _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
1255 a:_mm256_castpd_si256(a),
1256 b:_mm256_castpd_si256(b),
1257 ))
1258}
1259
1260/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1261/// from `a` and `b`.
1262///
1263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1264#[inline]
1265#[target_feature(enable = "avx")]
1266#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1267#[rustc_legacy_const_generics(2)]
1268#[stable(feature = "simd_x86", since = "1.27.0")]
1269pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1270 static_assert_uimm_bits!(IMM8, 8);
1271 const fn idx(imm8: i32, pos: u32) -> u32 {
1272 let part = if pos < 2 {
1273 imm8 & 0xf
1274 } else {
1275 (imm8 & 0xf0) >> 4
1276 };
1277 2 * (part as u32 & 0b11) + (pos & 1)
1278 }
1279 const fn idx0(imm8: i32, pos: u32) -> u32 {
1280 let part = if pos < 2 {
1281 imm8 & 0xf
1282 } else {
1283 (imm8 & 0xf0) >> 4
1284 };
1285 if part & 0b1000 != 0 { 4 } else { pos }
1286 }
1287 unsafe {
1288 let r = simd_shuffle!(
1289 a.as_i64x4(),
1290 b.as_i64x4(),
1291 [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
1292 );
1293 let r: i64x4 = simd_shuffle!(
1294 r,
1295 i64x4::ZERO,
1296 [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
1297 );
1298 r.as_m256i()
1299 }
1300}
1301
1302/// Broadcasts a single-precision (32-bit) floating-point element from memory
1303/// to all elements of the returned vector.
1304///
1305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1306#[inline]
1307#[target_feature(enable = "avx")]
1308#[cfg_attr(test, assert_instr(vbroadcastss))]
1309#[stable(feature = "simd_x86", since = "1.27.0")]
1310#[allow(clippy::trivially_copy_pass_by_ref)]
1311pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1312 _mm256_set1_ps(*f)
1313}
1314
1315/// Broadcasts a single-precision (32-bit) floating-point element from memory
1316/// to all elements of the returned vector.
1317///
1318/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1319#[inline]
1320#[target_feature(enable = "avx")]
1321#[cfg_attr(test, assert_instr(vbroadcastss))]
1322#[stable(feature = "simd_x86", since = "1.27.0")]
1323#[allow(clippy::trivially_copy_pass_by_ref)]
1324pub fn _mm_broadcast_ss(f: &f32) -> __m128 {
1325 _mm_set1_ps(*f)
1326}
1327
1328/// Broadcasts a double-precision (64-bit) floating-point element from memory
1329/// to all elements of the returned vector.
1330///
1331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1332#[inline]
1333#[target_feature(enable = "avx")]
1334#[cfg_attr(test, assert_instr(vbroadcastsd))]
1335#[stable(feature = "simd_x86", since = "1.27.0")]
1336#[allow(clippy::trivially_copy_pass_by_ref)]
1337pub fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1338 _mm256_set1_pd(*f)
1339}
1340
1341/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1342/// (32-bit) floating-point elements) to all elements of the returned vector.
1343///
1344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1345#[inline]
1346#[target_feature(enable = "avx")]
1347#[cfg_attr(test, assert_instr(vbroadcastf128))]
1348#[stable(feature = "simd_x86", since = "1.27.0")]
1349pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1350 unsafe { simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3]) }
1351}
1352
1353/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1354/// (64-bit) floating-point elements) to all elements of the returned vector.
1355///
1356/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1357#[inline]
1358#[target_feature(enable = "avx")]
1359#[cfg_attr(test, assert_instr(vbroadcastf128))]
1360#[stable(feature = "simd_x86", since = "1.27.0")]
1361pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1362 unsafe { simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1]) }
1363}
1364
1365/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1366/// single-precision (32-bit) floating-point elements) from `b` into result
1367/// at the location specified by `imm8`.
1368///
1369/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1370#[inline]
1371#[target_feature(enable = "avx")]
1372#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1373#[rustc_legacy_const_generics(2)]
1374#[stable(feature = "simd_x86", since = "1.27.0")]
1375pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1376 static_assert_uimm_bits!(IMM1, 1);
1377 unsafe {
1378 simd_shuffle!(
1379 a,
1380 _mm256_castps128_ps256(b),
1381 [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1382 )
1383 }
1384}
1385
1386/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1387/// double-precision (64-bit) floating-point elements) from `b` into result
1388/// at the location specified by `imm8`.
1389///
1390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1391#[inline]
1392#[target_feature(enable = "avx")]
1393#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1394#[rustc_legacy_const_generics(2)]
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1397 static_assert_uimm_bits!(IMM1, 1);
1398 unsafe {
1399 simd_shuffle!(
1400 a,
1401 _mm256_castpd128_pd256(b),
1402 [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1403 )
1404 }
1405}
1406
1407/// Copies `a` to result, then inserts 128 bits from `b` into result
1408/// at the location specified by `imm8`.
1409///
1410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1411#[inline]
1412#[target_feature(enable = "avx")]
1413#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1414#[rustc_legacy_const_generics(2)]
1415#[stable(feature = "simd_x86", since = "1.27.0")]
1416pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1417 static_assert_uimm_bits!(IMM1, 1);
1418 unsafe {
1419 let dst: i64x4 = simd_shuffle!(
1420 a.as_i64x4(),
1421 _mm256_castsi128_si256(b).as_i64x4(),
1422 [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1423 );
1424 transmute(src:dst)
1425 }
1426}
1427
1428/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1429/// at the location specified by `index`.
1430///
1431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1432#[inline]
1433#[target_feature(enable = "avx")]
1434// This intrinsic has no corresponding instruction.
1435#[rustc_legacy_const_generics(2)]
1436#[stable(feature = "simd_x86", since = "1.27.0")]
1437pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1438 static_assert_uimm_bits!(INDEX, 5);
1439 unsafe { transmute(src:simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1440}
1441
1442/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1443/// at the location specified by `index`.
1444///
1445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1446#[inline]
1447#[target_feature(enable = "avx")]
1448// This intrinsic has no corresponding instruction.
1449#[rustc_legacy_const_generics(2)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1452 static_assert_uimm_bits!(INDEX, 4);
1453 unsafe { transmute(src:simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1454}
1455
1456/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1457/// at the location specified by `index`.
1458///
1459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1460#[inline]
1461#[target_feature(enable = "avx")]
1462// This intrinsic has no corresponding instruction.
1463#[rustc_legacy_const_generics(2)]
1464#[stable(feature = "simd_x86", since = "1.27.0")]
1465pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1466 static_assert_uimm_bits!(INDEX, 3);
1467 unsafe { transmute(src:simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1468}
1469
1470/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1471/// floating-point elements) from memory into result.
1472/// `mem_addr` must be aligned on a 32-byte boundary or a
1473/// general-protection exception may be generated.
1474///
1475/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1476#[inline]
1477#[target_feature(enable = "avx")]
1478#[cfg_attr(
1479 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1480 assert_instr(vmovap)
1481)]
1482#[stable(feature = "simd_x86", since = "1.27.0")]
1483#[allow(clippy::cast_ptr_alignment)]
1484pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1485 *(mem_addr as *const __m256d)
1486}
1487
1488/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1489/// floating-point elements) from `a` into memory.
1490/// `mem_addr` must be aligned on a 32-byte boundary or a
1491/// general-protection exception may be generated.
1492///
1493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1494#[inline]
1495#[target_feature(enable = "avx")]
1496#[cfg_attr(
1497 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1498 assert_instr(vmovap)
1499)]
1500#[stable(feature = "simd_x86", since = "1.27.0")]
1501#[allow(clippy::cast_ptr_alignment)]
1502pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1503 *(mem_addr as *mut __m256d) = a;
1504}
1505
1506/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1507/// floating-point elements) from memory into result.
1508/// `mem_addr` must be aligned on a 32-byte boundary or a
1509/// general-protection exception may be generated.
1510///
1511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1512#[inline]
1513#[target_feature(enable = "avx")]
1514#[cfg_attr(
1515 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1516 assert_instr(vmovaps)
1517)]
1518#[stable(feature = "simd_x86", since = "1.27.0")]
1519#[allow(clippy::cast_ptr_alignment)]
1520pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1521 *(mem_addr as *const __m256)
1522}
1523
1524/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1525/// floating-point elements) from `a` into memory.
1526/// `mem_addr` must be aligned on a 32-byte boundary or a
1527/// general-protection exception may be generated.
1528///
1529/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1530#[inline]
1531#[target_feature(enable = "avx")]
1532#[cfg_attr(
1533 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1534 assert_instr(vmovaps)
1535)]
1536#[stable(feature = "simd_x86", since = "1.27.0")]
1537#[allow(clippy::cast_ptr_alignment)]
1538pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1539 *(mem_addr as *mut __m256) = a;
1540}
1541
1542/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1543/// floating-point elements) from memory into result.
1544/// `mem_addr` does not need to be aligned on any particular boundary.
1545///
1546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1547#[inline]
1548#[target_feature(enable = "avx")]
1549#[cfg_attr(test, assert_instr(vmovup))]
1550#[stable(feature = "simd_x86", since = "1.27.0")]
1551pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1552 let mut dst: __m256d = _mm256_undefined_pd();
1553 ptr::copy_nonoverlapping(
1554 src:mem_addr as *const u8,
1555 dst:ptr::addr_of_mut!(dst) as *mut u8,
1556 count:mem::size_of::<__m256d>(),
1557 );
1558 dst
1559}
1560
1561/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1562/// floating-point elements) from `a` into memory.
1563/// `mem_addr` does not need to be aligned on any particular boundary.
1564///
1565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1566#[inline]
1567#[target_feature(enable = "avx")]
1568#[cfg_attr(test, assert_instr(vmovup))]
1569#[stable(feature = "simd_x86", since = "1.27.0")]
1570pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1571 mem_addr.cast::<__m256d>().write_unaligned(val:a);
1572}
1573
1574/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1575/// floating-point elements) from memory into result.
1576/// `mem_addr` does not need to be aligned on any particular boundary.
1577///
1578/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1579#[inline]
1580#[target_feature(enable = "avx")]
1581#[cfg_attr(test, assert_instr(vmovups))]
1582#[stable(feature = "simd_x86", since = "1.27.0")]
1583pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1584 let mut dst: __m256 = _mm256_undefined_ps();
1585 ptr::copy_nonoverlapping(
1586 src:mem_addr as *const u8,
1587 dst:ptr::addr_of_mut!(dst) as *mut u8,
1588 count:mem::size_of::<__m256>(),
1589 );
1590 dst
1591}
1592
1593/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1594/// floating-point elements) from `a` into memory.
1595/// `mem_addr` does not need to be aligned on any particular boundary.
1596///
1597/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1598#[inline]
1599#[target_feature(enable = "avx")]
1600#[cfg_attr(test, assert_instr(vmovups))]
1601#[stable(feature = "simd_x86", since = "1.27.0")]
1602pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1603 mem_addr.cast::<__m256>().write_unaligned(val:a);
1604}
1605
1606/// Loads 256-bits of integer data from memory into result.
1607/// `mem_addr` must be aligned on a 32-byte boundary or a
1608/// general-protection exception may be generated.
1609///
1610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1611#[inline]
1612#[target_feature(enable = "avx")]
1613#[cfg_attr(
1614 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1615 assert_instr(vmovaps)
1616)] // FIXME vmovdqa expected
1617#[stable(feature = "simd_x86", since = "1.27.0")]
1618pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1619 *mem_addr
1620}
1621
1622/// Stores 256-bits of integer data from `a` into memory.
1623/// `mem_addr` must be aligned on a 32-byte boundary or a
1624/// general-protection exception may be generated.
1625///
1626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1627#[inline]
1628#[target_feature(enable = "avx")]
1629#[cfg_attr(
1630 all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1631 assert_instr(vmovaps)
1632)] // FIXME vmovdqa expected
1633#[stable(feature = "simd_x86", since = "1.27.0")]
1634pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1635 *mem_addr = a;
1636}
1637
1638/// Loads 256-bits of integer data from memory into result.
1639/// `mem_addr` does not need to be aligned on any particular boundary.
1640///
1641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1642#[inline]
1643#[target_feature(enable = "avx")]
1644#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1645#[stable(feature = "simd_x86", since = "1.27.0")]
1646pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1647 let mut dst: __m256i = _mm256_undefined_si256();
1648 ptr::copy_nonoverlapping(
1649 src:mem_addr as *const u8,
1650 dst:ptr::addr_of_mut!(dst) as *mut u8,
1651 count:mem::size_of::<__m256i>(),
1652 );
1653 dst
1654}
1655
1656/// Stores 256-bits of integer data from `a` into memory.
1657/// `mem_addr` does not need to be aligned on any particular boundary.
1658///
1659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1660#[inline]
1661#[target_feature(enable = "avx")]
1662#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1665 mem_addr.write_unaligned(val:a);
1666}
1667
1668/// Loads packed double-precision (64-bit) floating-point elements from memory
1669/// into result using `mask` (elements are zeroed out when the high bit of the
1670/// corresponding element is not set).
1671///
1672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1673#[inline]
1674#[target_feature(enable = "avx")]
1675#[cfg_attr(test, assert_instr(vmaskmovpd))]
1676#[stable(feature = "simd_x86", since = "1.27.0")]
1677pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1678 let mask: i64x4 = simd_shr(lhs:mask.as_i64x4(), rhs:i64x4::splat(63));
1679 simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd())
1680}
1681
1682/// Stores packed double-precision (64-bit) floating-point elements from `a`
1683/// into memory using `mask`.
1684///
1685/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1686#[inline]
1687#[target_feature(enable = "avx")]
1688#[cfg_attr(test, assert_instr(vmaskmovpd))]
1689#[stable(feature = "simd_x86", since = "1.27.0")]
1690pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1691 let mask: i64x4 = simd_shr(lhs:mask.as_i64x4(), rhs:i64x4::splat(63));
1692 simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1693}
1694
1695/// Loads packed double-precision (64-bit) floating-point elements from memory
1696/// into result using `mask` (elements are zeroed out when the high bit of the
1697/// corresponding element is not set).
1698///
1699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1700#[inline]
1701#[target_feature(enable = "avx")]
1702#[cfg_attr(test, assert_instr(vmaskmovpd))]
1703#[stable(feature = "simd_x86", since = "1.27.0")]
1704pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1705 let mask: i64x2 = simd_shr(lhs:mask.as_i64x2(), rhs:i64x2::splat(63));
1706 simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd())
1707}
1708
1709/// Stores packed double-precision (64-bit) floating-point elements from `a`
1710/// into memory using `mask`.
1711///
1712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1713#[inline]
1714#[target_feature(enable = "avx")]
1715#[cfg_attr(test, assert_instr(vmaskmovpd))]
1716#[stable(feature = "simd_x86", since = "1.27.0")]
1717pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1718 let mask: i64x2 = simd_shr(lhs:mask.as_i64x2(), rhs:i64x2::splat(63));
1719 simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1720}
1721
1722/// Loads packed single-precision (32-bit) floating-point elements from memory
1723/// into result using `mask` (elements are zeroed out when the high bit of the
1724/// corresponding element is not set).
1725///
1726/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1727#[inline]
1728#[target_feature(enable = "avx")]
1729#[cfg_attr(test, assert_instr(vmaskmovps))]
1730#[stable(feature = "simd_x86", since = "1.27.0")]
1731pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1732 let mask: i32x8 = simd_shr(lhs:mask.as_i32x8(), rhs:i32x8::splat(31));
1733 simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps())
1734}
1735
1736/// Stores packed single-precision (32-bit) floating-point elements from `a`
1737/// into memory using `mask`.
1738///
1739/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1740#[inline]
1741#[target_feature(enable = "avx")]
1742#[cfg_attr(test, assert_instr(vmaskmovps))]
1743#[stable(feature = "simd_x86", since = "1.27.0")]
1744pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1745 let mask: i32x8 = simd_shr(lhs:mask.as_i32x8(), rhs:i32x8::splat(31));
1746 simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1747}
1748
1749/// Loads packed single-precision (32-bit) floating-point elements from memory
1750/// into result using `mask` (elements are zeroed out when the high bit of the
1751/// corresponding element is not set).
1752///
1753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1754#[inline]
1755#[target_feature(enable = "avx")]
1756#[cfg_attr(test, assert_instr(vmaskmovps))]
1757#[stable(feature = "simd_x86", since = "1.27.0")]
1758pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1759 let mask: i32x4 = simd_shr(lhs:mask.as_i32x4(), rhs:i32x4::splat(31));
1760 simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps())
1761}
1762
1763/// Stores packed single-precision (32-bit) floating-point elements from `a`
1764/// into memory using `mask`.
1765///
1766/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1767#[inline]
1768#[target_feature(enable = "avx")]
1769#[cfg_attr(test, assert_instr(vmaskmovps))]
1770#[stable(feature = "simd_x86", since = "1.27.0")]
1771pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1772 let mask: i32x4 = simd_shr(lhs:mask.as_i32x4(), rhs:i32x4::splat(31));
1773 simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1774}
1775
1776/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1777/// from `a`, and returns the results.
1778///
1779/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1780#[inline]
1781#[target_feature(enable = "avx")]
1782#[cfg_attr(test, assert_instr(vmovshdup))]
1783#[stable(feature = "simd_x86", since = "1.27.0")]
1784pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1785 unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1786}
1787
1788/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1789/// from `a`, and returns the results.
1790///
1791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1792#[inline]
1793#[target_feature(enable = "avx")]
1794#[cfg_attr(test, assert_instr(vmovsldup))]
1795#[stable(feature = "simd_x86", since = "1.27.0")]
1796pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1797 unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1798}
1799
1800/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1801/// from `a`, and returns the results.
1802///
1803/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1804#[inline]
1805#[target_feature(enable = "avx")]
1806#[cfg_attr(test, assert_instr(vmovddup))]
1807#[stable(feature = "simd_x86", since = "1.27.0")]
1808pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1809 unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1810}
1811
1812/// Loads 256-bits of integer data from unaligned memory into result.
1813/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1814/// data crosses a cache line boundary.
1815///
1816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1817#[inline]
1818#[target_feature(enable = "avx")]
1819#[cfg_attr(test, assert_instr(vlddqu))]
1820#[stable(feature = "simd_x86", since = "1.27.0")]
1821pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1822 transmute(src:vlddqu(mem_addr as *const i8))
1823}
1824
1825/// Moves integer data from a 256-bit integer vector to a 32-byte
1826/// aligned memory location. To minimize caching, the data is flagged as
1827/// non-temporal (unlikely to be used again soon)
1828///
1829/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1830///
1831/// # Safety of non-temporal stores
1832///
1833/// After using this intrinsic, but before any other access to the memory that this intrinsic
1834/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1835/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1836/// return.
1837///
1838/// See [`_mm_sfence`] for details.
1839#[inline]
1840#[target_feature(enable = "avx")]
1841#[cfg_attr(test, assert_instr(vmovntdq))]
1842#[stable(feature = "simd_x86", since = "1.27.0")]
1843pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1844 // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1845 crate::arch::asm!(
1846 vps!("vmovntdq", ",{a}"),
1847 p = in(reg) mem_addr,
1848 a = in(ymm_reg) a,
1849 options(nostack, preserves_flags),
1850 );
1851}
1852
1853/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1854/// to a 32-byte aligned memory location. To minimize caching, the data is
1855/// flagged as non-temporal (unlikely to be used again soon).
1856///
1857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1858///
1859/// # Safety of non-temporal stores
1860///
1861/// After using this intrinsic, but before any other access to the memory that this intrinsic
1862/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1863/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1864/// return.
1865///
1866/// See [`_mm_sfence`] for details.
1867#[inline]
1868#[target_feature(enable = "avx")]
1869#[cfg_attr(test, assert_instr(vmovntpd))]
1870#[stable(feature = "simd_x86", since = "1.27.0")]
1871#[allow(clippy::cast_ptr_alignment)]
1872pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1873 // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1874 crate::arch::asm!(
1875 vps!("vmovntpd", ",{a}"),
1876 p = in(reg) mem_addr,
1877 a = in(ymm_reg) a,
1878 options(nostack, preserves_flags),
1879 );
1880}
1881
1882/// Moves single-precision floating point values from a 256-bit vector
1883/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1884/// caching, the data is flagged as non-temporal (unlikely to be used again
1885/// soon).
1886///
1887/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1888///
1889/// # Safety of non-temporal stores
1890///
1891/// After using this intrinsic, but before any other access to the memory that this intrinsic
1892/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1893/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1894/// return.
1895///
1896/// See [`_mm_sfence`] for details.
1897#[inline]
1898#[target_feature(enable = "avx")]
1899#[cfg_attr(test, assert_instr(vmovntps))]
1900#[stable(feature = "simd_x86", since = "1.27.0")]
1901#[allow(clippy::cast_ptr_alignment)]
1902pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1903 // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1904 crate::arch::asm!(
1905 vps!("vmovntps", ",{a}"),
1906 p = in(reg) mem_addr,
1907 a = in(ymm_reg) a,
1908 options(nostack, preserves_flags),
1909 );
1910}
1911
1912/// Computes the approximate reciprocal of packed single-precision (32-bit)
1913/// floating-point elements in `a`, and returns the results. The maximum
1914/// relative error for this approximation is less than 1.5*2^-12.
1915///
1916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
1917#[inline]
1918#[target_feature(enable = "avx")]
1919#[cfg_attr(test, assert_instr(vrcpps))]
1920#[stable(feature = "simd_x86", since = "1.27.0")]
1921pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
1922 unsafe { vrcpps(a) }
1923}
1924
1925/// Computes the approximate reciprocal square root of packed single-precision
1926/// (32-bit) floating-point elements in `a`, and returns the results.
1927/// The maximum relative error for this approximation is less than 1.5*2^-12.
1928///
1929/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
1930#[inline]
1931#[target_feature(enable = "avx")]
1932#[cfg_attr(test, assert_instr(vrsqrtps))]
1933#[stable(feature = "simd_x86", since = "1.27.0")]
1934pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
1935 unsafe { vrsqrtps(a) }
1936}
1937
1938/// Unpacks and interleave double-precision (64-bit) floating-point elements
1939/// from the high half of each 128-bit lane in `a` and `b`.
1940///
1941/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
1942#[inline]
1943#[target_feature(enable = "avx")]
1944#[cfg_attr(test, assert_instr(vunpckhpd))]
1945#[stable(feature = "simd_x86", since = "1.27.0")]
1946pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
1947 unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
1948}
1949
1950/// Unpacks and interleave single-precision (32-bit) floating-point elements
1951/// from the high half of each 128-bit lane in `a` and `b`.
1952///
1953/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
1954#[inline]
1955#[target_feature(enable = "avx")]
1956#[cfg_attr(test, assert_instr(vunpckhps))]
1957#[stable(feature = "simd_x86", since = "1.27.0")]
1958pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
1959 unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
1960}
1961
1962/// Unpacks and interleave double-precision (64-bit) floating-point elements
1963/// from the low half of each 128-bit lane in `a` and `b`.
1964///
1965/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
1966#[inline]
1967#[target_feature(enable = "avx")]
1968#[cfg_attr(test, assert_instr(vunpcklpd))]
1969#[stable(feature = "simd_x86", since = "1.27.0")]
1970pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
1971 unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
1972}
1973
1974/// Unpacks and interleave single-precision (32-bit) floating-point elements
1975/// from the low half of each 128-bit lane in `a` and `b`.
1976///
1977/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
1978#[inline]
1979#[target_feature(enable = "avx")]
1980#[cfg_attr(test, assert_instr(vunpcklps))]
1981#[stable(feature = "simd_x86", since = "1.27.0")]
1982pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
1983 unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
1984}
1985
1986/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1987/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1988/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1989/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
1990///
1991/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
1992#[inline]
1993#[target_feature(enable = "avx")]
1994#[cfg_attr(test, assert_instr(vptest))]
1995#[stable(feature = "simd_x86", since = "1.27.0")]
1996pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1997 unsafe {
1998 let r: i64x4 = simd_and(x:a.as_i64x4(), y:b.as_i64x4());
1999 (0i64 == simd_reduce_or(r)) as i32
2000 }
2001}
2002
2003/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2004/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2005/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2006/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
2007///
2008/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
2009#[inline]
2010#[target_feature(enable = "avx")]
2011#[cfg_attr(test, assert_instr(vptest))]
2012#[stable(feature = "simd_x86", since = "1.27.0")]
2013pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
2014 unsafe {
2015 let r: i64x4 = simd_and(x:simd_xor(a.as_i64x4(), i64x4::splat(!0)), y:b.as_i64x4());
2016 (0i64 == simd_reduce_or(r)) as i32
2017 }
2018}
2019
2020/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2021/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2022/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2023/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
2024/// `CF` values are zero, otherwise return 0.
2025///
2026/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
2027#[inline]
2028#[target_feature(enable = "avx")]
2029#[cfg_attr(test, assert_instr(vptest))]
2030#[stable(feature = "simd_x86", since = "1.27.0")]
2031pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
2032 unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
2033}
2034
2035/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2036/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2037/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2038/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2039/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2040/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2041/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2042///
2043/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
2044#[inline]
2045#[target_feature(enable = "avx")]
2046#[cfg_attr(test, assert_instr(vtestpd))]
2047#[stable(feature = "simd_x86", since = "1.27.0")]
2048pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
2049 unsafe { vtestzpd256(a, b) }
2050}
2051
2052/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2053/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2054/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2055/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2056/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2057/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2058/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2059///
2060/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
2061#[inline]
2062#[target_feature(enable = "avx")]
2063#[cfg_attr(test, assert_instr(vtestpd))]
2064#[stable(feature = "simd_x86", since = "1.27.0")]
2065pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
2066 unsafe { vtestcpd256(a, b) }
2067}
2068
2069/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2070/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2071/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2072/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2073/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2074/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2075/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2076/// are zero, otherwise return 0.
2077///
2078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2079#[inline]
2080#[target_feature(enable = "avx")]
2081#[cfg_attr(test, assert_instr(vtestpd))]
2082#[stable(feature = "simd_x86", since = "1.27.0")]
2083pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2084 unsafe { vtestnzcpd256(a, b) }
2085}
2086
2087/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2088/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2089/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2090/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2091/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2092/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2093/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2094///
2095/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2096#[inline]
2097#[target_feature(enable = "avx")]
2098#[cfg_attr(test, assert_instr(vtestpd))]
2099#[stable(feature = "simd_x86", since = "1.27.0")]
2100pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2101 unsafe {
2102 let r: i64x2 = simd_lt(x:transmute(_mm_and_pd(a, b)), y:i64x2::ZERO);
2103 (0i64 == simd_reduce_or(r)) as i32
2104 }
2105}
2106
2107/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2108/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2109/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2110/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2111/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2112/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2113/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vtestpd))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2121 unsafe {
2122 let r: i64x2 = simd_lt(x:transmute(_mm_andnot_pd(a, b)), y:i64x2::ZERO);
2123 (0i64 == simd_reduce_or(r)) as i32
2124 }
2125}
2126
2127/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2128/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2129/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2130/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2131/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2132/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2133/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2134/// are zero, otherwise return 0.
2135///
2136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2137#[inline]
2138#[target_feature(enable = "avx")]
2139#[cfg_attr(test, assert_instr(vtestpd))]
2140#[stable(feature = "simd_x86", since = "1.27.0")]
2141pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2142 unsafe { vtestnzcpd(a, b) }
2143}
2144
2145/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2146/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2147/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2148/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2149/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2150/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2151/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2152///
2153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2154#[inline]
2155#[target_feature(enable = "avx")]
2156#[cfg_attr(test, assert_instr(vtestps))]
2157#[stable(feature = "simd_x86", since = "1.27.0")]
2158pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2159 unsafe { vtestzps256(a, b) }
2160}
2161
2162/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2163/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2164/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2165/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2166/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2167/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2168/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2169///
2170/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2171#[inline]
2172#[target_feature(enable = "avx")]
2173#[cfg_attr(test, assert_instr(vtestps))]
2174#[stable(feature = "simd_x86", since = "1.27.0")]
2175pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2176 unsafe { vtestcps256(a, b) }
2177}
2178
2179/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2180/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2181/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2182/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2183/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2184/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2185/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2186/// are zero, otherwise return 0.
2187///
2188/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2189#[inline]
2190#[target_feature(enable = "avx")]
2191#[cfg_attr(test, assert_instr(vtestps))]
2192#[stable(feature = "simd_x86", since = "1.27.0")]
2193pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2194 unsafe { vtestnzcps256(a, b) }
2195}
2196
2197/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2198/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2199/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2200/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2201/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2202/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2203/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2204///
2205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2206#[inline]
2207#[target_feature(enable = "avx")]
2208#[cfg_attr(test, assert_instr(vtestps))]
2209#[stable(feature = "simd_x86", since = "1.27.0")]
2210pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2211 unsafe {
2212 let r: i32x4 = simd_lt(x:transmute(_mm_and_ps(a, b)), y:i32x4::ZERO);
2213 (0i32 == simd_reduce_or(r)) as i32
2214 }
2215}
2216
2217/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2218/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2219/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2220/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2221/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2222/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2223/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2224///
2225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2226#[inline]
2227#[target_feature(enable = "avx")]
2228#[cfg_attr(test, assert_instr(vtestps))]
2229#[stable(feature = "simd_x86", since = "1.27.0")]
2230pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2231 unsafe {
2232 let r: i32x4 = simd_lt(x:transmute(_mm_andnot_ps(a, b)), y:i32x4::ZERO);
2233 (0i32 == simd_reduce_or(r)) as i32
2234 }
2235}
2236
2237/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2238/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2239/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2240/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2241/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2242/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2243/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2244/// are zero, otherwise return 0.
2245///
2246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2247#[inline]
2248#[target_feature(enable = "avx")]
2249#[cfg_attr(test, assert_instr(vtestps))]
2250#[stable(feature = "simd_x86", since = "1.27.0")]
2251pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2252 unsafe { vtestnzcps(a, b) }
2253}
2254
2255/// Sets each bit of the returned mask based on the most significant bit of the
2256/// corresponding packed double-precision (64-bit) floating-point element in
2257/// `a`.
2258///
2259/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2260#[inline]
2261#[target_feature(enable = "avx")]
2262#[cfg_attr(test, assert_instr(vmovmskpd))]
2263#[stable(feature = "simd_x86", since = "1.27.0")]
2264pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
2265 // Propagate the highest bit to the rest, because simd_bitmask
2266 // requires all-1 or all-0.
2267 unsafe {
2268 let mask: i64x4 = simd_lt(x:transmute(a), y:i64x4::ZERO);
2269 simd_bitmask::<i64x4, u8>(mask).into()
2270 }
2271}
2272
2273/// Sets each bit of the returned mask based on the most significant bit of the
2274/// corresponding packed single-precision (32-bit) floating-point element in
2275/// `a`.
2276///
2277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2278#[inline]
2279#[target_feature(enable = "avx")]
2280#[cfg_attr(test, assert_instr(vmovmskps))]
2281#[stable(feature = "simd_x86", since = "1.27.0")]
2282pub fn _mm256_movemask_ps(a: __m256) -> i32 {
2283 // Propagate the highest bit to the rest, because simd_bitmask
2284 // requires all-1 or all-0.
2285 unsafe {
2286 let mask: i32x8 = simd_lt(x:transmute(a), y:i32x8::ZERO);
2287 simd_bitmask::<i32x8, u8>(mask).into()
2288 }
2289}
2290
2291/// Returns vector of type __m256d with all elements set to zero.
2292///
2293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2294#[inline]
2295#[target_feature(enable = "avx")]
2296#[cfg_attr(test, assert_instr(vxorp))]
2297#[stable(feature = "simd_x86", since = "1.27.0")]
2298pub fn _mm256_setzero_pd() -> __m256d {
2299 const { unsafe { mem::zeroed() } }
2300}
2301
2302/// Returns vector of type __m256 with all elements set to zero.
2303///
2304/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2305#[inline]
2306#[target_feature(enable = "avx")]
2307#[cfg_attr(test, assert_instr(vxorps))]
2308#[stable(feature = "simd_x86", since = "1.27.0")]
2309pub fn _mm256_setzero_ps() -> __m256 {
2310 const { unsafe { mem::zeroed() } }
2311}
2312
2313/// Returns vector of type __m256i with all elements set to zero.
2314///
2315/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2316#[inline]
2317#[target_feature(enable = "avx")]
2318#[cfg_attr(test, assert_instr(vxor))]
2319#[stable(feature = "simd_x86", since = "1.27.0")]
2320pub fn _mm256_setzero_si256() -> __m256i {
2321 const { unsafe { mem::zeroed() } }
2322}
2323
2324/// Sets packed double-precision (64-bit) floating-point elements in returned
2325/// vector with the supplied values.
2326///
2327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2328#[inline]
2329#[target_feature(enable = "avx")]
2330// This intrinsic has no corresponding instruction.
2331#[cfg_attr(test, assert_instr(vinsertf128))]
2332#[stable(feature = "simd_x86", since = "1.27.0")]
2333pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2334 _mm256_setr_pd(a:d, b:c, c:b, d:a)
2335}
2336
2337/// Sets packed single-precision (32-bit) floating-point elements in returned
2338/// vector with the supplied values.
2339///
2340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2341#[inline]
2342#[target_feature(enable = "avx")]
2343// This intrinsic has no corresponding instruction.
2344#[stable(feature = "simd_x86", since = "1.27.0")]
2345pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2346 _mm256_setr_ps(a:h, b:g, c:f, d:e, e:d, f:c, g:b, h:a)
2347}
2348
2349/// Sets packed 8-bit integers in returned vector with the supplied values.
2350///
2351/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2352#[inline]
2353#[target_feature(enable = "avx")]
2354// This intrinsic has no corresponding instruction.
2355#[stable(feature = "simd_x86", since = "1.27.0")]
2356pub fn _mm256_set_epi8(
2357 e00: i8,
2358 e01: i8,
2359 e02: i8,
2360 e03: i8,
2361 e04: i8,
2362 e05: i8,
2363 e06: i8,
2364 e07: i8,
2365 e08: i8,
2366 e09: i8,
2367 e10: i8,
2368 e11: i8,
2369 e12: i8,
2370 e13: i8,
2371 e14: i8,
2372 e15: i8,
2373 e16: i8,
2374 e17: i8,
2375 e18: i8,
2376 e19: i8,
2377 e20: i8,
2378 e21: i8,
2379 e22: i8,
2380 e23: i8,
2381 e24: i8,
2382 e25: i8,
2383 e26: i8,
2384 e27: i8,
2385 e28: i8,
2386 e29: i8,
2387 e30: i8,
2388 e31: i8,
2389) -> __m256i {
2390 #[rustfmt::skip]
2391 _mm256_setr_epi8(
2392 e00:e31, e01:e30, e02:e29, e03:e28, e04:e27, e05:e26, e06:e25, e07:e24,
2393 e08:e23, e09:e22, e10:e21, e11:e20, e12:e19, e13:e18, e14:e17, e15:e16,
2394 e16:e15, e17:e14, e18:e13, e19:e12, e20:e11, e21:e10, e22:e09, e23:e08,
2395 e24:e07, e25:e06, e26:e05, e27:e04, e28:e03, e29:e02, e30:e01, e31:e00,
2396 )
2397}
2398
2399/// Sets packed 16-bit integers in returned vector with the supplied values.
2400///
2401/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2402#[inline]
2403#[target_feature(enable = "avx")]
2404// This intrinsic has no corresponding instruction.
2405#[stable(feature = "simd_x86", since = "1.27.0")]
2406pub fn _mm256_set_epi16(
2407 e00: i16,
2408 e01: i16,
2409 e02: i16,
2410 e03: i16,
2411 e04: i16,
2412 e05: i16,
2413 e06: i16,
2414 e07: i16,
2415 e08: i16,
2416 e09: i16,
2417 e10: i16,
2418 e11: i16,
2419 e12: i16,
2420 e13: i16,
2421 e14: i16,
2422 e15: i16,
2423) -> __m256i {
2424 #[rustfmt::skip]
2425 _mm256_setr_epi16(
2426 e00:e15, e01:e14, e02:e13, e03:e12,
2427 e04:e11, e05:e10, e06:e09, e07:e08,
2428 e08:e07, e09:e06, e10:e05, e11:e04,
2429 e12:e03, e13:e02, e14:e01, e15:e00,
2430 )
2431}
2432
2433/// Sets packed 32-bit integers in returned vector with the supplied values.
2434///
2435/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2436#[inline]
2437#[target_feature(enable = "avx")]
2438// This intrinsic has no corresponding instruction.
2439#[stable(feature = "simd_x86", since = "1.27.0")]
2440pub fn _mm256_set_epi32(
2441 e0: i32,
2442 e1: i32,
2443 e2: i32,
2444 e3: i32,
2445 e4: i32,
2446 e5: i32,
2447 e6: i32,
2448 e7: i32,
2449) -> __m256i {
2450 _mm256_setr_epi32(e0:e7, e1:e6, e2:e5, e3:e4, e4:e3, e5:e2, e6:e1, e7:e0)
2451}
2452
2453/// Sets packed 64-bit integers in returned vector with the supplied values.
2454///
2455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2456#[inline]
2457#[target_feature(enable = "avx")]
2458// This intrinsic has no corresponding instruction.
2459#[stable(feature = "simd_x86", since = "1.27.0")]
2460pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2461 _mm256_setr_epi64x(a:d, b:c, c:b, d:a)
2462}
2463
2464/// Sets packed double-precision (64-bit) floating-point elements in returned
2465/// vector with the supplied values in reverse order.
2466///
2467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2468#[inline]
2469#[target_feature(enable = "avx")]
2470// This intrinsic has no corresponding instruction.
2471#[stable(feature = "simd_x86", since = "1.27.0")]
2472pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2473 __m256d([a, b, c, d])
2474}
2475
2476/// Sets packed single-precision (32-bit) floating-point elements in returned
2477/// vector with the supplied values in reverse order.
2478///
2479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2480#[inline]
2481#[target_feature(enable = "avx")]
2482// This intrinsic has no corresponding instruction.
2483#[stable(feature = "simd_x86", since = "1.27.0")]
2484pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2485 __m256([a, b, c, d, e, f, g, h])
2486}
2487
2488/// Sets packed 8-bit integers in returned vector with the supplied values in
2489/// reverse order.
2490///
2491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2492#[inline]
2493#[target_feature(enable = "avx")]
2494// This intrinsic has no corresponding instruction.
2495#[stable(feature = "simd_x86", since = "1.27.0")]
2496pub fn _mm256_setr_epi8(
2497 e00: i8,
2498 e01: i8,
2499 e02: i8,
2500 e03: i8,
2501 e04: i8,
2502 e05: i8,
2503 e06: i8,
2504 e07: i8,
2505 e08: i8,
2506 e09: i8,
2507 e10: i8,
2508 e11: i8,
2509 e12: i8,
2510 e13: i8,
2511 e14: i8,
2512 e15: i8,
2513 e16: i8,
2514 e17: i8,
2515 e18: i8,
2516 e19: i8,
2517 e20: i8,
2518 e21: i8,
2519 e22: i8,
2520 e23: i8,
2521 e24: i8,
2522 e25: i8,
2523 e26: i8,
2524 e27: i8,
2525 e28: i8,
2526 e29: i8,
2527 e30: i8,
2528 e31: i8,
2529) -> __m256i {
2530 unsafe {
2531 #[rustfmt::skip]
2532 transmute(src:i8x32::new(
2533 x0:e00, x1:e01, x2:e02, x3:e03, x4:e04, x5:e05, x6:e06, x7:e07,
2534 x8:e08, x9:e09, x10:e10, x11:e11, x12:e12, x13:e13, x14:e14, x15:e15,
2535 x16:e16, x17:e17, x18:e18, x19:e19, x20:e20, x21:e21, x22:e22, x23:e23,
2536 x24:e24, x25:e25, x26:e26, x27:e27, x28:e28, x29:e29, x30:e30, x31:e31,
2537 ))
2538 }
2539}
2540
2541/// Sets packed 16-bit integers in returned vector with the supplied values in
2542/// reverse order.
2543///
2544/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2545#[inline]
2546#[target_feature(enable = "avx")]
2547// This intrinsic has no corresponding instruction.
2548#[stable(feature = "simd_x86", since = "1.27.0")]
2549pub fn _mm256_setr_epi16(
2550 e00: i16,
2551 e01: i16,
2552 e02: i16,
2553 e03: i16,
2554 e04: i16,
2555 e05: i16,
2556 e06: i16,
2557 e07: i16,
2558 e08: i16,
2559 e09: i16,
2560 e10: i16,
2561 e11: i16,
2562 e12: i16,
2563 e13: i16,
2564 e14: i16,
2565 e15: i16,
2566) -> __m256i {
2567 unsafe {
2568 #[rustfmt::skip]
2569 transmute(src:i16x16::new(
2570 x0:e00, x1:e01, x2:e02, x3:e03,
2571 x4:e04, x5:e05, x6:e06, x7:e07,
2572 x8:e08, x9:e09, x10:e10, x11:e11,
2573 x12:e12, x13:e13, x14:e14, x15:e15,
2574 ))
2575 }
2576}
2577
2578/// Sets packed 32-bit integers in returned vector with the supplied values in
2579/// reverse order.
2580///
2581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2582#[inline]
2583#[target_feature(enable = "avx")]
2584// This intrinsic has no corresponding instruction.
2585#[stable(feature = "simd_x86", since = "1.27.0")]
2586pub fn _mm256_setr_epi32(
2587 e0: i32,
2588 e1: i32,
2589 e2: i32,
2590 e3: i32,
2591 e4: i32,
2592 e5: i32,
2593 e6: i32,
2594 e7: i32,
2595) -> __m256i {
2596 unsafe { transmute(src:i32x8::new(x0:e0, x1:e1, x2:e2, x3:e3, x4:e4, x5:e5, x6:e6, x7:e7)) }
2597}
2598
2599/// Sets packed 64-bit integers in returned vector with the supplied values in
2600/// reverse order.
2601///
2602/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2603#[inline]
2604#[target_feature(enable = "avx")]
2605// This intrinsic has no corresponding instruction.
2606#[stable(feature = "simd_x86", since = "1.27.0")]
2607pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2608 unsafe { transmute(src:i64x4::new(x0:a, x1:b, x2:c, x3:d)) }
2609}
2610
2611/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2612/// elements of returned vector.
2613///
2614/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2615#[inline]
2616#[target_feature(enable = "avx")]
2617// This intrinsic has no corresponding instruction.
2618#[stable(feature = "simd_x86", since = "1.27.0")]
2619pub fn _mm256_set1_pd(a: f64) -> __m256d {
2620 _mm256_setr_pd(a, b:a, c:a, d:a)
2621}
2622
2623/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2624/// elements of returned vector.
2625///
2626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2627#[inline]
2628#[target_feature(enable = "avx")]
2629// This intrinsic has no corresponding instruction.
2630#[stable(feature = "simd_x86", since = "1.27.0")]
2631pub fn _mm256_set1_ps(a: f32) -> __m256 {
2632 _mm256_setr_ps(a, b:a, c:a, d:a, e:a, f:a, g:a, h:a)
2633}
2634
2635/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2636/// This intrinsic may generate the `vpbroadcastb`.
2637///
2638/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2639#[inline]
2640#[target_feature(enable = "avx")]
2641// This intrinsic has no corresponding instruction.
2642#[stable(feature = "simd_x86", since = "1.27.0")]
2643pub fn _mm256_set1_epi8(a: i8) -> __m256i {
2644 #[rustfmt::skip]
2645 _mm256_setr_epi8(
2646 e00:a, e01:a, e02:a, e03:a, e04:a, e05:a, e06:a, e07:a,
2647 e08:a, e09:a, e10:a, e11:a, e12:a, e13:a, e14:a, e15:a,
2648 e16:a, e17:a, e18:a, e19:a, e20:a, e21:a, e22:a, e23:a,
2649 e24:a, e25:a, e26:a, e27:a, e28:a, e29:a, e30:a, e31:a,
2650 )
2651}
2652
2653/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2654/// This intrinsic may generate the `vpbroadcastw`.
2655///
2656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2657#[inline]
2658#[target_feature(enable = "avx")]
2659//#[cfg_attr(test, assert_instr(vpshufb))]
2660#[cfg_attr(test, assert_instr(vinsertf128))]
2661// This intrinsic has no corresponding instruction.
2662#[stable(feature = "simd_x86", since = "1.27.0")]
2663pub fn _mm256_set1_epi16(a: i16) -> __m256i {
2664 _mm256_setr_epi16(e00:a, e01:a, e02:a, e03:a, e04:a, e05:a, e06:a, e07:a, e08:a, e09:a, e10:a, e11:a, e12:a, e13:a, e14:a, e15:a)
2665}
2666
2667/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2668/// This intrinsic may generate the `vpbroadcastd`.
2669///
2670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2671#[inline]
2672#[target_feature(enable = "avx")]
2673// This intrinsic has no corresponding instruction.
2674#[stable(feature = "simd_x86", since = "1.27.0")]
2675pub fn _mm256_set1_epi32(a: i32) -> __m256i {
2676 _mm256_setr_epi32(e0:a, e1:a, e2:a, e3:a, e4:a, e5:a, e6:a, e7:a)
2677}
2678
2679/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2680/// This intrinsic may generate the `vpbroadcastq`.
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2683#[inline]
2684#[target_feature(enable = "avx")]
2685#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2686#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2687// This intrinsic has no corresponding instruction.
2688#[stable(feature = "simd_x86", since = "1.27.0")]
2689pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
2690 _mm256_setr_epi64x(a, b:a, c:a, d:a)
2691}
2692
2693/// Cast vector of type __m256d to type __m256.
2694///
2695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2696#[inline]
2697#[target_feature(enable = "avx")]
2698// This intrinsic is only used for compilation and does not generate any
2699// instructions, thus it has zero latency.
2700#[stable(feature = "simd_x86", since = "1.27.0")]
2701pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2702 unsafe { transmute(src:a) }
2703}
2704
2705/// Cast vector of type __m256 to type __m256d.
2706///
2707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2708#[inline]
2709#[target_feature(enable = "avx")]
2710// This intrinsic is only used for compilation and does not generate any
2711// instructions, thus it has zero latency.
2712#[stable(feature = "simd_x86", since = "1.27.0")]
2713pub fn _mm256_castps_pd(a: __m256) -> __m256d {
2714 unsafe { transmute(src:a) }
2715}
2716
2717/// Casts vector of type __m256 to type __m256i.
2718///
2719/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2720#[inline]
2721#[target_feature(enable = "avx")]
2722// This intrinsic is only used for compilation and does not generate any
2723// instructions, thus it has zero latency.
2724#[stable(feature = "simd_x86", since = "1.27.0")]
2725pub fn _mm256_castps_si256(a: __m256) -> __m256i {
2726 unsafe { transmute(src:a) }
2727}
2728
2729/// Casts vector of type __m256i to type __m256.
2730///
2731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2732#[inline]
2733#[target_feature(enable = "avx")]
2734// This intrinsic is only used for compilation and does not generate any
2735// instructions, thus it has zero latency.
2736#[stable(feature = "simd_x86", since = "1.27.0")]
2737pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2738 unsafe { transmute(src:a) }
2739}
2740
2741/// Casts vector of type __m256d to type __m256i.
2742///
2743/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2744#[inline]
2745#[target_feature(enable = "avx")]
2746// This intrinsic is only used for compilation and does not generate any
2747// instructions, thus it has zero latency.
2748#[stable(feature = "simd_x86", since = "1.27.0")]
2749pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2750 unsafe { transmute(src:a) }
2751}
2752
2753/// Casts vector of type __m256i to type __m256d.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2756#[inline]
2757#[target_feature(enable = "avx")]
2758// This intrinsic is only used for compilation and does not generate any
2759// instructions, thus it has zero latency.
2760#[stable(feature = "simd_x86", since = "1.27.0")]
2761pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2762 unsafe { transmute(src:a) }
2763}
2764
2765/// Casts vector of type __m256 to type __m128.
2766///
2767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2768#[inline]
2769#[target_feature(enable = "avx")]
2770// This intrinsic is only used for compilation and does not generate any
2771// instructions, thus it has zero latency.
2772#[stable(feature = "simd_x86", since = "1.27.0")]
2773pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2774 unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2775}
2776
2777/// Casts vector of type __m256d to type __m128d.
2778///
2779/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2780#[inline]
2781#[target_feature(enable = "avx")]
2782// This intrinsic is only used for compilation and does not generate any
2783// instructions, thus it has zero latency.
2784#[stable(feature = "simd_x86", since = "1.27.0")]
2785pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2786 unsafe { simd_shuffle!(a, a, [0, 1]) }
2787}
2788
2789/// Casts vector of type __m256i to type __m128i.
2790///
2791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2792#[inline]
2793#[target_feature(enable = "avx")]
2794// This intrinsic is only used for compilation and does not generate any
2795// instructions, thus it has zero latency.
2796#[stable(feature = "simd_x86", since = "1.27.0")]
2797pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2798 unsafe {
2799 let a: i64x4 = a.as_i64x4();
2800 let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2801 transmute(src:dst)
2802 }
2803}
2804
2805/// Casts vector of type __m128 to type __m256;
2806/// the upper 128 bits of the result are undefined.
2807///
2808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2809#[inline]
2810#[target_feature(enable = "avx")]
2811// This intrinsic is only used for compilation and does not generate any
2812// instructions, thus it has zero latency.
2813#[stable(feature = "simd_x86", since = "1.27.0")]
2814pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2815 unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2816}
2817
2818/// Casts vector of type __m128d to type __m256d;
2819/// the upper 128 bits of the result are undefined.
2820///
2821/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2822#[inline]
2823#[target_feature(enable = "avx")]
2824// This intrinsic is only used for compilation and does not generate any
2825// instructions, thus it has zero latency.
2826#[stable(feature = "simd_x86", since = "1.27.0")]
2827pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2828 unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2829}
2830
2831/// Casts vector of type __m128i to type __m256i;
2832/// the upper 128 bits of the result are undefined.
2833///
2834/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2835#[inline]
2836#[target_feature(enable = "avx")]
2837// This intrinsic is only used for compilation and does not generate any
2838// instructions, thus it has zero latency.
2839#[stable(feature = "simd_x86", since = "1.27.0")]
2840pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2841 unsafe {
2842 let a: i64x2 = a.as_i64x2();
2843 let undefined: i64x2 = i64x2::ZERO;
2844 let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2845 transmute(src:dst)
2846 }
2847}
2848
2849/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2850/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
2851/// the value of the source vector. The upper 128 bits are set to zero.
2852///
2853/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
2854#[inline]
2855#[target_feature(enable = "avx")]
2856// This intrinsic is only used for compilation and does not generate any
2857// instructions, thus it has zero latency.
2858#[stable(feature = "simd_x86", since = "1.27.0")]
2859pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
2860 unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
2861}
2862
2863/// Constructs a 256-bit integer vector from a 128-bit integer vector.
2864/// The lower 128 bits contain the value of the source vector. The upper
2865/// 128 bits are set to zero.
2866///
2867/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
2868#[inline]
2869#[target_feature(enable = "avx")]
2870// This intrinsic is only used for compilation and does not generate any
2871// instructions, thus it has zero latency.
2872#[stable(feature = "simd_x86", since = "1.27.0")]
2873pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
2874 unsafe {
2875 let b: i64x2 = i64x2::ZERO;
2876 let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
2877 transmute(src:dst)
2878 }
2879}
2880
2881/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
2882/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
2883/// contain the value of the source vector. The upper 128 bits are set
2884/// to zero.
2885///
2886/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
2887#[inline]
2888#[target_feature(enable = "avx")]
2889// This intrinsic is only used for compilation and does not generate any
2890// instructions, thus it has zero latency.
2891#[stable(feature = "simd_x86", since = "1.27.0")]
2892pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
2893 unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
2894}
2895
2896/// Returns vector of type `__m256` with indeterminate elements.
2897/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2898/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2899/// In practice, this is typically equivalent to [`mem::zeroed`].
2900///
2901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
2902#[inline]
2903#[target_feature(enable = "avx")]
2904// This intrinsic has no corresponding instruction.
2905#[stable(feature = "simd_x86", since = "1.27.0")]
2906pub fn _mm256_undefined_ps() -> __m256 {
2907 const { unsafe { mem::zeroed() } }
2908}
2909
2910/// Returns vector of type `__m256d` with indeterminate elements.
2911/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2912/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2913/// In practice, this is typically equivalent to [`mem::zeroed`].
2914///
2915/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
2916#[inline]
2917#[target_feature(enable = "avx")]
2918// This intrinsic has no corresponding instruction.
2919#[stable(feature = "simd_x86", since = "1.27.0")]
2920pub fn _mm256_undefined_pd() -> __m256d {
2921 const { unsafe { mem::zeroed() } }
2922}
2923
2924/// Returns vector of type __m256i with with indeterminate elements.
2925/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2926/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2927/// In practice, this is typically equivalent to [`mem::zeroed`].
2928///
2929/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
2930#[inline]
2931#[target_feature(enable = "avx")]
2932// This intrinsic has no corresponding instruction.
2933#[stable(feature = "simd_x86", since = "1.27.0")]
2934pub fn _mm256_undefined_si256() -> __m256i {
2935 const { unsafe { mem::zeroed() } }
2936}
2937
2938/// Sets packed __m256 returned vector with the supplied values.
2939///
2940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
2941#[inline]
2942#[target_feature(enable = "avx")]
2943#[cfg_attr(test, assert_instr(vinsertf128))]
2944#[stable(feature = "simd_x86", since = "1.27.0")]
2945pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
2946 unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
2947}
2948
2949/// Sets packed __m256d returned vector with the supplied values.
2950///
2951/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
2952#[inline]
2953#[target_feature(enable = "avx")]
2954#[cfg_attr(test, assert_instr(vinsertf128))]
2955#[stable(feature = "simd_x86", since = "1.27.0")]
2956pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
2957 unsafe {
2958 let hi: __m128 = transmute(src:hi);
2959 let lo: __m128 = transmute(src:lo);
2960 transmute(src:_mm256_set_m128(hi, lo))
2961 }
2962}
2963
2964/// Sets packed __m256i returned vector with the supplied values.
2965///
2966/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
2967#[inline]
2968#[target_feature(enable = "avx")]
2969#[cfg_attr(test, assert_instr(vinsertf128))]
2970#[stable(feature = "simd_x86", since = "1.27.0")]
2971pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
2972 unsafe {
2973 let hi: __m128 = transmute(src:hi);
2974 let lo: __m128 = transmute(src:lo);
2975 transmute(src:_mm256_set_m128(hi, lo))
2976 }
2977}
2978
2979/// Sets packed __m256 returned vector with the supplied values.
2980///
2981/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
2982#[inline]
2983#[target_feature(enable = "avx")]
2984#[cfg_attr(test, assert_instr(vinsertf128))]
2985#[stable(feature = "simd_x86", since = "1.27.0")]
2986pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
2987 _mm256_set_m128(hi, lo)
2988}
2989
2990/// Sets packed __m256d returned vector with the supplied values.
2991///
2992/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
2993#[inline]
2994#[target_feature(enable = "avx")]
2995#[cfg_attr(test, assert_instr(vinsertf128))]
2996#[stable(feature = "simd_x86", since = "1.27.0")]
2997pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
2998 _mm256_set_m128d(hi, lo)
2999}
3000
3001/// Sets packed __m256i returned vector with the supplied values.
3002///
3003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
3004#[inline]
3005#[target_feature(enable = "avx")]
3006#[cfg_attr(test, assert_instr(vinsertf128))]
3007#[stable(feature = "simd_x86", since = "1.27.0")]
3008pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
3009 _mm256_set_m128i(hi, lo)
3010}
3011
3012/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
3013/// floating-point elements) from memory, and combine them into a 256-bit
3014/// value.
3015/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3016///
3017/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
3018#[inline]
3019#[target_feature(enable = "avx")]
3020// This intrinsic has no corresponding instruction.
3021#[stable(feature = "simd_x86", since = "1.27.0")]
3022pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
3023 let a: __m256 = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
3024 _mm256_insertf128_ps::<1>(a, b:_mm_loadu_ps(hiaddr))
3025}
3026
3027/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
3028/// floating-point elements) from memory, and combine them into a 256-bit
3029/// value.
3030/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3031///
3032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
3033#[inline]
3034#[target_feature(enable = "avx")]
3035// This intrinsic has no corresponding instruction.
3036#[stable(feature = "simd_x86", since = "1.27.0")]
3037pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
3038 let a: __m256d = _mm256_castpd128_pd256(_mm_loadu_pd(mem_addr:loaddr));
3039 _mm256_insertf128_pd::<1>(a, b:_mm_loadu_pd(mem_addr:hiaddr))
3040}
3041
3042/// Loads two 128-bit values (composed of integer data) from memory, and combine
3043/// them into a 256-bit value.
3044/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3045///
3046/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
3047#[inline]
3048#[target_feature(enable = "avx")]
3049// This intrinsic has no corresponding instruction.
3050#[stable(feature = "simd_x86", since = "1.27.0")]
3051pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
3052 let a: __m256i = _mm256_castsi128_si256(_mm_loadu_si128(mem_addr:loaddr));
3053 _mm256_insertf128_si256::<1>(a, b:_mm_loadu_si128(mem_addr:hiaddr))
3054}
3055
3056/// Stores the high and low 128-bit halves (each composed of 4 packed
3057/// single-precision (32-bit) floating-point elements) from `a` into memory two
3058/// different 128-bit locations.
3059/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3060///
3061/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
3062#[inline]
3063#[target_feature(enable = "avx")]
3064// This intrinsic has no corresponding instruction.
3065#[stable(feature = "simd_x86", since = "1.27.0")]
3066pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
3067 let lo: __m128 = _mm256_castps256_ps128(a);
3068 _mm_storeu_ps(p:loaddr, a:lo);
3069 let hi: __m128 = _mm256_extractf128_ps::<1>(a);
3070 _mm_storeu_ps(p:hiaddr, a:hi);
3071}
3072
3073/// Stores the high and low 128-bit halves (each composed of 2 packed
3074/// double-precision (64-bit) floating-point elements) from `a` into memory two
3075/// different 128-bit locations.
3076/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3077///
3078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3079#[inline]
3080#[target_feature(enable = "avx")]
3081// This intrinsic has no corresponding instruction.
3082#[stable(feature = "simd_x86", since = "1.27.0")]
3083pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3084 let lo: __m128d = _mm256_castpd256_pd128(a);
3085 _mm_storeu_pd(mem_addr:loaddr, a:lo);
3086 let hi: __m128d = _mm256_extractf128_pd::<1>(a);
3087 _mm_storeu_pd(mem_addr:hiaddr, a:hi);
3088}
3089
3090/// Stores the high and low 128-bit halves (each composed of integer data) from
3091/// `a` into memory two different 128-bit locations.
3092/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3093///
3094/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3095#[inline]
3096#[target_feature(enable = "avx")]
3097// This intrinsic has no corresponding instruction.
3098#[stable(feature = "simd_x86", since = "1.27.0")]
3099pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3100 let lo: __m128i = _mm256_castsi256_si128(a);
3101 _mm_storeu_si128(mem_addr:loaddr, a:lo);
3102 let hi: __m128i = _mm256_extractf128_si256::<1>(a);
3103 _mm_storeu_si128(mem_addr:hiaddr, a:hi);
3104}
3105
3106/// Returns the first element of the input vector of `[8 x float]`.
3107///
3108/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3109#[inline]
3110#[target_feature(enable = "avx")]
3111//#[cfg_attr(test, assert_instr(movss))] FIXME
3112#[stable(feature = "simd_x86", since = "1.27.0")]
3113pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
3114 unsafe { simd_extract!(a, 0) }
3115}
3116
3117// LLVM intrinsics used in the above functions
3118#[allow(improper_ctypes)]
3119unsafe extern "C" {
3120 #[link_name = "llvm.x86.avx.round.pd.256"]
3121 unsafefn roundpd256(a: __m256d, b: i32) -> __m256d;
3122 #[link_name = "llvm.x86.avx.round.ps.256"]
3123 unsafefn roundps256(a: __m256, b: i32) -> __m256;
3124 #[link_name = "llvm.x86.avx.dp.ps.256"]
3125 unsafefn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3126 #[link_name = "llvm.x86.sse2.cmp.pd"]
3127 unsafefn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3128 #[link_name = "llvm.x86.avx.cmp.pd.256"]
3129 unsafefn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3130 #[link_name = "llvm.x86.sse.cmp.ps"]
3131 unsafefn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3132 #[link_name = "llvm.x86.avx.cmp.ps.256"]
3133 unsafefn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3134 #[link_name = "llvm.x86.sse2.cmp.sd"]
3135 unsafefn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3136 #[link_name = "llvm.x86.sse.cmp.ss"]
3137 unsafefn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3138 #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3139 unsafefn vcvtps2dq(a: __m256) -> i32x8;
3140 #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3141 unsafefn vcvttpd2dq(a: __m256d) -> i32x4;
3142 #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3143 unsafefn vcvtpd2dq(a: __m256d) -> i32x4;
3144 #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3145 unsafefn vcvttps2dq(a: __m256) -> i32x8;
3146 #[link_name = "llvm.x86.avx.vzeroall"]
3147 unsafefn vzeroall();
3148 #[link_name = "llvm.x86.avx.vzeroupper"]
3149 unsafefn vzeroupper();
3150 #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3151 unsafefn vpermilps256(a: __m256, b: i32x8) -> __m256;
3152 #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3153 unsafefn vpermilps(a: __m128, b: i32x4) -> __m128;
3154 #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3155 unsafefn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3156 #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3157 unsafefn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3158 #[link_name = "llvm.x86.avx.ldu.dq.256"]
3159 unsafefn vlddqu(mem_addr: *const i8) -> i8x32;
3160 #[link_name = "llvm.x86.avx.rcp.ps.256"]
3161 unsafefn vrcpps(a: __m256) -> __m256;
3162 #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3163 unsafefn vrsqrtps(a: __m256) -> __m256;
3164 #[link_name = "llvm.x86.avx.ptestnzc.256"]
3165 unsafefn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3166 #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3167 unsafefn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3168 #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3169 unsafefn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3170 #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3171 unsafefn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3172 #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3173 unsafefn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3174 #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3175 unsafefn vtestzps256(a: __m256, b: __m256) -> i32;
3176 #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3177 unsafefn vtestcps256(a: __m256, b: __m256) -> i32;
3178 #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3179 unsafefn vtestnzcps256(a: __m256, b: __m256) -> i32;
3180 #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3181 unsafefn vtestnzcps(a: __m128, b: __m128) -> i32;
3182 #[link_name = "llvm.x86.avx.min.ps.256"]
3183 unsafefn vminps(a: __m256, b: __m256) -> __m256;
3184 #[link_name = "llvm.x86.avx.max.ps.256"]
3185 unsafefn vmaxps(a: __m256, b: __m256) -> __m256;
3186 #[link_name = "llvm.x86.avx.min.pd.256"]
3187 unsafefn vminpd(a: __m256d, b: __m256d) -> __m256d;
3188 #[link_name = "llvm.x86.avx.max.pd.256"]
3189 unsafefn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3190}
3191
3192#[cfg(test)]
3193mod tests {
3194 use crate::hint::black_box;
3195 use crate::ptr;
3196 use stdarch_test::simd_test;
3197
3198 use crate::core_arch::x86::*;
3199
3200 #[simd_test(enable = "avx")]
3201 unsafe fn test_mm256_add_pd() {
3202 let a = _mm256_setr_pd(1., 2., 3., 4.);
3203 let b = _mm256_setr_pd(5., 6., 7., 8.);
3204 let r = _mm256_add_pd(a, b);
3205 let e = _mm256_setr_pd(6., 8., 10., 12.);
3206 assert_eq_m256d(r, e);
3207 }
3208
3209 #[simd_test(enable = "avx")]
3210 unsafe fn test_mm256_add_ps() {
3211 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3212 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3213 let r = _mm256_add_ps(a, b);
3214 let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3215 assert_eq_m256(r, e);
3216 }
3217
3218 #[simd_test(enable = "avx")]
3219 unsafe fn test_mm256_and_pd() {
3220 let a = _mm256_set1_pd(1.);
3221 let b = _mm256_set1_pd(0.6);
3222 let r = _mm256_and_pd(a, b);
3223 let e = _mm256_set1_pd(0.5);
3224 assert_eq_m256d(r, e);
3225 }
3226
3227 #[simd_test(enable = "avx")]
3228 unsafe fn test_mm256_and_ps() {
3229 let a = _mm256_set1_ps(1.);
3230 let b = _mm256_set1_ps(0.6);
3231 let r = _mm256_and_ps(a, b);
3232 let e = _mm256_set1_ps(0.5);
3233 assert_eq_m256(r, e);
3234 }
3235
3236 #[simd_test(enable = "avx")]
3237 unsafe fn test_mm256_or_pd() {
3238 let a = _mm256_set1_pd(1.);
3239 let b = _mm256_set1_pd(0.6);
3240 let r = _mm256_or_pd(a, b);
3241 let e = _mm256_set1_pd(1.2);
3242 assert_eq_m256d(r, e);
3243 }
3244
3245 #[simd_test(enable = "avx")]
3246 unsafe fn test_mm256_or_ps() {
3247 let a = _mm256_set1_ps(1.);
3248 let b = _mm256_set1_ps(0.6);
3249 let r = _mm256_or_ps(a, b);
3250 let e = _mm256_set1_ps(1.2);
3251 assert_eq_m256(r, e);
3252 }
3253
3254 #[simd_test(enable = "avx")]
3255 unsafe fn test_mm256_shuffle_pd() {
3256 let a = _mm256_setr_pd(1., 4., 5., 8.);
3257 let b = _mm256_setr_pd(2., 3., 6., 7.);
3258 let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3259 let e = _mm256_setr_pd(4., 3., 8., 7.);
3260 assert_eq_m256d(r, e);
3261 }
3262
3263 #[simd_test(enable = "avx")]
3264 unsafe fn test_mm256_shuffle_ps() {
3265 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3266 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3267 let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3268 let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3269 assert_eq_m256(r, e);
3270 }
3271
3272 #[simd_test(enable = "avx")]
3273 unsafe fn test_mm256_andnot_pd() {
3274 let a = _mm256_set1_pd(0.);
3275 let b = _mm256_set1_pd(0.6);
3276 let r = _mm256_andnot_pd(a, b);
3277 assert_eq_m256d(r, b);
3278 }
3279
3280 #[simd_test(enable = "avx")]
3281 unsafe fn test_mm256_andnot_ps() {
3282 let a = _mm256_set1_ps(0.);
3283 let b = _mm256_set1_ps(0.6);
3284 let r = _mm256_andnot_ps(a, b);
3285 assert_eq_m256(r, b);
3286 }
3287
3288 #[simd_test(enable = "avx")]
3289 unsafe fn test_mm256_max_pd() {
3290 let a = _mm256_setr_pd(1., 4., 5., 8.);
3291 let b = _mm256_setr_pd(2., 3., 6., 7.);
3292 let r = _mm256_max_pd(a, b);
3293 let e = _mm256_setr_pd(2., 4., 6., 8.);
3294 assert_eq_m256d(r, e);
3295 // > If the values being compared are both 0.0s (of either sign), the
3296 // > value in the second operand (source operand) is returned.
3297 let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3298 let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3299 let wu: [u64; 4] = transmute(w);
3300 let xu: [u64; 4] = transmute(x);
3301 assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3302 assert_eq!(xu, [0u64; 4]);
3303 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3304 // > second operand (source operand), either a NaN or a valid
3305 // > floating-point value, is written to the result.
3306 let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3307 let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3308 let yf: [f64; 4] = transmute(y);
3309 let zf: [f64; 4] = transmute(z);
3310 assert_eq!(yf, [0.0; 4]);
3311 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3312 }
3313
3314 #[simd_test(enable = "avx")]
3315 unsafe fn test_mm256_max_ps() {
3316 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3317 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3318 let r = _mm256_max_ps(a, b);
3319 let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3320 assert_eq_m256(r, e);
3321 // > If the values being compared are both 0.0s (of either sign), the
3322 // > value in the second operand (source operand) is returned.
3323 let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3324 let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3325 let wu: [u32; 8] = transmute(w);
3326 let xu: [u32; 8] = transmute(x);
3327 assert_eq!(wu, [0x8000_0000u32; 8]);
3328 assert_eq!(xu, [0u32; 8]);
3329 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3330 // > second operand (source operand), either a NaN or a valid
3331 // > floating-point value, is written to the result.
3332 let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3333 let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3334 let yf: [f32; 8] = transmute(y);
3335 let zf: [f32; 8] = transmute(z);
3336 assert_eq!(yf, [0.0; 8]);
3337 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3338 }
3339
3340 #[simd_test(enable = "avx")]
3341 unsafe fn test_mm256_min_pd() {
3342 let a = _mm256_setr_pd(1., 4., 5., 8.);
3343 let b = _mm256_setr_pd(2., 3., 6., 7.);
3344 let r = _mm256_min_pd(a, b);
3345 let e = _mm256_setr_pd(1., 3., 5., 7.);
3346 assert_eq_m256d(r, e);
3347 // > If the values being compared are both 0.0s (of either sign), the
3348 // > value in the second operand (source operand) is returned.
3349 let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3350 let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3351 let wu: [u64; 4] = transmute(w);
3352 let xu: [u64; 4] = transmute(x);
3353 assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3354 assert_eq!(xu, [0u64; 4]);
3355 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3356 // > second operand (source operand), either a NaN or a valid
3357 // > floating-point value, is written to the result.
3358 let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3359 let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3360 let yf: [f64; 4] = transmute(y);
3361 let zf: [f64; 4] = transmute(z);
3362 assert_eq!(yf, [0.0; 4]);
3363 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3364 }
3365
3366 #[simd_test(enable = "avx")]
3367 unsafe fn test_mm256_min_ps() {
3368 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3369 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3370 let r = _mm256_min_ps(a, b);
3371 let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3372 assert_eq_m256(r, e);
3373 // > If the values being compared are both 0.0s (of either sign), the
3374 // > value in the second operand (source operand) is returned.
3375 let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3376 let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3377 let wu: [u32; 8] = transmute(w);
3378 let xu: [u32; 8] = transmute(x);
3379 assert_eq!(wu, [0x8000_0000u32; 8]);
3380 assert_eq!(xu, [0u32; 8]);
3381 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3382 // > second operand (source operand), either a NaN or a valid
3383 // > floating-point value, is written to the result.
3384 let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3385 let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3386 let yf: [f32; 8] = transmute(y);
3387 let zf: [f32; 8] = transmute(z);
3388 assert_eq!(yf, [0.0; 8]);
3389 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3390 }
3391
3392 #[simd_test(enable = "avx")]
3393 unsafe fn test_mm256_mul_pd() {
3394 let a = _mm256_setr_pd(1., 2., 3., 4.);
3395 let b = _mm256_setr_pd(5., 6., 7., 8.);
3396 let r = _mm256_mul_pd(a, b);
3397 let e = _mm256_setr_pd(5., 12., 21., 32.);
3398 assert_eq_m256d(r, e);
3399 }
3400
3401 #[simd_test(enable = "avx")]
3402 unsafe fn test_mm256_mul_ps() {
3403 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3404 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3405 let r = _mm256_mul_ps(a, b);
3406 let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3407 assert_eq_m256(r, e);
3408 }
3409
3410 #[simd_test(enable = "avx")]
3411 unsafe fn test_mm256_addsub_pd() {
3412 let a = _mm256_setr_pd(1., 2., 3., 4.);
3413 let b = _mm256_setr_pd(5., 6., 7., 8.);
3414 let r = _mm256_addsub_pd(a, b);
3415 let e = _mm256_setr_pd(-4., 8., -4., 12.);
3416 assert_eq_m256d(r, e);
3417 }
3418
3419 #[simd_test(enable = "avx")]
3420 unsafe fn test_mm256_addsub_ps() {
3421 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3422 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3423 let r = _mm256_addsub_ps(a, b);
3424 let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3425 assert_eq_m256(r, e);
3426 }
3427
3428 #[simd_test(enable = "avx")]
3429 unsafe fn test_mm256_sub_pd() {
3430 let a = _mm256_setr_pd(1., 2., 3., 4.);
3431 let b = _mm256_setr_pd(5., 6., 7., 8.);
3432 let r = _mm256_sub_pd(a, b);
3433 let e = _mm256_setr_pd(-4., -4., -4., -4.);
3434 assert_eq_m256d(r, e);
3435 }
3436
3437 #[simd_test(enable = "avx")]
3438 unsafe fn test_mm256_sub_ps() {
3439 let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3440 let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3441 let r = _mm256_sub_ps(a, b);
3442 let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3443 assert_eq_m256(r, e);
3444 }
3445
3446 #[simd_test(enable = "avx")]
3447 unsafe fn test_mm256_round_pd() {
3448 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3449 let result_closest = _mm256_round_pd::<0b0000>(a);
3450 let result_down = _mm256_round_pd::<0b0001>(a);
3451 let result_up = _mm256_round_pd::<0b0010>(a);
3452 let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3453 let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3454 let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3455 assert_eq_m256d(result_closest, expected_closest);
3456 assert_eq_m256d(result_down, expected_down);
3457 assert_eq_m256d(result_up, expected_up);
3458 }
3459
3460 #[simd_test(enable = "avx")]
3461 unsafe fn test_mm256_floor_pd() {
3462 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3463 let result_down = _mm256_floor_pd(a);
3464 let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3465 assert_eq_m256d(result_down, expected_down);
3466 }
3467
3468 #[simd_test(enable = "avx")]
3469 unsafe fn test_mm256_ceil_pd() {
3470 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3471 let result_up = _mm256_ceil_pd(a);
3472 let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3473 assert_eq_m256d(result_up, expected_up);
3474 }
3475
3476 #[simd_test(enable = "avx")]
3477 unsafe fn test_mm256_round_ps() {
3478 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3479 let result_closest = _mm256_round_ps::<0b0000>(a);
3480 let result_down = _mm256_round_ps::<0b0001>(a);
3481 let result_up = _mm256_round_ps::<0b0010>(a);
3482 let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3483 let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3484 let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3485 assert_eq_m256(result_closest, expected_closest);
3486 assert_eq_m256(result_down, expected_down);
3487 assert_eq_m256(result_up, expected_up);
3488 }
3489
3490 #[simd_test(enable = "avx")]
3491 unsafe fn test_mm256_floor_ps() {
3492 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3493 let result_down = _mm256_floor_ps(a);
3494 let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3495 assert_eq_m256(result_down, expected_down);
3496 }
3497
3498 #[simd_test(enable = "avx")]
3499 unsafe fn test_mm256_ceil_ps() {
3500 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3501 let result_up = _mm256_ceil_ps(a);
3502 let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3503 assert_eq_m256(result_up, expected_up);
3504 }
3505
3506 #[simd_test(enable = "avx")]
3507 unsafe fn test_mm256_sqrt_pd() {
3508 let a = _mm256_setr_pd(4., 9., 16., 25.);
3509 let r = _mm256_sqrt_pd(a);
3510 let e = _mm256_setr_pd(2., 3., 4., 5.);
3511 assert_eq_m256d(r, e);
3512 }
3513
3514 #[simd_test(enable = "avx")]
3515 unsafe fn test_mm256_sqrt_ps() {
3516 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3517 let r = _mm256_sqrt_ps(a);
3518 let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3519 assert_eq_m256(r, e);
3520 }
3521
3522 #[simd_test(enable = "avx")]
3523 unsafe fn test_mm256_div_ps() {
3524 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3525 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3526 let r = _mm256_div_ps(a, b);
3527 let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3528 assert_eq_m256(r, e);
3529 }
3530
3531 #[simd_test(enable = "avx")]
3532 unsafe fn test_mm256_div_pd() {
3533 let a = _mm256_setr_pd(4., 9., 16., 25.);
3534 let b = _mm256_setr_pd(4., 3., 2., 5.);
3535 let r = _mm256_div_pd(a, b);
3536 let e = _mm256_setr_pd(1., 3., 8., 5.);
3537 assert_eq_m256d(r, e);
3538 }
3539
3540 #[simd_test(enable = "avx")]
3541 unsafe fn test_mm256_blend_pd() {
3542 let a = _mm256_setr_pd(4., 9., 16., 25.);
3543 let b = _mm256_setr_pd(4., 3., 2., 5.);
3544 let r = _mm256_blend_pd::<0x0>(a, b);
3545 assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3546 let r = _mm256_blend_pd::<0x3>(a, b);
3547 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3548 let r = _mm256_blend_pd::<0xF>(a, b);
3549 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3550 }
3551
3552 #[simd_test(enable = "avx")]
3553 unsafe fn test_mm256_blend_ps() {
3554 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3555 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3556 let r = _mm256_blend_ps::<0x0>(a, b);
3557 assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3558 let r = _mm256_blend_ps::<0x3>(a, b);
3559 assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3560 let r = _mm256_blend_ps::<0xF>(a, b);
3561 assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3562 }
3563
3564 #[simd_test(enable = "avx")]
3565 unsafe fn test_mm256_blendv_pd() {
3566 let a = _mm256_setr_pd(4., 9., 16., 25.);
3567 let b = _mm256_setr_pd(4., 3., 2., 5.);
3568 let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3569 let r = _mm256_blendv_pd(a, b, c);
3570 let e = _mm256_setr_pd(4., 9., 2., 5.);
3571 assert_eq_m256d(r, e);
3572 }
3573
3574 #[simd_test(enable = "avx")]
3575 unsafe fn test_mm256_blendv_ps() {
3576 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3577 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3578 #[rustfmt::skip]
3579 let c = _mm256_setr_ps(
3580 0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3581 );
3582 let r = _mm256_blendv_ps(a, b, c);
3583 let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3584 assert_eq_m256(r, e);
3585 }
3586
3587 #[simd_test(enable = "avx")]
3588 unsafe fn test_mm256_dp_ps() {
3589 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3590 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3591 let r = _mm256_dp_ps::<0xFF>(a, b);
3592 let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3593 assert_eq_m256(r, e);
3594 }
3595
3596 #[simd_test(enable = "avx")]
3597 unsafe fn test_mm256_hadd_pd() {
3598 let a = _mm256_setr_pd(4., 9., 16., 25.);
3599 let b = _mm256_setr_pd(4., 3., 2., 5.);
3600 let r = _mm256_hadd_pd(a, b);
3601 let e = _mm256_setr_pd(13., 7., 41., 7.);
3602 assert_eq_m256d(r, e);
3603
3604 let a = _mm256_setr_pd(1., 2., 3., 4.);
3605 let b = _mm256_setr_pd(5., 6., 7., 8.);
3606 let r = _mm256_hadd_pd(a, b);
3607 let e = _mm256_setr_pd(3., 11., 7., 15.);
3608 assert_eq_m256d(r, e);
3609 }
3610
3611 #[simd_test(enable = "avx")]
3612 unsafe fn test_mm256_hadd_ps() {
3613 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3614 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3615 let r = _mm256_hadd_ps(a, b);
3616 let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3617 assert_eq_m256(r, e);
3618
3619 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3620 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3621 let r = _mm256_hadd_ps(a, b);
3622 let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3623 assert_eq_m256(r, e);
3624 }
3625
3626 #[simd_test(enable = "avx")]
3627 unsafe fn test_mm256_hsub_pd() {
3628 let a = _mm256_setr_pd(4., 9., 16., 25.);
3629 let b = _mm256_setr_pd(4., 3., 2., 5.);
3630 let r = _mm256_hsub_pd(a, b);
3631 let e = _mm256_setr_pd(-5., 1., -9., -3.);
3632 assert_eq_m256d(r, e);
3633
3634 let a = _mm256_setr_pd(1., 2., 3., 4.);
3635 let b = _mm256_setr_pd(5., 6., 7., 8.);
3636 let r = _mm256_hsub_pd(a, b);
3637 let e = _mm256_setr_pd(-1., -1., -1., -1.);
3638 assert_eq_m256d(r, e);
3639 }
3640
3641 #[simd_test(enable = "avx")]
3642 unsafe fn test_mm256_hsub_ps() {
3643 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3644 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3645 let r = _mm256_hsub_ps(a, b);
3646 let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3647 assert_eq_m256(r, e);
3648
3649 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3650 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3651 let r = _mm256_hsub_ps(a, b);
3652 let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3653 assert_eq_m256(r, e);
3654 }
3655
3656 #[simd_test(enable = "avx")]
3657 unsafe fn test_mm256_xor_pd() {
3658 let a = _mm256_setr_pd(4., 9., 16., 25.);
3659 let b = _mm256_set1_pd(0.);
3660 let r = _mm256_xor_pd(a, b);
3661 assert_eq_m256d(r, a);
3662 }
3663
3664 #[simd_test(enable = "avx")]
3665 unsafe fn test_mm256_xor_ps() {
3666 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3667 let b = _mm256_set1_ps(0.);
3668 let r = _mm256_xor_ps(a, b);
3669 assert_eq_m256(r, a);
3670 }
3671
3672 #[simd_test(enable = "avx")]
3673 unsafe fn test_mm_cmp_pd() {
3674 let a = _mm_setr_pd(4., 9.);
3675 let b = _mm_setr_pd(4., 3.);
3676 let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3677 assert!(get_m128d(r, 0).is_nan());
3678 assert!(get_m128d(r, 1).is_nan());
3679 }
3680
3681 #[simd_test(enable = "avx")]
3682 unsafe fn test_mm256_cmp_pd() {
3683 let a = _mm256_setr_pd(1., 2., 3., 4.);
3684 let b = _mm256_setr_pd(5., 6., 7., 8.);
3685 let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3686 let e = _mm256_set1_pd(0.);
3687 assert_eq_m256d(r, e);
3688 }
3689
3690 #[simd_test(enable = "avx")]
3691 unsafe fn test_mm_cmp_ps() {
3692 let a = _mm_setr_ps(4., 3., 2., 5.);
3693 let b = _mm_setr_ps(4., 9., 16., 25.);
3694 let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3695 assert!(get_m128(r, 0).is_nan());
3696 assert_eq!(get_m128(r, 1), 0.);
3697 assert_eq!(get_m128(r, 2), 0.);
3698 assert_eq!(get_m128(r, 3), 0.);
3699 }
3700
3701 #[simd_test(enable = "avx")]
3702 unsafe fn test_mm256_cmp_ps() {
3703 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3704 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3705 let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3706 let e = _mm256_set1_ps(0.);
3707 assert_eq_m256(r, e);
3708 }
3709
3710 #[simd_test(enable = "avx")]
3711 unsafe fn test_mm_cmp_sd() {
3712 let a = _mm_setr_pd(4., 9.);
3713 let b = _mm_setr_pd(4., 3.);
3714 let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3715 assert!(get_m128d(r, 0).is_nan());
3716 assert_eq!(get_m128d(r, 1), 9.);
3717 }
3718
3719 #[simd_test(enable = "avx")]
3720 unsafe fn test_mm_cmp_ss() {
3721 let a = _mm_setr_ps(4., 3., 2., 5.);
3722 let b = _mm_setr_ps(4., 9., 16., 25.);
3723 let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3724 assert!(get_m128(r, 0).is_nan());
3725 assert_eq!(get_m128(r, 1), 3.);
3726 assert_eq!(get_m128(r, 2), 2.);
3727 assert_eq!(get_m128(r, 3), 5.);
3728 }
3729
3730 #[simd_test(enable = "avx")]
3731 unsafe fn test_mm256_cvtepi32_pd() {
3732 let a = _mm_setr_epi32(4, 9, 16, 25);
3733 let r = _mm256_cvtepi32_pd(a);
3734 let e = _mm256_setr_pd(4., 9., 16., 25.);
3735 assert_eq_m256d(r, e);
3736 }
3737
3738 #[simd_test(enable = "avx")]
3739 unsafe fn test_mm256_cvtepi32_ps() {
3740 let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3741 let r = _mm256_cvtepi32_ps(a);
3742 let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3743 assert_eq_m256(r, e);
3744 }
3745
3746 #[simd_test(enable = "avx")]
3747 unsafe fn test_mm256_cvtpd_ps() {
3748 let a = _mm256_setr_pd(4., 9., 16., 25.);
3749 let r = _mm256_cvtpd_ps(a);
3750 let e = _mm_setr_ps(4., 9., 16., 25.);
3751 assert_eq_m128(r, e);
3752 }
3753
3754 #[simd_test(enable = "avx")]
3755 unsafe fn test_mm256_cvtps_epi32() {
3756 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3757 let r = _mm256_cvtps_epi32(a);
3758 let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3759 assert_eq_m256i(r, e);
3760 }
3761
3762 #[simd_test(enable = "avx")]
3763 unsafe fn test_mm256_cvtps_pd() {
3764 let a = _mm_setr_ps(4., 9., 16., 25.);
3765 let r = _mm256_cvtps_pd(a);
3766 let e = _mm256_setr_pd(4., 9., 16., 25.);
3767 assert_eq_m256d(r, e);
3768 }
3769
3770 #[simd_test(enable = "avx")]
3771 unsafe fn test_mm256_cvtsd_f64() {
3772 let a = _mm256_setr_pd(1., 2., 3., 4.);
3773 let r = _mm256_cvtsd_f64(a);
3774 assert_eq!(r, 1.);
3775 }
3776
3777 #[simd_test(enable = "avx")]
3778 unsafe fn test_mm256_cvttpd_epi32() {
3779 let a = _mm256_setr_pd(4., 9., 16., 25.);
3780 let r = _mm256_cvttpd_epi32(a);
3781 let e = _mm_setr_epi32(4, 9, 16, 25);
3782 assert_eq_m128i(r, e);
3783 }
3784
3785 #[simd_test(enable = "avx")]
3786 unsafe fn test_mm256_cvtpd_epi32() {
3787 let a = _mm256_setr_pd(4., 9., 16., 25.);
3788 let r = _mm256_cvtpd_epi32(a);
3789 let e = _mm_setr_epi32(4, 9, 16, 25);
3790 assert_eq_m128i(r, e);
3791 }
3792
3793 #[simd_test(enable = "avx")]
3794 unsafe fn test_mm256_cvttps_epi32() {
3795 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3796 let r = _mm256_cvttps_epi32(a);
3797 let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3798 assert_eq_m256i(r, e);
3799 }
3800
3801 #[simd_test(enable = "avx")]
3802 unsafe fn test_mm256_extractf128_ps() {
3803 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3804 let r = _mm256_extractf128_ps::<0>(a);
3805 let e = _mm_setr_ps(4., 3., 2., 5.);
3806 assert_eq_m128(r, e);
3807 }
3808
3809 #[simd_test(enable = "avx")]
3810 unsafe fn test_mm256_extractf128_pd() {
3811 let a = _mm256_setr_pd(4., 3., 2., 5.);
3812 let r = _mm256_extractf128_pd::<0>(a);
3813 let e = _mm_setr_pd(4., 3.);
3814 assert_eq_m128d(r, e);
3815 }
3816
3817 #[simd_test(enable = "avx")]
3818 unsafe fn test_mm256_extractf128_si256() {
3819 let a = _mm256_setr_epi64x(4, 3, 2, 5);
3820 let r = _mm256_extractf128_si256::<0>(a);
3821 let e = _mm_setr_epi64x(4, 3);
3822 assert_eq_m128i(r, e);
3823 }
3824
3825 #[simd_test(enable = "avx")]
3826 unsafe fn test_mm256_extract_epi32() {
3827 let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3828 let r1 = _mm256_extract_epi32::<0>(a);
3829 let r2 = _mm256_extract_epi32::<3>(a);
3830 assert_eq!(r1, -1);
3831 assert_eq!(r2, 3);
3832 }
3833
3834 #[simd_test(enable = "avx")]
3835 unsafe fn test_mm256_cvtsi256_si32() {
3836 let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3837 let r = _mm256_cvtsi256_si32(a);
3838 assert_eq!(r, 1);
3839 }
3840
3841 #[simd_test(enable = "avx")]
3842 #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3843 unsafe fn test_mm256_zeroall() {
3844 _mm256_zeroall();
3845 }
3846
3847 #[simd_test(enable = "avx")]
3848 #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3849 unsafe fn test_mm256_zeroupper() {
3850 _mm256_zeroupper();
3851 }
3852
3853 #[simd_test(enable = "avx")]
3854 unsafe fn test_mm256_permutevar_ps() {
3855 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3856 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3857 let r = _mm256_permutevar_ps(a, b);
3858 let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
3859 assert_eq_m256(r, e);
3860 }
3861
3862 #[simd_test(enable = "avx")]
3863 unsafe fn test_mm_permutevar_ps() {
3864 let a = _mm_setr_ps(4., 3., 2., 5.);
3865 let b = _mm_setr_epi32(1, 2, 3, 4);
3866 let r = _mm_permutevar_ps(a, b);
3867 let e = _mm_setr_ps(3., 2., 5., 4.);
3868 assert_eq_m128(r, e);
3869 }
3870
3871 #[simd_test(enable = "avx")]
3872 unsafe fn test_mm256_permute_ps() {
3873 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3874 let r = _mm256_permute_ps::<0x1b>(a);
3875 let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
3876 assert_eq_m256(r, e);
3877 }
3878
3879 #[simd_test(enable = "avx")]
3880 unsafe fn test_mm_permute_ps() {
3881 let a = _mm_setr_ps(4., 3., 2., 5.);
3882 let r = _mm_permute_ps::<0x1b>(a);
3883 let e = _mm_setr_ps(5., 2., 3., 4.);
3884 assert_eq_m128(r, e);
3885 }
3886
3887 #[simd_test(enable = "avx")]
3888 unsafe fn test_mm256_permutevar_pd() {
3889 let a = _mm256_setr_pd(4., 3., 2., 5.);
3890 let b = _mm256_setr_epi64x(1, 2, 3, 4);
3891 let r = _mm256_permutevar_pd(a, b);
3892 let e = _mm256_setr_pd(4., 3., 5., 2.);
3893 assert_eq_m256d(r, e);
3894 }
3895
3896 #[simd_test(enable = "avx")]
3897 unsafe fn test_mm_permutevar_pd() {
3898 let a = _mm_setr_pd(4., 3.);
3899 let b = _mm_setr_epi64x(3, 0);
3900 let r = _mm_permutevar_pd(a, b);
3901 let e = _mm_setr_pd(3., 4.);
3902 assert_eq_m128d(r, e);
3903 }
3904
3905 #[simd_test(enable = "avx")]
3906 unsafe fn test_mm256_permute_pd() {
3907 let a = _mm256_setr_pd(4., 3., 2., 5.);
3908 let r = _mm256_permute_pd::<5>(a);
3909 let e = _mm256_setr_pd(3., 4., 5., 2.);
3910 assert_eq_m256d(r, e);
3911 }
3912
3913 #[simd_test(enable = "avx")]
3914 unsafe fn test_mm_permute_pd() {
3915 let a = _mm_setr_pd(4., 3.);
3916 let r = _mm_permute_pd::<1>(a);
3917 let e = _mm_setr_pd(3., 4.);
3918 assert_eq_m128d(r, e);
3919 }
3920
3921 #[simd_test(enable = "avx")]
3922 unsafe fn test_mm256_permute2f128_ps() {
3923 let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
3924 let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
3925 let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
3926 let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
3927 assert_eq_m256(r, e);
3928
3929 // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
3930 let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
3931 let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
3932 assert_eq_m256(r, z);
3933 }
3934
3935 #[simd_test(enable = "avx")]
3936 unsafe fn test_mm256_permute2f128_pd() {
3937 let a = _mm256_setr_pd(1., 2., 3., 4.);
3938 let b = _mm256_setr_pd(5., 6., 7., 8.);
3939 let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
3940 let e = _mm256_setr_pd(3., 4., 7., 8.);
3941 assert_eq_m256d(r, e);
3942
3943 // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
3944 let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
3945 let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
3946 assert_eq_m256d(r, e);
3947 }
3948
3949 #[simd_test(enable = "avx")]
3950 unsafe fn test_mm256_permute2f128_si256() {
3951 let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
3952 let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
3953 let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
3954 let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
3955 assert_eq_m256i(r, e);
3956
3957 // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
3958 let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
3959 let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
3960 assert_eq_m256i(r, e);
3961 }
3962
3963 #[simd_test(enable = "avx")]
3964 unsafe fn test_mm256_broadcast_ss() {
3965 let r = _mm256_broadcast_ss(&3.);
3966 let e = _mm256_set1_ps(3.);
3967 assert_eq_m256(r, e);
3968 }
3969
3970 #[simd_test(enable = "avx")]
3971 unsafe fn test_mm_broadcast_ss() {
3972 let r = _mm_broadcast_ss(&3.);
3973 let e = _mm_set1_ps(3.);
3974 assert_eq_m128(r, e);
3975 }
3976
3977 #[simd_test(enable = "avx")]
3978 unsafe fn test_mm256_broadcast_sd() {
3979 let r = _mm256_broadcast_sd(&3.);
3980 let e = _mm256_set1_pd(3.);
3981 assert_eq_m256d(r, e);
3982 }
3983
3984 #[simd_test(enable = "avx")]
3985 unsafe fn test_mm256_broadcast_ps() {
3986 let a = _mm_setr_ps(4., 3., 2., 5.);
3987 let r = _mm256_broadcast_ps(&a);
3988 let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
3989 assert_eq_m256(r, e);
3990 }
3991
3992 #[simd_test(enable = "avx")]
3993 unsafe fn test_mm256_broadcast_pd() {
3994 let a = _mm_setr_pd(4., 3.);
3995 let r = _mm256_broadcast_pd(&a);
3996 let e = _mm256_setr_pd(4., 3., 4., 3.);
3997 assert_eq_m256d(r, e);
3998 }
3999
4000 #[simd_test(enable = "avx")]
4001 unsafe fn test_mm256_insertf128_ps() {
4002 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4003 let b = _mm_setr_ps(4., 9., 16., 25.);
4004 let r = _mm256_insertf128_ps::<0>(a, b);
4005 let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
4006 assert_eq_m256(r, e);
4007 }
4008
4009 #[simd_test(enable = "avx")]
4010 unsafe fn test_mm256_insertf128_pd() {
4011 let a = _mm256_setr_pd(1., 2., 3., 4.);
4012 let b = _mm_setr_pd(5., 6.);
4013 let r = _mm256_insertf128_pd::<0>(a, b);
4014 let e = _mm256_setr_pd(5., 6., 3., 4.);
4015 assert_eq_m256d(r, e);
4016 }
4017
4018 #[simd_test(enable = "avx")]
4019 unsafe fn test_mm256_insertf128_si256() {
4020 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4021 let b = _mm_setr_epi64x(5, 6);
4022 let r = _mm256_insertf128_si256::<0>(a, b);
4023 let e = _mm256_setr_epi64x(5, 6, 3, 4);
4024 assert_eq_m256i(r, e);
4025 }
4026
4027 #[simd_test(enable = "avx")]
4028 unsafe fn test_mm256_insert_epi8() {
4029 #[rustfmt::skip]
4030 let a = _mm256_setr_epi8(
4031 1, 2, 3, 4, 5, 6, 7, 8,
4032 9, 10, 11, 12, 13, 14, 15, 16,
4033 17, 18, 19, 20, 21, 22, 23, 24,
4034 25, 26, 27, 28, 29, 30, 31, 32,
4035 );
4036 let r = _mm256_insert_epi8::<31>(a, 0);
4037 #[rustfmt::skip]
4038 let e = _mm256_setr_epi8(
4039 1, 2, 3, 4, 5, 6, 7, 8,
4040 9, 10, 11, 12, 13, 14, 15, 16,
4041 17, 18, 19, 20, 21, 22, 23, 24,
4042 25, 26, 27, 28, 29, 30, 31, 0,
4043 );
4044 assert_eq_m256i(r, e);
4045 }
4046
4047 #[simd_test(enable = "avx")]
4048 unsafe fn test_mm256_insert_epi16() {
4049 #[rustfmt::skip]
4050 let a = _mm256_setr_epi16(
4051 0, 1, 2, 3, 4, 5, 6, 7,
4052 8, 9, 10, 11, 12, 13, 14, 15,
4053 );
4054 let r = _mm256_insert_epi16::<15>(a, 0);
4055 #[rustfmt::skip]
4056 let e = _mm256_setr_epi16(
4057 0, 1, 2, 3, 4, 5, 6, 7,
4058 8, 9, 10, 11, 12, 13, 14, 0,
4059 );
4060 assert_eq_m256i(r, e);
4061 }
4062
4063 #[simd_test(enable = "avx")]
4064 unsafe fn test_mm256_insert_epi32() {
4065 let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4066 let r = _mm256_insert_epi32::<7>(a, 0);
4067 let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4068 assert_eq_m256i(r, e);
4069 }
4070
4071 #[simd_test(enable = "avx")]
4072 unsafe fn test_mm256_load_pd() {
4073 let a = _mm256_setr_pd(1., 2., 3., 4.);
4074 let p = ptr::addr_of!(a) as *const f64;
4075 let r = _mm256_load_pd(p);
4076 let e = _mm256_setr_pd(1., 2., 3., 4.);
4077 assert_eq_m256d(r, e);
4078 }
4079
4080 #[simd_test(enable = "avx")]
4081 unsafe fn test_mm256_store_pd() {
4082 let a = _mm256_setr_pd(1., 2., 3., 4.);
4083 let mut r = _mm256_undefined_pd();
4084 _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4085 assert_eq_m256d(r, a);
4086 }
4087
4088 #[simd_test(enable = "avx")]
4089 unsafe fn test_mm256_load_ps() {
4090 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4091 let p = ptr::addr_of!(a) as *const f32;
4092 let r = _mm256_load_ps(p);
4093 let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4094 assert_eq_m256(r, e);
4095 }
4096
4097 #[simd_test(enable = "avx")]
4098 unsafe fn test_mm256_store_ps() {
4099 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4100 let mut r = _mm256_undefined_ps();
4101 _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4102 assert_eq_m256(r, a);
4103 }
4104
4105 #[simd_test(enable = "avx")]
4106 unsafe fn test_mm256_loadu_pd() {
4107 let a = &[1.0f64, 2., 3., 4.];
4108 let p = a.as_ptr();
4109 let r = _mm256_loadu_pd(black_box(p));
4110 let e = _mm256_setr_pd(1., 2., 3., 4.);
4111 assert_eq_m256d(r, e);
4112 }
4113
4114 #[simd_test(enable = "avx")]
4115 unsafe fn test_mm256_storeu_pd() {
4116 let a = _mm256_set1_pd(9.);
4117 let mut r = _mm256_undefined_pd();
4118 _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4119 assert_eq_m256d(r, a);
4120 }
4121
4122 #[simd_test(enable = "avx")]
4123 unsafe fn test_mm256_loadu_ps() {
4124 let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4125 let p = a.as_ptr();
4126 let r = _mm256_loadu_ps(black_box(p));
4127 let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4128 assert_eq_m256(r, e);
4129 }
4130
4131 #[simd_test(enable = "avx")]
4132 unsafe fn test_mm256_storeu_ps() {
4133 let a = _mm256_set1_ps(9.);
4134 let mut r = _mm256_undefined_ps();
4135 _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4136 assert_eq_m256(r, a);
4137 }
4138
4139 #[simd_test(enable = "avx")]
4140 unsafe fn test_mm256_load_si256() {
4141 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4142 let p = ptr::addr_of!(a);
4143 let r = _mm256_load_si256(p);
4144 let e = _mm256_setr_epi64x(1, 2, 3, 4);
4145 assert_eq_m256i(r, e);
4146 }
4147
4148 #[simd_test(enable = "avx")]
4149 unsafe fn test_mm256_store_si256() {
4150 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4151 let mut r = _mm256_undefined_si256();
4152 _mm256_store_si256(ptr::addr_of_mut!(r), a);
4153 assert_eq_m256i(r, a);
4154 }
4155
4156 #[simd_test(enable = "avx")]
4157 unsafe fn test_mm256_loadu_si256() {
4158 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4159 let p = ptr::addr_of!(a);
4160 let r = _mm256_loadu_si256(black_box(p));
4161 let e = _mm256_setr_epi64x(1, 2, 3, 4);
4162 assert_eq_m256i(r, e);
4163 }
4164
4165 #[simd_test(enable = "avx")]
4166 unsafe fn test_mm256_storeu_si256() {
4167 let a = _mm256_set1_epi8(9);
4168 let mut r = _mm256_undefined_si256();
4169 _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4170 assert_eq_m256i(r, a);
4171 }
4172
4173 #[simd_test(enable = "avx")]
4174 unsafe fn test_mm256_maskload_pd() {
4175 let a = &[1.0f64, 2., 3., 4.];
4176 let p = a.as_ptr();
4177 let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4178 let r = _mm256_maskload_pd(black_box(p), mask);
4179 let e = _mm256_setr_pd(0., 2., 0., 4.);
4180 assert_eq_m256d(r, e);
4181 }
4182
4183 #[simd_test(enable = "avx")]
4184 unsafe fn test_mm256_maskstore_pd() {
4185 let mut r = _mm256_set1_pd(0.);
4186 let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4187 let a = _mm256_setr_pd(1., 2., 3., 4.);
4188 _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4189 let e = _mm256_setr_pd(0., 2., 0., 4.);
4190 assert_eq_m256d(r, e);
4191 }
4192
4193 #[simd_test(enable = "avx")]
4194 unsafe fn test_mm_maskload_pd() {
4195 let a = &[1.0f64, 2.];
4196 let p = a.as_ptr();
4197 let mask = _mm_setr_epi64x(0, !0);
4198 let r = _mm_maskload_pd(black_box(p), mask);
4199 let e = _mm_setr_pd(0., 2.);
4200 assert_eq_m128d(r, e);
4201 }
4202
4203 #[simd_test(enable = "avx")]
4204 unsafe fn test_mm_maskstore_pd() {
4205 let mut r = _mm_set1_pd(0.);
4206 let mask = _mm_setr_epi64x(0, !0);
4207 let a = _mm_setr_pd(1., 2.);
4208 _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4209 let e = _mm_setr_pd(0., 2.);
4210 assert_eq_m128d(r, e);
4211 }
4212
4213 #[simd_test(enable = "avx")]
4214 unsafe fn test_mm256_maskload_ps() {
4215 let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4216 let p = a.as_ptr();
4217 let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4218 let r = _mm256_maskload_ps(black_box(p), mask);
4219 let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4220 assert_eq_m256(r, e);
4221 }
4222
4223 #[simd_test(enable = "avx")]
4224 unsafe fn test_mm256_maskstore_ps() {
4225 let mut r = _mm256_set1_ps(0.);
4226 let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4227 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4228 _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4229 let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4230 assert_eq_m256(r, e);
4231 }
4232
4233 #[simd_test(enable = "avx")]
4234 unsafe fn test_mm_maskload_ps() {
4235 let a = &[1.0f32, 2., 3., 4.];
4236 let p = a.as_ptr();
4237 let mask = _mm_setr_epi32(0, !0, 0, !0);
4238 let r = _mm_maskload_ps(black_box(p), mask);
4239 let e = _mm_setr_ps(0., 2., 0., 4.);
4240 assert_eq_m128(r, e);
4241 }
4242
4243 #[simd_test(enable = "avx")]
4244 unsafe fn test_mm_maskstore_ps() {
4245 let mut r = _mm_set1_ps(0.);
4246 let mask = _mm_setr_epi32(0, !0, 0, !0);
4247 let a = _mm_setr_ps(1., 2., 3., 4.);
4248 _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4249 let e = _mm_setr_ps(0., 2., 0., 4.);
4250 assert_eq_m128(r, e);
4251 }
4252
4253 #[simd_test(enable = "avx")]
4254 unsafe fn test_mm256_movehdup_ps() {
4255 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4256 let r = _mm256_movehdup_ps(a);
4257 let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4258 assert_eq_m256(r, e);
4259 }
4260
4261 #[simd_test(enable = "avx")]
4262 unsafe fn test_mm256_moveldup_ps() {
4263 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4264 let r = _mm256_moveldup_ps(a);
4265 let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4266 assert_eq_m256(r, e);
4267 }
4268
4269 #[simd_test(enable = "avx")]
4270 unsafe fn test_mm256_movedup_pd() {
4271 let a = _mm256_setr_pd(1., 2., 3., 4.);
4272 let r = _mm256_movedup_pd(a);
4273 let e = _mm256_setr_pd(1., 1., 3., 3.);
4274 assert_eq_m256d(r, e);
4275 }
4276
4277 #[simd_test(enable = "avx")]
4278 unsafe fn test_mm256_lddqu_si256() {
4279 #[rustfmt::skip]
4280 let a = _mm256_setr_epi8(
4281 1, 2, 3, 4, 5, 6, 7, 8,
4282 9, 10, 11, 12, 13, 14, 15, 16,
4283 17, 18, 19, 20, 21, 22, 23, 24,
4284 25, 26, 27, 28, 29, 30, 31, 32,
4285 );
4286 let p = ptr::addr_of!(a);
4287 let r = _mm256_lddqu_si256(black_box(p));
4288 #[rustfmt::skip]
4289 let e = _mm256_setr_epi8(
4290 1, 2, 3, 4, 5, 6, 7, 8,
4291 9, 10, 11, 12, 13, 14, 15, 16,
4292 17, 18, 19, 20, 21, 22, 23, 24,
4293 25, 26, 27, 28, 29, 30, 31, 32,
4294 );
4295 assert_eq_m256i(r, e);
4296 }
4297
4298 #[simd_test(enable = "avx")]
4299 #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4300 unsafe fn test_mm256_stream_si256() {
4301 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4302 let mut r = _mm256_undefined_si256();
4303 _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4304 _mm_sfence();
4305 assert_eq_m256i(r, a);
4306 }
4307
4308 #[simd_test(enable = "avx")]
4309 #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4310 unsafe fn test_mm256_stream_pd() {
4311 #[repr(align(32))]
4312 struct Memory {
4313 pub data: [f64; 4],
4314 }
4315 let a = _mm256_set1_pd(7.0);
4316 let mut mem = Memory { data: [-1.0; 4] };
4317
4318 _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4319 _mm_sfence();
4320 for i in 0..4 {
4321 assert_eq!(mem.data[i], get_m256d(a, i));
4322 }
4323 }
4324
4325 #[simd_test(enable = "avx")]
4326 #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4327 unsafe fn test_mm256_stream_ps() {
4328 #[repr(align(32))]
4329 struct Memory {
4330 pub data: [f32; 8],
4331 }
4332 let a = _mm256_set1_ps(7.0);
4333 let mut mem = Memory { data: [-1.0; 8] };
4334
4335 _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4336 _mm_sfence();
4337 for i in 0..8 {
4338 assert_eq!(mem.data[i], get_m256(a, i));
4339 }
4340 }
4341
4342 #[simd_test(enable = "avx")]
4343 unsafe fn test_mm256_rcp_ps() {
4344 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4345 let r = _mm256_rcp_ps(a);
4346 #[rustfmt::skip]
4347 let e = _mm256_setr_ps(
4348 0.99975586, 0.49987793, 0.33325195, 0.24993896,
4349 0.19995117, 0.16662598, 0.14282227, 0.12496948,
4350 );
4351 let rel_err = 0.00048828125;
4352 for i in 0..8 {
4353 assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4354 }
4355 }
4356
4357 #[simd_test(enable = "avx")]
4358 unsafe fn test_mm256_rsqrt_ps() {
4359 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4360 let r = _mm256_rsqrt_ps(a);
4361 #[rustfmt::skip]
4362 let e = _mm256_setr_ps(
4363 0.99975586, 0.7069092, 0.5772705, 0.49987793,
4364 0.44714355, 0.40820313, 0.3779297, 0.3534546,
4365 );
4366 let rel_err = 0.00048828125;
4367 for i in 0..8 {
4368 assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4369 }
4370 }
4371
4372 #[simd_test(enable = "avx")]
4373 unsafe fn test_mm256_unpackhi_pd() {
4374 let a = _mm256_setr_pd(1., 2., 3., 4.);
4375 let b = _mm256_setr_pd(5., 6., 7., 8.);
4376 let r = _mm256_unpackhi_pd(a, b);
4377 let e = _mm256_setr_pd(2., 6., 4., 8.);
4378 assert_eq_m256d(r, e);
4379 }
4380
4381 #[simd_test(enable = "avx")]
4382 unsafe fn test_mm256_unpackhi_ps() {
4383 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4384 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4385 let r = _mm256_unpackhi_ps(a, b);
4386 let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4387 assert_eq_m256(r, e);
4388 }
4389
4390 #[simd_test(enable = "avx")]
4391 unsafe fn test_mm256_unpacklo_pd() {
4392 let a = _mm256_setr_pd(1., 2., 3., 4.);
4393 let b = _mm256_setr_pd(5., 6., 7., 8.);
4394 let r = _mm256_unpacklo_pd(a, b);
4395 let e = _mm256_setr_pd(1., 5., 3., 7.);
4396 assert_eq_m256d(r, e);
4397 }
4398
4399 #[simd_test(enable = "avx")]
4400 unsafe fn test_mm256_unpacklo_ps() {
4401 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4402 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4403 let r = _mm256_unpacklo_ps(a, b);
4404 let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4405 assert_eq_m256(r, e);
4406 }
4407
4408 #[simd_test(enable = "avx")]
4409 unsafe fn test_mm256_testz_si256() {
4410 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4411 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4412 let r = _mm256_testz_si256(a, b);
4413 assert_eq!(r, 0);
4414 let b = _mm256_set1_epi64x(0);
4415 let r = _mm256_testz_si256(a, b);
4416 assert_eq!(r, 1);
4417 }
4418
4419 #[simd_test(enable = "avx")]
4420 unsafe fn test_mm256_testc_si256() {
4421 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4422 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4423 let r = _mm256_testc_si256(a, b);
4424 assert_eq!(r, 0);
4425 let b = _mm256_set1_epi64x(0);
4426 let r = _mm256_testc_si256(a, b);
4427 assert_eq!(r, 1);
4428 }
4429
4430 #[simd_test(enable = "avx")]
4431 unsafe fn test_mm256_testnzc_si256() {
4432 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4433 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4434 let r = _mm256_testnzc_si256(a, b);
4435 assert_eq!(r, 1);
4436 let a = _mm256_setr_epi64x(0, 0, 0, 0);
4437 let b = _mm256_setr_epi64x(0, 0, 0, 0);
4438 let r = _mm256_testnzc_si256(a, b);
4439 assert_eq!(r, 0);
4440 }
4441
4442 #[simd_test(enable = "avx")]
4443 unsafe fn test_mm256_testz_pd() {
4444 let a = _mm256_setr_pd(1., 2., 3., 4.);
4445 let b = _mm256_setr_pd(5., 6., 7., 8.);
4446 let r = _mm256_testz_pd(a, b);
4447 assert_eq!(r, 1);
4448 let a = _mm256_set1_pd(-1.);
4449 let r = _mm256_testz_pd(a, a);
4450 assert_eq!(r, 0);
4451 }
4452
4453 #[simd_test(enable = "avx")]
4454 unsafe fn test_mm256_testc_pd() {
4455 let a = _mm256_setr_pd(1., 2., 3., 4.);
4456 let b = _mm256_setr_pd(5., 6., 7., 8.);
4457 let r = _mm256_testc_pd(a, b);
4458 assert_eq!(r, 1);
4459 let a = _mm256_set1_pd(1.);
4460 let b = _mm256_set1_pd(-1.);
4461 let r = _mm256_testc_pd(a, b);
4462 assert_eq!(r, 0);
4463 }
4464
4465 #[simd_test(enable = "avx")]
4466 unsafe fn test_mm256_testnzc_pd() {
4467 let a = _mm256_setr_pd(1., 2., 3., 4.);
4468 let b = _mm256_setr_pd(5., 6., 7., 8.);
4469 let r = _mm256_testnzc_pd(a, b);
4470 assert_eq!(r, 0);
4471 let a = _mm256_setr_pd(1., -1., -1., -1.);
4472 let b = _mm256_setr_pd(-1., -1., 1., 1.);
4473 let r = _mm256_testnzc_pd(a, b);
4474 assert_eq!(r, 1);
4475 }
4476
4477 #[simd_test(enable = "avx")]
4478 unsafe fn test_mm_testz_pd() {
4479 let a = _mm_setr_pd(1., 2.);
4480 let b = _mm_setr_pd(5., 6.);
4481 let r = _mm_testz_pd(a, b);
4482 assert_eq!(r, 1);
4483 let a = _mm_set1_pd(-1.);
4484 let r = _mm_testz_pd(a, a);
4485 assert_eq!(r, 0);
4486 }
4487
4488 #[simd_test(enable = "avx")]
4489 unsafe fn test_mm_testc_pd() {
4490 let a = _mm_setr_pd(1., 2.);
4491 let b = _mm_setr_pd(5., 6.);
4492 let r = _mm_testc_pd(a, b);
4493 assert_eq!(r, 1);
4494 let a = _mm_set1_pd(1.);
4495 let b = _mm_set1_pd(-1.);
4496 let r = _mm_testc_pd(a, b);
4497 assert_eq!(r, 0);
4498 }
4499
4500 #[simd_test(enable = "avx")]
4501 unsafe fn test_mm_testnzc_pd() {
4502 let a = _mm_setr_pd(1., 2.);
4503 let b = _mm_setr_pd(5., 6.);
4504 let r = _mm_testnzc_pd(a, b);
4505 assert_eq!(r, 0);
4506 let a = _mm_setr_pd(1., -1.);
4507 let b = _mm_setr_pd(-1., -1.);
4508 let r = _mm_testnzc_pd(a, b);
4509 assert_eq!(r, 1);
4510 }
4511
4512 #[simd_test(enable = "avx")]
4513 unsafe fn test_mm256_testz_ps() {
4514 let a = _mm256_set1_ps(1.);
4515 let r = _mm256_testz_ps(a, a);
4516 assert_eq!(r, 1);
4517 let a = _mm256_set1_ps(-1.);
4518 let r = _mm256_testz_ps(a, a);
4519 assert_eq!(r, 0);
4520 }
4521
4522 #[simd_test(enable = "avx")]
4523 unsafe fn test_mm256_testc_ps() {
4524 let a = _mm256_set1_ps(1.);
4525 let r = _mm256_testc_ps(a, a);
4526 assert_eq!(r, 1);
4527 let b = _mm256_set1_ps(-1.);
4528 let r = _mm256_testc_ps(a, b);
4529 assert_eq!(r, 0);
4530 }
4531
4532 #[simd_test(enable = "avx")]
4533 unsafe fn test_mm256_testnzc_ps() {
4534 let a = _mm256_set1_ps(1.);
4535 let r = _mm256_testnzc_ps(a, a);
4536 assert_eq!(r, 0);
4537 let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4538 let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4539 let r = _mm256_testnzc_ps(a, b);
4540 assert_eq!(r, 1);
4541 }
4542
4543 #[simd_test(enable = "avx")]
4544 unsafe fn test_mm_testz_ps() {
4545 let a = _mm_set1_ps(1.);
4546 let r = _mm_testz_ps(a, a);
4547 assert_eq!(r, 1);
4548 let a = _mm_set1_ps(-1.);
4549 let r = _mm_testz_ps(a, a);
4550 assert_eq!(r, 0);
4551 }
4552
4553 #[simd_test(enable = "avx")]
4554 unsafe fn test_mm_testc_ps() {
4555 let a = _mm_set1_ps(1.);
4556 let r = _mm_testc_ps(a, a);
4557 assert_eq!(r, 1);
4558 let b = _mm_set1_ps(-1.);
4559 let r = _mm_testc_ps(a, b);
4560 assert_eq!(r, 0);
4561 }
4562
4563 #[simd_test(enable = "avx")]
4564 unsafe fn test_mm_testnzc_ps() {
4565 let a = _mm_set1_ps(1.);
4566 let r = _mm_testnzc_ps(a, a);
4567 assert_eq!(r, 0);
4568 let a = _mm_setr_ps(1., -1., -1., -1.);
4569 let b = _mm_setr_ps(-1., -1., 1., 1.);
4570 let r = _mm_testnzc_ps(a, b);
4571 assert_eq!(r, 1);
4572 }
4573
4574 #[simd_test(enable = "avx")]
4575 unsafe fn test_mm256_movemask_pd() {
4576 let a = _mm256_setr_pd(1., -2., 3., -4.);
4577 let r = _mm256_movemask_pd(a);
4578 assert_eq!(r, 0xA);
4579 }
4580
4581 #[simd_test(enable = "avx")]
4582 unsafe fn test_mm256_movemask_ps() {
4583 let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4584 let r = _mm256_movemask_ps(a);
4585 assert_eq!(r, 0xAA);
4586 }
4587
4588 #[simd_test(enable = "avx")]
4589 unsafe fn test_mm256_setzero_pd() {
4590 let r = _mm256_setzero_pd();
4591 assert_eq_m256d(r, _mm256_set1_pd(0.));
4592 }
4593
4594 #[simd_test(enable = "avx")]
4595 unsafe fn test_mm256_setzero_ps() {
4596 let r = _mm256_setzero_ps();
4597 assert_eq_m256(r, _mm256_set1_ps(0.));
4598 }
4599
4600 #[simd_test(enable = "avx")]
4601 unsafe fn test_mm256_setzero_si256() {
4602 let r = _mm256_setzero_si256();
4603 assert_eq_m256i(r, _mm256_set1_epi8(0));
4604 }
4605
4606 #[simd_test(enable = "avx")]
4607 unsafe fn test_mm256_set_pd() {
4608 let r = _mm256_set_pd(1., 2., 3., 4.);
4609 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4610 }
4611
4612 #[simd_test(enable = "avx")]
4613 unsafe fn test_mm256_set_ps() {
4614 let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4615 assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4616 }
4617
4618 #[simd_test(enable = "avx")]
4619 unsafe fn test_mm256_set_epi8() {
4620 #[rustfmt::skip]
4621 let r = _mm256_set_epi8(
4622 1, 2, 3, 4, 5, 6, 7, 8,
4623 9, 10, 11, 12, 13, 14, 15, 16,
4624 17, 18, 19, 20, 21, 22, 23, 24,
4625 25, 26, 27, 28, 29, 30, 31, 32,
4626 );
4627 #[rustfmt::skip]
4628 let e = _mm256_setr_epi8(
4629 32, 31, 30, 29, 28, 27, 26, 25,
4630 24, 23, 22, 21, 20, 19, 18, 17,
4631 16, 15, 14, 13, 12, 11, 10, 9,
4632 8, 7, 6, 5, 4, 3, 2, 1
4633 );
4634 assert_eq_m256i(r, e);
4635 }
4636
4637 #[simd_test(enable = "avx")]
4638 unsafe fn test_mm256_set_epi16() {
4639 #[rustfmt::skip]
4640 let r = _mm256_set_epi16(
4641 1, 2, 3, 4, 5, 6, 7, 8,
4642 9, 10, 11, 12, 13, 14, 15, 16,
4643 );
4644 #[rustfmt::skip]
4645 let e = _mm256_setr_epi16(
4646 16, 15, 14, 13, 12, 11, 10, 9, 8,
4647 7, 6, 5, 4, 3, 2, 1,
4648 );
4649 assert_eq_m256i(r, e);
4650 }
4651
4652 #[simd_test(enable = "avx")]
4653 unsafe fn test_mm256_set_epi32() {
4654 let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4655 assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4656 }
4657
4658 #[simd_test(enable = "avx")]
4659 unsafe fn test_mm256_set_epi64x() {
4660 let r = _mm256_set_epi64x(1, 2, 3, 4);
4661 assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4662 }
4663
4664 #[simd_test(enable = "avx")]
4665 unsafe fn test_mm256_setr_pd() {
4666 let r = _mm256_setr_pd(1., 2., 3., 4.);
4667 assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4668 }
4669
4670 #[simd_test(enable = "avx")]
4671 unsafe fn test_mm256_setr_ps() {
4672 let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4673 assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4674 }
4675
4676 #[simd_test(enable = "avx")]
4677 unsafe fn test_mm256_setr_epi8() {
4678 #[rustfmt::skip]
4679 let r = _mm256_setr_epi8(
4680 1, 2, 3, 4, 5, 6, 7, 8,
4681 9, 10, 11, 12, 13, 14, 15, 16,
4682 17, 18, 19, 20, 21, 22, 23, 24,
4683 25, 26, 27, 28, 29, 30, 31, 32,
4684 );
4685 #[rustfmt::skip]
4686 let e = _mm256_setr_epi8(
4687 1, 2, 3, 4, 5, 6, 7, 8,
4688 9, 10, 11, 12, 13, 14, 15, 16,
4689 17, 18, 19, 20, 21, 22, 23, 24,
4690 25, 26, 27, 28, 29, 30, 31, 32
4691 );
4692
4693 assert_eq_m256i(r, e);
4694 }
4695
4696 #[simd_test(enable = "avx")]
4697 unsafe fn test_mm256_setr_epi16() {
4698 #[rustfmt::skip]
4699 let r = _mm256_setr_epi16(
4700 1, 2, 3, 4, 5, 6, 7, 8,
4701 9, 10, 11, 12, 13, 14, 15, 16,
4702 );
4703 #[rustfmt::skip]
4704 let e = _mm256_setr_epi16(
4705 1, 2, 3, 4, 5, 6, 7, 8,
4706 9, 10, 11, 12, 13, 14, 15, 16,
4707 );
4708 assert_eq_m256i(r, e);
4709 }
4710
4711 #[simd_test(enable = "avx")]
4712 unsafe fn test_mm256_setr_epi32() {
4713 let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4714 assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4715 }
4716
4717 #[simd_test(enable = "avx")]
4718 unsafe fn test_mm256_setr_epi64x() {
4719 let r = _mm256_setr_epi64x(1, 2, 3, 4);
4720 assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4721 }
4722
4723 #[simd_test(enable = "avx")]
4724 unsafe fn test_mm256_set1_pd() {
4725 let r = _mm256_set1_pd(1.);
4726 assert_eq_m256d(r, _mm256_set1_pd(1.));
4727 }
4728
4729 #[simd_test(enable = "avx")]
4730 unsafe fn test_mm256_set1_ps() {
4731 let r = _mm256_set1_ps(1.);
4732 assert_eq_m256(r, _mm256_set1_ps(1.));
4733 }
4734
4735 #[simd_test(enable = "avx")]
4736 unsafe fn test_mm256_set1_epi8() {
4737 let r = _mm256_set1_epi8(1);
4738 assert_eq_m256i(r, _mm256_set1_epi8(1));
4739 }
4740
4741 #[simd_test(enable = "avx")]
4742 unsafe fn test_mm256_set1_epi16() {
4743 let r = _mm256_set1_epi16(1);
4744 assert_eq_m256i(r, _mm256_set1_epi16(1));
4745 }
4746
4747 #[simd_test(enable = "avx")]
4748 unsafe fn test_mm256_set1_epi32() {
4749 let r = _mm256_set1_epi32(1);
4750 assert_eq_m256i(r, _mm256_set1_epi32(1));
4751 }
4752
4753 #[simd_test(enable = "avx")]
4754 unsafe fn test_mm256_set1_epi64x() {
4755 let r = _mm256_set1_epi64x(1);
4756 assert_eq_m256i(r, _mm256_set1_epi64x(1));
4757 }
4758
4759 #[simd_test(enable = "avx")]
4760 unsafe fn test_mm256_castpd_ps() {
4761 let a = _mm256_setr_pd(1., 2., 3., 4.);
4762 let r = _mm256_castpd_ps(a);
4763 let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4764 assert_eq_m256(r, e);
4765 }
4766
4767 #[simd_test(enable = "avx")]
4768 unsafe fn test_mm256_castps_pd() {
4769 let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4770 let r = _mm256_castps_pd(a);
4771 let e = _mm256_setr_pd(1., 2., 3., 4.);
4772 assert_eq_m256d(r, e);
4773 }
4774
4775 #[simd_test(enable = "avx")]
4776 unsafe fn test_mm256_castps_si256() {
4777 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4778 let r = _mm256_castps_si256(a);
4779 #[rustfmt::skip]
4780 let e = _mm256_setr_epi8(
4781 0, 0, -128, 63, 0, 0, 0, 64,
4782 0, 0, 64, 64, 0, 0, -128, 64,
4783 0, 0, -96, 64, 0, 0, -64, 64,
4784 0, 0, -32, 64, 0, 0, 0, 65,
4785 );
4786 assert_eq_m256i(r, e);
4787 }
4788
4789 #[simd_test(enable = "avx")]
4790 unsafe fn test_mm256_castsi256_ps() {
4791 #[rustfmt::skip]
4792 let a = _mm256_setr_epi8(
4793 0, 0, -128, 63, 0, 0, 0, 64,
4794 0, 0, 64, 64, 0, 0, -128, 64,
4795 0, 0, -96, 64, 0, 0, -64, 64,
4796 0, 0, -32, 64, 0, 0, 0, 65,
4797 );
4798 let r = _mm256_castsi256_ps(a);
4799 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4800 assert_eq_m256(r, e);
4801 }
4802
4803 #[simd_test(enable = "avx")]
4804 unsafe fn test_mm256_castpd_si256() {
4805 let a = _mm256_setr_pd(1., 2., 3., 4.);
4806 let r = _mm256_castpd_si256(a);
4807 assert_eq_m256d(transmute(r), a);
4808 }
4809
4810 #[simd_test(enable = "avx")]
4811 unsafe fn test_mm256_castsi256_pd() {
4812 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4813 let r = _mm256_castsi256_pd(a);
4814 assert_eq_m256d(r, transmute(a));
4815 }
4816
4817 #[simd_test(enable = "avx")]
4818 unsafe fn test_mm256_castps256_ps128() {
4819 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4820 let r = _mm256_castps256_ps128(a);
4821 assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4822 }
4823
4824 #[simd_test(enable = "avx")]
4825 unsafe fn test_mm256_castpd256_pd128() {
4826 let a = _mm256_setr_pd(1., 2., 3., 4.);
4827 let r = _mm256_castpd256_pd128(a);
4828 assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4829 }
4830
4831 #[simd_test(enable = "avx")]
4832 unsafe fn test_mm256_castsi256_si128() {
4833 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4834 let r = _mm256_castsi256_si128(a);
4835 assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
4836 }
4837
4838 #[simd_test(enable = "avx")]
4839 unsafe fn test_mm256_castps128_ps256() {
4840 let a = _mm_setr_ps(1., 2., 3., 4.);
4841 let r = _mm256_castps128_ps256(a);
4842 assert_eq_m128(_mm256_castps256_ps128(r), a);
4843 }
4844
4845 #[simd_test(enable = "avx")]
4846 unsafe fn test_mm256_castpd128_pd256() {
4847 let a = _mm_setr_pd(1., 2.);
4848 let r = _mm256_castpd128_pd256(a);
4849 assert_eq_m128d(_mm256_castpd256_pd128(r), a);
4850 }
4851
4852 #[simd_test(enable = "avx")]
4853 unsafe fn test_mm256_castsi128_si256() {
4854 let a = _mm_setr_epi32(1, 2, 3, 4);
4855 let r = _mm256_castsi128_si256(a);
4856 assert_eq_m128i(_mm256_castsi256_si128(r), a);
4857 }
4858
4859 #[simd_test(enable = "avx")]
4860 unsafe fn test_mm256_zextps128_ps256() {
4861 let a = _mm_setr_ps(1., 2., 3., 4.);
4862 let r = _mm256_zextps128_ps256(a);
4863 let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
4864 assert_eq_m256(r, e);
4865 }
4866
4867 #[simd_test(enable = "avx")]
4868 unsafe fn test_mm256_zextsi128_si256() {
4869 let a = _mm_setr_epi64x(1, 2);
4870 let r = _mm256_zextsi128_si256(a);
4871 let e = _mm256_setr_epi64x(1, 2, 0, 0);
4872 assert_eq_m256i(r, e);
4873 }
4874
4875 #[simd_test(enable = "avx")]
4876 unsafe fn test_mm256_zextpd128_pd256() {
4877 let a = _mm_setr_pd(1., 2.);
4878 let r = _mm256_zextpd128_pd256(a);
4879 let e = _mm256_setr_pd(1., 2., 0., 0.);
4880 assert_eq_m256d(r, e);
4881 }
4882
4883 #[simd_test(enable = "avx")]
4884 unsafe fn test_mm256_set_m128() {
4885 let hi = _mm_setr_ps(5., 6., 7., 8.);
4886 let lo = _mm_setr_ps(1., 2., 3., 4.);
4887 let r = _mm256_set_m128(hi, lo);
4888 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4889 assert_eq_m256(r, e);
4890 }
4891
4892 #[simd_test(enable = "avx")]
4893 unsafe fn test_mm256_set_m128d() {
4894 let hi = _mm_setr_pd(3., 4.);
4895 let lo = _mm_setr_pd(1., 2.);
4896 let r = _mm256_set_m128d(hi, lo);
4897 let e = _mm256_setr_pd(1., 2., 3., 4.);
4898 assert_eq_m256d(r, e);
4899 }
4900
4901 #[simd_test(enable = "avx")]
4902 unsafe fn test_mm256_set_m128i() {
4903 #[rustfmt::skip]
4904 let hi = _mm_setr_epi8(
4905 17, 18, 19, 20,
4906 21, 22, 23, 24,
4907 25, 26, 27, 28,
4908 29, 30, 31, 32,
4909 );
4910 #[rustfmt::skip]
4911 let lo = _mm_setr_epi8(
4912 1, 2, 3, 4,
4913 5, 6, 7, 8,
4914 9, 10, 11, 12,
4915 13, 14, 15, 16,
4916 );
4917 let r = _mm256_set_m128i(hi, lo);
4918 #[rustfmt::skip]
4919 let e = _mm256_setr_epi8(
4920 1, 2, 3, 4, 5, 6, 7, 8,
4921 9, 10, 11, 12, 13, 14, 15, 16,
4922 17, 18, 19, 20, 21, 22, 23, 24,
4923 25, 26, 27, 28, 29, 30, 31, 32,
4924 );
4925 assert_eq_m256i(r, e);
4926 }
4927
4928 #[simd_test(enable = "avx")]
4929 unsafe fn test_mm256_setr_m128() {
4930 let lo = _mm_setr_ps(1., 2., 3., 4.);
4931 let hi = _mm_setr_ps(5., 6., 7., 8.);
4932 let r = _mm256_setr_m128(lo, hi);
4933 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4934 assert_eq_m256(r, e);
4935 }
4936
4937 #[simd_test(enable = "avx")]
4938 unsafe fn test_mm256_setr_m128d() {
4939 let lo = _mm_setr_pd(1., 2.);
4940 let hi = _mm_setr_pd(3., 4.);
4941 let r = _mm256_setr_m128d(lo, hi);
4942 let e = _mm256_setr_pd(1., 2., 3., 4.);
4943 assert_eq_m256d(r, e);
4944 }
4945
4946 #[simd_test(enable = "avx")]
4947 unsafe fn test_mm256_setr_m128i() {
4948 #[rustfmt::skip]
4949 let lo = _mm_setr_epi8(
4950 1, 2, 3, 4,
4951 5, 6, 7, 8,
4952 9, 10, 11, 12,
4953 13, 14, 15, 16,
4954 );
4955 #[rustfmt::skip]
4956 let hi = _mm_setr_epi8(
4957 17, 18, 19, 20, 21, 22, 23, 24,
4958 25, 26, 27, 28, 29, 30, 31, 32,
4959 );
4960 let r = _mm256_setr_m128i(lo, hi);
4961 #[rustfmt::skip]
4962 let e = _mm256_setr_epi8(
4963 1, 2, 3, 4, 5, 6, 7, 8,
4964 9, 10, 11, 12, 13, 14, 15, 16,
4965 17, 18, 19, 20, 21, 22, 23, 24,
4966 25, 26, 27, 28, 29, 30, 31, 32,
4967 );
4968 assert_eq_m256i(r, e);
4969 }
4970
4971 #[simd_test(enable = "avx")]
4972 unsafe fn test_mm256_loadu2_m128() {
4973 let hi = &[5., 6., 7., 8.];
4974 let hiaddr = hi.as_ptr();
4975 let lo = &[1., 2., 3., 4.];
4976 let loaddr = lo.as_ptr();
4977 let r = _mm256_loadu2_m128(hiaddr, loaddr);
4978 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4979 assert_eq_m256(r, e);
4980 }
4981
4982 #[simd_test(enable = "avx")]
4983 unsafe fn test_mm256_loadu2_m128d() {
4984 let hi = &[3., 4.];
4985 let hiaddr = hi.as_ptr();
4986 let lo = &[1., 2.];
4987 let loaddr = lo.as_ptr();
4988 let r = _mm256_loadu2_m128d(hiaddr, loaddr);
4989 let e = _mm256_setr_pd(1., 2., 3., 4.);
4990 assert_eq_m256d(r, e);
4991 }
4992
4993 #[simd_test(enable = "avx")]
4994 unsafe fn test_mm256_loadu2_m128i() {
4995 #[rustfmt::skip]
4996 let hi = _mm_setr_epi8(
4997 17, 18, 19, 20, 21, 22, 23, 24,
4998 25, 26, 27, 28, 29, 30, 31, 32,
4999 );
5000 #[rustfmt::skip]
5001 let lo = _mm_setr_epi8(
5002 1, 2, 3, 4, 5, 6, 7, 8,
5003 9, 10, 11, 12, 13, 14, 15, 16,
5004 );
5005 let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
5006 #[rustfmt::skip]
5007 let e = _mm256_setr_epi8(
5008 1, 2, 3, 4, 5, 6, 7, 8,
5009 9, 10, 11, 12, 13, 14, 15, 16,
5010 17, 18, 19, 20, 21, 22, 23, 24,
5011 25, 26, 27, 28, 29, 30, 31, 32,
5012 );
5013 assert_eq_m256i(r, e);
5014 }
5015
5016 #[simd_test(enable = "avx")]
5017 unsafe fn test_mm256_storeu2_m128() {
5018 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5019 let mut hi = _mm_undefined_ps();
5020 let mut lo = _mm_undefined_ps();
5021 _mm256_storeu2_m128(
5022 ptr::addr_of_mut!(hi) as *mut f32,
5023 ptr::addr_of_mut!(lo) as *mut f32,
5024 a,
5025 );
5026 assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
5027 assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
5028 }
5029
5030 #[simd_test(enable = "avx")]
5031 unsafe fn test_mm256_storeu2_m128d() {
5032 let a = _mm256_setr_pd(1., 2., 3., 4.);
5033 let mut hi = _mm_undefined_pd();
5034 let mut lo = _mm_undefined_pd();
5035 _mm256_storeu2_m128d(
5036 ptr::addr_of_mut!(hi) as *mut f64,
5037 ptr::addr_of_mut!(lo) as *mut f64,
5038 a,
5039 );
5040 assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
5041 assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
5042 }
5043
5044 #[simd_test(enable = "avx")]
5045 unsafe fn test_mm256_storeu2_m128i() {
5046 #[rustfmt::skip]
5047 let a = _mm256_setr_epi8(
5048 1, 2, 3, 4, 5, 6, 7, 8,
5049 9, 10, 11, 12, 13, 14, 15, 16,
5050 17, 18, 19, 20, 21, 22, 23, 24,
5051 25, 26, 27, 28, 29, 30, 31, 32,
5052 );
5053 let mut hi = _mm_undefined_si128();
5054 let mut lo = _mm_undefined_si128();
5055 _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5056 #[rustfmt::skip]
5057 let e_hi = _mm_setr_epi8(
5058 17, 18, 19, 20, 21, 22, 23, 24,
5059 25, 26, 27, 28, 29, 30, 31, 32
5060 );
5061 #[rustfmt::skip]
5062 let e_lo = _mm_setr_epi8(
5063 1, 2, 3, 4, 5, 6, 7, 8,
5064 9, 10, 11, 12, 13, 14, 15, 16
5065 );
5066
5067 assert_eq_m128i(hi, e_hi);
5068 assert_eq_m128i(lo, e_lo);
5069 }
5070
5071 #[simd_test(enable = "avx")]
5072 unsafe fn test_mm256_cvtss_f32() {
5073 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5074 let r = _mm256_cvtss_f32(a);
5075 assert_eq!(r, 1.);
5076 }
5077}
5078