1//! [AVX512BF16 intrinsics].
2//!
3//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
4
5use crate::{
6 core_arch::{simd::*, simd_llvm::*, x86::*},
7 mem::transmute,
8};
9
10#[cfg(test)]
11use stdarch_test::assert_instr;
12
13#[allow(improper_ctypes)]
14extern "C" {
15 #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128"]
16 fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8;
17 #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256"]
18 fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16;
19 #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512"]
20 fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32;
21 #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256"]
22 fn cvtneps2bf16_256(a: f32x8) -> i16x8;
23 #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512"]
24 fn cvtneps2bf16_512(a: f32x16) -> i16x16;
25 #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128"]
26 fn dpbf16ps(a: f32x4, b: i32x4, c: i32x4) -> f32x4;
27 #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256"]
28 fn dpbf16ps_256(a: f32x8, b: i32x8, c: i32x8) -> f32x8;
29 #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512"]
30 fn dpbf16ps_512(a: f32x16, b: i32x16, c: i32x16) -> f32x16;
31}
32
33/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors
34/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
35/// 128-bit wide vector.
36/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
37#[inline]
38#[target_feature(enable = "avx512bf16,avx512vl")]
39#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
40pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
41 transmute(src:cvtne2ps2bf16(a:a.as_f32x4(), b:b.as_f32x4()))
42}
43
44/// Convert packed single-precision (32-bit) floating-point elements in two vectors
45/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
46/// in single vector dst using writemask k (elements are copied from src when the
47/// corresponding mask bit is not set).
48/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
49#[inline]
50#[target_feature(enable = "avx512bf16,avx512vl")]
51#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
52pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
53 let cvt: u16x8 = _mm_cvtne2ps_pbh(a, b).as_u16x8();
54 transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x8()))
55}
56
57/// Convert packed single-precision (32-bit) floating-point elements in two vectors
58/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
59/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
60/// mask bit is not set).
61/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
62#[inline]
63#[target_feature(enable = "avx512bf16,avx512vl")]
64#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
65pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
66 let cvt: u16x8 = _mm_cvtne2ps_pbh(a, b).as_u16x8();
67 let zero: u16x8 = _mm_setzero_si128().as_u16x8();
68 transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero))
69}
70
71/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
72/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
73/// 256-bit wide vector.
74/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
75#[inline]
76#[target_feature(enable = "avx512bf16,avx512vl")]
77#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
78pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
79 transmute(src:cvtne2ps2bf16_256(a:a.as_f32x8(), b:b.as_f32x8()))
80}
81
82/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
83/// to packed BF16 (16-bit) floating-point elements and store the results in single vector
84/// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
85/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
86#[inline]
87#[target_feature(enable = "avx512bf16,avx512vl")]
88#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
89pub unsafe fn _mm256_mask_cvtne2ps_pbh(
90 src: __m256bh,
91 k: __mmask16,
92 a: __m256,
93 b: __m256,
94) -> __m256bh {
95 let cvt: u16x16 = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
96 transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x16()))
97}
98
99/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
100/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector
101/// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
103#[inline]
104#[target_feature(enable = "avx512bf16,avx512vl")]
105#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
106pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
107 let cvt: u16x16 = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
108 let zero: u16x16 = _mm256_setzero_si256().as_u16x16();
109 transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero))
110}
111
112/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
113/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
114/// 512-bit wide vector.
115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
116#[inline]
117#[target_feature(enable = "avx512bf16,avx512f")]
118#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
119pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
120 transmute(src:cvtne2ps2bf16_512(a:a.as_f32x16(), b:b.as_f32x16()))
121}
122
123/// Convert packed single-precision (32-bit) floating-point elements in two vectors
124/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
125/// in single vector dst using writemask k (elements are copied from src when the
126/// corresponding mask bit is not set).
127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
128#[inline]
129#[target_feature(enable = "avx512bf16,avx512f")]
130#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
131pub unsafe fn _mm512_mask_cvtne2ps_pbh(
132 src: __m512bh,
133 k: __mmask32,
134 a: __m512,
135 b: __m512,
136) -> __m512bh {
137 let cvt: u16x32 = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
138 transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x32()))
139}
140
141/// Convert packed single-precision (32-bit) floating-point elements in two vectors
142/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
143/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
144/// mask bit is not set).
145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
146#[inline]
147#[target_feature(enable = "avx512bf16,avx512f")]
148#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
149pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
150 let cvt: u16x32 = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
151 let zero: u16x32 = _mm512_setzero_si512().as_u16x32();
152 transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero))
153}
154
155/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
156/// floating-point elements, and store the results in dst.
157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
158#[inline]
159#[target_feature(enable = "avx512bf16,avx512vl")]
160#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
161pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
162 transmute(src:cvtneps2bf16_256(a.as_f32x8()))
163}
164
165/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
166/// floating-point elements, and store the results in dst using writemask k
167/// (elements are copied from src when the corresponding mask bit is not set).
168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
169#[inline]
170#[target_feature(enable = "avx512bf16,avx512vl")]
171#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
172pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
173 let cvt: u16x8 = _mm256_cvtneps_pbh(a).as_u16x8();
174 transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x8()))
175}
176
177/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
178/// floating-point elements, and store the results in dst using zeromask k
179/// (elements are zeroed out when the corresponding mask bit is not set).
180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
181#[inline]
182#[target_feature(enable = "avx512bf16,avx512vl")]
183#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
184pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
185 let cvt: u16x8 = _mm256_cvtneps_pbh(a).as_u16x8();
186 let zero: u16x8 = _mm_setzero_si128().as_u16x8();
187 transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero))
188}
189
190/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
191/// floating-point elements, and store the results in dst.
192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
193#[inline]
194#[target_feature(enable = "avx512bf16,avx512f")]
195#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
196pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
197 transmute(src:cvtneps2bf16_512(a.as_f32x16()))
198}
199
200/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
201/// floating-point elements, and store the results in dst using writemask k
202/// (elements are copied from src when the corresponding mask bit is not set).
203/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
204#[inline]
205#[target_feature(enable = "avx512bf16,avx512f")]
206#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
207pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
208 let cvt: u16x16 = _mm512_cvtneps_pbh(a).as_u16x16();
209 transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x16()))
210}
211
212/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
213/// floating-point elements, and store the results in dst using zeromask k
214/// (elements are zeroed out when the corresponding mask bit is not set).
215/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
216#[inline]
217#[target_feature(enable = "avx512bf16,avx512f")]
218#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
219pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
220 let cvt: u16x16 = _mm512_cvtneps_pbh(a).as_u16x16();
221 let zero: u16x16 = _mm256_setzero_si256().as_u16x16();
222 transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero))
223}
224
225/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
226/// accumulating the intermediate single-precision (32-bit) floating-point elements
227/// with elements in src, and store the results in dst.
228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
229#[inline]
230#[target_feature(enable = "avx512bf16,avx512vl")]
231#[cfg_attr(test, assert_instr("vdpbf16ps"))]
232pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
233 transmute(src:dpbf16ps(a:src.as_f32x4(), b:a.as_i32x4(), c:b.as_i32x4()))
234}
235
236/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
237/// accumulating the intermediate single-precision (32-bit) floating-point elements
238/// with elements in src, and store the results in dst using writemask k
239/// (elements are copied from src when the corresponding mask bit is not set).
240/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
241#[inline]
242#[target_feature(enable = "avx512bf16,avx512vl")]
243#[cfg_attr(test, assert_instr("vdpbf16ps"))]
244pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
245 let rst: f32x4 = _mm_dpbf16_ps(src, a, b).as_f32x4();
246 transmute(src:simd_select_bitmask(m:k, a:rst, b:src.as_f32x4()))
247}
248
249/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
250/// accumulating the intermediate single-precision (32-bit) floating-point elements
251/// with elements in src, and store the results in dst using zeromask k
252/// (elements are zeroed out when the corresponding mask bit is not set).
253/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
254#[inline]
255#[target_feature(enable = "avx512bf16,avx512vl")]
256#[cfg_attr(test, assert_instr("vdpbf16ps"))]
257pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
258 let rst: f32x4 = _mm_dpbf16_ps(src, a, b).as_f32x4();
259 let zero: f32x4 = _mm_set1_ps(0.0_f32).as_f32x4();
260 transmute(src:simd_select_bitmask(m:k, a:rst, b:zero))
261}
262
263/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
264/// accumulating the intermediate single-precision (32-bit) floating-point elements
265/// with elements in src, and store the results in dst.
266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
267#[inline]
268#[target_feature(enable = "avx512bf16,avx512vl")]
269#[cfg_attr(test, assert_instr("vdpbf16ps"))]
270pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
271 transmute(src:dpbf16ps_256(a:src.as_f32x8(), b:a.as_i32x8(), c:b.as_i32x8()))
272}
273
274/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
275/// accumulating the intermediate single-precision (32-bit) floating-point elements
276/// with elements in src, and store the results in dst using writemask k
277/// (elements are copied from src when the corresponding mask bit is not set).
278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
279#[inline]
280#[target_feature(enable = "avx512bf16,avx512vl")]
281#[cfg_attr(test, assert_instr("vdpbf16ps"))]
282pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
283 let rst: f32x8 = _mm256_dpbf16_ps(src, a, b).as_f32x8();
284 transmute(src:simd_select_bitmask(m:k, a:rst, b:src.as_f32x8()))
285}
286
287/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
288/// accumulating the intermediate single-precision (32-bit) floating-point elements
289/// with elements in src, and store the results in dst using zeromask k
290/// (elements are zeroed out when the corresponding mask bit is not set).
291/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
292#[inline]
293#[target_feature(enable = "avx512bf16,avx512vl")]
294#[cfg_attr(test, assert_instr("vdpbf16ps"))]
295pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
296 let rst: f32x8 = _mm256_dpbf16_ps(src, a, b).as_f32x8();
297 let zero: f32x8 = _mm256_setzero_ps().as_f32x8();
298 transmute(src:simd_select_bitmask(m:k, a:rst, b:zero))
299}
300
301/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
302/// accumulating the intermediate single-precision (32-bit) floating-point elements
303/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit)
304/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit)
305/// floating-point elements with elements in src, and store the results in dst.
306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
307#[inline]
308#[target_feature(enable = "avx512bf16,avx512f")]
309#[cfg_attr(test, assert_instr("vdpbf16ps"))]
310pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
311 transmute(src:dpbf16ps_512(a:src.as_f32x16(), b:a.as_i32x16(), c:b.as_i32x16()))
312}
313
314/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
315/// accumulating the intermediate single-precision (32-bit) floating-point elements
316/// with elements in src, and store the results in dst using writemask k
317/// (elements are copied from src when the corresponding mask bit is not set).
318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
319#[inline]
320#[target_feature(enable = "avx512bf16,avx512f")]
321#[cfg_attr(test, assert_instr("vdpbf16ps"))]
322pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
323 let rst: f32x16 = _mm512_dpbf16_ps(src, a, b).as_f32x16();
324 transmute(src:simd_select_bitmask(m:k, a:rst, b:src.as_f32x16()))
325}
326
327/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
328/// accumulating the intermediate single-precision (32-bit) floating-point elements
329/// with elements in src, and store the results in dst using zeromask k
330/// (elements are zeroed out when the corresponding mask bit is not set).
331/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
332#[inline]
333#[target_feature(enable = "avx512bf16,avx512f")]
334#[cfg_attr(test, assert_instr("vdpbf16ps"))]
335pub unsafe fn _mm512_maskz_dpbf16_ps(
336 k: __mmask16,
337 src: __m512,
338 a: __m512bh,
339 b: __m512bh,
340) -> __m512 {
341 let rst: f32x16 = _mm512_dpbf16_ps(src, a, b).as_f32x16();
342 let zero: f32x16 = _mm512_setzero_ps().as_f32x16();
343 transmute(src:simd_select_bitmask(m:k, a:rst, b:zero))
344}
345
346#[cfg(test)]
347mod tests {
348 use crate::{core_arch::x86::*, mem::transmute};
349 use stdarch_test::simd_test;
350
351 #[simd_test(enable = "avx512bf16,avx512vl")]
352 unsafe fn test_mm_cvtne2ps_pbh() {
353 let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
354 let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
355 let a: __m128 = transmute(a_array);
356 let b: __m128 = transmute(b_array);
357 let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
358 let result: [u16; 8] = transmute(c.as_u16x8());
359 #[rustfmt::skip]
360 let expected_result: [u16; 8] = [
361 0b1_10000110_0110010,
362 0b1_10000010_0101000,
363 0b1_10000000_1110000,
364 0b1_10000100_1001001,
365 0b0_10000110_0110010,
366 0b0_10000010_0101000,
367 0b0_10000000_1110000,
368 0b0_10000100_1001001,
369 ];
370 assert_eq!(result, expected_result);
371 }
372
373 #[simd_test(enable = "avx512bf16,avx512vl")]
374 unsafe fn test_mm_mask_cvtne2ps_pbh() {
375 let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
376 let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
377 #[rustfmt::skip]
378 let src_array: [u16; 8] = [
379 0b0_10000110_0110010,
380 0b0_10000010_0101000,
381 0b0_10000000_1110000,
382 0b0_10000100_1001001,
383 0b0_10000110_0110010,
384 0b0_10000010_0101000,
385 0b0_10000000_1110000,
386 0b0_10000100_1001001,
387 ];
388 let src: __m128bh = transmute(src_array);
389 let a: __m128 = transmute(a_array);
390 let b: __m128 = transmute(b_array);
391 let k: __mmask8 = 0b1111_1111;
392 let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
393 let result: [u16; 8] = transmute(c.as_u16x8());
394 #[rustfmt::skip]
395 let expected_result: [u16; 8] = [
396 0b1_10000110_0110010,
397 0b1_10000010_0101000,
398 0b1_10000000_1110000,
399 0b1_10000100_1001001,
400 0b0_10000110_0110010,
401 0b0_10000010_0101000,
402 0b0_10000000_1110000,
403 0b0_10000100_1001001,
404 ];
405 assert_eq!(result, expected_result);
406 let k = 0b0000_0000;
407 let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
408 let result: [u16; 8] = transmute(c.as_u16x8());
409 let expected_result = src_array;
410 assert_eq!(result, expected_result);
411 }
412
413 #[simd_test(enable = "avx512bf16,avx512vl")]
414 unsafe fn test_mm_maskz_cvtne2ps_pbh() {
415 let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
416 let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
417 let a: __m128 = transmute(a_array);
418 let b: __m128 = transmute(b_array);
419 let k: __mmask8 = 0b1111_1111;
420 let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
421 let result: [u16; 8] = transmute(c.as_u16x8());
422 #[rustfmt::skip]
423 let expected_result: [u16; 8] = [
424 0b1_10000110_0110010,
425 0b1_10000010_0101000,
426 0b1_10000000_1110000,
427 0b1_10000100_1001001,
428 0b0_10000110_0110010,
429 0b0_10000010_0101000,
430 0b0_10000000_1110000,
431 0b0_10000100_1001001,
432 ];
433 assert_eq!(result, expected_result);
434 let k = 0b0011_1100;
435 let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
436 let result: [u16; 8] = transmute(c.as_u16x8());
437 #[rustfmt::skip]
438 let expected_result: [u16; 8] = [
439 0,
440 0,
441 0b1_10000000_1110000,
442 0b1_10000100_1001001,
443 0b0_10000110_0110010,
444 0b0_10000010_0101000,
445 0,
446 0,
447 ];
448 assert_eq!(result, expected_result);
449 }
450
451 #[simd_test(enable = "avx512bf16,avx512vl")]
452 unsafe fn test_mm256_cvtne2ps_pbh() {
453 #[rustfmt::skip]
454 let a_array = [
455 178.125_f32,
456 10.5_f32,
457 3.75_f32,
458 50.25_f32,
459 16.5_f32,
460 255.11_f32,
461 1000.158_f32,
462 575.575_f32,
463 ];
464 let b_array = [
465 -178.125_f32,
466 -10.5_f32,
467 -3.75_f32,
468 -50.25_f32,
469 -16.5_f32,
470 -255.11_f32,
471 -1000.158_f32,
472 -575.575_f32,
473 ];
474 let a: __m256 = transmute(a_array);
475 let b: __m256 = transmute(b_array);
476 let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
477 let result: [u16; 16] = transmute(c.as_u16x16());
478 #[rustfmt::skip]
479 let expected_result: [u16; 16] = [
480 0b1_10000110_0110010,
481 0b1_10000010_0101000,
482 0b1_10000000_1110000,
483 0b1_10000100_1001001,
484 0b1_10000011_0000100,
485 0b1_10000110_1111111,
486 0b1_10001000_1111010,
487 0b1_10001000_0010000,
488 0b0_10000110_0110010,
489 0b0_10000010_0101000,
490 0b0_10000000_1110000,
491 0b0_10000100_1001001,
492 0b0_10000011_0000100,
493 0b0_10000110_1111111,
494 0b0_10001000_1111010,
495 0b0_10001000_0010000,
496 ];
497 assert_eq!(result, expected_result);
498 }
499
500 #[simd_test(enable = "avx512bf16,avx512vl")]
501 unsafe fn test_mm256_mask_cvtne2ps_pbh() {
502 #[rustfmt::skip]
503 let a_array = [
504 178.125_f32,
505 10.5_f32,
506 3.75_f32,
507 50.25_f32,
508 16.5_f32,
509 255.11_f32,
510 1000.158_f32,
511 575.575_f32,
512 ];
513 let b_array = [
514 -178.125_f32,
515 -10.5_f32,
516 -3.75_f32,
517 -50.25_f32,
518 -16.5_f32,
519 -255.11_f32,
520 -1000.158_f32,
521 -575.575_f32,
522 ];
523 let src_array: [u16; 16] = [
524 0b0_10000110_0110010,
525 0b0_10000010_0101000,
526 0b0_10000000_1110000,
527 0b0_10000100_1001001,
528 0b0_10000110_0110010,
529 0b0_10000010_0101000,
530 0b0_10000000_1110000,
531 0b0_10000100_1001001,
532 0b0_10000110_0110010,
533 0b0_10000010_0101000,
534 0b0_10000000_1110000,
535 0b0_10000100_1001001,
536 0b0_10000110_0110010,
537 0b0_10000010_0101000,
538 0b0_10000000_1110000,
539 0b0_10000100_1001001,
540 ];
541 let src: __m256bh = transmute(src_array);
542 let a: __m256 = transmute(a_array);
543 let b: __m256 = transmute(b_array);
544 let k: __mmask16 = 0xffff;
545 let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
546 let result: [u16; 16] = transmute(c.as_u16x16());
547 #[rustfmt::skip]
548 let expected_result: [u16; 16] = [
549 0b1_10000110_0110010,
550 0b1_10000010_0101000,
551 0b1_10000000_1110000,
552 0b1_10000100_1001001,
553 0b1_10000011_0000100,
554 0b1_10000110_1111111,
555 0b1_10001000_1111010,
556 0b1_10001000_0010000,
557 0b0_10000110_0110010,
558 0b0_10000010_0101000,
559 0b0_10000000_1110000,
560 0b0_10000100_1001001,
561 0b0_10000011_0000100,
562 0b0_10000110_1111111,
563 0b0_10001000_1111010,
564 0b0_10001000_0010000,
565 ];
566 assert_eq!(result, expected_result);
567 let k: __mmask16 = 0;
568 let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
569 let result: [u16; 16] = transmute(c.as_u16x16());
570 let expected_result = src_array;
571 assert_eq!(result, expected_result);
572 }
573
574 #[simd_test(enable = "avx512bf16,avx512vl")]
575 unsafe fn test_mm256_maskz_cvtne2ps_pbh() {
576 #[rustfmt::skip]
577 let a_array = [
578 178.125_f32,
579 10.5_f32,
580 3.75_f32,
581 50.25_f32,
582 16.5_f32,
583 255.11_f32,
584 1000.158_f32,
585 575.575_f32,
586 ];
587 let b_array = [
588 -178.125_f32,
589 -10.5_f32,
590 -3.75_f32,
591 -50.25_f32,
592 -16.5_f32,
593 -255.11_f32,
594 -1000.158_f32,
595 -575.575_f32,
596 ];
597 let a: __m256 = transmute(a_array);
598 let b: __m256 = transmute(b_array);
599 let k: __mmask16 = 0xffff;
600 let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
601 let result: [u16; 16] = transmute(c.as_u16x16());
602 #[rustfmt::skip]
603 let expected_result: [u16; 16] = [
604 0b1_10000110_0110010,
605 0b1_10000010_0101000,
606 0b1_10000000_1110000,
607 0b1_10000100_1001001,
608 0b1_10000011_0000100,
609 0b1_10000110_1111111,
610 0b1_10001000_1111010,
611 0b1_10001000_0010000,
612 0b0_10000110_0110010,
613 0b0_10000010_0101000,
614 0b0_10000000_1110000,
615 0b0_10000100_1001001,
616 0b0_10000011_0000100,
617 0b0_10000110_1111111,
618 0b0_10001000_1111010,
619 0b0_10001000_0010000,
620 ];
621 assert_eq!(result, expected_result);
622 let k: __mmask16 = 0b0110_1100_0011_0110;
623 let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
624 let result: [u16; 16] = transmute(c.as_u16x16());
625 #[rustfmt::skip]
626 let expected_result: [u16; 16] = [
627 0,
628 0b1_10000010_0101000,
629 0b1_10000000_1110000,
630 0,
631 0b1_10000011_0000100,
632 0b1_10000110_1111111,
633 0,
634 0,
635 0,
636 0,
637 0b0_10000000_1110000,
638 0b0_10000100_1001001,
639 0,
640 0b0_10000110_1111111,
641 0b0_10001000_1111010,
642 0,
643 ];
644 assert_eq!(result, expected_result);
645 }
646
647 #[simd_test(enable = "avx512bf16,avx512f")]
648 unsafe fn test_mm512_cvtne2ps_pbh() {
649 #[rustfmt::skip]
650 let a_array = [
651 178.125_f32,
652 10.5_f32,
653 3.75_f32,
654 50.25_f32,
655 16.5_f32,
656 255.11_f32,
657 1000.158_f32,
658 575.575_f32,
659 178.125_f32,
660 10.5_f32,
661 3.75_f32,
662 50.25_f32,
663 16.5_f32,
664 255.11_f32,
665 1000.158_f32,
666 575.575_f32,
667 ];
668 let b_array = [
669 -178.125_f32,
670 -10.5_f32,
671 -3.75_f32,
672 -50.25_f32,
673 -16.5_f32,
674 -255.11_f32,
675 -1000.158_f32,
676 -575.575_f32,
677 -178.125_f32,
678 -10.5_f32,
679 -3.75_f32,
680 -50.25_f32,
681 -16.5_f32,
682 -255.11_f32,
683 -1000.158_f32,
684 -575.575_f32,
685 ];
686 let a: __m512 = transmute(a_array);
687 let b: __m512 = transmute(b_array);
688 let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
689 let result: [u16; 32] = transmute(c.as_u16x32());
690 #[rustfmt::skip]
691 let expected_result: [u16; 32] = [
692 0b1_10000110_0110010,
693 0b1_10000010_0101000,
694 0b1_10000000_1110000,
695 0b1_10000100_1001001,
696 0b1_10000011_0000100,
697 0b1_10000110_1111111,
698 0b1_10001000_1111010,
699 0b1_10001000_0010000,
700 0b1_10000110_0110010,
701 0b1_10000010_0101000,
702 0b1_10000000_1110000,
703 0b1_10000100_1001001,
704 0b1_10000011_0000100,
705 0b1_10000110_1111111,
706 0b1_10001000_1111010,
707 0b1_10001000_0010000,
708 0b0_10000110_0110010,
709 0b0_10000010_0101000,
710 0b0_10000000_1110000,
711 0b0_10000100_1001001,
712 0b0_10000011_0000100,
713 0b0_10000110_1111111,
714 0b0_10001000_1111010,
715 0b0_10001000_0010000,
716 0b0_10000110_0110010,
717 0b0_10000010_0101000,
718 0b0_10000000_1110000,
719 0b0_10000100_1001001,
720 0b0_10000011_0000100,
721 0b0_10000110_1111111,
722 0b0_10001000_1111010,
723 0b0_10001000_0010000,
724 ];
725 assert_eq!(result, expected_result);
726 }
727
728 #[simd_test(enable = "avx512bf16,avx512f")]
729 unsafe fn test_mm512_mask_cvtne2ps_pbh() {
730 #[rustfmt::skip]
731 let a_array = [
732 178.125_f32,
733 10.5_f32,
734 3.75_f32,
735 50.25_f32,
736 16.5_f32,
737 255.11_f32,
738 1000.158_f32,
739 575.575_f32,
740 178.125_f32,
741 10.5_f32,
742 3.75_f32,
743 50.25_f32,
744 16.5_f32,
745 255.11_f32,
746 1000.158_f32,
747 575.575_f32,
748 ];
749 let b_array = [
750 -178.125_f32,
751 -10.5_f32,
752 -3.75_f32,
753 -50.25_f32,
754 -16.5_f32,
755 -255.11_f32,
756 -1000.158_f32,
757 -575.575_f32,
758 -178.125_f32,
759 -10.5_f32,
760 -3.75_f32,
761 -50.25_f32,
762 -16.5_f32,
763 -255.11_f32,
764 -1000.158_f32,
765 -575.575_f32,
766 ];
767 let src_array: [u16; 32] = [
768 0b0_10000110_0110010,
769 0b0_10000010_0101000,
770 0b0_10000000_1110000,
771 0b0_10000100_1001001,
772 0b0_10000110_0110010,
773 0b0_10000010_0101000,
774 0b0_10000000_1110000,
775 0b0_10000100_1001001,
776 0b0_10000110_0110010,
777 0b0_10000010_0101000,
778 0b0_10000000_1110000,
779 0b0_10000100_1001001,
780 0b0_10000110_0110010,
781 0b0_10000010_0101000,
782 0b0_10000000_1110000,
783 0b0_10000100_1001001,
784 0b0_10000110_0110010,
785 0b0_10000010_0101000,
786 0b0_10000000_1110000,
787 0b0_10000100_1001001,
788 0b0_10000110_0110010,
789 0b0_10000010_0101000,
790 0b0_10000000_1110000,
791 0b0_10000100_1001001,
792 0b0_10000110_0110010,
793 0b0_10000010_0101000,
794 0b0_10000000_1110000,
795 0b0_10000100_1001001,
796 0b0_10000110_0110010,
797 0b0_10000010_0101000,
798 0b0_10000000_1110000,
799 0b0_10000100_1001001,
800 ];
801 let src: __m512bh = transmute(src_array);
802 let a: __m512 = transmute(a_array);
803 let b: __m512 = transmute(b_array);
804 let k: __mmask32 = 0xffffffff;
805 let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
806 let result: [u16; 32] = transmute(c.as_u16x32());
807 #[rustfmt::skip]
808 let expected_result: [u16; 32] = [
809 0b1_10000110_0110010,
810 0b1_10000010_0101000,
811 0b1_10000000_1110000,
812 0b1_10000100_1001001,
813 0b1_10000011_0000100,
814 0b1_10000110_1111111,
815 0b1_10001000_1111010,
816 0b1_10001000_0010000,
817 0b1_10000110_0110010,
818 0b1_10000010_0101000,
819 0b1_10000000_1110000,
820 0b1_10000100_1001001,
821 0b1_10000011_0000100,
822 0b1_10000110_1111111,
823 0b1_10001000_1111010,
824 0b1_10001000_0010000,
825 0b0_10000110_0110010,
826 0b0_10000010_0101000,
827 0b0_10000000_1110000,
828 0b0_10000100_1001001,
829 0b0_10000011_0000100,
830 0b0_10000110_1111111,
831 0b0_10001000_1111010,
832 0b0_10001000_0010000,
833 0b0_10000110_0110010,
834 0b0_10000010_0101000,
835 0b0_10000000_1110000,
836 0b0_10000100_1001001,
837 0b0_10000011_0000100,
838 0b0_10000110_1111111,
839 0b0_10001000_1111010,
840 0b0_10001000_0010000,
841 ];
842 assert_eq!(result, expected_result);
843 let k: __mmask32 = 0;
844 let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
845 let result: [u16; 32] = transmute(c.as_u16x32());
846 let expected_result = src_array;
847 assert_eq!(result, expected_result);
848 }
849
850 #[simd_test(enable = "avx512bf16,avx512f")]
851 unsafe fn test_mm512_maskz_cvtne2ps_pbh() {
852 #[rustfmt::skip]
853 let a_array = [
854 178.125_f32,
855 10.5_f32,
856 3.75_f32,
857 50.25_f32,
858 16.5_f32,
859 255.11_f32,
860 1000.158_f32,
861 575.575_f32,
862 178.125_f32,
863 10.5_f32,
864 3.75_f32,
865 50.25_f32,
866 16.5_f32,
867 255.11_f32,
868 1000.158_f32,
869 575.575_f32,
870 ];
871 let b_array = [
872 -178.125_f32,
873 -10.5_f32,
874 -3.75_f32,
875 -50.25_f32,
876 -16.5_f32,
877 -255.11_f32,
878 -1000.158_f32,
879 -575.575_f32,
880 -178.125_f32,
881 -10.5_f32,
882 -3.75_f32,
883 -50.25_f32,
884 -16.5_f32,
885 -255.11_f32,
886 -1000.158_f32,
887 -575.575_f32,
888 ];
889 let a: __m512 = transmute(a_array);
890 let b: __m512 = transmute(b_array);
891 let k: __mmask32 = 0xffffffff;
892 let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
893 let result: [u16; 32] = transmute(c.as_u16x32());
894 #[rustfmt::skip]
895 let expected_result: [u16; 32] = [
896 0b1_10000110_0110010,
897 0b1_10000010_0101000,
898 0b1_10000000_1110000,
899 0b1_10000100_1001001,
900 0b1_10000011_0000100,
901 0b1_10000110_1111111,
902 0b1_10001000_1111010,
903 0b1_10001000_0010000,
904 0b1_10000110_0110010,
905 0b1_10000010_0101000,
906 0b1_10000000_1110000,
907 0b1_10000100_1001001,
908 0b1_10000011_0000100,
909 0b1_10000110_1111111,
910 0b1_10001000_1111010,
911 0b1_10001000_0010000,
912 0b0_10000110_0110010,
913 0b0_10000010_0101000,
914 0b0_10000000_1110000,
915 0b0_10000100_1001001,
916 0b0_10000011_0000100,
917 0b0_10000110_1111111,
918 0b0_10001000_1111010,
919 0b0_10001000_0010000,
920 0b0_10000110_0110010,
921 0b0_10000010_0101000,
922 0b0_10000000_1110000,
923 0b0_10000100_1001001,
924 0b0_10000011_0000100,
925 0b0_10000110_1111111,
926 0b0_10001000_1111010,
927 0b0_10001000_0010000,
928 ];
929 assert_eq!(result, expected_result);
930 let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
931 let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
932 let result: [u16; 32] = transmute(c.as_u16x32());
933 #[rustfmt::skip]
934 let expected_result: [u16; 32] = [
935 0,
936 0b1_10000010_0101000,
937 0b1_10000000_1110000,
938 0,
939 0b1_10000011_0000100,
940 0,
941 0b1_10001000_1111010,
942 0,
943 0b1_10000110_0110010,
944 0b1_10000010_0101000,
945 0,
946 0,
947 0,
948 0b1_10000110_1111111,
949 0,
950 0b1_10001000_0010000,
951 0,
952 0b0_10000010_0101000,
953 0b0_10000000_1110000,
954 0,
955 0b0_10000011_0000100,
956 0,
957 0,
958 0b0_10001000_0010000,
959 0,
960 0b0_10000010_0101000,
961 0,
962 0b0_10000100_1001001,
963 0,
964 0,
965 0b0_10001000_1111010,
966 0b0_10001000_0010000,
967 ];
968 assert_eq!(result, expected_result);
969 }
970
971 #[simd_test(enable = "avx512bf16,avx512vl")]
972 unsafe fn test_mm256_cvtneps_pbh() {
973 #[rustfmt::skip]
974 let a_array = [
975 178.125_f32,
976 10.5_f32,
977 3.75_f32,
978 50.25_f32,
979 16.5_f32,
980 255.11_f32,
981 1000.158_f32,
982 575.575_f32,
983 ];
984 let a: __m256 = transmute(a_array);
985 let c: __m128bh = _mm256_cvtneps_pbh(a);
986 let result: [u16; 8] = transmute(c.as_u16x8());
987 #[rustfmt::skip]
988 let expected_result: [u16; 8] = [
989 0b0_10000110_0110010,
990 0b0_10000010_0101000,
991 0b0_10000000_1110000,
992 0b0_10000100_1001001,
993 0b0_10000011_0000100,
994 0b0_10000110_1111111,
995 0b0_10001000_1111010,
996 0b0_10001000_0010000,
997 ];
998 assert_eq!(result, expected_result);
999 }
1000
1001 #[simd_test(enable = "avx512bf16,avx512vl")]
1002 unsafe fn test_mm256_mask_cvtneps_pbh() {
1003 #[rustfmt::skip]
1004 let a_array = [
1005 178.125_f32,
1006 10.5_f32,
1007 3.75_f32,
1008 50.25_f32,
1009 16.5_f32,
1010 255.11_f32,
1011 1000.158_f32,
1012 575.575_f32,
1013 ];
1014 let src_array: [u16; 8] = [
1015 0b1_10000110_0110010,
1016 0b1_10000010_0101000,
1017 0b1_10000000_1110000,
1018 0b1_10000100_1001001,
1019 0b1_10000011_0000100,
1020 0b1_10000110_1111111,
1021 0b1_10001000_1111010,
1022 0b1_10001000_0010000,
1023 ];
1024 let src: __m128bh = transmute(src_array);
1025 let a: __m256 = transmute(a_array);
1026 let k: __mmask8 = 0xff;
1027 let b = _mm256_mask_cvtneps_pbh(src, k, a);
1028 let result: [u16; 8] = transmute(b.as_u16x8());
1029 #[rustfmt::skip]
1030 let expected_result: [u16; 8] = [
1031 0b0_10000110_0110010,
1032 0b0_10000010_0101000,
1033 0b0_10000000_1110000,
1034 0b0_10000100_1001001,
1035 0b0_10000011_0000100,
1036 0b0_10000110_1111111,
1037 0b0_10001000_1111010,
1038 0b0_10001000_0010000,
1039 ];
1040 assert_eq!(result, expected_result);
1041 let k: __mmask8 = 0x0;
1042 let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a);
1043 let result: [u16; 8] = transmute(b.as_u16x8());
1044 let expected_result: [u16; 8] = src_array;
1045 assert_eq!(result, expected_result);
1046 }
1047
1048 #[simd_test(enable = "avx512bf16,avx512vl")]
1049 unsafe fn test_mm256_maskz_cvtneps_pbh() {
1050 #[rustfmt::skip]
1051 let a_array = [
1052 178.125_f32,
1053 10.5_f32,
1054 3.75_f32,
1055 50.25_f32,
1056 16.5_f32,
1057 255.11_f32,
1058 1000.158_f32,
1059 575.575_f32,
1060 ];
1061 let a: __m256 = transmute(a_array);
1062 let k: __mmask8 = 0xff;
1063 let b = _mm256_maskz_cvtneps_pbh(k, a);
1064 let result: [u16; 8] = transmute(b.as_u16x8());
1065 #[rustfmt::skip]
1066 let expected_result: [u16; 8] = [
1067 0b0_10000110_0110010,
1068 0b0_10000010_0101000,
1069 0b0_10000000_1110000,
1070 0b0_10000100_1001001,
1071 0b0_10000011_0000100,
1072 0b0_10000110_1111111,
1073 0b0_10001000_1111010,
1074 0b0_10001000_0010000,
1075 ];
1076 assert_eq!(result, expected_result);
1077 let k: __mmask8 = 0x6;
1078 let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a);
1079 let result: [u16; 8] = transmute(b.as_u16x8());
1080 let expected_result: [u16; 8] =
1081 [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
1082 assert_eq!(result, expected_result);
1083 }
1084
1085 #[simd_test(enable = "avx512bf16,avx512f")]
1086 unsafe fn test_mm512_cvtneps_pbh() {
1087 #[rustfmt::skip]
1088 let a_array = [
1089 178.125_f32,
1090 10.5_f32,
1091 3.75_f32,
1092 50.25_f32,
1093 16.5_f32,
1094 255.11_f32,
1095 1000.158_f32,
1096 575.575_f32,
1097 178.125_f32,
1098 10.5_f32,
1099 3.75_f32,
1100 50.25_f32,
1101 16.5_f32,
1102 255.11_f32,
1103 1000.158_f32,
1104 575.575_f32,
1105 ];
1106 let a: __m512 = transmute(a_array);
1107 let c: __m256bh = _mm512_cvtneps_pbh(a);
1108 let result: [u16; 16] = transmute(c.as_u16x16());
1109 #[rustfmt::skip]
1110 let expected_result: [u16; 16] = [
1111 0b0_10000110_0110010,
1112 0b0_10000010_0101000,
1113 0b0_10000000_1110000,
1114 0b0_10000100_1001001,
1115 0b0_10000011_0000100,
1116 0b0_10000110_1111111,
1117 0b0_10001000_1111010,
1118 0b0_10001000_0010000,
1119 0b0_10000110_0110010,
1120 0b0_10000010_0101000,
1121 0b0_10000000_1110000,
1122 0b0_10000100_1001001,
1123 0b0_10000011_0000100,
1124 0b0_10000110_1111111,
1125 0b0_10001000_1111010,
1126 0b0_10001000_0010000,
1127 ];
1128 assert_eq!(result, expected_result);
1129 }
1130
1131 #[simd_test(enable = "avx512bf16,avx512f")]
1132 unsafe fn test_mm512_mask_cvtneps_pbh() {
1133 #[rustfmt::skip]
1134 let a_array = [
1135 178.125_f32,
1136 10.5_f32,
1137 3.75_f32,
1138 50.25_f32,
1139 16.5_f32,
1140 255.11_f32,
1141 1000.158_f32,
1142 575.575_f32,
1143 178.125_f32,
1144 10.5_f32,
1145 3.75_f32,
1146 50.25_f32,
1147 16.5_f32,
1148 255.11_f32,
1149 1000.158_f32,
1150 575.575_f32,
1151 ];
1152 let src_array: [u16; 16] = [
1153 0b1_10000110_0110010,
1154 0b1_10000010_0101000,
1155 0b1_10000000_1110000,
1156 0b1_10000100_1001001,
1157 0b1_10000011_0000100,
1158 0b1_10000110_1111111,
1159 0b1_10001000_1111010,
1160 0b1_10001000_0010000,
1161 0b1_10000110_0110010,
1162 0b1_10000010_0101000,
1163 0b1_10000000_1110000,
1164 0b1_10000100_1001001,
1165 0b1_10000011_0000100,
1166 0b1_10000110_1111111,
1167 0b1_10001000_1111010,
1168 0b1_10001000_0010000,
1169 ];
1170 let src: __m256bh = transmute(src_array);
1171 let a: __m512 = transmute(a_array);
1172 let k: __mmask16 = 0xffff;
1173 let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
1174 let result: [u16; 16] = transmute(c.as_u16x16());
1175 #[rustfmt::skip]
1176 let expected_result: [u16; 16] = [
1177 0b0_10000110_0110010,
1178 0b0_10000010_0101000,
1179 0b0_10000000_1110000,
1180 0b0_10000100_1001001,
1181 0b0_10000011_0000100,
1182 0b0_10000110_1111111,
1183 0b0_10001000_1111010,
1184 0b0_10001000_0010000,
1185 0b0_10000110_0110010,
1186 0b0_10000010_0101000,
1187 0b0_10000000_1110000,
1188 0b0_10000100_1001001,
1189 0b0_10000011_0000100,
1190 0b0_10000110_1111111,
1191 0b0_10001000_1111010,
1192 0b0_10001000_0010000,
1193 ];
1194 assert_eq!(result, expected_result);
1195 let k: __mmask16 = 0;
1196 let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
1197 let result: [u16; 16] = transmute(c.as_u16x16());
1198 let expected_result = src_array;
1199 assert_eq!(result, expected_result);
1200 }
1201
1202 #[simd_test(enable = "avx512bf16,avx512f")]
1203 unsafe fn test_mm512_maskz_cvtneps_pbh() {
1204 #[rustfmt::skip]
1205 let a_array = [
1206 178.125_f32,
1207 10.5_f32,
1208 3.75_f32,
1209 50.25_f32,
1210 16.5_f32,
1211 255.11_f32,
1212 1000.158_f32,
1213 575.575_f32,
1214 178.125_f32,
1215 10.5_f32,
1216 3.75_f32,
1217 50.25_f32,
1218 16.5_f32,
1219 255.11_f32,
1220 1000.158_f32,
1221 575.575_f32,
1222 ];
1223 let a: __m512 = transmute(a_array);
1224 let k: __mmask16 = 0xffff;
1225 let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
1226 let result: [u16; 16] = transmute(c.as_u16x16());
1227 #[rustfmt::skip]
1228 let expected_result: [u16; 16] = [
1229 0b0_10000110_0110010,
1230 0b0_10000010_0101000,
1231 0b0_10000000_1110000,
1232 0b0_10000100_1001001,
1233 0b0_10000011_0000100,
1234 0b0_10000110_1111111,
1235 0b0_10001000_1111010,
1236 0b0_10001000_0010000,
1237 0b0_10000110_0110010,
1238 0b0_10000010_0101000,
1239 0b0_10000000_1110000,
1240 0b0_10000100_1001001,
1241 0b0_10000011_0000100,
1242 0b0_10000110_1111111,
1243 0b0_10001000_1111010,
1244 0b0_10001000_0010000,
1245 ];
1246 assert_eq!(result, expected_result);
1247 let k: __mmask16 = 0x653a;
1248 let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
1249 let result: [u16; 16] = transmute(c.as_u16x16());
1250 #[rustfmt::skip]
1251 let expected_result: [u16; 16] = [
1252 0,
1253 0b0_10000010_0101000,
1254 0,
1255 0b0_10000100_1001001,
1256 0b0_10000011_0000100,
1257 0b0_10000110_1111111,
1258 0,
1259 0,
1260 0b0_10000110_0110010,
1261 0,
1262 0b0_10000000_1110000,
1263 0,
1264 0,
1265 0b0_10000110_1111111,
1266 0b0_10001000_1111010,
1267 0,
1268 ];
1269 assert_eq!(result, expected_result);
1270 }
1271
1272 #[simd_test(enable = "avx512bf16,avx512vl")]
1273 unsafe fn test_mm_dpbf16_ps() {
1274 let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1275 let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1276 let a1: __m128 = transmute(a_array);
1277 let b1: __m128 = transmute(b_array);
1278 let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
1279 let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1280 let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1281 let c: __m128 = _mm_dpbf16_ps(src, a, b);
1282 let result: [f32; 4] = transmute(c.as_f32x4());
1283 let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1284 assert_eq!(result, expected_result);
1285 }
1286
1287 #[simd_test(enable = "avx512bf16,avx512vl")]
1288 unsafe fn test_mm_mask_dpbf16_ps() {
1289 let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1290 let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1291 let a1: __m128 = transmute(a_array);
1292 let b1: __m128 = transmute(b_array);
1293 let k: __mmask8 = 0xf3;
1294 let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
1295 let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1296 let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1297 let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1298 let result: [f32; 4] = transmute(c.as_f32x4());
1299 let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
1300 assert_eq!(result, expected_result);
1301 let k: __mmask8 = 0xff;
1302 let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1303 let result: [f32; 4] = transmute(c.as_f32x4());
1304 let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1305 assert_eq!(result, expected_result);
1306 let k: __mmask8 = 0;
1307 let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1308 let result: [f32; 4] = transmute(c.as_f32x4());
1309 let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
1310 assert_eq!(result, expected_result);
1311 }
1312
1313 #[simd_test(enable = "avx512bf16,avx512vl")]
1314 unsafe fn test_mm_maskz_dpbf16_ps() {
1315 let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1316 let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1317 let a1: __m128 = transmute(a_array);
1318 let b1: __m128 = transmute(b_array);
1319 let k: __mmask8 = 0xf3;
1320 let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
1321 let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1322 let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1323 let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1324 let result: [f32; 4] = transmute(c.as_f32x4());
1325 let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
1326 assert_eq!(result, expected_result);
1327 let k: __mmask8 = 0xff;
1328 let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1329 let result: [f32; 4] = transmute(c.as_f32x4());
1330 let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1331 assert_eq!(result, expected_result);
1332 let k: __mmask8 = 0;
1333 let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1334 let result: [f32; 4] = transmute(c.as_f32x4());
1335 let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
1336 assert_eq!(result, expected_result);
1337 }
1338
1339 #[simd_test(enable = "avx512bf16,avx512vl")]
1340 unsafe fn test_mm256_dpbf16_ps() {
1341 #[rustfmt::skip]
1342 let a_array = [
1343 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1344 ];
1345 let b_array = [
1346 -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1347 ];
1348 let a1: __m256 = transmute(a_array);
1349 let b1: __m256 = transmute(b_array);
1350 #[rustfmt::skip]
1351 let src: __m256 = transmute([
1352 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1353 ]);
1354 let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1355 let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1356 let c: __m256 = _mm256_dpbf16_ps(src, a, b);
1357 let result: [f32; 8] = transmute(c.as_f32x8());
1358 #[rustfmt::skip]
1359 let expected_result: [f32; 8] = [
1360 -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1361 ];
1362 assert_eq!(result, expected_result);
1363 }
1364
1365 #[simd_test(enable = "avx512bf16,avx512vl")]
1366 unsafe fn test_mm256_mask_dpbf16_ps() {
1367 #[rustfmt::skip]
1368 let a_array = [
1369 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1370 ];
1371 let b_array = [
1372 -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1373 ];
1374 let a1: __m256 = transmute(a_array);
1375 let b1: __m256 = transmute(b_array);
1376 let k: __mmask8 = 0x33;
1377 #[rustfmt::skip]
1378 let src: __m256 = transmute([
1379 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1380 ]);
1381 let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1382 let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1383 let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1384 let result: [f32; 8] = transmute(c.as_f32x8());
1385 #[rustfmt::skip]
1386 let expected_result: [f32; 8] = [
1387 -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1388 ];
1389 assert_eq!(result, expected_result);
1390 let k: __mmask8 = 0xff;
1391 let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1392 let result: [f32; 8] = transmute(c.as_f32x8());
1393 #[rustfmt::skip]
1394 let expected_result: [f32; 8] = [
1395 -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1396 ];
1397 assert_eq!(result, expected_result);
1398 let k: __mmask8 = 0;
1399 let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1400 let result: [f32; 8] = transmute(c.as_f32x8());
1401 #[rustfmt::skip]
1402 let expected_result: [f32; 8] = [
1403 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1404 ];
1405 assert_eq!(result, expected_result);
1406 }
1407
1408 #[simd_test(enable = "avx512bf16,avx512vl")]
1409 unsafe fn test_mm256_maskz_dpbf16_ps() {
1410 #[rustfmt::skip]
1411 let a_array = [
1412 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1413 ];
1414 let b_array = [
1415 -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1416 ];
1417 let a1: __m256 = transmute(a_array);
1418 let b1: __m256 = transmute(b_array);
1419 let k: __mmask8 = 0x33;
1420 #[rustfmt::skip]
1421 let src: __m256 = transmute([
1422 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1423 ]);
1424 let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1425 let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1426 let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1427 let result: [f32; 8] = transmute(c.as_f32x8());
1428 #[rustfmt::skip]
1429 let expected_result: [f32; 8] = [
1430 -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
1431 ];
1432 assert_eq!(result, expected_result);
1433 let k: __mmask8 = 0xff;
1434 let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1435 let result: [f32; 8] = transmute(c.as_f32x8());
1436 #[rustfmt::skip]
1437 let expected_result: [f32; 8] = [
1438 -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1439 ];
1440 assert_eq!(result, expected_result);
1441 let k: __mmask8 = 0;
1442 let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1443 let result: [f32; 8] = transmute(c.as_f32x8());
1444 let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
1445 assert_eq!(result, expected_result);
1446 }
1447
1448 #[simd_test(enable = "avx512bf16,avx512f")]
1449 unsafe fn test_mm512_dpbf16_ps() {
1450 #[rustfmt::skip]
1451 let a_array = [
1452 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1453 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1454 ];
1455 let b_array = [
1456 -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1457 -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1458 ];
1459 let a1: __m512 = transmute(a_array);
1460 let b1: __m512 = transmute(b_array);
1461 let src: __m512 = transmute([
1462 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1463 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1464 ]);
1465 let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1466 let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1467 let c: __m512 = _mm512_dpbf16_ps(src, a, b);
1468 let result: [f32; 16] = transmute(c.as_f32x16());
1469 #[rustfmt::skip]
1470 let expected_result: [f32; 16] = [
1471 -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1472 -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1473 ];
1474 assert_eq!(result, expected_result);
1475 }
1476
1477 #[simd_test(enable = "avx512bf16,avx512f")]
1478 unsafe fn test_mm512_mask_dpbf16_ps() {
1479 #[rustfmt::skip]
1480 let a_array = [
1481 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1482 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1483 ];
1484 let b_array = [
1485 -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1486 -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1487 ];
1488 let a1: __m512 = transmute(a_array);
1489 let b1: __m512 = transmute(b_array);
1490 let k: __mmask16 = 0x3333;
1491 #[rustfmt::skip]
1492 let src: __m512 = transmute([
1493 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1494 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1495 ]);
1496 let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1497 let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1498 let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1499 let result: [f32; 16] = transmute(c.as_f32x16());
1500 #[rustfmt::skip]
1501 let expected_result: [f32; 16] = [
1502 -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1503 -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1504 ];
1505 assert_eq!(result, expected_result);
1506 let k: __mmask16 = 0xffff;
1507 let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1508 let result: [f32; 16] = transmute(c.as_f32x16());
1509 #[rustfmt::skip]
1510 let expected_result: [f32; 16] = [
1511 -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1512 -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1513 ];
1514 assert_eq!(result, expected_result);
1515 let k: __mmask16 = 0;
1516 let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1517 let result: [f32; 16] = transmute(c.as_f32x16());
1518 #[rustfmt::skip]
1519 let expected_result: [f32; 16] = [
1520 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1521 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1522 ];
1523 assert_eq!(result, expected_result);
1524 }
1525
1526 #[simd_test(enable = "avx512bf16,avx512f")]
1527 unsafe fn test_mm512_maskz_dpbf16_ps() {
1528 #[rustfmt::skip]
1529 let a_array = [
1530 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1531 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1532 ];
1533 let b_array = [
1534 -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1535 -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1536 ];
1537 let a1: __m512 = transmute(a_array);
1538 let b1: __m512 = transmute(b_array);
1539 let k: __mmask16 = 0x3333;
1540 #[rustfmt::skip]
1541 let src: __m512 = transmute([
1542 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1543 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1544 ]);
1545 let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1546 let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1547 let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1548 let result: [f32; 16] = transmute(c.as_f32x16());
1549 #[rustfmt::skip]
1550 let expected_result: [f32; 16] = [
1551 -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32,
1552 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
1553 ];
1554 assert_eq!(result, expected_result);
1555 let k: __mmask16 = 0xffff;
1556 let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1557 let result: [f32; 16] = transmute(c.as_f32x16());
1558 #[rustfmt::skip]
1559 let expected_result: [f32; 16] = [
1560 -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1561 -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1562 ];
1563 assert_eq!(result, expected_result);
1564 let k: __mmask16 = 0;
1565 let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1566 let result: [f32; 16] = transmute(c.as_f32x16());
1567 #[rustfmt::skip]
1568 let expected_result: [f32; 16] = [
1569 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1570 ];
1571 assert_eq!(result, expected_result);
1572 }
1573}
1574