1 | //! [AVX512BF16 intrinsics]. |
2 | //! |
3 | //! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16 |
4 | |
5 | use crate::{ |
6 | core_arch::{simd::*, simd_llvm::*, x86::*}, |
7 | mem::transmute, |
8 | }; |
9 | |
10 | #[cfg (test)] |
11 | use stdarch_test::assert_instr; |
12 | |
13 | #[allow (improper_ctypes)] |
14 | extern "C" { |
15 | #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128" ] |
16 | fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8; |
17 | #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256" ] |
18 | fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16; |
19 | #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512" ] |
20 | fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32; |
21 | #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256" ] |
22 | fn cvtneps2bf16_256(a: f32x8) -> i16x8; |
23 | #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512" ] |
24 | fn cvtneps2bf16_512(a: f32x16) -> i16x16; |
25 | #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128" ] |
26 | fn dpbf16ps(a: f32x4, b: i32x4, c: i32x4) -> f32x4; |
27 | #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256" ] |
28 | fn dpbf16ps_256(a: f32x8, b: i32x8, c: i32x8) -> f32x8; |
29 | #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512" ] |
30 | fn dpbf16ps_512(a: f32x16, b: i32x16, c: i32x16) -> f32x16; |
31 | } |
32 | |
33 | /// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors |
34 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a |
35 | /// 128-bit wide vector. |
36 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh) |
37 | #[inline ] |
38 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
39 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
40 | pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh { |
41 | transmute(src:cvtne2ps2bf16(a:a.as_f32x4(), b:b.as_f32x4())) |
42 | } |
43 | |
44 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors |
45 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results |
46 | /// in single vector dst using writemask k (elements are copied from src when the |
47 | /// corresponding mask bit is not set). |
48 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh) |
49 | #[inline ] |
50 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
51 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
52 | pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh { |
53 | let cvt: u16x8 = _mm_cvtne2ps_pbh(a, b).as_u16x8(); |
54 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x8())) |
55 | } |
56 | |
57 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors |
58 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results |
59 | /// in single vector dst using zeromask k (elements are zeroed out when the corresponding |
60 | /// mask bit is not set). |
61 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh) |
62 | #[inline ] |
63 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
64 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
65 | pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh { |
66 | let cvt: u16x8 = _mm_cvtne2ps_pbh(a, b).as_u16x8(); |
67 | let zero: u16x8 = _mm_setzero_si128().as_u16x8(); |
68 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero)) |
69 | } |
70 | |
71 | /// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors |
72 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a |
73 | /// 256-bit wide vector. |
74 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh) |
75 | #[inline ] |
76 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
77 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
78 | pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh { |
79 | transmute(src:cvtne2ps2bf16_256(a:a.as_f32x8(), b:b.as_f32x8())) |
80 | } |
81 | |
82 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b |
83 | /// to packed BF16 (16-bit) floating-point elements and store the results in single vector |
84 | /// dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
85 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh) |
86 | #[inline ] |
87 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
88 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
89 | pub unsafe fn _mm256_mask_cvtne2ps_pbh( |
90 | src: __m256bh, |
91 | k: __mmask16, |
92 | a: __m256, |
93 | b: __m256, |
94 | ) -> __m256bh { |
95 | let cvt: u16x16 = _mm256_cvtne2ps_pbh(a, b).as_u16x16(); |
96 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x16())) |
97 | } |
98 | |
99 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b |
100 | /// to packed BF16 (16-bit) floating-point elements, and store the results in single vector |
101 | /// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
102 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh) |
103 | #[inline ] |
104 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
105 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
106 | pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh { |
107 | let cvt: u16x16 = _mm256_cvtne2ps_pbh(a, b).as_u16x16(); |
108 | let zero: u16x16 = _mm256_setzero_si256().as_u16x16(); |
109 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero)) |
110 | } |
111 | |
112 | /// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors |
113 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a |
114 | /// 512-bit wide vector. |
115 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh) |
116 | #[inline ] |
117 | #[target_feature (enable = "avx512bf16,avx512f" )] |
118 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
119 | pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh { |
120 | transmute(src:cvtne2ps2bf16_512(a:a.as_f32x16(), b:b.as_f32x16())) |
121 | } |
122 | |
123 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors |
124 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results |
125 | /// in single vector dst using writemask k (elements are copied from src when the |
126 | /// corresponding mask bit is not set). |
127 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh) |
128 | #[inline ] |
129 | #[target_feature (enable = "avx512bf16,avx512f" )] |
130 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
131 | pub unsafe fn _mm512_mask_cvtne2ps_pbh( |
132 | src: __m512bh, |
133 | k: __mmask32, |
134 | a: __m512, |
135 | b: __m512, |
136 | ) -> __m512bh { |
137 | let cvt: u16x32 = _mm512_cvtne2ps_pbh(a, b).as_u16x32(); |
138 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x32())) |
139 | } |
140 | |
141 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors |
142 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results |
143 | /// in single vector dst using zeromask k (elements are zeroed out when the corresponding |
144 | /// mask bit is not set). |
145 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh) |
146 | #[inline ] |
147 | #[target_feature (enable = "avx512bf16,avx512f" )] |
148 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
149 | pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh { |
150 | let cvt: u16x32 = _mm512_cvtne2ps_pbh(a, b).as_u16x32(); |
151 | let zero: u16x32 = _mm512_setzero_si512().as_u16x32(); |
152 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero)) |
153 | } |
154 | |
155 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
156 | /// floating-point elements, and store the results in dst. |
157 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh) |
158 | #[inline ] |
159 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
160 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
161 | pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh { |
162 | transmute(src:cvtneps2bf16_256(a.as_f32x8())) |
163 | } |
164 | |
165 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
166 | /// floating-point elements, and store the results in dst using writemask k |
167 | /// (elements are copied from src when the corresponding mask bit is not set). |
168 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh) |
169 | #[inline ] |
170 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
171 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
172 | pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh { |
173 | let cvt: u16x8 = _mm256_cvtneps_pbh(a).as_u16x8(); |
174 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x8())) |
175 | } |
176 | |
177 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
178 | /// floating-point elements, and store the results in dst using zeromask k |
179 | /// (elements are zeroed out when the corresponding mask bit is not set). |
180 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh) |
181 | #[inline ] |
182 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
183 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
184 | pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh { |
185 | let cvt: u16x8 = _mm256_cvtneps_pbh(a).as_u16x8(); |
186 | let zero: u16x8 = _mm_setzero_si128().as_u16x8(); |
187 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero)) |
188 | } |
189 | |
190 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
191 | /// floating-point elements, and store the results in dst. |
192 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh) |
193 | #[inline ] |
194 | #[target_feature (enable = "avx512bf16,avx512f" )] |
195 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
196 | pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh { |
197 | transmute(src:cvtneps2bf16_512(a.as_f32x16())) |
198 | } |
199 | |
200 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
201 | /// floating-point elements, and store the results in dst using writemask k |
202 | /// (elements are copied from src when the corresponding mask bit is not set). |
203 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh) |
204 | #[inline ] |
205 | #[target_feature (enable = "avx512bf16,avx512f" )] |
206 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
207 | pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh { |
208 | let cvt: u16x16 = _mm512_cvtneps_pbh(a).as_u16x16(); |
209 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:src.as_u16x16())) |
210 | } |
211 | |
212 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
213 | /// floating-point elements, and store the results in dst using zeromask k |
214 | /// (elements are zeroed out when the corresponding mask bit is not set). |
215 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh) |
216 | #[inline ] |
217 | #[target_feature (enable = "avx512bf16,avx512f" )] |
218 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
219 | pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh { |
220 | let cvt: u16x16 = _mm512_cvtneps_pbh(a).as_u16x16(); |
221 | let zero: u16x16 = _mm256_setzero_si256().as_u16x16(); |
222 | transmute(src:simd_select_bitmask(m:k, a:cvt, b:zero)) |
223 | } |
224 | |
225 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
226 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
227 | /// with elements in src, and store the results in dst. |
228 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps) |
229 | #[inline ] |
230 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
231 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
232 | pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 { |
233 | transmute(src:dpbf16ps(a:src.as_f32x4(), b:a.as_i32x4(), c:b.as_i32x4())) |
234 | } |
235 | |
236 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
237 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
238 | /// with elements in src, and store the results in dst using writemask k |
239 | /// (elements are copied from src when the corresponding mask bit is not set). |
240 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps) |
241 | #[inline ] |
242 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
243 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
244 | pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 { |
245 | let rst: f32x4 = _mm_dpbf16_ps(src, a, b).as_f32x4(); |
246 | transmute(src:simd_select_bitmask(m:k, a:rst, b:src.as_f32x4())) |
247 | } |
248 | |
249 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
250 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
251 | /// with elements in src, and store the results in dst using zeromask k |
252 | /// (elements are zeroed out when the corresponding mask bit is not set). |
253 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps) |
254 | #[inline ] |
255 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
256 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
257 | pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 { |
258 | let rst: f32x4 = _mm_dpbf16_ps(src, a, b).as_f32x4(); |
259 | let zero: f32x4 = _mm_set1_ps(0.0_f32).as_f32x4(); |
260 | transmute(src:simd_select_bitmask(m:k, a:rst, b:zero)) |
261 | } |
262 | |
263 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
264 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
265 | /// with elements in src, and store the results in dst. |
266 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps) |
267 | #[inline ] |
268 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
269 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
270 | pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 { |
271 | transmute(src:dpbf16ps_256(a:src.as_f32x8(), b:a.as_i32x8(), c:b.as_i32x8())) |
272 | } |
273 | |
274 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
275 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
276 | /// with elements in src, and store the results in dst using writemask k |
277 | /// (elements are copied from src when the corresponding mask bit is not set). |
278 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps) |
279 | #[inline ] |
280 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
281 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
282 | pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 { |
283 | let rst: f32x8 = _mm256_dpbf16_ps(src, a, b).as_f32x8(); |
284 | transmute(src:simd_select_bitmask(m:k, a:rst, b:src.as_f32x8())) |
285 | } |
286 | |
287 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
288 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
289 | /// with elements in src, and store the results in dst using zeromask k |
290 | /// (elements are zeroed out when the corresponding mask bit is not set). |
291 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps) |
292 | #[inline ] |
293 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
294 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
295 | pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 { |
296 | let rst: f32x8 = _mm256_dpbf16_ps(src, a, b).as_f32x8(); |
297 | let zero: f32x8 = _mm256_setzero_ps().as_f32x8(); |
298 | transmute(src:simd_select_bitmask(m:k, a:rst, b:zero)) |
299 | } |
300 | |
301 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
302 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
303 | /// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit) |
304 | /// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit) |
305 | /// floating-point elements with elements in src, and store the results in dst. |
306 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps) |
307 | #[inline ] |
308 | #[target_feature (enable = "avx512bf16,avx512f" )] |
309 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
310 | pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 { |
311 | transmute(src:dpbf16ps_512(a:src.as_f32x16(), b:a.as_i32x16(), c:b.as_i32x16())) |
312 | } |
313 | |
314 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
315 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
316 | /// with elements in src, and store the results in dst using writemask k |
317 | /// (elements are copied from src when the corresponding mask bit is not set). |
318 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps) |
319 | #[inline ] |
320 | #[target_feature (enable = "avx512bf16,avx512f" )] |
321 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
322 | pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 { |
323 | let rst: f32x16 = _mm512_dpbf16_ps(src, a, b).as_f32x16(); |
324 | transmute(src:simd_select_bitmask(m:k, a:rst, b:src.as_f32x16())) |
325 | } |
326 | |
327 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
328 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
329 | /// with elements in src, and store the results in dst using zeromask k |
330 | /// (elements are zeroed out when the corresponding mask bit is not set). |
331 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps) |
332 | #[inline ] |
333 | #[target_feature (enable = "avx512bf16,avx512f" )] |
334 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
335 | pub unsafe fn _mm512_maskz_dpbf16_ps( |
336 | k: __mmask16, |
337 | src: __m512, |
338 | a: __m512bh, |
339 | b: __m512bh, |
340 | ) -> __m512 { |
341 | let rst: f32x16 = _mm512_dpbf16_ps(src, a, b).as_f32x16(); |
342 | let zero: f32x16 = _mm512_setzero_ps().as_f32x16(); |
343 | transmute(src:simd_select_bitmask(m:k, a:rst, b:zero)) |
344 | } |
345 | |
346 | #[cfg (test)] |
347 | mod tests { |
348 | use crate::{core_arch::x86::*, mem::transmute}; |
349 | use stdarch_test::simd_test; |
350 | |
351 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
352 | unsafe fn test_mm_cvtne2ps_pbh() { |
353 | let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
354 | let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32]; |
355 | let a: __m128 = transmute(a_array); |
356 | let b: __m128 = transmute(b_array); |
357 | let c: __m128bh = _mm_cvtne2ps_pbh(a, b); |
358 | let result: [u16; 8] = transmute(c.as_u16x8()); |
359 | #[rustfmt::skip] |
360 | let expected_result: [u16; 8] = [ |
361 | 0b1_10000110_0110010, |
362 | 0b1_10000010_0101000, |
363 | 0b1_10000000_1110000, |
364 | 0b1_10000100_1001001, |
365 | 0b0_10000110_0110010, |
366 | 0b0_10000010_0101000, |
367 | 0b0_10000000_1110000, |
368 | 0b0_10000100_1001001, |
369 | ]; |
370 | assert_eq!(result, expected_result); |
371 | } |
372 | |
373 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
374 | unsafe fn test_mm_mask_cvtne2ps_pbh() { |
375 | let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
376 | let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32]; |
377 | #[rustfmt::skip] |
378 | let src_array: [u16; 8] = [ |
379 | 0b0_10000110_0110010, |
380 | 0b0_10000010_0101000, |
381 | 0b0_10000000_1110000, |
382 | 0b0_10000100_1001001, |
383 | 0b0_10000110_0110010, |
384 | 0b0_10000010_0101000, |
385 | 0b0_10000000_1110000, |
386 | 0b0_10000100_1001001, |
387 | ]; |
388 | let src: __m128bh = transmute(src_array); |
389 | let a: __m128 = transmute(a_array); |
390 | let b: __m128 = transmute(b_array); |
391 | let k: __mmask8 = 0b1111_1111; |
392 | let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b); |
393 | let result: [u16; 8] = transmute(c.as_u16x8()); |
394 | #[rustfmt::skip] |
395 | let expected_result: [u16; 8] = [ |
396 | 0b1_10000110_0110010, |
397 | 0b1_10000010_0101000, |
398 | 0b1_10000000_1110000, |
399 | 0b1_10000100_1001001, |
400 | 0b0_10000110_0110010, |
401 | 0b0_10000010_0101000, |
402 | 0b0_10000000_1110000, |
403 | 0b0_10000100_1001001, |
404 | ]; |
405 | assert_eq!(result, expected_result); |
406 | let k = 0b0000_0000; |
407 | let c = _mm_mask_cvtne2ps_pbh(src, k, a, b); |
408 | let result: [u16; 8] = transmute(c.as_u16x8()); |
409 | let expected_result = src_array; |
410 | assert_eq!(result, expected_result); |
411 | } |
412 | |
413 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
414 | unsafe fn test_mm_maskz_cvtne2ps_pbh() { |
415 | let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
416 | let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32]; |
417 | let a: __m128 = transmute(a_array); |
418 | let b: __m128 = transmute(b_array); |
419 | let k: __mmask8 = 0b1111_1111; |
420 | let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b); |
421 | let result: [u16; 8] = transmute(c.as_u16x8()); |
422 | #[rustfmt::skip] |
423 | let expected_result: [u16; 8] = [ |
424 | 0b1_10000110_0110010, |
425 | 0b1_10000010_0101000, |
426 | 0b1_10000000_1110000, |
427 | 0b1_10000100_1001001, |
428 | 0b0_10000110_0110010, |
429 | 0b0_10000010_0101000, |
430 | 0b0_10000000_1110000, |
431 | 0b0_10000100_1001001, |
432 | ]; |
433 | assert_eq!(result, expected_result); |
434 | let k = 0b0011_1100; |
435 | let c = _mm_maskz_cvtne2ps_pbh(k, a, b); |
436 | let result: [u16; 8] = transmute(c.as_u16x8()); |
437 | #[rustfmt::skip] |
438 | let expected_result: [u16; 8] = [ |
439 | 0, |
440 | 0, |
441 | 0b1_10000000_1110000, |
442 | 0b1_10000100_1001001, |
443 | 0b0_10000110_0110010, |
444 | 0b0_10000010_0101000, |
445 | 0, |
446 | 0, |
447 | ]; |
448 | assert_eq!(result, expected_result); |
449 | } |
450 | |
451 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
452 | unsafe fn test_mm256_cvtne2ps_pbh() { |
453 | #[rustfmt::skip] |
454 | let a_array = [ |
455 | 178.125_f32, |
456 | 10.5_f32, |
457 | 3.75_f32, |
458 | 50.25_f32, |
459 | 16.5_f32, |
460 | 255.11_f32, |
461 | 1000.158_f32, |
462 | 575.575_f32, |
463 | ]; |
464 | let b_array = [ |
465 | -178.125_f32, |
466 | -10.5_f32, |
467 | -3.75_f32, |
468 | -50.25_f32, |
469 | -16.5_f32, |
470 | -255.11_f32, |
471 | -1000.158_f32, |
472 | -575.575_f32, |
473 | ]; |
474 | let a: __m256 = transmute(a_array); |
475 | let b: __m256 = transmute(b_array); |
476 | let c: __m256bh = _mm256_cvtne2ps_pbh(a, b); |
477 | let result: [u16; 16] = transmute(c.as_u16x16()); |
478 | #[rustfmt::skip] |
479 | let expected_result: [u16; 16] = [ |
480 | 0b1_10000110_0110010, |
481 | 0b1_10000010_0101000, |
482 | 0b1_10000000_1110000, |
483 | 0b1_10000100_1001001, |
484 | 0b1_10000011_0000100, |
485 | 0b1_10000110_1111111, |
486 | 0b1_10001000_1111010, |
487 | 0b1_10001000_0010000, |
488 | 0b0_10000110_0110010, |
489 | 0b0_10000010_0101000, |
490 | 0b0_10000000_1110000, |
491 | 0b0_10000100_1001001, |
492 | 0b0_10000011_0000100, |
493 | 0b0_10000110_1111111, |
494 | 0b0_10001000_1111010, |
495 | 0b0_10001000_0010000, |
496 | ]; |
497 | assert_eq!(result, expected_result); |
498 | } |
499 | |
500 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
501 | unsafe fn test_mm256_mask_cvtne2ps_pbh() { |
502 | #[rustfmt::skip] |
503 | let a_array = [ |
504 | 178.125_f32, |
505 | 10.5_f32, |
506 | 3.75_f32, |
507 | 50.25_f32, |
508 | 16.5_f32, |
509 | 255.11_f32, |
510 | 1000.158_f32, |
511 | 575.575_f32, |
512 | ]; |
513 | let b_array = [ |
514 | -178.125_f32, |
515 | -10.5_f32, |
516 | -3.75_f32, |
517 | -50.25_f32, |
518 | -16.5_f32, |
519 | -255.11_f32, |
520 | -1000.158_f32, |
521 | -575.575_f32, |
522 | ]; |
523 | let src_array: [u16; 16] = [ |
524 | 0b0_10000110_0110010, |
525 | 0b0_10000010_0101000, |
526 | 0b0_10000000_1110000, |
527 | 0b0_10000100_1001001, |
528 | 0b0_10000110_0110010, |
529 | 0b0_10000010_0101000, |
530 | 0b0_10000000_1110000, |
531 | 0b0_10000100_1001001, |
532 | 0b0_10000110_0110010, |
533 | 0b0_10000010_0101000, |
534 | 0b0_10000000_1110000, |
535 | 0b0_10000100_1001001, |
536 | 0b0_10000110_0110010, |
537 | 0b0_10000010_0101000, |
538 | 0b0_10000000_1110000, |
539 | 0b0_10000100_1001001, |
540 | ]; |
541 | let src: __m256bh = transmute(src_array); |
542 | let a: __m256 = transmute(a_array); |
543 | let b: __m256 = transmute(b_array); |
544 | let k: __mmask16 = 0xffff; |
545 | let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b); |
546 | let result: [u16; 16] = transmute(c.as_u16x16()); |
547 | #[rustfmt::skip] |
548 | let expected_result: [u16; 16] = [ |
549 | 0b1_10000110_0110010, |
550 | 0b1_10000010_0101000, |
551 | 0b1_10000000_1110000, |
552 | 0b1_10000100_1001001, |
553 | 0b1_10000011_0000100, |
554 | 0b1_10000110_1111111, |
555 | 0b1_10001000_1111010, |
556 | 0b1_10001000_0010000, |
557 | 0b0_10000110_0110010, |
558 | 0b0_10000010_0101000, |
559 | 0b0_10000000_1110000, |
560 | 0b0_10000100_1001001, |
561 | 0b0_10000011_0000100, |
562 | 0b0_10000110_1111111, |
563 | 0b0_10001000_1111010, |
564 | 0b0_10001000_0010000, |
565 | ]; |
566 | assert_eq!(result, expected_result); |
567 | let k: __mmask16 = 0; |
568 | let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b); |
569 | let result: [u16; 16] = transmute(c.as_u16x16()); |
570 | let expected_result = src_array; |
571 | assert_eq!(result, expected_result); |
572 | } |
573 | |
574 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
575 | unsafe fn test_mm256_maskz_cvtne2ps_pbh() { |
576 | #[rustfmt::skip] |
577 | let a_array = [ |
578 | 178.125_f32, |
579 | 10.5_f32, |
580 | 3.75_f32, |
581 | 50.25_f32, |
582 | 16.5_f32, |
583 | 255.11_f32, |
584 | 1000.158_f32, |
585 | 575.575_f32, |
586 | ]; |
587 | let b_array = [ |
588 | -178.125_f32, |
589 | -10.5_f32, |
590 | -3.75_f32, |
591 | -50.25_f32, |
592 | -16.5_f32, |
593 | -255.11_f32, |
594 | -1000.158_f32, |
595 | -575.575_f32, |
596 | ]; |
597 | let a: __m256 = transmute(a_array); |
598 | let b: __m256 = transmute(b_array); |
599 | let k: __mmask16 = 0xffff; |
600 | let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b); |
601 | let result: [u16; 16] = transmute(c.as_u16x16()); |
602 | #[rustfmt::skip] |
603 | let expected_result: [u16; 16] = [ |
604 | 0b1_10000110_0110010, |
605 | 0b1_10000010_0101000, |
606 | 0b1_10000000_1110000, |
607 | 0b1_10000100_1001001, |
608 | 0b1_10000011_0000100, |
609 | 0b1_10000110_1111111, |
610 | 0b1_10001000_1111010, |
611 | 0b1_10001000_0010000, |
612 | 0b0_10000110_0110010, |
613 | 0b0_10000010_0101000, |
614 | 0b0_10000000_1110000, |
615 | 0b0_10000100_1001001, |
616 | 0b0_10000011_0000100, |
617 | 0b0_10000110_1111111, |
618 | 0b0_10001000_1111010, |
619 | 0b0_10001000_0010000, |
620 | ]; |
621 | assert_eq!(result, expected_result); |
622 | let k: __mmask16 = 0b0110_1100_0011_0110; |
623 | let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b); |
624 | let result: [u16; 16] = transmute(c.as_u16x16()); |
625 | #[rustfmt::skip] |
626 | let expected_result: [u16; 16] = [ |
627 | 0, |
628 | 0b1_10000010_0101000, |
629 | 0b1_10000000_1110000, |
630 | 0, |
631 | 0b1_10000011_0000100, |
632 | 0b1_10000110_1111111, |
633 | 0, |
634 | 0, |
635 | 0, |
636 | 0, |
637 | 0b0_10000000_1110000, |
638 | 0b0_10000100_1001001, |
639 | 0, |
640 | 0b0_10000110_1111111, |
641 | 0b0_10001000_1111010, |
642 | 0, |
643 | ]; |
644 | assert_eq!(result, expected_result); |
645 | } |
646 | |
647 | #[simd_test(enable = "avx512bf16,avx512f" )] |
648 | unsafe fn test_mm512_cvtne2ps_pbh() { |
649 | #[rustfmt::skip] |
650 | let a_array = [ |
651 | 178.125_f32, |
652 | 10.5_f32, |
653 | 3.75_f32, |
654 | 50.25_f32, |
655 | 16.5_f32, |
656 | 255.11_f32, |
657 | 1000.158_f32, |
658 | 575.575_f32, |
659 | 178.125_f32, |
660 | 10.5_f32, |
661 | 3.75_f32, |
662 | 50.25_f32, |
663 | 16.5_f32, |
664 | 255.11_f32, |
665 | 1000.158_f32, |
666 | 575.575_f32, |
667 | ]; |
668 | let b_array = [ |
669 | -178.125_f32, |
670 | -10.5_f32, |
671 | -3.75_f32, |
672 | -50.25_f32, |
673 | -16.5_f32, |
674 | -255.11_f32, |
675 | -1000.158_f32, |
676 | -575.575_f32, |
677 | -178.125_f32, |
678 | -10.5_f32, |
679 | -3.75_f32, |
680 | -50.25_f32, |
681 | -16.5_f32, |
682 | -255.11_f32, |
683 | -1000.158_f32, |
684 | -575.575_f32, |
685 | ]; |
686 | let a: __m512 = transmute(a_array); |
687 | let b: __m512 = transmute(b_array); |
688 | let c: __m512bh = _mm512_cvtne2ps_pbh(a, b); |
689 | let result: [u16; 32] = transmute(c.as_u16x32()); |
690 | #[rustfmt::skip] |
691 | let expected_result: [u16; 32] = [ |
692 | 0b1_10000110_0110010, |
693 | 0b1_10000010_0101000, |
694 | 0b1_10000000_1110000, |
695 | 0b1_10000100_1001001, |
696 | 0b1_10000011_0000100, |
697 | 0b1_10000110_1111111, |
698 | 0b1_10001000_1111010, |
699 | 0b1_10001000_0010000, |
700 | 0b1_10000110_0110010, |
701 | 0b1_10000010_0101000, |
702 | 0b1_10000000_1110000, |
703 | 0b1_10000100_1001001, |
704 | 0b1_10000011_0000100, |
705 | 0b1_10000110_1111111, |
706 | 0b1_10001000_1111010, |
707 | 0b1_10001000_0010000, |
708 | 0b0_10000110_0110010, |
709 | 0b0_10000010_0101000, |
710 | 0b0_10000000_1110000, |
711 | 0b0_10000100_1001001, |
712 | 0b0_10000011_0000100, |
713 | 0b0_10000110_1111111, |
714 | 0b0_10001000_1111010, |
715 | 0b0_10001000_0010000, |
716 | 0b0_10000110_0110010, |
717 | 0b0_10000010_0101000, |
718 | 0b0_10000000_1110000, |
719 | 0b0_10000100_1001001, |
720 | 0b0_10000011_0000100, |
721 | 0b0_10000110_1111111, |
722 | 0b0_10001000_1111010, |
723 | 0b0_10001000_0010000, |
724 | ]; |
725 | assert_eq!(result, expected_result); |
726 | } |
727 | |
728 | #[simd_test(enable = "avx512bf16,avx512f" )] |
729 | unsafe fn test_mm512_mask_cvtne2ps_pbh() { |
730 | #[rustfmt::skip] |
731 | let a_array = [ |
732 | 178.125_f32, |
733 | 10.5_f32, |
734 | 3.75_f32, |
735 | 50.25_f32, |
736 | 16.5_f32, |
737 | 255.11_f32, |
738 | 1000.158_f32, |
739 | 575.575_f32, |
740 | 178.125_f32, |
741 | 10.5_f32, |
742 | 3.75_f32, |
743 | 50.25_f32, |
744 | 16.5_f32, |
745 | 255.11_f32, |
746 | 1000.158_f32, |
747 | 575.575_f32, |
748 | ]; |
749 | let b_array = [ |
750 | -178.125_f32, |
751 | -10.5_f32, |
752 | -3.75_f32, |
753 | -50.25_f32, |
754 | -16.5_f32, |
755 | -255.11_f32, |
756 | -1000.158_f32, |
757 | -575.575_f32, |
758 | -178.125_f32, |
759 | -10.5_f32, |
760 | -3.75_f32, |
761 | -50.25_f32, |
762 | -16.5_f32, |
763 | -255.11_f32, |
764 | -1000.158_f32, |
765 | -575.575_f32, |
766 | ]; |
767 | let src_array: [u16; 32] = [ |
768 | 0b0_10000110_0110010, |
769 | 0b0_10000010_0101000, |
770 | 0b0_10000000_1110000, |
771 | 0b0_10000100_1001001, |
772 | 0b0_10000110_0110010, |
773 | 0b0_10000010_0101000, |
774 | 0b0_10000000_1110000, |
775 | 0b0_10000100_1001001, |
776 | 0b0_10000110_0110010, |
777 | 0b0_10000010_0101000, |
778 | 0b0_10000000_1110000, |
779 | 0b0_10000100_1001001, |
780 | 0b0_10000110_0110010, |
781 | 0b0_10000010_0101000, |
782 | 0b0_10000000_1110000, |
783 | 0b0_10000100_1001001, |
784 | 0b0_10000110_0110010, |
785 | 0b0_10000010_0101000, |
786 | 0b0_10000000_1110000, |
787 | 0b0_10000100_1001001, |
788 | 0b0_10000110_0110010, |
789 | 0b0_10000010_0101000, |
790 | 0b0_10000000_1110000, |
791 | 0b0_10000100_1001001, |
792 | 0b0_10000110_0110010, |
793 | 0b0_10000010_0101000, |
794 | 0b0_10000000_1110000, |
795 | 0b0_10000100_1001001, |
796 | 0b0_10000110_0110010, |
797 | 0b0_10000010_0101000, |
798 | 0b0_10000000_1110000, |
799 | 0b0_10000100_1001001, |
800 | ]; |
801 | let src: __m512bh = transmute(src_array); |
802 | let a: __m512 = transmute(a_array); |
803 | let b: __m512 = transmute(b_array); |
804 | let k: __mmask32 = 0xffffffff; |
805 | let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b); |
806 | let result: [u16; 32] = transmute(c.as_u16x32()); |
807 | #[rustfmt::skip] |
808 | let expected_result: [u16; 32] = [ |
809 | 0b1_10000110_0110010, |
810 | 0b1_10000010_0101000, |
811 | 0b1_10000000_1110000, |
812 | 0b1_10000100_1001001, |
813 | 0b1_10000011_0000100, |
814 | 0b1_10000110_1111111, |
815 | 0b1_10001000_1111010, |
816 | 0b1_10001000_0010000, |
817 | 0b1_10000110_0110010, |
818 | 0b1_10000010_0101000, |
819 | 0b1_10000000_1110000, |
820 | 0b1_10000100_1001001, |
821 | 0b1_10000011_0000100, |
822 | 0b1_10000110_1111111, |
823 | 0b1_10001000_1111010, |
824 | 0b1_10001000_0010000, |
825 | 0b0_10000110_0110010, |
826 | 0b0_10000010_0101000, |
827 | 0b0_10000000_1110000, |
828 | 0b0_10000100_1001001, |
829 | 0b0_10000011_0000100, |
830 | 0b0_10000110_1111111, |
831 | 0b0_10001000_1111010, |
832 | 0b0_10001000_0010000, |
833 | 0b0_10000110_0110010, |
834 | 0b0_10000010_0101000, |
835 | 0b0_10000000_1110000, |
836 | 0b0_10000100_1001001, |
837 | 0b0_10000011_0000100, |
838 | 0b0_10000110_1111111, |
839 | 0b0_10001000_1111010, |
840 | 0b0_10001000_0010000, |
841 | ]; |
842 | assert_eq!(result, expected_result); |
843 | let k: __mmask32 = 0; |
844 | let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b); |
845 | let result: [u16; 32] = transmute(c.as_u16x32()); |
846 | let expected_result = src_array; |
847 | assert_eq!(result, expected_result); |
848 | } |
849 | |
850 | #[simd_test(enable = "avx512bf16,avx512f" )] |
851 | unsafe fn test_mm512_maskz_cvtne2ps_pbh() { |
852 | #[rustfmt::skip] |
853 | let a_array = [ |
854 | 178.125_f32, |
855 | 10.5_f32, |
856 | 3.75_f32, |
857 | 50.25_f32, |
858 | 16.5_f32, |
859 | 255.11_f32, |
860 | 1000.158_f32, |
861 | 575.575_f32, |
862 | 178.125_f32, |
863 | 10.5_f32, |
864 | 3.75_f32, |
865 | 50.25_f32, |
866 | 16.5_f32, |
867 | 255.11_f32, |
868 | 1000.158_f32, |
869 | 575.575_f32, |
870 | ]; |
871 | let b_array = [ |
872 | -178.125_f32, |
873 | -10.5_f32, |
874 | -3.75_f32, |
875 | -50.25_f32, |
876 | -16.5_f32, |
877 | -255.11_f32, |
878 | -1000.158_f32, |
879 | -575.575_f32, |
880 | -178.125_f32, |
881 | -10.5_f32, |
882 | -3.75_f32, |
883 | -50.25_f32, |
884 | -16.5_f32, |
885 | -255.11_f32, |
886 | -1000.158_f32, |
887 | -575.575_f32, |
888 | ]; |
889 | let a: __m512 = transmute(a_array); |
890 | let b: __m512 = transmute(b_array); |
891 | let k: __mmask32 = 0xffffffff; |
892 | let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b); |
893 | let result: [u16; 32] = transmute(c.as_u16x32()); |
894 | #[rustfmt::skip] |
895 | let expected_result: [u16; 32] = [ |
896 | 0b1_10000110_0110010, |
897 | 0b1_10000010_0101000, |
898 | 0b1_10000000_1110000, |
899 | 0b1_10000100_1001001, |
900 | 0b1_10000011_0000100, |
901 | 0b1_10000110_1111111, |
902 | 0b1_10001000_1111010, |
903 | 0b1_10001000_0010000, |
904 | 0b1_10000110_0110010, |
905 | 0b1_10000010_0101000, |
906 | 0b1_10000000_1110000, |
907 | 0b1_10000100_1001001, |
908 | 0b1_10000011_0000100, |
909 | 0b1_10000110_1111111, |
910 | 0b1_10001000_1111010, |
911 | 0b1_10001000_0010000, |
912 | 0b0_10000110_0110010, |
913 | 0b0_10000010_0101000, |
914 | 0b0_10000000_1110000, |
915 | 0b0_10000100_1001001, |
916 | 0b0_10000011_0000100, |
917 | 0b0_10000110_1111111, |
918 | 0b0_10001000_1111010, |
919 | 0b0_10001000_0010000, |
920 | 0b0_10000110_0110010, |
921 | 0b0_10000010_0101000, |
922 | 0b0_10000000_1110000, |
923 | 0b0_10000100_1001001, |
924 | 0b0_10000011_0000100, |
925 | 0b0_10000110_1111111, |
926 | 0b0_10001000_1111010, |
927 | 0b0_10001000_0010000, |
928 | ]; |
929 | assert_eq!(result, expected_result); |
930 | let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110; |
931 | let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b); |
932 | let result: [u16; 32] = transmute(c.as_u16x32()); |
933 | #[rustfmt::skip] |
934 | let expected_result: [u16; 32] = [ |
935 | 0, |
936 | 0b1_10000010_0101000, |
937 | 0b1_10000000_1110000, |
938 | 0, |
939 | 0b1_10000011_0000100, |
940 | 0, |
941 | 0b1_10001000_1111010, |
942 | 0, |
943 | 0b1_10000110_0110010, |
944 | 0b1_10000010_0101000, |
945 | 0, |
946 | 0, |
947 | 0, |
948 | 0b1_10000110_1111111, |
949 | 0, |
950 | 0b1_10001000_0010000, |
951 | 0, |
952 | 0b0_10000010_0101000, |
953 | 0b0_10000000_1110000, |
954 | 0, |
955 | 0b0_10000011_0000100, |
956 | 0, |
957 | 0, |
958 | 0b0_10001000_0010000, |
959 | 0, |
960 | 0b0_10000010_0101000, |
961 | 0, |
962 | 0b0_10000100_1001001, |
963 | 0, |
964 | 0, |
965 | 0b0_10001000_1111010, |
966 | 0b0_10001000_0010000, |
967 | ]; |
968 | assert_eq!(result, expected_result); |
969 | } |
970 | |
971 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
972 | unsafe fn test_mm256_cvtneps_pbh() { |
973 | #[rustfmt::skip] |
974 | let a_array = [ |
975 | 178.125_f32, |
976 | 10.5_f32, |
977 | 3.75_f32, |
978 | 50.25_f32, |
979 | 16.5_f32, |
980 | 255.11_f32, |
981 | 1000.158_f32, |
982 | 575.575_f32, |
983 | ]; |
984 | let a: __m256 = transmute(a_array); |
985 | let c: __m128bh = _mm256_cvtneps_pbh(a); |
986 | let result: [u16; 8] = transmute(c.as_u16x8()); |
987 | #[rustfmt::skip] |
988 | let expected_result: [u16; 8] = [ |
989 | 0b0_10000110_0110010, |
990 | 0b0_10000010_0101000, |
991 | 0b0_10000000_1110000, |
992 | 0b0_10000100_1001001, |
993 | 0b0_10000011_0000100, |
994 | 0b0_10000110_1111111, |
995 | 0b0_10001000_1111010, |
996 | 0b0_10001000_0010000, |
997 | ]; |
998 | assert_eq!(result, expected_result); |
999 | } |
1000 | |
1001 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1002 | unsafe fn test_mm256_mask_cvtneps_pbh() { |
1003 | #[rustfmt::skip] |
1004 | let a_array = [ |
1005 | 178.125_f32, |
1006 | 10.5_f32, |
1007 | 3.75_f32, |
1008 | 50.25_f32, |
1009 | 16.5_f32, |
1010 | 255.11_f32, |
1011 | 1000.158_f32, |
1012 | 575.575_f32, |
1013 | ]; |
1014 | let src_array: [u16; 8] = [ |
1015 | 0b1_10000110_0110010, |
1016 | 0b1_10000010_0101000, |
1017 | 0b1_10000000_1110000, |
1018 | 0b1_10000100_1001001, |
1019 | 0b1_10000011_0000100, |
1020 | 0b1_10000110_1111111, |
1021 | 0b1_10001000_1111010, |
1022 | 0b1_10001000_0010000, |
1023 | ]; |
1024 | let src: __m128bh = transmute(src_array); |
1025 | let a: __m256 = transmute(a_array); |
1026 | let k: __mmask8 = 0xff; |
1027 | let b = _mm256_mask_cvtneps_pbh(src, k, a); |
1028 | let result: [u16; 8] = transmute(b.as_u16x8()); |
1029 | #[rustfmt::skip] |
1030 | let expected_result: [u16; 8] = [ |
1031 | 0b0_10000110_0110010, |
1032 | 0b0_10000010_0101000, |
1033 | 0b0_10000000_1110000, |
1034 | 0b0_10000100_1001001, |
1035 | 0b0_10000011_0000100, |
1036 | 0b0_10000110_1111111, |
1037 | 0b0_10001000_1111010, |
1038 | 0b0_10001000_0010000, |
1039 | ]; |
1040 | assert_eq!(result, expected_result); |
1041 | let k: __mmask8 = 0x0; |
1042 | let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a); |
1043 | let result: [u16; 8] = transmute(b.as_u16x8()); |
1044 | let expected_result: [u16; 8] = src_array; |
1045 | assert_eq!(result, expected_result); |
1046 | } |
1047 | |
1048 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1049 | unsafe fn test_mm256_maskz_cvtneps_pbh() { |
1050 | #[rustfmt::skip] |
1051 | let a_array = [ |
1052 | 178.125_f32, |
1053 | 10.5_f32, |
1054 | 3.75_f32, |
1055 | 50.25_f32, |
1056 | 16.5_f32, |
1057 | 255.11_f32, |
1058 | 1000.158_f32, |
1059 | 575.575_f32, |
1060 | ]; |
1061 | let a: __m256 = transmute(a_array); |
1062 | let k: __mmask8 = 0xff; |
1063 | let b = _mm256_maskz_cvtneps_pbh(k, a); |
1064 | let result: [u16; 8] = transmute(b.as_u16x8()); |
1065 | #[rustfmt::skip] |
1066 | let expected_result: [u16; 8] = [ |
1067 | 0b0_10000110_0110010, |
1068 | 0b0_10000010_0101000, |
1069 | 0b0_10000000_1110000, |
1070 | 0b0_10000100_1001001, |
1071 | 0b0_10000011_0000100, |
1072 | 0b0_10000110_1111111, |
1073 | 0b0_10001000_1111010, |
1074 | 0b0_10001000_0010000, |
1075 | ]; |
1076 | assert_eq!(result, expected_result); |
1077 | let k: __mmask8 = 0x6; |
1078 | let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a); |
1079 | let result: [u16; 8] = transmute(b.as_u16x8()); |
1080 | let expected_result: [u16; 8] = |
1081 | [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0]; |
1082 | assert_eq!(result, expected_result); |
1083 | } |
1084 | |
1085 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1086 | unsafe fn test_mm512_cvtneps_pbh() { |
1087 | #[rustfmt::skip] |
1088 | let a_array = [ |
1089 | 178.125_f32, |
1090 | 10.5_f32, |
1091 | 3.75_f32, |
1092 | 50.25_f32, |
1093 | 16.5_f32, |
1094 | 255.11_f32, |
1095 | 1000.158_f32, |
1096 | 575.575_f32, |
1097 | 178.125_f32, |
1098 | 10.5_f32, |
1099 | 3.75_f32, |
1100 | 50.25_f32, |
1101 | 16.5_f32, |
1102 | 255.11_f32, |
1103 | 1000.158_f32, |
1104 | 575.575_f32, |
1105 | ]; |
1106 | let a: __m512 = transmute(a_array); |
1107 | let c: __m256bh = _mm512_cvtneps_pbh(a); |
1108 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1109 | #[rustfmt::skip] |
1110 | let expected_result: [u16; 16] = [ |
1111 | 0b0_10000110_0110010, |
1112 | 0b0_10000010_0101000, |
1113 | 0b0_10000000_1110000, |
1114 | 0b0_10000100_1001001, |
1115 | 0b0_10000011_0000100, |
1116 | 0b0_10000110_1111111, |
1117 | 0b0_10001000_1111010, |
1118 | 0b0_10001000_0010000, |
1119 | 0b0_10000110_0110010, |
1120 | 0b0_10000010_0101000, |
1121 | 0b0_10000000_1110000, |
1122 | 0b0_10000100_1001001, |
1123 | 0b0_10000011_0000100, |
1124 | 0b0_10000110_1111111, |
1125 | 0b0_10001000_1111010, |
1126 | 0b0_10001000_0010000, |
1127 | ]; |
1128 | assert_eq!(result, expected_result); |
1129 | } |
1130 | |
1131 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1132 | unsafe fn test_mm512_mask_cvtneps_pbh() { |
1133 | #[rustfmt::skip] |
1134 | let a_array = [ |
1135 | 178.125_f32, |
1136 | 10.5_f32, |
1137 | 3.75_f32, |
1138 | 50.25_f32, |
1139 | 16.5_f32, |
1140 | 255.11_f32, |
1141 | 1000.158_f32, |
1142 | 575.575_f32, |
1143 | 178.125_f32, |
1144 | 10.5_f32, |
1145 | 3.75_f32, |
1146 | 50.25_f32, |
1147 | 16.5_f32, |
1148 | 255.11_f32, |
1149 | 1000.158_f32, |
1150 | 575.575_f32, |
1151 | ]; |
1152 | let src_array: [u16; 16] = [ |
1153 | 0b1_10000110_0110010, |
1154 | 0b1_10000010_0101000, |
1155 | 0b1_10000000_1110000, |
1156 | 0b1_10000100_1001001, |
1157 | 0b1_10000011_0000100, |
1158 | 0b1_10000110_1111111, |
1159 | 0b1_10001000_1111010, |
1160 | 0b1_10001000_0010000, |
1161 | 0b1_10000110_0110010, |
1162 | 0b1_10000010_0101000, |
1163 | 0b1_10000000_1110000, |
1164 | 0b1_10000100_1001001, |
1165 | 0b1_10000011_0000100, |
1166 | 0b1_10000110_1111111, |
1167 | 0b1_10001000_1111010, |
1168 | 0b1_10001000_0010000, |
1169 | ]; |
1170 | let src: __m256bh = transmute(src_array); |
1171 | let a: __m512 = transmute(a_array); |
1172 | let k: __mmask16 = 0xffff; |
1173 | let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a); |
1174 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1175 | #[rustfmt::skip] |
1176 | let expected_result: [u16; 16] = [ |
1177 | 0b0_10000110_0110010, |
1178 | 0b0_10000010_0101000, |
1179 | 0b0_10000000_1110000, |
1180 | 0b0_10000100_1001001, |
1181 | 0b0_10000011_0000100, |
1182 | 0b0_10000110_1111111, |
1183 | 0b0_10001000_1111010, |
1184 | 0b0_10001000_0010000, |
1185 | 0b0_10000110_0110010, |
1186 | 0b0_10000010_0101000, |
1187 | 0b0_10000000_1110000, |
1188 | 0b0_10000100_1001001, |
1189 | 0b0_10000011_0000100, |
1190 | 0b0_10000110_1111111, |
1191 | 0b0_10001000_1111010, |
1192 | 0b0_10001000_0010000, |
1193 | ]; |
1194 | assert_eq!(result, expected_result); |
1195 | let k: __mmask16 = 0; |
1196 | let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a); |
1197 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1198 | let expected_result = src_array; |
1199 | assert_eq!(result, expected_result); |
1200 | } |
1201 | |
1202 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1203 | unsafe fn test_mm512_maskz_cvtneps_pbh() { |
1204 | #[rustfmt::skip] |
1205 | let a_array = [ |
1206 | 178.125_f32, |
1207 | 10.5_f32, |
1208 | 3.75_f32, |
1209 | 50.25_f32, |
1210 | 16.5_f32, |
1211 | 255.11_f32, |
1212 | 1000.158_f32, |
1213 | 575.575_f32, |
1214 | 178.125_f32, |
1215 | 10.5_f32, |
1216 | 3.75_f32, |
1217 | 50.25_f32, |
1218 | 16.5_f32, |
1219 | 255.11_f32, |
1220 | 1000.158_f32, |
1221 | 575.575_f32, |
1222 | ]; |
1223 | let a: __m512 = transmute(a_array); |
1224 | let k: __mmask16 = 0xffff; |
1225 | let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a); |
1226 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1227 | #[rustfmt::skip] |
1228 | let expected_result: [u16; 16] = [ |
1229 | 0b0_10000110_0110010, |
1230 | 0b0_10000010_0101000, |
1231 | 0b0_10000000_1110000, |
1232 | 0b0_10000100_1001001, |
1233 | 0b0_10000011_0000100, |
1234 | 0b0_10000110_1111111, |
1235 | 0b0_10001000_1111010, |
1236 | 0b0_10001000_0010000, |
1237 | 0b0_10000110_0110010, |
1238 | 0b0_10000010_0101000, |
1239 | 0b0_10000000_1110000, |
1240 | 0b0_10000100_1001001, |
1241 | 0b0_10000011_0000100, |
1242 | 0b0_10000110_1111111, |
1243 | 0b0_10001000_1111010, |
1244 | 0b0_10001000_0010000, |
1245 | ]; |
1246 | assert_eq!(result, expected_result); |
1247 | let k: __mmask16 = 0x653a; |
1248 | let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a); |
1249 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1250 | #[rustfmt::skip] |
1251 | let expected_result: [u16; 16] = [ |
1252 | 0, |
1253 | 0b0_10000010_0101000, |
1254 | 0, |
1255 | 0b0_10000100_1001001, |
1256 | 0b0_10000011_0000100, |
1257 | 0b0_10000110_1111111, |
1258 | 0, |
1259 | 0, |
1260 | 0b0_10000110_0110010, |
1261 | 0, |
1262 | 0b0_10000000_1110000, |
1263 | 0, |
1264 | 0, |
1265 | 0b0_10000110_1111111, |
1266 | 0b0_10001000_1111010, |
1267 | 0, |
1268 | ]; |
1269 | assert_eq!(result, expected_result); |
1270 | } |
1271 | |
1272 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1273 | unsafe fn test_mm_dpbf16_ps() { |
1274 | let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
1275 | let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; |
1276 | let a1: __m128 = transmute(a_array); |
1277 | let b1: __m128 = transmute(b_array); |
1278 | let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); |
1279 | let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1); |
1280 | let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1); |
1281 | let c: __m128 = _mm_dpbf16_ps(src, a, b); |
1282 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1283 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; |
1284 | assert_eq!(result, expected_result); |
1285 | } |
1286 | |
1287 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1288 | unsafe fn test_mm_mask_dpbf16_ps() { |
1289 | let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
1290 | let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; |
1291 | let a1: __m128 = transmute(a_array); |
1292 | let b1: __m128 = transmute(b_array); |
1293 | let k: __mmask8 = 0xf3; |
1294 | let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); |
1295 | let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1); |
1296 | let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1); |
1297 | let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b); |
1298 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1299 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32]; |
1300 | assert_eq!(result, expected_result); |
1301 | let k: __mmask8 = 0xff; |
1302 | let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b); |
1303 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1304 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; |
1305 | assert_eq!(result, expected_result); |
1306 | let k: __mmask8 = 0; |
1307 | let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b); |
1308 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1309 | let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]; |
1310 | assert_eq!(result, expected_result); |
1311 | } |
1312 | |
1313 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1314 | unsafe fn test_mm_maskz_dpbf16_ps() { |
1315 | let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
1316 | let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; |
1317 | let a1: __m128 = transmute(a_array); |
1318 | let b1: __m128 = transmute(b_array); |
1319 | let k: __mmask8 = 0xf3; |
1320 | let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); |
1321 | let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1); |
1322 | let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1); |
1323 | let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b); |
1324 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1325 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0]; |
1326 | assert_eq!(result, expected_result); |
1327 | let k: __mmask8 = 0xff; |
1328 | let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b); |
1329 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1330 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; |
1331 | assert_eq!(result, expected_result); |
1332 | let k: __mmask8 = 0; |
1333 | let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b); |
1334 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1335 | let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0]; |
1336 | assert_eq!(result, expected_result); |
1337 | } |
1338 | |
1339 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1340 | unsafe fn test_mm256_dpbf16_ps() { |
1341 | #[rustfmt::skip] |
1342 | let a_array = [ |
1343 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1344 | ]; |
1345 | let b_array = [ |
1346 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1347 | ]; |
1348 | let a1: __m256 = transmute(a_array); |
1349 | let b1: __m256 = transmute(b_array); |
1350 | #[rustfmt::skip] |
1351 | let src: __m256 = transmute([ |
1352 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1353 | ]); |
1354 | let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1); |
1355 | let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1); |
1356 | let c: __m256 = _mm256_dpbf16_ps(src, a, b); |
1357 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1358 | #[rustfmt::skip] |
1359 | let expected_result: [f32; 8] = [ |
1360 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1361 | ]; |
1362 | assert_eq!(result, expected_result); |
1363 | } |
1364 | |
1365 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1366 | unsafe fn test_mm256_mask_dpbf16_ps() { |
1367 | #[rustfmt::skip] |
1368 | let a_array = [ |
1369 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1370 | ]; |
1371 | let b_array = [ |
1372 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1373 | ]; |
1374 | let a1: __m256 = transmute(a_array); |
1375 | let b1: __m256 = transmute(b_array); |
1376 | let k: __mmask8 = 0x33; |
1377 | #[rustfmt::skip] |
1378 | let src: __m256 = transmute([ |
1379 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1380 | ]); |
1381 | let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1); |
1382 | let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1); |
1383 | let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b); |
1384 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1385 | #[rustfmt::skip] |
1386 | let expected_result: [f32; 8] = [ |
1387 | -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, |
1388 | ]; |
1389 | assert_eq!(result, expected_result); |
1390 | let k: __mmask8 = 0xff; |
1391 | let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b); |
1392 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1393 | #[rustfmt::skip] |
1394 | let expected_result: [f32; 8] = [ |
1395 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1396 | ]; |
1397 | assert_eq!(result, expected_result); |
1398 | let k: __mmask8 = 0; |
1399 | let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b); |
1400 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1401 | #[rustfmt::skip] |
1402 | let expected_result: [f32; 8] = [ |
1403 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1404 | ]; |
1405 | assert_eq!(result, expected_result); |
1406 | } |
1407 | |
1408 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1409 | unsafe fn test_mm256_maskz_dpbf16_ps() { |
1410 | #[rustfmt::skip] |
1411 | let a_array = [ |
1412 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1413 | ]; |
1414 | let b_array = [ |
1415 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1416 | ]; |
1417 | let a1: __m256 = transmute(a_array); |
1418 | let b1: __m256 = transmute(b_array); |
1419 | let k: __mmask8 = 0x33; |
1420 | #[rustfmt::skip] |
1421 | let src: __m256 = transmute([ |
1422 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1423 | ]); |
1424 | let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1); |
1425 | let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1); |
1426 | let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b); |
1427 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1428 | #[rustfmt::skip] |
1429 | let expected_result: [f32; 8] = [ |
1430 | -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, |
1431 | ]; |
1432 | assert_eq!(result, expected_result); |
1433 | let k: __mmask8 = 0xff; |
1434 | let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b); |
1435 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1436 | #[rustfmt::skip] |
1437 | let expected_result: [f32; 8] = [ |
1438 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1439 | ]; |
1440 | assert_eq!(result, expected_result); |
1441 | let k: __mmask8 = 0; |
1442 | let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b); |
1443 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1444 | let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]; |
1445 | assert_eq!(result, expected_result); |
1446 | } |
1447 | |
1448 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1449 | unsafe fn test_mm512_dpbf16_ps() { |
1450 | #[rustfmt::skip] |
1451 | let a_array = [ |
1452 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1453 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1454 | ]; |
1455 | let b_array = [ |
1456 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1457 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1458 | ]; |
1459 | let a1: __m512 = transmute(a_array); |
1460 | let b1: __m512 = transmute(b_array); |
1461 | let src: __m512 = transmute([ |
1462 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, |
1463 | 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1464 | ]); |
1465 | let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1); |
1466 | let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1); |
1467 | let c: __m512 = _mm512_dpbf16_ps(src, a, b); |
1468 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1469 | #[rustfmt::skip] |
1470 | let expected_result: [f32; 16] = [ |
1471 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1472 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1473 | ]; |
1474 | assert_eq!(result, expected_result); |
1475 | } |
1476 | |
1477 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1478 | unsafe fn test_mm512_mask_dpbf16_ps() { |
1479 | #[rustfmt::skip] |
1480 | let a_array = [ |
1481 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1482 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1483 | ]; |
1484 | let b_array = [ |
1485 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1486 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1487 | ]; |
1488 | let a1: __m512 = transmute(a_array); |
1489 | let b1: __m512 = transmute(b_array); |
1490 | let k: __mmask16 = 0x3333; |
1491 | #[rustfmt::skip] |
1492 | let src: __m512 = transmute([ |
1493 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, |
1494 | 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1495 | ]); |
1496 | let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1); |
1497 | let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1); |
1498 | let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b); |
1499 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1500 | #[rustfmt::skip] |
1501 | let expected_result: [f32; 16] = [ |
1502 | -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, |
1503 | -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, |
1504 | ]; |
1505 | assert_eq!(result, expected_result); |
1506 | let k: __mmask16 = 0xffff; |
1507 | let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b); |
1508 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1509 | #[rustfmt::skip] |
1510 | let expected_result: [f32; 16] = [ |
1511 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1512 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1513 | ]; |
1514 | assert_eq!(result, expected_result); |
1515 | let k: __mmask16 = 0; |
1516 | let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b); |
1517 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1518 | #[rustfmt::skip] |
1519 | let expected_result: [f32; 16] = [ |
1520 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, |
1521 | 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1522 | ]; |
1523 | assert_eq!(result, expected_result); |
1524 | } |
1525 | |
1526 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1527 | unsafe fn test_mm512_maskz_dpbf16_ps() { |
1528 | #[rustfmt::skip] |
1529 | let a_array = [ |
1530 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1531 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1532 | ]; |
1533 | let b_array = [ |
1534 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1535 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1536 | ]; |
1537 | let a1: __m512 = transmute(a_array); |
1538 | let b1: __m512 = transmute(b_array); |
1539 | let k: __mmask16 = 0x3333; |
1540 | #[rustfmt::skip] |
1541 | let src: __m512 = transmute([ |
1542 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, |
1543 | 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1544 | ]); |
1545 | let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1); |
1546 | let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1); |
1547 | let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b); |
1548 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1549 | #[rustfmt::skip] |
1550 | let expected_result: [f32; 16] = [ |
1551 | -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, |
1552 | 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, |
1553 | ]; |
1554 | assert_eq!(result, expected_result); |
1555 | let k: __mmask16 = 0xffff; |
1556 | let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b); |
1557 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1558 | #[rustfmt::skip] |
1559 | let expected_result: [f32; 16] = [ |
1560 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1561 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1562 | ]; |
1563 | assert_eq!(result, expected_result); |
1564 | let k: __mmask16 = 0; |
1565 | let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b); |
1566 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1567 | #[rustfmt::skip] |
1568 | let expected_result: [f32; 16] = [ |
1569 | 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, |
1570 | ]; |
1571 | assert_eq!(result, expected_result); |
1572 | } |
1573 | } |
1574 | |