1 | //! [AVX512BF16 intrinsics]. |
2 | //! |
3 | //! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16 |
4 | |
5 | use crate::arch::asm; |
6 | use crate::core_arch::{simd::*, x86::*}; |
7 | use crate::intrinsics::simd::*; |
8 | |
9 | #[cfg (test)] |
10 | use stdarch_test::assert_instr; |
11 | |
12 | #[allow (improper_ctypes)] |
13 | unsafe extern "C" { |
14 | #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128" ] |
15 | unsafefn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8; |
16 | #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256" ] |
17 | unsafefn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16; |
18 | #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512" ] |
19 | unsafefn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32; |
20 | #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256" ] |
21 | unsafefn cvtneps2bf16_256(a: f32x8) -> i16x8; |
22 | #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512" ] |
23 | unsafefn cvtneps2bf16_512(a: f32x16) -> i16x16; |
24 | #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128" ] |
25 | unsafefn dpbf16ps(a: f32x4, b: i32x4, c: i32x4) -> f32x4; |
26 | #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256" ] |
27 | unsafefn dpbf16ps_256(a: f32x8, b: i32x8, c: i32x8) -> f32x8; |
28 | #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512" ] |
29 | unsafefn dpbf16ps_512(a: f32x16, b: i32x16, c: i32x16) -> f32x16; |
30 | } |
31 | |
32 | /// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors |
33 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a |
34 | /// 128-bit wide vector. |
35 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh) |
36 | #[inline ] |
37 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
38 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
39 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
40 | pub fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh { |
41 | unsafe { transmute(src:cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4())) } |
42 | } |
43 | |
44 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors |
45 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results |
46 | /// in single vector dst using writemask k (elements are copied from src when the |
47 | /// corresponding mask bit is not set). |
48 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh) |
49 | #[inline ] |
50 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
51 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
52 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
53 | pub fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh { |
54 | unsafe { |
55 | let cvt: u16x8 = _mm_cvtne2ps_pbh(a, b).as_u16x8(); |
56 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:src.as_u16x8())) |
57 | } |
58 | } |
59 | |
60 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors |
61 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results |
62 | /// in single vector dst using zeromask k (elements are zeroed out when the corresponding |
63 | /// mask bit is not set). |
64 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh) |
65 | #[inline ] |
66 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
67 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
68 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
69 | pub fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh { |
70 | unsafe { |
71 | let cvt: u16x8 = _mm_cvtne2ps_pbh(a, b).as_u16x8(); |
72 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:u16x8::ZERO)) |
73 | } |
74 | } |
75 | |
76 | /// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors |
77 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a |
78 | /// 256-bit wide vector. |
79 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh) |
80 | #[inline ] |
81 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
82 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
83 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
84 | pub fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh { |
85 | unsafe { transmute(src:cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8())) } |
86 | } |
87 | |
88 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b |
89 | /// to packed BF16 (16-bit) floating-point elements and store the results in single vector |
90 | /// dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
91 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh) |
92 | #[inline ] |
93 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
94 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
95 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
96 | pub fn _mm256_mask_cvtne2ps_pbh(src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh { |
97 | unsafe { |
98 | let cvt: u16x16 = _mm256_cvtne2ps_pbh(a, b).as_u16x16(); |
99 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:src.as_u16x16())) |
100 | } |
101 | } |
102 | |
103 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b |
104 | /// to packed BF16 (16-bit) floating-point elements, and store the results in single vector |
105 | /// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
106 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh) |
107 | #[inline ] |
108 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
109 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
110 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
111 | pub fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh { |
112 | unsafe { |
113 | let cvt: u16x16 = _mm256_cvtne2ps_pbh(a, b).as_u16x16(); |
114 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:u16x16::ZERO)) |
115 | } |
116 | } |
117 | |
118 | /// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors |
119 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a |
120 | /// 512-bit wide vector. |
121 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh) |
122 | #[inline ] |
123 | #[target_feature (enable = "avx512bf16,avx512f" )] |
124 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
125 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
126 | pub fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh { |
127 | unsafe { transmute(src:cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16())) } |
128 | } |
129 | |
130 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors |
131 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results |
132 | /// in single vector dst using writemask k (elements are copied from src when the |
133 | /// corresponding mask bit is not set). |
134 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh) |
135 | #[inline ] |
136 | #[target_feature (enable = "avx512bf16,avx512f" )] |
137 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
138 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
139 | pub fn _mm512_mask_cvtne2ps_pbh(src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh { |
140 | unsafe { |
141 | let cvt: u16x32 = _mm512_cvtne2ps_pbh(a, b).as_u16x32(); |
142 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:src.as_u16x32())) |
143 | } |
144 | } |
145 | |
146 | /// Convert packed single-precision (32-bit) floating-point elements in two vectors |
147 | /// a and b to packed BF16 (16-bit) floating-point elements, and store the results |
148 | /// in single vector dst using zeromask k (elements are zeroed out when the corresponding |
149 | /// mask bit is not set). |
150 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh) |
151 | #[inline ] |
152 | #[target_feature (enable = "avx512bf16,avx512f" )] |
153 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
154 | #[cfg_attr (test, assert_instr("vcvtne2ps2bf16" ))] |
155 | pub fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh { |
156 | unsafe { |
157 | let cvt: u16x32 = _mm512_cvtne2ps_pbh(a, b).as_u16x32(); |
158 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:u16x32::ZERO)) |
159 | } |
160 | } |
161 | |
162 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
163 | /// floating-point elements, and store the results in dst. |
164 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh) |
165 | #[inline ] |
166 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
167 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
168 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
169 | pub fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh { |
170 | unsafe { transmute(src:cvtneps2bf16_256(a.as_f32x8())) } |
171 | } |
172 | |
173 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
174 | /// floating-point elements, and store the results in dst using writemask k |
175 | /// (elements are copied from src when the corresponding mask bit is not set). |
176 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh) |
177 | #[inline ] |
178 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
179 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
180 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
181 | pub fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh { |
182 | unsafe { |
183 | let cvt: u16x8 = _mm256_cvtneps_pbh(a).as_u16x8(); |
184 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:src.as_u16x8())) |
185 | } |
186 | } |
187 | |
188 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
189 | /// floating-point elements, and store the results in dst using zeromask k |
190 | /// (elements are zeroed out when the corresponding mask bit is not set). |
191 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh) |
192 | #[inline ] |
193 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
194 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
195 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
196 | pub fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh { |
197 | unsafe { |
198 | let cvt: u16x8 = _mm256_cvtneps_pbh(a).as_u16x8(); |
199 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:u16x8::ZERO)) |
200 | } |
201 | } |
202 | |
203 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
204 | /// floating-point elements, and store the results in dst. |
205 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh) |
206 | #[inline ] |
207 | #[target_feature (enable = "avx512bf16,avx512f" )] |
208 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
209 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
210 | pub fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh { |
211 | unsafe { transmute(src:cvtneps2bf16_512(a.as_f32x16())) } |
212 | } |
213 | |
214 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
215 | /// floating-point elements, and store the results in dst using writemask k |
216 | /// (elements are copied from src when the corresponding mask bit is not set). |
217 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh) |
218 | #[inline ] |
219 | #[target_feature (enable = "avx512bf16,avx512f" )] |
220 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
221 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
222 | pub fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh { |
223 | unsafe { |
224 | let cvt: u16x16 = _mm512_cvtneps_pbh(a).as_u16x16(); |
225 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:src.as_u16x16())) |
226 | } |
227 | } |
228 | |
229 | /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
230 | /// floating-point elements, and store the results in dst using zeromask k |
231 | /// (elements are zeroed out when the corresponding mask bit is not set). |
232 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh) |
233 | #[inline ] |
234 | #[target_feature (enable = "avx512bf16,avx512f" )] |
235 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
236 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
237 | pub fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh { |
238 | unsafe { |
239 | let cvt: u16x16 = _mm512_cvtneps_pbh(a).as_u16x16(); |
240 | transmute(src:simd_select_bitmask(m:k, yes:cvt, no:u16x16::ZERO)) |
241 | } |
242 | } |
243 | |
244 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
245 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
246 | /// with elements in src, and store the results in dst. |
247 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps) |
248 | #[inline ] |
249 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
250 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
251 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
252 | pub fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 { |
253 | unsafe { transmute(src:dpbf16ps(a:src.as_f32x4(), b:a.as_i32x4(), c:b.as_i32x4())) } |
254 | } |
255 | |
256 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
257 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
258 | /// with elements in src, and store the results in dst using writemask k |
259 | /// (elements are copied from src when the corresponding mask bit is not set). |
260 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps) |
261 | #[inline ] |
262 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
263 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
264 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
265 | pub fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 { |
266 | unsafe { |
267 | let rst: f32x4 = _mm_dpbf16_ps(src, a, b).as_f32x4(); |
268 | transmute(src:simd_select_bitmask(m:k, yes:rst, no:src.as_f32x4())) |
269 | } |
270 | } |
271 | |
272 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
273 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
274 | /// with elements in src, and store the results in dst using zeromask k |
275 | /// (elements are zeroed out when the corresponding mask bit is not set). |
276 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps) |
277 | #[inline ] |
278 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
279 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
280 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
281 | pub fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 { |
282 | unsafe { |
283 | let rst: f32x4 = _mm_dpbf16_ps(src, a, b).as_f32x4(); |
284 | let zero: f32x4 = _mm_set1_ps(0.0_f32).as_f32x4(); |
285 | transmute(src:simd_select_bitmask(m:k, yes:rst, no:zero)) |
286 | } |
287 | } |
288 | |
289 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
290 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
291 | /// with elements in src, and store the results in dst. |
292 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps) |
293 | #[inline ] |
294 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
295 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
296 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
297 | pub fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 { |
298 | unsafe { transmute(src:dpbf16ps_256(a:src.as_f32x8(), b:a.as_i32x8(), c:b.as_i32x8())) } |
299 | } |
300 | |
301 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
302 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
303 | /// with elements in src, and store the results in dst using writemask k |
304 | /// (elements are copied from src when the corresponding mask bit is not set). |
305 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps) |
306 | #[inline ] |
307 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
308 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
309 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
310 | pub fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 { |
311 | unsafe { |
312 | let rst: f32x8 = _mm256_dpbf16_ps(src, a, b).as_f32x8(); |
313 | transmute(src:simd_select_bitmask(m:k, yes:rst, no:src.as_f32x8())) |
314 | } |
315 | } |
316 | |
317 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
318 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
319 | /// with elements in src, and store the results in dst using zeromask k |
320 | /// (elements are zeroed out when the corresponding mask bit is not set). |
321 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps) |
322 | #[inline ] |
323 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
324 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
325 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
326 | pub fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 { |
327 | unsafe { |
328 | let rst: f32x8 = _mm256_dpbf16_ps(src, a, b).as_f32x8(); |
329 | transmute(src:simd_select_bitmask(m:k, yes:rst, no:f32x8::ZERO)) |
330 | } |
331 | } |
332 | |
333 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
334 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
335 | /// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit) |
336 | /// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit) |
337 | /// floating-point elements with elements in src, and store the results in dst. |
338 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps) |
339 | #[inline ] |
340 | #[target_feature (enable = "avx512bf16,avx512f" )] |
341 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
342 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
343 | pub fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 { |
344 | unsafe { transmute(src:dpbf16ps_512(a:src.as_f32x16(), b:a.as_i32x16(), c:b.as_i32x16())) } |
345 | } |
346 | |
347 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
348 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
349 | /// with elements in src, and store the results in dst using writemask k |
350 | /// (elements are copied from src when the corresponding mask bit is not set). |
351 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps) |
352 | #[inline ] |
353 | #[target_feature (enable = "avx512bf16,avx512f" )] |
354 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
355 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
356 | pub fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 { |
357 | unsafe { |
358 | let rst: f32x16 = _mm512_dpbf16_ps(src, a, b).as_f32x16(); |
359 | transmute(src:simd_select_bitmask(m:k, yes:rst, no:src.as_f32x16())) |
360 | } |
361 | } |
362 | |
363 | /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, |
364 | /// accumulating the intermediate single-precision (32-bit) floating-point elements |
365 | /// with elements in src, and store the results in dst using zeromask k |
366 | /// (elements are zeroed out when the corresponding mask bit is not set). |
367 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps) |
368 | #[inline ] |
369 | #[target_feature (enable = "avx512bf16,avx512f" )] |
370 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
371 | #[cfg_attr (test, assert_instr("vdpbf16ps" ))] |
372 | pub fn _mm512_maskz_dpbf16_ps(k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 { |
373 | unsafe { |
374 | let rst: f32x16 = _mm512_dpbf16_ps(src, a, b).as_f32x16(); |
375 | transmute(src:simd_select_bitmask(m:k, yes:rst, no:f32x16::ZERO)) |
376 | } |
377 | } |
378 | |
379 | /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) |
380 | /// floating-point elements, and store the results in dst. |
381 | /// |
382 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps) |
383 | #[inline ] |
384 | #[target_feature (enable = "avx512bf16,avx512f" )] |
385 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
386 | pub fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 { |
387 | unsafe { _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(src:a)))) } |
388 | } |
389 | |
390 | /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) |
391 | /// floating-point elements, and store the results in dst using writemask k (elements are copied |
392 | /// from src when the corresponding mask bit is not set). |
393 | /// |
394 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps) |
395 | #[inline ] |
396 | #[target_feature (enable = "avx512bf16,avx512f" )] |
397 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
398 | pub fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 { |
399 | unsafe { |
400 | let cvt: __m512 = _mm512_cvtpbh_ps(a); |
401 | transmute(src:simd_select_bitmask(m:k, yes:cvt.as_f32x16(), no:src.as_f32x16())) |
402 | } |
403 | } |
404 | |
405 | /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) |
406 | /// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out |
407 | /// when the corresponding mask bit is not set). |
408 | /// |
409 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps) |
410 | #[inline ] |
411 | #[target_feature (enable = "avx512bf16,avx512f" )] |
412 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
413 | pub fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 { |
414 | unsafe { |
415 | let cvt: __m512 = _mm512_cvtpbh_ps(a); |
416 | transmute(src:simd_select_bitmask(m:k, yes:cvt.as_f32x16(), no:f32x16::ZERO)) |
417 | } |
418 | } |
419 | |
420 | /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) |
421 | /// floating-point elements, and store the results in dst. |
422 | /// |
423 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps) |
424 | #[inline ] |
425 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
426 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
427 | pub fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 { |
428 | unsafe { _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(src:a)))) } |
429 | } |
430 | |
431 | /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) |
432 | /// floating-point elements, and store the results in dst using writemask k (elements are copied |
433 | /// from src when the corresponding mask bit is not set). |
434 | /// |
435 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps) |
436 | #[inline ] |
437 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
438 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
439 | pub fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 { |
440 | unsafe { |
441 | let cvt: __m256 = _mm256_cvtpbh_ps(a); |
442 | transmute(src:simd_select_bitmask(m:k, yes:cvt.as_f32x8(), no:src.as_f32x8())) |
443 | } |
444 | } |
445 | |
446 | /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) |
447 | /// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out |
448 | /// when the corresponding mask bit is not set). |
449 | /// |
450 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps) |
451 | #[inline ] |
452 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
453 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
454 | pub fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 { |
455 | unsafe { |
456 | let cvt: __m256 = _mm256_cvtpbh_ps(a); |
457 | transmute(src:simd_select_bitmask(m:k, yes:cvt.as_f32x8(), no:f32x8::ZERO)) |
458 | } |
459 | } |
460 | |
461 | /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point |
462 | /// elements, and store the results in dst. |
463 | /// |
464 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps) |
465 | #[inline ] |
466 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
467 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
468 | pub fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 { |
469 | unsafe { _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(src:a)))) } |
470 | } |
471 | |
472 | /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point |
473 | /// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding |
474 | /// mask bit is not set). |
475 | /// |
476 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps) |
477 | #[inline ] |
478 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
479 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
480 | pub fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 { |
481 | unsafe { |
482 | let cvt: __m128 = _mm_cvtpbh_ps(a); |
483 | transmute(src:simd_select_bitmask(m:k, yes:cvt.as_f32x4(), no:src.as_f32x4())) |
484 | } |
485 | } |
486 | |
487 | /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point |
488 | /// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding |
489 | /// mask bit is not set). |
490 | /// |
491 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps) |
492 | #[inline ] |
493 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
494 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
495 | pub fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 { |
496 | unsafe { |
497 | let cvt: __m128 = _mm_cvtpbh_ps(a); |
498 | transmute(src:simd_select_bitmask(m:k, yes:cvt.as_f32x4(), no:f32x4::ZERO)) |
499 | } |
500 | } |
501 | |
502 | /// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point |
503 | /// element, and store the result in dst. |
504 | /// |
505 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss) |
506 | #[inline ] |
507 | #[target_feature (enable = "avx512bf16,avx512f" )] |
508 | #[unstable (feature = "stdarch_x86_avx512_bf16" , issue = "127356" )] |
509 | pub fn _mm_cvtsbh_ss(a: bf16) -> f32 { |
510 | f32::from_bits((a.to_bits() as u32) << 16) |
511 | } |
512 | |
513 | /// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
514 | /// floating-point elements, and store the results in dst. |
515 | /// |
516 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh) |
517 | #[inline ] |
518 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
519 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
520 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
521 | pub fn _mm_cvtneps_pbh(a: __m128) -> __m128bh { |
522 | unsafe { |
523 | let mut dst: __m128bh; |
524 | asm!( |
525 | "vcvtneps2bf16 { dst}, { src}" , |
526 | dst = lateout(xmm_reg) dst, |
527 | src = in(xmm_reg) a, |
528 | options(pure, nomem, nostack, preserves_flags) |
529 | ); |
530 | dst |
531 | } |
532 | } |
533 | |
534 | /// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
535 | /// floating-point elements, and store the results in dst using writemask k (elements are copied |
536 | /// from src when the corresponding mask bit is not set). |
537 | /// |
538 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh) |
539 | #[inline ] |
540 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
541 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
542 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
543 | pub fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh { |
544 | unsafe { |
545 | let mut dst: __m128bh = src; |
546 | asm!( |
547 | "vcvtneps2bf16 { dst}{{{ k}}},{ src}" , |
548 | dst = inlateout(xmm_reg) dst, |
549 | src = in(xmm_reg) a, |
550 | k = in(kreg) k, |
551 | options(pure, nomem, nostack, preserves_flags) |
552 | ); |
553 | dst |
554 | } |
555 | } |
556 | |
557 | /// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) |
558 | /// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out |
559 | /// when the corresponding mask bit is not set). |
560 | /// |
561 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh) |
562 | #[inline ] |
563 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
564 | #[cfg_attr (test, assert_instr("vcvtneps2bf16" ))] |
565 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
566 | pub fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh { |
567 | unsafe { |
568 | let mut dst: __m128bh; |
569 | asm!( |
570 | "vcvtneps2bf16 { dst}{{{ k}}}{{z}},{ src}" , |
571 | dst = lateout(xmm_reg) dst, |
572 | src = in(xmm_reg) a, |
573 | k = in(kreg) k, |
574 | options(pure, nomem, nostack, preserves_flags) |
575 | ); |
576 | dst |
577 | } |
578 | } |
579 | |
580 | /// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point |
581 | /// element, and store the result in dst. |
582 | /// |
583 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh) |
584 | #[inline ] |
585 | #[target_feature (enable = "avx512bf16,avx512vl" )] |
586 | #[unstable (feature = "stdarch_x86_avx512_bf16" , issue = "127356" )] |
587 | pub fn _mm_cvtness_sbh(a: f32) -> bf16 { |
588 | unsafe { |
589 | let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0); |
590 | bf16::from_bits(value) |
591 | } |
592 | } |
593 | |
594 | #[cfg (test)] |
595 | mod tests { |
596 | use crate::core_arch::simd::u16x4; |
597 | use crate::{ |
598 | core_arch::x86::*, |
599 | mem::{transmute, transmute_copy}, |
600 | }; |
601 | use stdarch_test::simd_test; |
602 | |
603 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
604 | unsafe fn test_mm_cvtne2ps_pbh() { |
605 | let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
606 | let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32]; |
607 | let a: __m128 = transmute(a_array); |
608 | let b: __m128 = transmute(b_array); |
609 | let c: __m128bh = _mm_cvtne2ps_pbh(a, b); |
610 | let result: [u16; 8] = transmute(c.as_u16x8()); |
611 | #[rustfmt::skip] |
612 | let expected_result: [u16; 8] = [ |
613 | 0b1_10000110_0110010, |
614 | 0b1_10000010_0101000, |
615 | 0b1_10000000_1110000, |
616 | 0b1_10000100_1001001, |
617 | 0b0_10000110_0110010, |
618 | 0b0_10000010_0101000, |
619 | 0b0_10000000_1110000, |
620 | 0b0_10000100_1001001, |
621 | ]; |
622 | assert_eq!(result, expected_result); |
623 | } |
624 | |
625 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
626 | unsafe fn test_mm_mask_cvtne2ps_pbh() { |
627 | let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
628 | let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32]; |
629 | #[rustfmt::skip] |
630 | let src_array: [u16; 8] = [ |
631 | 0b0_10000110_0110010, |
632 | 0b0_10000010_0101000, |
633 | 0b0_10000000_1110000, |
634 | 0b0_10000100_1001001, |
635 | 0b0_10000110_0110010, |
636 | 0b0_10000010_0101000, |
637 | 0b0_10000000_1110000, |
638 | 0b0_10000100_1001001, |
639 | ]; |
640 | let src: __m128bh = transmute(src_array); |
641 | let a: __m128 = transmute(a_array); |
642 | let b: __m128 = transmute(b_array); |
643 | let k: __mmask8 = 0b1111_1111; |
644 | let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b); |
645 | let result: [u16; 8] = transmute(c.as_u16x8()); |
646 | #[rustfmt::skip] |
647 | let expected_result: [u16; 8] = [ |
648 | 0b1_10000110_0110010, |
649 | 0b1_10000010_0101000, |
650 | 0b1_10000000_1110000, |
651 | 0b1_10000100_1001001, |
652 | 0b0_10000110_0110010, |
653 | 0b0_10000010_0101000, |
654 | 0b0_10000000_1110000, |
655 | 0b0_10000100_1001001, |
656 | ]; |
657 | assert_eq!(result, expected_result); |
658 | let k = 0b0000_0000; |
659 | let c = _mm_mask_cvtne2ps_pbh(src, k, a, b); |
660 | let result: [u16; 8] = transmute(c.as_u16x8()); |
661 | let expected_result = src_array; |
662 | assert_eq!(result, expected_result); |
663 | } |
664 | |
665 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
666 | unsafe fn test_mm_maskz_cvtne2ps_pbh() { |
667 | let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
668 | let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32]; |
669 | let a: __m128 = transmute(a_array); |
670 | let b: __m128 = transmute(b_array); |
671 | let k: __mmask8 = 0b1111_1111; |
672 | let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b); |
673 | let result: [u16; 8] = transmute(c.as_u16x8()); |
674 | #[rustfmt::skip] |
675 | let expected_result: [u16; 8] = [ |
676 | 0b1_10000110_0110010, |
677 | 0b1_10000010_0101000, |
678 | 0b1_10000000_1110000, |
679 | 0b1_10000100_1001001, |
680 | 0b0_10000110_0110010, |
681 | 0b0_10000010_0101000, |
682 | 0b0_10000000_1110000, |
683 | 0b0_10000100_1001001, |
684 | ]; |
685 | assert_eq!(result, expected_result); |
686 | let k = 0b0011_1100; |
687 | let c = _mm_maskz_cvtne2ps_pbh(k, a, b); |
688 | let result: [u16; 8] = transmute(c.as_u16x8()); |
689 | #[rustfmt::skip] |
690 | let expected_result: [u16; 8] = [ |
691 | 0, |
692 | 0, |
693 | 0b1_10000000_1110000, |
694 | 0b1_10000100_1001001, |
695 | 0b0_10000110_0110010, |
696 | 0b0_10000010_0101000, |
697 | 0, |
698 | 0, |
699 | ]; |
700 | assert_eq!(result, expected_result); |
701 | } |
702 | |
703 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
704 | unsafe fn test_mm256_cvtne2ps_pbh() { |
705 | #[rustfmt::skip] |
706 | let a_array = [ |
707 | 178.125_f32, |
708 | 10.5_f32, |
709 | 3.75_f32, |
710 | 50.25_f32, |
711 | 16.5_f32, |
712 | 255.11_f32, |
713 | 1000.158_f32, |
714 | 575.575_f32, |
715 | ]; |
716 | let b_array = [ |
717 | -178.125_f32, |
718 | -10.5_f32, |
719 | -3.75_f32, |
720 | -50.25_f32, |
721 | -16.5_f32, |
722 | -255.11_f32, |
723 | -1000.158_f32, |
724 | -575.575_f32, |
725 | ]; |
726 | let a: __m256 = transmute(a_array); |
727 | let b: __m256 = transmute(b_array); |
728 | let c: __m256bh = _mm256_cvtne2ps_pbh(a, b); |
729 | let result: [u16; 16] = transmute(c.as_u16x16()); |
730 | #[rustfmt::skip] |
731 | let expected_result: [u16; 16] = [ |
732 | 0b1_10000110_0110010, |
733 | 0b1_10000010_0101000, |
734 | 0b1_10000000_1110000, |
735 | 0b1_10000100_1001001, |
736 | 0b1_10000011_0000100, |
737 | 0b1_10000110_1111111, |
738 | 0b1_10001000_1111010, |
739 | 0b1_10001000_0010000, |
740 | 0b0_10000110_0110010, |
741 | 0b0_10000010_0101000, |
742 | 0b0_10000000_1110000, |
743 | 0b0_10000100_1001001, |
744 | 0b0_10000011_0000100, |
745 | 0b0_10000110_1111111, |
746 | 0b0_10001000_1111010, |
747 | 0b0_10001000_0010000, |
748 | ]; |
749 | assert_eq!(result, expected_result); |
750 | } |
751 | |
752 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
753 | unsafe fn test_mm256_mask_cvtne2ps_pbh() { |
754 | #[rustfmt::skip] |
755 | let a_array = [ |
756 | 178.125_f32, |
757 | 10.5_f32, |
758 | 3.75_f32, |
759 | 50.25_f32, |
760 | 16.5_f32, |
761 | 255.11_f32, |
762 | 1000.158_f32, |
763 | 575.575_f32, |
764 | ]; |
765 | let b_array = [ |
766 | -178.125_f32, |
767 | -10.5_f32, |
768 | -3.75_f32, |
769 | -50.25_f32, |
770 | -16.5_f32, |
771 | -255.11_f32, |
772 | -1000.158_f32, |
773 | -575.575_f32, |
774 | ]; |
775 | let src_array: [u16; 16] = [ |
776 | 0b0_10000110_0110010, |
777 | 0b0_10000010_0101000, |
778 | 0b0_10000000_1110000, |
779 | 0b0_10000100_1001001, |
780 | 0b0_10000110_0110010, |
781 | 0b0_10000010_0101000, |
782 | 0b0_10000000_1110000, |
783 | 0b0_10000100_1001001, |
784 | 0b0_10000110_0110010, |
785 | 0b0_10000010_0101000, |
786 | 0b0_10000000_1110000, |
787 | 0b0_10000100_1001001, |
788 | 0b0_10000110_0110010, |
789 | 0b0_10000010_0101000, |
790 | 0b0_10000000_1110000, |
791 | 0b0_10000100_1001001, |
792 | ]; |
793 | let src: __m256bh = transmute(src_array); |
794 | let a: __m256 = transmute(a_array); |
795 | let b: __m256 = transmute(b_array); |
796 | let k: __mmask16 = 0xffff; |
797 | let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b); |
798 | let result: [u16; 16] = transmute(c.as_u16x16()); |
799 | #[rustfmt::skip] |
800 | let expected_result: [u16; 16] = [ |
801 | 0b1_10000110_0110010, |
802 | 0b1_10000010_0101000, |
803 | 0b1_10000000_1110000, |
804 | 0b1_10000100_1001001, |
805 | 0b1_10000011_0000100, |
806 | 0b1_10000110_1111111, |
807 | 0b1_10001000_1111010, |
808 | 0b1_10001000_0010000, |
809 | 0b0_10000110_0110010, |
810 | 0b0_10000010_0101000, |
811 | 0b0_10000000_1110000, |
812 | 0b0_10000100_1001001, |
813 | 0b0_10000011_0000100, |
814 | 0b0_10000110_1111111, |
815 | 0b0_10001000_1111010, |
816 | 0b0_10001000_0010000, |
817 | ]; |
818 | assert_eq!(result, expected_result); |
819 | let k: __mmask16 = 0; |
820 | let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b); |
821 | let result: [u16; 16] = transmute(c.as_u16x16()); |
822 | let expected_result = src_array; |
823 | assert_eq!(result, expected_result); |
824 | } |
825 | |
826 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
827 | unsafe fn test_mm256_maskz_cvtne2ps_pbh() { |
828 | #[rustfmt::skip] |
829 | let a_array = [ |
830 | 178.125_f32, |
831 | 10.5_f32, |
832 | 3.75_f32, |
833 | 50.25_f32, |
834 | 16.5_f32, |
835 | 255.11_f32, |
836 | 1000.158_f32, |
837 | 575.575_f32, |
838 | ]; |
839 | let b_array = [ |
840 | -178.125_f32, |
841 | -10.5_f32, |
842 | -3.75_f32, |
843 | -50.25_f32, |
844 | -16.5_f32, |
845 | -255.11_f32, |
846 | -1000.158_f32, |
847 | -575.575_f32, |
848 | ]; |
849 | let a: __m256 = transmute(a_array); |
850 | let b: __m256 = transmute(b_array); |
851 | let k: __mmask16 = 0xffff; |
852 | let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b); |
853 | let result: [u16; 16] = transmute(c.as_u16x16()); |
854 | #[rustfmt::skip] |
855 | let expected_result: [u16; 16] = [ |
856 | 0b1_10000110_0110010, |
857 | 0b1_10000010_0101000, |
858 | 0b1_10000000_1110000, |
859 | 0b1_10000100_1001001, |
860 | 0b1_10000011_0000100, |
861 | 0b1_10000110_1111111, |
862 | 0b1_10001000_1111010, |
863 | 0b1_10001000_0010000, |
864 | 0b0_10000110_0110010, |
865 | 0b0_10000010_0101000, |
866 | 0b0_10000000_1110000, |
867 | 0b0_10000100_1001001, |
868 | 0b0_10000011_0000100, |
869 | 0b0_10000110_1111111, |
870 | 0b0_10001000_1111010, |
871 | 0b0_10001000_0010000, |
872 | ]; |
873 | assert_eq!(result, expected_result); |
874 | let k: __mmask16 = 0b0110_1100_0011_0110; |
875 | let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b); |
876 | let result: [u16; 16] = transmute(c.as_u16x16()); |
877 | #[rustfmt::skip] |
878 | let expected_result: [u16; 16] = [ |
879 | 0, |
880 | 0b1_10000010_0101000, |
881 | 0b1_10000000_1110000, |
882 | 0, |
883 | 0b1_10000011_0000100, |
884 | 0b1_10000110_1111111, |
885 | 0, |
886 | 0, |
887 | 0, |
888 | 0, |
889 | 0b0_10000000_1110000, |
890 | 0b0_10000100_1001001, |
891 | 0, |
892 | 0b0_10000110_1111111, |
893 | 0b0_10001000_1111010, |
894 | 0, |
895 | ]; |
896 | assert_eq!(result, expected_result); |
897 | } |
898 | |
899 | #[simd_test(enable = "avx512bf16,avx512f" )] |
900 | unsafe fn test_mm512_cvtne2ps_pbh() { |
901 | #[rustfmt::skip] |
902 | let a_array = [ |
903 | 178.125_f32, |
904 | 10.5_f32, |
905 | 3.75_f32, |
906 | 50.25_f32, |
907 | 16.5_f32, |
908 | 255.11_f32, |
909 | 1000.158_f32, |
910 | 575.575_f32, |
911 | 178.125_f32, |
912 | 10.5_f32, |
913 | 3.75_f32, |
914 | 50.25_f32, |
915 | 16.5_f32, |
916 | 255.11_f32, |
917 | 1000.158_f32, |
918 | 575.575_f32, |
919 | ]; |
920 | let b_array = [ |
921 | -178.125_f32, |
922 | -10.5_f32, |
923 | -3.75_f32, |
924 | -50.25_f32, |
925 | -16.5_f32, |
926 | -255.11_f32, |
927 | -1000.158_f32, |
928 | -575.575_f32, |
929 | -178.125_f32, |
930 | -10.5_f32, |
931 | -3.75_f32, |
932 | -50.25_f32, |
933 | -16.5_f32, |
934 | -255.11_f32, |
935 | -1000.158_f32, |
936 | -575.575_f32, |
937 | ]; |
938 | let a: __m512 = transmute(a_array); |
939 | let b: __m512 = transmute(b_array); |
940 | let c: __m512bh = _mm512_cvtne2ps_pbh(a, b); |
941 | let result: [u16; 32] = transmute(c.as_u16x32()); |
942 | #[rustfmt::skip] |
943 | let expected_result: [u16; 32] = [ |
944 | 0b1_10000110_0110010, |
945 | 0b1_10000010_0101000, |
946 | 0b1_10000000_1110000, |
947 | 0b1_10000100_1001001, |
948 | 0b1_10000011_0000100, |
949 | 0b1_10000110_1111111, |
950 | 0b1_10001000_1111010, |
951 | 0b1_10001000_0010000, |
952 | 0b1_10000110_0110010, |
953 | 0b1_10000010_0101000, |
954 | 0b1_10000000_1110000, |
955 | 0b1_10000100_1001001, |
956 | 0b1_10000011_0000100, |
957 | 0b1_10000110_1111111, |
958 | 0b1_10001000_1111010, |
959 | 0b1_10001000_0010000, |
960 | 0b0_10000110_0110010, |
961 | 0b0_10000010_0101000, |
962 | 0b0_10000000_1110000, |
963 | 0b0_10000100_1001001, |
964 | 0b0_10000011_0000100, |
965 | 0b0_10000110_1111111, |
966 | 0b0_10001000_1111010, |
967 | 0b0_10001000_0010000, |
968 | 0b0_10000110_0110010, |
969 | 0b0_10000010_0101000, |
970 | 0b0_10000000_1110000, |
971 | 0b0_10000100_1001001, |
972 | 0b0_10000011_0000100, |
973 | 0b0_10000110_1111111, |
974 | 0b0_10001000_1111010, |
975 | 0b0_10001000_0010000, |
976 | ]; |
977 | assert_eq!(result, expected_result); |
978 | } |
979 | |
980 | #[simd_test(enable = "avx512bf16,avx512f" )] |
981 | unsafe fn test_mm512_mask_cvtne2ps_pbh() { |
982 | #[rustfmt::skip] |
983 | let a_array = [ |
984 | 178.125_f32, |
985 | 10.5_f32, |
986 | 3.75_f32, |
987 | 50.25_f32, |
988 | 16.5_f32, |
989 | 255.11_f32, |
990 | 1000.158_f32, |
991 | 575.575_f32, |
992 | 178.125_f32, |
993 | 10.5_f32, |
994 | 3.75_f32, |
995 | 50.25_f32, |
996 | 16.5_f32, |
997 | 255.11_f32, |
998 | 1000.158_f32, |
999 | 575.575_f32, |
1000 | ]; |
1001 | let b_array = [ |
1002 | -178.125_f32, |
1003 | -10.5_f32, |
1004 | -3.75_f32, |
1005 | -50.25_f32, |
1006 | -16.5_f32, |
1007 | -255.11_f32, |
1008 | -1000.158_f32, |
1009 | -575.575_f32, |
1010 | -178.125_f32, |
1011 | -10.5_f32, |
1012 | -3.75_f32, |
1013 | -50.25_f32, |
1014 | -16.5_f32, |
1015 | -255.11_f32, |
1016 | -1000.158_f32, |
1017 | -575.575_f32, |
1018 | ]; |
1019 | let src_array: [u16; 32] = [ |
1020 | 0b0_10000110_0110010, |
1021 | 0b0_10000010_0101000, |
1022 | 0b0_10000000_1110000, |
1023 | 0b0_10000100_1001001, |
1024 | 0b0_10000110_0110010, |
1025 | 0b0_10000010_0101000, |
1026 | 0b0_10000000_1110000, |
1027 | 0b0_10000100_1001001, |
1028 | 0b0_10000110_0110010, |
1029 | 0b0_10000010_0101000, |
1030 | 0b0_10000000_1110000, |
1031 | 0b0_10000100_1001001, |
1032 | 0b0_10000110_0110010, |
1033 | 0b0_10000010_0101000, |
1034 | 0b0_10000000_1110000, |
1035 | 0b0_10000100_1001001, |
1036 | 0b0_10000110_0110010, |
1037 | 0b0_10000010_0101000, |
1038 | 0b0_10000000_1110000, |
1039 | 0b0_10000100_1001001, |
1040 | 0b0_10000110_0110010, |
1041 | 0b0_10000010_0101000, |
1042 | 0b0_10000000_1110000, |
1043 | 0b0_10000100_1001001, |
1044 | 0b0_10000110_0110010, |
1045 | 0b0_10000010_0101000, |
1046 | 0b0_10000000_1110000, |
1047 | 0b0_10000100_1001001, |
1048 | 0b0_10000110_0110010, |
1049 | 0b0_10000010_0101000, |
1050 | 0b0_10000000_1110000, |
1051 | 0b0_10000100_1001001, |
1052 | ]; |
1053 | let src: __m512bh = transmute(src_array); |
1054 | let a: __m512 = transmute(a_array); |
1055 | let b: __m512 = transmute(b_array); |
1056 | let k: __mmask32 = 0xffffffff; |
1057 | let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b); |
1058 | let result: [u16; 32] = transmute(c.as_u16x32()); |
1059 | #[rustfmt::skip] |
1060 | let expected_result: [u16; 32] = [ |
1061 | 0b1_10000110_0110010, |
1062 | 0b1_10000010_0101000, |
1063 | 0b1_10000000_1110000, |
1064 | 0b1_10000100_1001001, |
1065 | 0b1_10000011_0000100, |
1066 | 0b1_10000110_1111111, |
1067 | 0b1_10001000_1111010, |
1068 | 0b1_10001000_0010000, |
1069 | 0b1_10000110_0110010, |
1070 | 0b1_10000010_0101000, |
1071 | 0b1_10000000_1110000, |
1072 | 0b1_10000100_1001001, |
1073 | 0b1_10000011_0000100, |
1074 | 0b1_10000110_1111111, |
1075 | 0b1_10001000_1111010, |
1076 | 0b1_10001000_0010000, |
1077 | 0b0_10000110_0110010, |
1078 | 0b0_10000010_0101000, |
1079 | 0b0_10000000_1110000, |
1080 | 0b0_10000100_1001001, |
1081 | 0b0_10000011_0000100, |
1082 | 0b0_10000110_1111111, |
1083 | 0b0_10001000_1111010, |
1084 | 0b0_10001000_0010000, |
1085 | 0b0_10000110_0110010, |
1086 | 0b0_10000010_0101000, |
1087 | 0b0_10000000_1110000, |
1088 | 0b0_10000100_1001001, |
1089 | 0b0_10000011_0000100, |
1090 | 0b0_10000110_1111111, |
1091 | 0b0_10001000_1111010, |
1092 | 0b0_10001000_0010000, |
1093 | ]; |
1094 | assert_eq!(result, expected_result); |
1095 | let k: __mmask32 = 0; |
1096 | let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b); |
1097 | let result: [u16; 32] = transmute(c.as_u16x32()); |
1098 | let expected_result = src_array; |
1099 | assert_eq!(result, expected_result); |
1100 | } |
1101 | |
1102 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1103 | unsafe fn test_mm512_maskz_cvtne2ps_pbh() { |
1104 | #[rustfmt::skip] |
1105 | let a_array = [ |
1106 | 178.125_f32, |
1107 | 10.5_f32, |
1108 | 3.75_f32, |
1109 | 50.25_f32, |
1110 | 16.5_f32, |
1111 | 255.11_f32, |
1112 | 1000.158_f32, |
1113 | 575.575_f32, |
1114 | 178.125_f32, |
1115 | 10.5_f32, |
1116 | 3.75_f32, |
1117 | 50.25_f32, |
1118 | 16.5_f32, |
1119 | 255.11_f32, |
1120 | 1000.158_f32, |
1121 | 575.575_f32, |
1122 | ]; |
1123 | let b_array = [ |
1124 | -178.125_f32, |
1125 | -10.5_f32, |
1126 | -3.75_f32, |
1127 | -50.25_f32, |
1128 | -16.5_f32, |
1129 | -255.11_f32, |
1130 | -1000.158_f32, |
1131 | -575.575_f32, |
1132 | -178.125_f32, |
1133 | -10.5_f32, |
1134 | -3.75_f32, |
1135 | -50.25_f32, |
1136 | -16.5_f32, |
1137 | -255.11_f32, |
1138 | -1000.158_f32, |
1139 | -575.575_f32, |
1140 | ]; |
1141 | let a: __m512 = transmute(a_array); |
1142 | let b: __m512 = transmute(b_array); |
1143 | let k: __mmask32 = 0xffffffff; |
1144 | let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b); |
1145 | let result: [u16; 32] = transmute(c.as_u16x32()); |
1146 | #[rustfmt::skip] |
1147 | let expected_result: [u16; 32] = [ |
1148 | 0b1_10000110_0110010, |
1149 | 0b1_10000010_0101000, |
1150 | 0b1_10000000_1110000, |
1151 | 0b1_10000100_1001001, |
1152 | 0b1_10000011_0000100, |
1153 | 0b1_10000110_1111111, |
1154 | 0b1_10001000_1111010, |
1155 | 0b1_10001000_0010000, |
1156 | 0b1_10000110_0110010, |
1157 | 0b1_10000010_0101000, |
1158 | 0b1_10000000_1110000, |
1159 | 0b1_10000100_1001001, |
1160 | 0b1_10000011_0000100, |
1161 | 0b1_10000110_1111111, |
1162 | 0b1_10001000_1111010, |
1163 | 0b1_10001000_0010000, |
1164 | 0b0_10000110_0110010, |
1165 | 0b0_10000010_0101000, |
1166 | 0b0_10000000_1110000, |
1167 | 0b0_10000100_1001001, |
1168 | 0b0_10000011_0000100, |
1169 | 0b0_10000110_1111111, |
1170 | 0b0_10001000_1111010, |
1171 | 0b0_10001000_0010000, |
1172 | 0b0_10000110_0110010, |
1173 | 0b0_10000010_0101000, |
1174 | 0b0_10000000_1110000, |
1175 | 0b0_10000100_1001001, |
1176 | 0b0_10000011_0000100, |
1177 | 0b0_10000110_1111111, |
1178 | 0b0_10001000_1111010, |
1179 | 0b0_10001000_0010000, |
1180 | ]; |
1181 | assert_eq!(result, expected_result); |
1182 | let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110; |
1183 | let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b); |
1184 | let result: [u16; 32] = transmute(c.as_u16x32()); |
1185 | #[rustfmt::skip] |
1186 | let expected_result: [u16; 32] = [ |
1187 | 0, |
1188 | 0b1_10000010_0101000, |
1189 | 0b1_10000000_1110000, |
1190 | 0, |
1191 | 0b1_10000011_0000100, |
1192 | 0, |
1193 | 0b1_10001000_1111010, |
1194 | 0, |
1195 | 0b1_10000110_0110010, |
1196 | 0b1_10000010_0101000, |
1197 | 0, |
1198 | 0, |
1199 | 0, |
1200 | 0b1_10000110_1111111, |
1201 | 0, |
1202 | 0b1_10001000_0010000, |
1203 | 0, |
1204 | 0b0_10000010_0101000, |
1205 | 0b0_10000000_1110000, |
1206 | 0, |
1207 | 0b0_10000011_0000100, |
1208 | 0, |
1209 | 0, |
1210 | 0b0_10001000_0010000, |
1211 | 0, |
1212 | 0b0_10000010_0101000, |
1213 | 0, |
1214 | 0b0_10000100_1001001, |
1215 | 0, |
1216 | 0, |
1217 | 0b0_10001000_1111010, |
1218 | 0b0_10001000_0010000, |
1219 | ]; |
1220 | assert_eq!(result, expected_result); |
1221 | } |
1222 | |
1223 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1224 | unsafe fn test_mm256_cvtneps_pbh() { |
1225 | #[rustfmt::skip] |
1226 | let a_array = [ |
1227 | 178.125_f32, |
1228 | 10.5_f32, |
1229 | 3.75_f32, |
1230 | 50.25_f32, |
1231 | 16.5_f32, |
1232 | 255.11_f32, |
1233 | 1000.158_f32, |
1234 | 575.575_f32, |
1235 | ]; |
1236 | let a: __m256 = transmute(a_array); |
1237 | let c: __m128bh = _mm256_cvtneps_pbh(a); |
1238 | let result: [u16; 8] = transmute(c.as_u16x8()); |
1239 | #[rustfmt::skip] |
1240 | let expected_result: [u16; 8] = [ |
1241 | 0b0_10000110_0110010, |
1242 | 0b0_10000010_0101000, |
1243 | 0b0_10000000_1110000, |
1244 | 0b0_10000100_1001001, |
1245 | 0b0_10000011_0000100, |
1246 | 0b0_10000110_1111111, |
1247 | 0b0_10001000_1111010, |
1248 | 0b0_10001000_0010000, |
1249 | ]; |
1250 | assert_eq!(result, expected_result); |
1251 | } |
1252 | |
1253 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1254 | unsafe fn test_mm256_mask_cvtneps_pbh() { |
1255 | #[rustfmt::skip] |
1256 | let a_array = [ |
1257 | 178.125_f32, |
1258 | 10.5_f32, |
1259 | 3.75_f32, |
1260 | 50.25_f32, |
1261 | 16.5_f32, |
1262 | 255.11_f32, |
1263 | 1000.158_f32, |
1264 | 575.575_f32, |
1265 | ]; |
1266 | let src_array: [u16; 8] = [ |
1267 | 0b1_10000110_0110010, |
1268 | 0b1_10000010_0101000, |
1269 | 0b1_10000000_1110000, |
1270 | 0b1_10000100_1001001, |
1271 | 0b1_10000011_0000100, |
1272 | 0b1_10000110_1111111, |
1273 | 0b1_10001000_1111010, |
1274 | 0b1_10001000_0010000, |
1275 | ]; |
1276 | let src: __m128bh = transmute(src_array); |
1277 | let a: __m256 = transmute(a_array); |
1278 | let k: __mmask8 = 0xff; |
1279 | let b = _mm256_mask_cvtneps_pbh(src, k, a); |
1280 | let result: [u16; 8] = transmute(b.as_u16x8()); |
1281 | #[rustfmt::skip] |
1282 | let expected_result: [u16; 8] = [ |
1283 | 0b0_10000110_0110010, |
1284 | 0b0_10000010_0101000, |
1285 | 0b0_10000000_1110000, |
1286 | 0b0_10000100_1001001, |
1287 | 0b0_10000011_0000100, |
1288 | 0b0_10000110_1111111, |
1289 | 0b0_10001000_1111010, |
1290 | 0b0_10001000_0010000, |
1291 | ]; |
1292 | assert_eq!(result, expected_result); |
1293 | let k: __mmask8 = 0x0; |
1294 | let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a); |
1295 | let result: [u16; 8] = transmute(b.as_u16x8()); |
1296 | let expected_result: [u16; 8] = src_array; |
1297 | assert_eq!(result, expected_result); |
1298 | } |
1299 | |
1300 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1301 | unsafe fn test_mm256_maskz_cvtneps_pbh() { |
1302 | #[rustfmt::skip] |
1303 | let a_array = [ |
1304 | 178.125_f32, |
1305 | 10.5_f32, |
1306 | 3.75_f32, |
1307 | 50.25_f32, |
1308 | 16.5_f32, |
1309 | 255.11_f32, |
1310 | 1000.158_f32, |
1311 | 575.575_f32, |
1312 | ]; |
1313 | let a: __m256 = transmute(a_array); |
1314 | let k: __mmask8 = 0xff; |
1315 | let b = _mm256_maskz_cvtneps_pbh(k, a); |
1316 | let result: [u16; 8] = transmute(b.as_u16x8()); |
1317 | #[rustfmt::skip] |
1318 | let expected_result: [u16; 8] = [ |
1319 | 0b0_10000110_0110010, |
1320 | 0b0_10000010_0101000, |
1321 | 0b0_10000000_1110000, |
1322 | 0b0_10000100_1001001, |
1323 | 0b0_10000011_0000100, |
1324 | 0b0_10000110_1111111, |
1325 | 0b0_10001000_1111010, |
1326 | 0b0_10001000_0010000, |
1327 | ]; |
1328 | assert_eq!(result, expected_result); |
1329 | let k: __mmask8 = 0x6; |
1330 | let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a); |
1331 | let result: [u16; 8] = transmute(b.as_u16x8()); |
1332 | let expected_result: [u16; 8] = |
1333 | [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0]; |
1334 | assert_eq!(result, expected_result); |
1335 | } |
1336 | |
1337 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1338 | unsafe fn test_mm512_cvtneps_pbh() { |
1339 | #[rustfmt::skip] |
1340 | let a_array = [ |
1341 | 178.125_f32, |
1342 | 10.5_f32, |
1343 | 3.75_f32, |
1344 | 50.25_f32, |
1345 | 16.5_f32, |
1346 | 255.11_f32, |
1347 | 1000.158_f32, |
1348 | 575.575_f32, |
1349 | 178.125_f32, |
1350 | 10.5_f32, |
1351 | 3.75_f32, |
1352 | 50.25_f32, |
1353 | 16.5_f32, |
1354 | 255.11_f32, |
1355 | 1000.158_f32, |
1356 | 575.575_f32, |
1357 | ]; |
1358 | let a: __m512 = transmute(a_array); |
1359 | let c: __m256bh = _mm512_cvtneps_pbh(a); |
1360 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1361 | #[rustfmt::skip] |
1362 | let expected_result: [u16; 16] = [ |
1363 | 0b0_10000110_0110010, |
1364 | 0b0_10000010_0101000, |
1365 | 0b0_10000000_1110000, |
1366 | 0b0_10000100_1001001, |
1367 | 0b0_10000011_0000100, |
1368 | 0b0_10000110_1111111, |
1369 | 0b0_10001000_1111010, |
1370 | 0b0_10001000_0010000, |
1371 | 0b0_10000110_0110010, |
1372 | 0b0_10000010_0101000, |
1373 | 0b0_10000000_1110000, |
1374 | 0b0_10000100_1001001, |
1375 | 0b0_10000011_0000100, |
1376 | 0b0_10000110_1111111, |
1377 | 0b0_10001000_1111010, |
1378 | 0b0_10001000_0010000, |
1379 | ]; |
1380 | assert_eq!(result, expected_result); |
1381 | } |
1382 | |
1383 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1384 | unsafe fn test_mm512_mask_cvtneps_pbh() { |
1385 | #[rustfmt::skip] |
1386 | let a_array = [ |
1387 | 178.125_f32, |
1388 | 10.5_f32, |
1389 | 3.75_f32, |
1390 | 50.25_f32, |
1391 | 16.5_f32, |
1392 | 255.11_f32, |
1393 | 1000.158_f32, |
1394 | 575.575_f32, |
1395 | 178.125_f32, |
1396 | 10.5_f32, |
1397 | 3.75_f32, |
1398 | 50.25_f32, |
1399 | 16.5_f32, |
1400 | 255.11_f32, |
1401 | 1000.158_f32, |
1402 | 575.575_f32, |
1403 | ]; |
1404 | let src_array: [u16; 16] = [ |
1405 | 0b1_10000110_0110010, |
1406 | 0b1_10000010_0101000, |
1407 | 0b1_10000000_1110000, |
1408 | 0b1_10000100_1001001, |
1409 | 0b1_10000011_0000100, |
1410 | 0b1_10000110_1111111, |
1411 | 0b1_10001000_1111010, |
1412 | 0b1_10001000_0010000, |
1413 | 0b1_10000110_0110010, |
1414 | 0b1_10000010_0101000, |
1415 | 0b1_10000000_1110000, |
1416 | 0b1_10000100_1001001, |
1417 | 0b1_10000011_0000100, |
1418 | 0b1_10000110_1111111, |
1419 | 0b1_10001000_1111010, |
1420 | 0b1_10001000_0010000, |
1421 | ]; |
1422 | let src: __m256bh = transmute(src_array); |
1423 | let a: __m512 = transmute(a_array); |
1424 | let k: __mmask16 = 0xffff; |
1425 | let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a); |
1426 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1427 | #[rustfmt::skip] |
1428 | let expected_result: [u16; 16] = [ |
1429 | 0b0_10000110_0110010, |
1430 | 0b0_10000010_0101000, |
1431 | 0b0_10000000_1110000, |
1432 | 0b0_10000100_1001001, |
1433 | 0b0_10000011_0000100, |
1434 | 0b0_10000110_1111111, |
1435 | 0b0_10001000_1111010, |
1436 | 0b0_10001000_0010000, |
1437 | 0b0_10000110_0110010, |
1438 | 0b0_10000010_0101000, |
1439 | 0b0_10000000_1110000, |
1440 | 0b0_10000100_1001001, |
1441 | 0b0_10000011_0000100, |
1442 | 0b0_10000110_1111111, |
1443 | 0b0_10001000_1111010, |
1444 | 0b0_10001000_0010000, |
1445 | ]; |
1446 | assert_eq!(result, expected_result); |
1447 | let k: __mmask16 = 0; |
1448 | let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a); |
1449 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1450 | let expected_result = src_array; |
1451 | assert_eq!(result, expected_result); |
1452 | } |
1453 | |
1454 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1455 | unsafe fn test_mm512_maskz_cvtneps_pbh() { |
1456 | #[rustfmt::skip] |
1457 | let a_array = [ |
1458 | 178.125_f32, |
1459 | 10.5_f32, |
1460 | 3.75_f32, |
1461 | 50.25_f32, |
1462 | 16.5_f32, |
1463 | 255.11_f32, |
1464 | 1000.158_f32, |
1465 | 575.575_f32, |
1466 | 178.125_f32, |
1467 | 10.5_f32, |
1468 | 3.75_f32, |
1469 | 50.25_f32, |
1470 | 16.5_f32, |
1471 | 255.11_f32, |
1472 | 1000.158_f32, |
1473 | 575.575_f32, |
1474 | ]; |
1475 | let a: __m512 = transmute(a_array); |
1476 | let k: __mmask16 = 0xffff; |
1477 | let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a); |
1478 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1479 | #[rustfmt::skip] |
1480 | let expected_result: [u16; 16] = [ |
1481 | 0b0_10000110_0110010, |
1482 | 0b0_10000010_0101000, |
1483 | 0b0_10000000_1110000, |
1484 | 0b0_10000100_1001001, |
1485 | 0b0_10000011_0000100, |
1486 | 0b0_10000110_1111111, |
1487 | 0b0_10001000_1111010, |
1488 | 0b0_10001000_0010000, |
1489 | 0b0_10000110_0110010, |
1490 | 0b0_10000010_0101000, |
1491 | 0b0_10000000_1110000, |
1492 | 0b0_10000100_1001001, |
1493 | 0b0_10000011_0000100, |
1494 | 0b0_10000110_1111111, |
1495 | 0b0_10001000_1111010, |
1496 | 0b0_10001000_0010000, |
1497 | ]; |
1498 | assert_eq!(result, expected_result); |
1499 | let k: __mmask16 = 0x653a; |
1500 | let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a); |
1501 | let result: [u16; 16] = transmute(c.as_u16x16()); |
1502 | #[rustfmt::skip] |
1503 | let expected_result: [u16; 16] = [ |
1504 | 0, |
1505 | 0b0_10000010_0101000, |
1506 | 0, |
1507 | 0b0_10000100_1001001, |
1508 | 0b0_10000011_0000100, |
1509 | 0b0_10000110_1111111, |
1510 | 0, |
1511 | 0, |
1512 | 0b0_10000110_0110010, |
1513 | 0, |
1514 | 0b0_10000000_1110000, |
1515 | 0, |
1516 | 0, |
1517 | 0b0_10000110_1111111, |
1518 | 0b0_10001000_1111010, |
1519 | 0, |
1520 | ]; |
1521 | assert_eq!(result, expected_result); |
1522 | } |
1523 | |
1524 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1525 | unsafe fn test_mm_dpbf16_ps() { |
1526 | let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
1527 | let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; |
1528 | let a1: __m128 = transmute(a_array); |
1529 | let b1: __m128 = transmute(b_array); |
1530 | let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); |
1531 | let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1); |
1532 | let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1); |
1533 | let c: __m128 = _mm_dpbf16_ps(src, a, b); |
1534 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1535 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; |
1536 | assert_eq!(result, expected_result); |
1537 | } |
1538 | |
1539 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1540 | unsafe fn test_mm_mask_dpbf16_ps() { |
1541 | let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
1542 | let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; |
1543 | let a1: __m128 = transmute(a_array); |
1544 | let b1: __m128 = transmute(b_array); |
1545 | let k: __mmask8 = 0xf3; |
1546 | let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); |
1547 | let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1); |
1548 | let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1); |
1549 | let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b); |
1550 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1551 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32]; |
1552 | assert_eq!(result, expected_result); |
1553 | let k: __mmask8 = 0xff; |
1554 | let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b); |
1555 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1556 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; |
1557 | assert_eq!(result, expected_result); |
1558 | let k: __mmask8 = 0; |
1559 | let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b); |
1560 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1561 | let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]; |
1562 | assert_eq!(result, expected_result); |
1563 | } |
1564 | |
1565 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1566 | unsafe fn test_mm_maskz_dpbf16_ps() { |
1567 | let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; |
1568 | let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; |
1569 | let a1: __m128 = transmute(a_array); |
1570 | let b1: __m128 = transmute(b_array); |
1571 | let k: __mmask8 = 0xf3; |
1572 | let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); |
1573 | let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1); |
1574 | let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1); |
1575 | let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b); |
1576 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1577 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0]; |
1578 | assert_eq!(result, expected_result); |
1579 | let k: __mmask8 = 0xff; |
1580 | let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b); |
1581 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1582 | let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; |
1583 | assert_eq!(result, expected_result); |
1584 | let k: __mmask8 = 0; |
1585 | let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b); |
1586 | let result: [f32; 4] = transmute(c.as_f32x4()); |
1587 | let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0]; |
1588 | assert_eq!(result, expected_result); |
1589 | } |
1590 | |
1591 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1592 | unsafe fn test_mm256_dpbf16_ps() { |
1593 | #[rustfmt::skip] |
1594 | let a_array = [ |
1595 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1596 | ]; |
1597 | let b_array = [ |
1598 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1599 | ]; |
1600 | let a1: __m256 = transmute(a_array); |
1601 | let b1: __m256 = transmute(b_array); |
1602 | #[rustfmt::skip] |
1603 | let src: __m256 = transmute([ |
1604 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1605 | ]); |
1606 | let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1); |
1607 | let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1); |
1608 | let c: __m256 = _mm256_dpbf16_ps(src, a, b); |
1609 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1610 | #[rustfmt::skip] |
1611 | let expected_result: [f32; 8] = [ |
1612 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1613 | ]; |
1614 | assert_eq!(result, expected_result); |
1615 | } |
1616 | |
1617 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1618 | unsafe fn test_mm256_mask_dpbf16_ps() { |
1619 | #[rustfmt::skip] |
1620 | let a_array = [ |
1621 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1622 | ]; |
1623 | let b_array = [ |
1624 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1625 | ]; |
1626 | let a1: __m256 = transmute(a_array); |
1627 | let b1: __m256 = transmute(b_array); |
1628 | let k: __mmask8 = 0x33; |
1629 | #[rustfmt::skip] |
1630 | let src: __m256 = transmute([ |
1631 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1632 | ]); |
1633 | let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1); |
1634 | let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1); |
1635 | let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b); |
1636 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1637 | #[rustfmt::skip] |
1638 | let expected_result: [f32; 8] = [ |
1639 | -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, |
1640 | ]; |
1641 | assert_eq!(result, expected_result); |
1642 | let k: __mmask8 = 0xff; |
1643 | let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b); |
1644 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1645 | #[rustfmt::skip] |
1646 | let expected_result: [f32; 8] = [ |
1647 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1648 | ]; |
1649 | assert_eq!(result, expected_result); |
1650 | let k: __mmask8 = 0; |
1651 | let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b); |
1652 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1653 | #[rustfmt::skip] |
1654 | let expected_result: [f32; 8] = [ |
1655 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1656 | ]; |
1657 | assert_eq!(result, expected_result); |
1658 | } |
1659 | |
1660 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1661 | unsafe fn test_mm256_maskz_dpbf16_ps() { |
1662 | #[rustfmt::skip] |
1663 | let a_array = [ |
1664 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1665 | ]; |
1666 | let b_array = [ |
1667 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1668 | ]; |
1669 | let a1: __m256 = transmute(a_array); |
1670 | let b1: __m256 = transmute(b_array); |
1671 | let k: __mmask8 = 0x33; |
1672 | #[rustfmt::skip] |
1673 | let src: __m256 = transmute([ |
1674 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1675 | ]); |
1676 | let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1); |
1677 | let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1); |
1678 | let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b); |
1679 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1680 | #[rustfmt::skip] |
1681 | let expected_result: [f32; 8] = [ |
1682 | -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, |
1683 | ]; |
1684 | assert_eq!(result, expected_result); |
1685 | let k: __mmask8 = 0xff; |
1686 | let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b); |
1687 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1688 | #[rustfmt::skip] |
1689 | let expected_result: [f32; 8] = [ |
1690 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1691 | ]; |
1692 | assert_eq!(result, expected_result); |
1693 | let k: __mmask8 = 0; |
1694 | let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b); |
1695 | let result: [f32; 8] = transmute(c.as_f32x8()); |
1696 | let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]; |
1697 | assert_eq!(result, expected_result); |
1698 | } |
1699 | |
1700 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1701 | unsafe fn test_mm512_dpbf16_ps() { |
1702 | #[rustfmt::skip] |
1703 | let a_array = [ |
1704 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1705 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1706 | ]; |
1707 | let b_array = [ |
1708 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1709 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1710 | ]; |
1711 | let a1: __m512 = transmute(a_array); |
1712 | let b1: __m512 = transmute(b_array); |
1713 | let src: __m512 = transmute([ |
1714 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, |
1715 | 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1716 | ]); |
1717 | let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1); |
1718 | let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1); |
1719 | let c: __m512 = _mm512_dpbf16_ps(src, a, b); |
1720 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1721 | #[rustfmt::skip] |
1722 | let expected_result: [f32; 16] = [ |
1723 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1724 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1725 | ]; |
1726 | assert_eq!(result, expected_result); |
1727 | } |
1728 | |
1729 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1730 | unsafe fn test_mm512_mask_dpbf16_ps() { |
1731 | #[rustfmt::skip] |
1732 | let a_array = [ |
1733 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1734 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1735 | ]; |
1736 | let b_array = [ |
1737 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1738 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1739 | ]; |
1740 | let a1: __m512 = transmute(a_array); |
1741 | let b1: __m512 = transmute(b_array); |
1742 | let k: __mmask16 = 0x3333; |
1743 | #[rustfmt::skip] |
1744 | let src: __m512 = transmute([ |
1745 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, |
1746 | 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1747 | ]); |
1748 | let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1); |
1749 | let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1); |
1750 | let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b); |
1751 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1752 | #[rustfmt::skip] |
1753 | let expected_result: [f32; 16] = [ |
1754 | -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, |
1755 | -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, |
1756 | ]; |
1757 | assert_eq!(result, expected_result); |
1758 | let k: __mmask16 = 0xffff; |
1759 | let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b); |
1760 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1761 | #[rustfmt::skip] |
1762 | let expected_result: [f32; 16] = [ |
1763 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1764 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1765 | ]; |
1766 | assert_eq!(result, expected_result); |
1767 | let k: __mmask16 = 0; |
1768 | let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b); |
1769 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1770 | #[rustfmt::skip] |
1771 | let expected_result: [f32; 16] = [ |
1772 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, |
1773 | 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1774 | ]; |
1775 | assert_eq!(result, expected_result); |
1776 | } |
1777 | |
1778 | #[simd_test(enable = "avx512bf16,avx512f" )] |
1779 | unsafe fn test_mm512_maskz_dpbf16_ps() { |
1780 | #[rustfmt::skip] |
1781 | let a_array = [ |
1782 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1783 | 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, |
1784 | ]; |
1785 | let b_array = [ |
1786 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1787 | -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, |
1788 | ]; |
1789 | let a1: __m512 = transmute(a_array); |
1790 | let b1: __m512 = transmute(b_array); |
1791 | let k: __mmask16 = 0x3333; |
1792 | #[rustfmt::skip] |
1793 | let src: __m512 = transmute([ |
1794 | 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, |
1795 | 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, |
1796 | ]); |
1797 | let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1); |
1798 | let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1); |
1799 | let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b); |
1800 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1801 | #[rustfmt::skip] |
1802 | let expected_result: [f32; 16] = [ |
1803 | -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, |
1804 | 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, |
1805 | ]; |
1806 | assert_eq!(result, expected_result); |
1807 | let k: __mmask16 = 0xffff; |
1808 | let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b); |
1809 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1810 | #[rustfmt::skip] |
1811 | let expected_result: [f32; 16] = [ |
1812 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1813 | -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, |
1814 | ]; |
1815 | assert_eq!(result, expected_result); |
1816 | let k: __mmask16 = 0; |
1817 | let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b); |
1818 | let result: [f32; 16] = transmute(c.as_f32x16()); |
1819 | #[rustfmt::skip] |
1820 | let expected_result: [f32; 16] = [ |
1821 | 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, |
1822 | ]; |
1823 | assert_eq!(result, expected_result); |
1824 | } |
1825 | |
1826 | const BF16_ONE: u16 = 0b0_01111111_0000000; |
1827 | const BF16_TWO: u16 = 0b0_10000000_0000000; |
1828 | const BF16_THREE: u16 = 0b0_10000000_1000000; |
1829 | const BF16_FOUR: u16 = 0b0_10000001_0000000; |
1830 | const BF16_FIVE: u16 = 0b0_10000001_0100000; |
1831 | const BF16_SIX: u16 = 0b0_10000001_1000000; |
1832 | const BF16_SEVEN: u16 = 0b0_10000001_1100000; |
1833 | const BF16_EIGHT: u16 = 0b0_10000010_0000000; |
1834 | |
1835 | #[simd_test(enable = "avx512bf16" )] |
1836 | unsafe fn test_mm512_cvtpbh_ps() { |
1837 | let a = __m256bh([ |
1838 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
1839 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
1840 | ]); |
1841 | let r = _mm512_cvtpbh_ps(a); |
1842 | let e = _mm512_setr_ps( |
1843 | 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, |
1844 | ); |
1845 | assert_eq_m512(r, e); |
1846 | } |
1847 | |
1848 | #[simd_test(enable = "avx512bf16" )] |
1849 | unsafe fn test_mm512_mask_cvtpbh_ps() { |
1850 | let a = __m256bh([ |
1851 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
1852 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
1853 | ]); |
1854 | let src = _mm512_setr_ps( |
1855 | 9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16., |
1856 | ); |
1857 | let k = 0b1010_1010_1010_1010; |
1858 | let r = _mm512_mask_cvtpbh_ps(src, k, a); |
1859 | let e = _mm512_setr_ps( |
1860 | 9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8., |
1861 | ); |
1862 | assert_eq_m512(r, e); |
1863 | } |
1864 | |
1865 | #[simd_test(enable = "avx512bf16" )] |
1866 | unsafe fn test_mm512_maskz_cvtpbh_ps() { |
1867 | let a = __m256bh([ |
1868 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
1869 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
1870 | ]); |
1871 | let k = 0b1010_1010_1010_1010; |
1872 | let r = _mm512_maskz_cvtpbh_ps(k, a); |
1873 | let e = _mm512_setr_ps( |
1874 | 0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8., |
1875 | ); |
1876 | assert_eq_m512(r, e); |
1877 | } |
1878 | |
1879 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1880 | unsafe fn test_mm256_cvtpbh_ps() { |
1881 | let a = __m128bh([ |
1882 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
1883 | ]); |
1884 | let r = _mm256_cvtpbh_ps(a); |
1885 | let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); |
1886 | assert_eq_m256(r, e); |
1887 | } |
1888 | |
1889 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1890 | unsafe fn test_mm256_mask_cvtpbh_ps() { |
1891 | let a = __m128bh([ |
1892 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
1893 | ]); |
1894 | let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.); |
1895 | let k = 0b1010_1010; |
1896 | let r = _mm256_mask_cvtpbh_ps(src, k, a); |
1897 | let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.); |
1898 | assert_eq_m256(r, e); |
1899 | } |
1900 | |
1901 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1902 | unsafe fn test_mm256_maskz_cvtpbh_ps() { |
1903 | let a = __m128bh([ |
1904 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
1905 | ]); |
1906 | let k = 0b1010_1010; |
1907 | let r = _mm256_maskz_cvtpbh_ps(k, a); |
1908 | let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.); |
1909 | assert_eq_m256(r, e); |
1910 | } |
1911 | |
1912 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1913 | unsafe fn test_mm_cvtpbh_ps() { |
1914 | let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]); |
1915 | let r = _mm_cvtpbh_ps(a); |
1916 | let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
1917 | assert_eq_m128(r, e); |
1918 | } |
1919 | |
1920 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1921 | unsafe fn test_mm_mask_cvtpbh_ps() { |
1922 | let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]); |
1923 | let src = _mm_setr_ps(9., 10., 11., 12.); |
1924 | let k = 0b1010; |
1925 | let r = _mm_mask_cvtpbh_ps(src, k, a); |
1926 | let e = _mm_setr_ps(9., 2., 11., 4.); |
1927 | assert_eq_m128(r, e); |
1928 | } |
1929 | |
1930 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1931 | unsafe fn test_mm_maskz_cvtpbh_ps() { |
1932 | let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]); |
1933 | let k = 0b1010; |
1934 | let r = _mm_maskz_cvtpbh_ps(k, a); |
1935 | let e = _mm_setr_ps(0., 2., 0., 4.); |
1936 | assert_eq_m128(r, e); |
1937 | } |
1938 | |
1939 | #[simd_test(enable = "avx512bf16" )] |
1940 | unsafe fn test_mm_cvtsbh_ss() { |
1941 | let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE)); |
1942 | assert_eq!(r, 1.); |
1943 | } |
1944 | |
1945 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1946 | unsafe fn test_mm_cvtneps_pbh() { |
1947 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
1948 | let r: u16x4 = transmute_copy(&_mm_cvtneps_pbh(a)); |
1949 | let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR); |
1950 | assert_eq!(r, e); |
1951 | } |
1952 | |
1953 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1954 | unsafe fn test_mm_mask_cvtneps_pbh() { |
1955 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
1956 | let src = __m128bh([5, 6, 7, 8, !0, !0, !0, !0]); |
1957 | let k = 0b1010; |
1958 | let r: u16x4 = transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a)); |
1959 | let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR); |
1960 | assert_eq!(r, e); |
1961 | } |
1962 | |
1963 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1964 | unsafe fn test_mm_maskz_cvtneps_pbh() { |
1965 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); |
1966 | let k = 0b1010; |
1967 | let r: u16x4 = transmute_copy(&_mm_maskz_cvtneps_pbh(k, a)); |
1968 | let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR); |
1969 | assert_eq!(r, e); |
1970 | } |
1971 | |
1972 | #[simd_test(enable = "avx512bf16,avx512vl" )] |
1973 | unsafe fn test_mm_cvtness_sbh() { |
1974 | let r = _mm_cvtness_sbh(1.); |
1975 | assert_eq!(r.to_bits(), BF16_ONE); |
1976 | } |
1977 | } |
1978 | |