1use crate::arch::asm;
2use crate::core_arch::x86::*;
3
4#[cfg(test)]
5use stdarch_test::assert_instr;
6
7/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
8/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
9/// floating-point elements, and store the results in dst.
10///
11/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
12#[inline]
13#[target_feature(enable = "avxneconvert")]
14#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
15#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
16pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
17 bcstnebf162ps_128(a)
18}
19
20/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
21/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
22/// elements, and store the results in dst.
23///
24/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
25#[inline]
26#[target_feature(enable = "avxneconvert")]
27#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
28#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
29pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
30 bcstnebf162ps_256(a)
31}
32
33/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
34/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
35/// (32-bit) floating-point elements, and store the results in dst.
36///
37/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
38#[inline]
39#[target_feature(enable = "avxneconvert")]
40#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
41#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
42pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 {
43 bcstnesh2ps_128(a)
44}
45
46/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
47/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
48/// (32-bit) floating-point elements, and store the results in dst.
49///
50/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
51#[inline]
52#[target_feature(enable = "avxneconvert")]
53#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
54#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
55pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 {
56 bcstnesh2ps_256(a)
57}
58
59/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
60/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
61///
62/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
63#[inline]
64#[target_feature(enable = "avxneconvert")]
65#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
66#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
67pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
68 transmute(src:cvtneebf162ps_128(a))
69}
70
71/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
72/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
73///
74/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
75#[inline]
76#[target_feature(enable = "avxneconvert")]
77#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
78#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
79pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
80 transmute(src:cvtneebf162ps_256(a))
81}
82
83/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
84/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
85///
86/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
87#[inline]
88#[target_feature(enable = "avxneconvert")]
89#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
90#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
91pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 {
92 transmute(src:cvtneeph2ps_128(a))
93}
94
95/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
96/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
97///
98/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
99#[inline]
100#[target_feature(enable = "avxneconvert")]
101#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
102#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
103pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 {
104 transmute(src:cvtneeph2ps_256(a))
105}
106
107/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
108/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
109///
110/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
111#[inline]
112#[target_feature(enable = "avxneconvert")]
113#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
114#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
115pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
116 transmute(src:cvtneobf162ps_128(a))
117}
118
119/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
120/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
121///
122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
123#[inline]
124#[target_feature(enable = "avxneconvert")]
125#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
126#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
127pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
128 transmute(src:cvtneobf162ps_256(a))
129}
130
131/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
132/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
133///
134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
135#[inline]
136#[target_feature(enable = "avxneconvert")]
137#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
138#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
139pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 {
140 transmute(src:cvtneoph2ps_128(a))
141}
142
143/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
144/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
145///
146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
147#[inline]
148#[target_feature(enable = "avxneconvert")]
149#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
150#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
151pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
152 transmute(src:cvtneoph2ps_256(a))
153}
154
155/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
156/// elements, and store the results in dst.
157///
158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
159#[inline]
160#[target_feature(enable = "avxneconvert")]
161#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
162#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
163pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
164 unsafe {
165 let mut dst: __m128bh;
166 asm!(
167 "{{vex}}vcvtneps2bf16 {dst},{src}",
168 dst = lateout(xmm_reg) dst,
169 src = in(xmm_reg) a,
170 options(pure, nomem, nostack, preserves_flags)
171 );
172 dst
173 }
174}
175
176/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
177/// elements, and store the results in dst.
178///
179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
180#[inline]
181#[target_feature(enable = "avxneconvert")]
182#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
183#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
184pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
185 unsafe {
186 let mut dst: __m128bh;
187 asm!(
188 "{{vex}}vcvtneps2bf16 {dst},{src}",
189 dst = lateout(xmm_reg) dst,
190 src = in(ymm_reg) a,
191 options(pure, nomem, nostack, preserves_flags)
192 );
193 dst
194 }
195}
196
197#[allow(improper_ctypes)]
198unsafe extern "C" {
199 #[link_name = "llvm.x86.vbcstnebf162ps128"]
200 unsafefn bcstnebf162ps_128(a: *const bf16) -> __m128;
201 #[link_name = "llvm.x86.vbcstnebf162ps256"]
202 unsafefn bcstnebf162ps_256(a: *const bf16) -> __m256;
203 #[link_name = "llvm.x86.vbcstnesh2ps128"]
204 unsafefn bcstnesh2ps_128(a: *const f16) -> __m128;
205 #[link_name = "llvm.x86.vbcstnesh2ps256"]
206 unsafefn bcstnesh2ps_256(a: *const f16) -> __m256;
207
208 #[link_name = "llvm.x86.vcvtneebf162ps128"]
209 unsafefn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
210 #[link_name = "llvm.x86.vcvtneebf162ps256"]
211 unsafefn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
212 #[link_name = "llvm.x86.vcvtneeph2ps128"]
213 unsafefn cvtneeph2ps_128(a: *const __m128h) -> __m128;
214 #[link_name = "llvm.x86.vcvtneeph2ps256"]
215 unsafefn cvtneeph2ps_256(a: *const __m256h) -> __m256;
216
217 #[link_name = "llvm.x86.vcvtneobf162ps128"]
218 unsafefn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
219 #[link_name = "llvm.x86.vcvtneobf162ps256"]
220 unsafefn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
221 #[link_name = "llvm.x86.vcvtneoph2ps128"]
222 unsafefn cvtneoph2ps_128(a: *const __m128h) -> __m128;
223 #[link_name = "llvm.x86.vcvtneoph2ps256"]
224 unsafefn cvtneoph2ps_256(a: *const __m256h) -> __m256;
225}
226
227#[cfg(test)]
228mod tests {
229 use crate::core_arch::simd::{u16x4, u16x8};
230 use crate::core_arch::x86::*;
231 use crate::mem::transmute_copy;
232 use std::ptr::addr_of;
233 use stdarch_test::simd_test;
234
235 const BF16_ONE: u16 = 0b0_01111111_0000000;
236 const BF16_TWO: u16 = 0b0_10000000_0000000;
237 const BF16_THREE: u16 = 0b0_10000000_1000000;
238 const BF16_FOUR: u16 = 0b0_10000001_0000000;
239 const BF16_FIVE: u16 = 0b0_10000001_0100000;
240 const BF16_SIX: u16 = 0b0_10000001_1000000;
241 const BF16_SEVEN: u16 = 0b0_10000001_1100000;
242 const BF16_EIGHT: u16 = 0b0_10000010_0000000;
243
244 #[simd_test(enable = "avxneconvert")]
245 unsafe fn test_mm_bcstnebf16_ps() {
246 let a = bf16::from_bits(BF16_ONE);
247 let r = _mm_bcstnebf16_ps(addr_of!(a));
248 let e = _mm_set_ps(1., 1., 1., 1.);
249 assert_eq_m128(r, e);
250 }
251
252 #[simd_test(enable = "avxneconvert")]
253 unsafe fn test_mm256_bcstnebf16_ps() {
254 let a = bf16::from_bits(BF16_ONE);
255 let r = _mm256_bcstnebf16_ps(addr_of!(a));
256 let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
257 assert_eq_m256(r, e);
258 }
259
260 #[simd_test(enable = "avxneconvert")]
261 unsafe fn test_mm_bcstnesh_ps() {
262 let a = 1.0_f16;
263 let r = _mm_bcstnesh_ps(addr_of!(a));
264 let e = _mm_set_ps(1., 1., 1., 1.);
265 assert_eq_m128(r, e);
266 }
267
268 #[simd_test(enable = "avxneconvert")]
269 unsafe fn test_mm256_bcstnesh_ps() {
270 let a = 1.0_f16;
271 let r = _mm256_bcstnesh_ps(addr_of!(a));
272 let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
273 assert_eq_m256(r, e);
274 }
275
276 #[simd_test(enable = "avxneconvert")]
277 unsafe fn test_mm_cvtneebf16_ps() {
278 let a = __m128bh([
279 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
280 ]);
281 let r = _mm_cvtneebf16_ps(addr_of!(a));
282 let e = _mm_setr_ps(1., 3., 5., 7.);
283 assert_eq_m128(r, e);
284 }
285
286 #[simd_test(enable = "avxneconvert")]
287 unsafe fn test_mm256_cvtneebf16_ps() {
288 let a = __m256bh([
289 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
290 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
291 ]);
292 let r = _mm256_cvtneebf16_ps(addr_of!(a));
293 let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
294 assert_eq_m256(r, e);
295 }
296
297 #[simd_test(enable = "avxneconvert")]
298 unsafe fn test_mm_cvtneeph_ps() {
299 let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
300 let r = _mm_cvtneeph_ps(addr_of!(a));
301 let e = _mm_setr_ps(1., 3., 5., 7.);
302 assert_eq_m128(r, e);
303 }
304
305 #[simd_test(enable = "avxneconvert")]
306 unsafe fn test_mm256_cvtneeph_ps() {
307 let a = __m256h([
308 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
309 ]);
310 let r = _mm256_cvtneeph_ps(addr_of!(a));
311 let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
312 assert_eq_m256(r, e);
313 }
314
315 #[simd_test(enable = "avxneconvert")]
316 unsafe fn test_mm_cvtneobf16_ps() {
317 let a = __m128bh([
318 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
319 ]);
320 let r = _mm_cvtneobf16_ps(addr_of!(a));
321 let e = _mm_setr_ps(2., 4., 6., 8.);
322 assert_eq_m128(r, e);
323 }
324
325 #[simd_test(enable = "avxneconvert")]
326 unsafe fn test_mm256_cvtneobf16_ps() {
327 let a = __m256bh([
328 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
329 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
330 ]);
331 let r = _mm256_cvtneobf16_ps(addr_of!(a));
332 let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
333 assert_eq_m256(r, e);
334 }
335
336 #[simd_test(enable = "avxneconvert")]
337 unsafe fn test_mm_cvtneoph_ps() {
338 let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
339 let r = _mm_cvtneoph_ps(addr_of!(a));
340 let e = _mm_setr_ps(2., 4., 6., 8.);
341 assert_eq_m128(r, e);
342 }
343
344 #[simd_test(enable = "avxneconvert")]
345 unsafe fn test_mm256_cvtneoph_ps() {
346 let a = __m256h([
347 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
348 ]);
349 let r = _mm256_cvtneoph_ps(addr_of!(a));
350 let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
351 assert_eq_m256(r, e);
352 }
353
354 #[simd_test(enable = "avxneconvert")]
355 unsafe fn test_mm_cvtneps_avx_pbh() {
356 let a = _mm_setr_ps(1., 2., 3., 4.);
357 let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
358 let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
359 assert_eq!(r, e);
360 }
361
362 #[simd_test(enable = "avxneconvert")]
363 unsafe fn test_mm256_cvtneps_avx_pbh() {
364 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
365 let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
366 let e = u16x8::new(
367 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
368 );
369 assert_eq!(r, e);
370 }
371}
372