1use crate::arch::asm;
2use crate::core_arch::x86::*;
3
4#[cfg(test)]
5use stdarch_test::assert_instr;
6
7/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
8/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
9/// floating-point elements, and store the results in dst.
10///
11/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
12#[inline]
13#[target_feature(enable = "avxneconvert")]
14#[cfg_attr(
15 all(test, any(target_os = "linux", target_env = "msvc")),
16 assert_instr(vbcstnebf162ps)
17)]
18#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
19pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
20 bcstnebf162ps_128(a)
21}
22
23/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
24/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
25/// elements, and store the results in dst.
26///
27/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
28#[inline]
29#[target_feature(enable = "avxneconvert")]
30#[cfg_attr(
31 all(test, any(target_os = "linux", target_env = "msvc")),
32 assert_instr(vbcstnebf162ps)
33)]
34#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
35pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
36 bcstnebf162ps_256(a)
37}
38
39/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
40/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
41/// (32-bit) floating-point elements, and store the results in dst.
42///
43/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
44#[inline]
45#[target_feature(enable = "avxneconvert")]
46#[cfg_attr(
47 all(test, any(target_os = "linux", target_env = "msvc")),
48 assert_instr(vbcstnesh2ps)
49)]
50#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
51pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 {
52 bcstnesh2ps_128(a)
53}
54
55/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
56/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
57/// (32-bit) floating-point elements, and store the results in dst.
58///
59/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
60#[inline]
61#[target_feature(enable = "avxneconvert")]
62#[cfg_attr(
63 all(test, any(target_os = "linux", target_env = "msvc")),
64 assert_instr(vbcstnesh2ps)
65)]
66#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
67pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 {
68 bcstnesh2ps_256(a)
69}
70
71/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
72/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
73///
74/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
75#[inline]
76#[target_feature(enable = "avxneconvert")]
77#[cfg_attr(
78 all(test, any(target_os = "linux", target_env = "msvc")),
79 assert_instr(vcvtneebf162ps)
80)]
81#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
82pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
83 transmute(src:cvtneebf162ps_128(a))
84}
85
86/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
87/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
88///
89/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
90#[inline]
91#[target_feature(enable = "avxneconvert")]
92#[cfg_attr(
93 all(test, any(target_os = "linux", target_env = "msvc")),
94 assert_instr(vcvtneebf162ps)
95)]
96#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
97pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
98 transmute(src:cvtneebf162ps_256(a))
99}
100
101/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
102/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
103///
104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
105#[inline]
106#[target_feature(enable = "avxneconvert")]
107#[cfg_attr(
108 all(test, any(target_os = "linux", target_env = "msvc")),
109 assert_instr(vcvtneeph2ps)
110)]
111#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
112pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 {
113 transmute(src:cvtneeph2ps_128(a))
114}
115
116/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
117/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
118///
119/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
120#[inline]
121#[target_feature(enable = "avxneconvert")]
122#[cfg_attr(
123 all(test, any(target_os = "linux", target_env = "msvc")),
124 assert_instr(vcvtneeph2ps)
125)]
126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
127pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 {
128 transmute(src:cvtneeph2ps_256(a))
129}
130
131/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
132/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
133///
134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
135#[inline]
136#[target_feature(enable = "avxneconvert")]
137#[cfg_attr(
138 all(test, any(target_os = "linux", target_env = "msvc")),
139 assert_instr(vcvtneobf162ps)
140)]
141#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
142pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
143 transmute(src:cvtneobf162ps_128(a))
144}
145
146/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
147/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
148///
149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
150#[inline]
151#[target_feature(enable = "avxneconvert")]
152#[cfg_attr(
153 all(test, any(target_os = "linux", target_env = "msvc")),
154 assert_instr(vcvtneobf162ps)
155)]
156#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
157pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
158 transmute(src:cvtneobf162ps_256(a))
159}
160
161/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
162/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
163///
164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
165#[inline]
166#[target_feature(enable = "avxneconvert")]
167#[cfg_attr(
168 all(test, any(target_os = "linux", target_env = "msvc")),
169 assert_instr(vcvtneoph2ps)
170)]
171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
172pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 {
173 transmute(src:cvtneoph2ps_128(a))
174}
175
176/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
177/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
178///
179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
180#[inline]
181#[target_feature(enable = "avxneconvert")]
182#[cfg_attr(
183 all(test, any(target_os = "linux", target_env = "msvc")),
184 assert_instr(vcvtneoph2ps)
185)]
186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
187pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
188 transmute(src:cvtneoph2ps_256(a))
189}
190
191/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
192/// elements, and store the results in dst.
193///
194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
195#[inline]
196#[target_feature(enable = "avxneconvert")]
197#[cfg_attr(
198 all(test, any(target_os = "linux", target_env = "msvc")),
199 assert_instr(vcvtneps2bf16)
200)]
201#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
202pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
203 unsafe {
204 let mut dst: __m128bh;
205 asm!(
206 "{{vex}}vcvtneps2bf16 {dst},{src}",
207 dst = lateout(xmm_reg) dst,
208 src = in(xmm_reg) a,
209 options(pure, nomem, nostack, preserves_flags)
210 );
211 dst
212 }
213}
214
215/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
216/// elements, and store the results in dst.
217///
218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
219#[inline]
220#[target_feature(enable = "avxneconvert")]
221#[cfg_attr(
222 all(test, any(target_os = "linux", target_env = "msvc")),
223 assert_instr(vcvtneps2bf16)
224)]
225#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
226pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
227 unsafe {
228 let mut dst: __m128bh;
229 asm!(
230 "{{vex}}vcvtneps2bf16 {dst},{src}",
231 dst = lateout(xmm_reg) dst,
232 src = in(ymm_reg) a,
233 options(pure, nomem, nostack, preserves_flags)
234 );
235 dst
236 }
237}
238
239#[allow(improper_ctypes)]
240unsafe extern "C" {
241 #[link_name = "llvm.x86.vbcstnebf162ps128"]
242 unsafefn bcstnebf162ps_128(a: *const bf16) -> __m128;
243 #[link_name = "llvm.x86.vbcstnebf162ps256"]
244 unsafefn bcstnebf162ps_256(a: *const bf16) -> __m256;
245 #[link_name = "llvm.x86.vbcstnesh2ps128"]
246 unsafefn bcstnesh2ps_128(a: *const f16) -> __m128;
247 #[link_name = "llvm.x86.vbcstnesh2ps256"]
248 unsafefn bcstnesh2ps_256(a: *const f16) -> __m256;
249
250 #[link_name = "llvm.x86.vcvtneebf162ps128"]
251 unsafefn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
252 #[link_name = "llvm.x86.vcvtneebf162ps256"]
253 unsafefn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
254 #[link_name = "llvm.x86.vcvtneeph2ps128"]
255 unsafefn cvtneeph2ps_128(a: *const __m128h) -> __m128;
256 #[link_name = "llvm.x86.vcvtneeph2ps256"]
257 unsafefn cvtneeph2ps_256(a: *const __m256h) -> __m256;
258
259 #[link_name = "llvm.x86.vcvtneobf162ps128"]
260 unsafefn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
261 #[link_name = "llvm.x86.vcvtneobf162ps256"]
262 unsafefn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
263 #[link_name = "llvm.x86.vcvtneoph2ps128"]
264 unsafefn cvtneoph2ps_128(a: *const __m128h) -> __m128;
265 #[link_name = "llvm.x86.vcvtneoph2ps256"]
266 unsafefn cvtneoph2ps_256(a: *const __m256h) -> __m256;
267}
268
269#[cfg(test)]
270mod tests {
271 use crate::core_arch::simd::{u16x4, u16x8};
272 use crate::core_arch::x86::*;
273 use crate::mem::transmute_copy;
274 use std::ptr::addr_of;
275 use stdarch_test::simd_test;
276
277 const BF16_ONE: u16 = 0b0_01111111_0000000;
278 const BF16_TWO: u16 = 0b0_10000000_0000000;
279 const BF16_THREE: u16 = 0b0_10000000_1000000;
280 const BF16_FOUR: u16 = 0b0_10000001_0000000;
281 const BF16_FIVE: u16 = 0b0_10000001_0100000;
282 const BF16_SIX: u16 = 0b0_10000001_1000000;
283 const BF16_SEVEN: u16 = 0b0_10000001_1100000;
284 const BF16_EIGHT: u16 = 0b0_10000010_0000000;
285
286 #[simd_test(enable = "avxneconvert")]
287 unsafe fn test_mm_bcstnebf16_ps() {
288 let a = bf16::from_bits(BF16_ONE);
289 let r = _mm_bcstnebf16_ps(addr_of!(a));
290 let e = _mm_set_ps(1., 1., 1., 1.);
291 assert_eq_m128(r, e);
292 }
293
294 #[simd_test(enable = "avxneconvert")]
295 unsafe fn test_mm256_bcstnebf16_ps() {
296 let a = bf16::from_bits(BF16_ONE);
297 let r = _mm256_bcstnebf16_ps(addr_of!(a));
298 let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
299 assert_eq_m256(r, e);
300 }
301
302 #[simd_test(enable = "avxneconvert")]
303 unsafe fn test_mm_bcstnesh_ps() {
304 let a = 1.0_f16;
305 let r = _mm_bcstnesh_ps(addr_of!(a));
306 let e = _mm_set_ps(1., 1., 1., 1.);
307 assert_eq_m128(r, e);
308 }
309
310 #[simd_test(enable = "avxneconvert")]
311 unsafe fn test_mm256_bcstnesh_ps() {
312 let a = 1.0_f16;
313 let r = _mm256_bcstnesh_ps(addr_of!(a));
314 let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
315 assert_eq_m256(r, e);
316 }
317
318 #[simd_test(enable = "avxneconvert")]
319 unsafe fn test_mm_cvtneebf16_ps() {
320 let a = __m128bh([
321 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
322 ]);
323 let r = _mm_cvtneebf16_ps(addr_of!(a));
324 let e = _mm_setr_ps(1., 3., 5., 7.);
325 assert_eq_m128(r, e);
326 }
327
328 #[simd_test(enable = "avxneconvert")]
329 unsafe fn test_mm256_cvtneebf16_ps() {
330 let a = __m256bh([
331 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
332 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
333 ]);
334 let r = _mm256_cvtneebf16_ps(addr_of!(a));
335 let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
336 assert_eq_m256(r, e);
337 }
338
339 #[simd_test(enable = "avxneconvert")]
340 unsafe fn test_mm_cvtneeph_ps() {
341 let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
342 let r = _mm_cvtneeph_ps(addr_of!(a));
343 let e = _mm_setr_ps(1., 3., 5., 7.);
344 assert_eq_m128(r, e);
345 }
346
347 #[simd_test(enable = "avxneconvert")]
348 unsafe fn test_mm256_cvtneeph_ps() {
349 let a = __m256h([
350 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
351 ]);
352 let r = _mm256_cvtneeph_ps(addr_of!(a));
353 let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
354 assert_eq_m256(r, e);
355 }
356
357 #[simd_test(enable = "avxneconvert")]
358 unsafe fn test_mm_cvtneobf16_ps() {
359 let a = __m128bh([
360 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
361 ]);
362 let r = _mm_cvtneobf16_ps(addr_of!(a));
363 let e = _mm_setr_ps(2., 4., 6., 8.);
364 assert_eq_m128(r, e);
365 }
366
367 #[simd_test(enable = "avxneconvert")]
368 unsafe fn test_mm256_cvtneobf16_ps() {
369 let a = __m256bh([
370 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
371 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
372 ]);
373 let r = _mm256_cvtneobf16_ps(addr_of!(a));
374 let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
375 assert_eq_m256(r, e);
376 }
377
378 #[simd_test(enable = "avxneconvert")]
379 unsafe fn test_mm_cvtneoph_ps() {
380 let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
381 let r = _mm_cvtneoph_ps(addr_of!(a));
382 let e = _mm_setr_ps(2., 4., 6., 8.);
383 assert_eq_m128(r, e);
384 }
385
386 #[simd_test(enable = "avxneconvert")]
387 unsafe fn test_mm256_cvtneoph_ps() {
388 let a = __m256h([
389 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
390 ]);
391 let r = _mm256_cvtneoph_ps(addr_of!(a));
392 let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
393 assert_eq_m256(r, e);
394 }
395
396 #[simd_test(enable = "avxneconvert")]
397 unsafe fn test_mm_cvtneps_avx_pbh() {
398 let a = _mm_setr_ps(1., 2., 3., 4.);
399 let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
400 let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
401 assert_eq!(r, e);
402 }
403
404 #[simd_test(enable = "avxneconvert")]
405 unsafe fn test_mm256_cvtneps_avx_pbh() {
406 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
407 let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
408 let e = u16x8::new(
409 BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
410 );
411 assert_eq!(r, e);
412 }
413}
414