1 | use crate::arch::asm; |
2 | use crate::core_arch::x86::*; |
3 | |
4 | #[cfg (test)] |
5 | use stdarch_test::assert_instr; |
6 | |
7 | /// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location |
8 | /// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) |
9 | /// floating-point elements, and store the results in dst. |
10 | /// |
11 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps) |
12 | #[inline ] |
13 | #[target_feature (enable = "avxneconvert" )] |
14 | #[cfg_attr ( |
15 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
16 | assert_instr(vbcstnebf162ps) |
17 | )] |
18 | #[unstable (feature = "stdarch_x86_avx512_bf16" , issue = "127356" )] |
19 | pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 { |
20 | bcstnebf162ps_128(a) |
21 | } |
22 | |
23 | /// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location |
24 | /// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point |
25 | /// elements, and store the results in dst. |
26 | /// |
27 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps) |
28 | #[inline ] |
29 | #[target_feature (enable = "avxneconvert" )] |
30 | #[cfg_attr ( |
31 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
32 | assert_instr(vbcstnebf162ps) |
33 | )] |
34 | #[unstable (feature = "stdarch_x86_avx512_bf16" , issue = "127356" )] |
35 | pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 { |
36 | bcstnebf162ps_256(a) |
37 | } |
38 | |
39 | /// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting |
40 | /// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision |
41 | /// (32-bit) floating-point elements, and store the results in dst. |
42 | /// |
43 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps) |
44 | #[inline ] |
45 | #[target_feature (enable = "avxneconvert" )] |
46 | #[cfg_attr ( |
47 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
48 | assert_instr(vbcstnesh2ps) |
49 | )] |
50 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
51 | pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 { |
52 | bcstnesh2ps_128(a) |
53 | } |
54 | |
55 | /// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting |
56 | /// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision |
57 | /// (32-bit) floating-point elements, and store the results in dst. |
58 | /// |
59 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps) |
60 | #[inline ] |
61 | #[target_feature (enable = "avxneconvert" )] |
62 | #[cfg_attr ( |
63 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
64 | assert_instr(vbcstnesh2ps) |
65 | )] |
66 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
67 | pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 { |
68 | bcstnesh2ps_256(a) |
69 | } |
70 | |
71 | /// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at |
72 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
73 | /// |
74 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps) |
75 | #[inline ] |
76 | #[target_feature (enable = "avxneconvert" )] |
77 | #[cfg_attr ( |
78 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
79 | assert_instr(vcvtneebf162ps) |
80 | )] |
81 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
82 | pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 { |
83 | transmute(src:cvtneebf162ps_128(a)) |
84 | } |
85 | |
86 | /// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at |
87 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
88 | /// |
89 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps) |
90 | #[inline ] |
91 | #[target_feature (enable = "avxneconvert" )] |
92 | #[cfg_attr ( |
93 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
94 | assert_instr(vcvtneebf162ps) |
95 | )] |
96 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
97 | pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 { |
98 | transmute(src:cvtneebf162ps_256(a)) |
99 | } |
100 | |
101 | /// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at |
102 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
103 | /// |
104 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps) |
105 | #[inline ] |
106 | #[target_feature (enable = "avxneconvert" )] |
107 | #[cfg_attr ( |
108 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
109 | assert_instr(vcvtneeph2ps) |
110 | )] |
111 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
112 | pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 { |
113 | transmute(src:cvtneeph2ps_128(a)) |
114 | } |
115 | |
116 | /// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at |
117 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
118 | /// |
119 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps) |
120 | #[inline ] |
121 | #[target_feature (enable = "avxneconvert" )] |
122 | #[cfg_attr ( |
123 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
124 | assert_instr(vcvtneeph2ps) |
125 | )] |
126 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
127 | pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 { |
128 | transmute(src:cvtneeph2ps_256(a)) |
129 | } |
130 | |
131 | /// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at |
132 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
133 | /// |
134 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps) |
135 | #[inline ] |
136 | #[target_feature (enable = "avxneconvert" )] |
137 | #[cfg_attr ( |
138 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
139 | assert_instr(vcvtneobf162ps) |
140 | )] |
141 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
142 | pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 { |
143 | transmute(src:cvtneobf162ps_128(a)) |
144 | } |
145 | |
146 | /// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at |
147 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
148 | /// |
149 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps) |
150 | #[inline ] |
151 | #[target_feature (enable = "avxneconvert" )] |
152 | #[cfg_attr ( |
153 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
154 | assert_instr(vcvtneobf162ps) |
155 | )] |
156 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
157 | pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 { |
158 | transmute(src:cvtneobf162ps_256(a)) |
159 | } |
160 | |
161 | /// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at |
162 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
163 | /// |
164 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps) |
165 | #[inline ] |
166 | #[target_feature (enable = "avxneconvert" )] |
167 | #[cfg_attr ( |
168 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
169 | assert_instr(vcvtneoph2ps) |
170 | )] |
171 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
172 | pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 { |
173 | transmute(src:cvtneoph2ps_128(a)) |
174 | } |
175 | |
176 | /// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at |
177 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
178 | /// |
179 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps) |
180 | #[inline ] |
181 | #[target_feature (enable = "avxneconvert" )] |
182 | #[cfg_attr ( |
183 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
184 | assert_instr(vcvtneoph2ps) |
185 | )] |
186 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
187 | pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 { |
188 | transmute(src:cvtneoph2ps_256(a)) |
189 | } |
190 | |
191 | /// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point |
192 | /// elements, and store the results in dst. |
193 | /// |
194 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh) |
195 | #[inline ] |
196 | #[target_feature (enable = "avxneconvert" )] |
197 | #[cfg_attr ( |
198 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
199 | assert_instr(vcvtneps2bf16) |
200 | )] |
201 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
202 | pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh { |
203 | unsafe { |
204 | let mut dst: __m128bh; |
205 | asm!( |
206 | "{{vex}}vcvtneps2bf16 { dst},{ src}" , |
207 | dst = lateout(xmm_reg) dst, |
208 | src = in(xmm_reg) a, |
209 | options(pure, nomem, nostack, preserves_flags) |
210 | ); |
211 | dst |
212 | } |
213 | } |
214 | |
215 | /// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point |
216 | /// elements, and store the results in dst. |
217 | /// |
218 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh) |
219 | #[inline ] |
220 | #[target_feature (enable = "avxneconvert" )] |
221 | #[cfg_attr ( |
222 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
223 | assert_instr(vcvtneps2bf16) |
224 | )] |
225 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
226 | pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh { |
227 | unsafe { |
228 | let mut dst: __m128bh; |
229 | asm!( |
230 | "{{vex}}vcvtneps2bf16 { dst},{ src}" , |
231 | dst = lateout(xmm_reg) dst, |
232 | src = in(ymm_reg) a, |
233 | options(pure, nomem, nostack, preserves_flags) |
234 | ); |
235 | dst |
236 | } |
237 | } |
238 | |
239 | #[allow (improper_ctypes)] |
240 | unsafe extern "C" { |
241 | #[link_name = "llvm.x86.vbcstnebf162ps128" ] |
242 | unsafefn bcstnebf162ps_128(a: *const bf16) -> __m128; |
243 | #[link_name = "llvm.x86.vbcstnebf162ps256" ] |
244 | unsafefn bcstnebf162ps_256(a: *const bf16) -> __m256; |
245 | #[link_name = "llvm.x86.vbcstnesh2ps128" ] |
246 | unsafefn bcstnesh2ps_128(a: *const f16) -> __m128; |
247 | #[link_name = "llvm.x86.vbcstnesh2ps256" ] |
248 | unsafefn bcstnesh2ps_256(a: *const f16) -> __m256; |
249 | |
250 | #[link_name = "llvm.x86.vcvtneebf162ps128" ] |
251 | unsafefn cvtneebf162ps_128(a: *const __m128bh) -> __m128; |
252 | #[link_name = "llvm.x86.vcvtneebf162ps256" ] |
253 | unsafefn cvtneebf162ps_256(a: *const __m256bh) -> __m256; |
254 | #[link_name = "llvm.x86.vcvtneeph2ps128" ] |
255 | unsafefn cvtneeph2ps_128(a: *const __m128h) -> __m128; |
256 | #[link_name = "llvm.x86.vcvtneeph2ps256" ] |
257 | unsafefn cvtneeph2ps_256(a: *const __m256h) -> __m256; |
258 | |
259 | #[link_name = "llvm.x86.vcvtneobf162ps128" ] |
260 | unsafefn cvtneobf162ps_128(a: *const __m128bh) -> __m128; |
261 | #[link_name = "llvm.x86.vcvtneobf162ps256" ] |
262 | unsafefn cvtneobf162ps_256(a: *const __m256bh) -> __m256; |
263 | #[link_name = "llvm.x86.vcvtneoph2ps128" ] |
264 | unsafefn cvtneoph2ps_128(a: *const __m128h) -> __m128; |
265 | #[link_name = "llvm.x86.vcvtneoph2ps256" ] |
266 | unsafefn cvtneoph2ps_256(a: *const __m256h) -> __m256; |
267 | } |
268 | |
269 | #[cfg (test)] |
270 | mod tests { |
271 | use crate::core_arch::simd::{u16x4, u16x8}; |
272 | use crate::core_arch::x86::*; |
273 | use crate::mem::transmute_copy; |
274 | use std::ptr::addr_of; |
275 | use stdarch_test::simd_test; |
276 | |
277 | const BF16_ONE: u16 = 0b0_01111111_0000000; |
278 | const BF16_TWO: u16 = 0b0_10000000_0000000; |
279 | const BF16_THREE: u16 = 0b0_10000000_1000000; |
280 | const BF16_FOUR: u16 = 0b0_10000001_0000000; |
281 | const BF16_FIVE: u16 = 0b0_10000001_0100000; |
282 | const BF16_SIX: u16 = 0b0_10000001_1000000; |
283 | const BF16_SEVEN: u16 = 0b0_10000001_1100000; |
284 | const BF16_EIGHT: u16 = 0b0_10000010_0000000; |
285 | |
286 | #[simd_test(enable = "avxneconvert" )] |
287 | unsafe fn test_mm_bcstnebf16_ps() { |
288 | let a = bf16::from_bits(BF16_ONE); |
289 | let r = _mm_bcstnebf16_ps(addr_of!(a)); |
290 | let e = _mm_set_ps(1., 1., 1., 1.); |
291 | assert_eq_m128(r, e); |
292 | } |
293 | |
294 | #[simd_test(enable = "avxneconvert" )] |
295 | unsafe fn test_mm256_bcstnebf16_ps() { |
296 | let a = bf16::from_bits(BF16_ONE); |
297 | let r = _mm256_bcstnebf16_ps(addr_of!(a)); |
298 | let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.); |
299 | assert_eq_m256(r, e); |
300 | } |
301 | |
302 | #[simd_test(enable = "avxneconvert" )] |
303 | unsafe fn test_mm_bcstnesh_ps() { |
304 | let a = 1.0_f16; |
305 | let r = _mm_bcstnesh_ps(addr_of!(a)); |
306 | let e = _mm_set_ps(1., 1., 1., 1.); |
307 | assert_eq_m128(r, e); |
308 | } |
309 | |
310 | #[simd_test(enable = "avxneconvert" )] |
311 | unsafe fn test_mm256_bcstnesh_ps() { |
312 | let a = 1.0_f16; |
313 | let r = _mm256_bcstnesh_ps(addr_of!(a)); |
314 | let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.); |
315 | assert_eq_m256(r, e); |
316 | } |
317 | |
318 | #[simd_test(enable = "avxneconvert" )] |
319 | unsafe fn test_mm_cvtneebf16_ps() { |
320 | let a = __m128bh([ |
321 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
322 | ]); |
323 | let r = _mm_cvtneebf16_ps(addr_of!(a)); |
324 | let e = _mm_setr_ps(1., 3., 5., 7.); |
325 | assert_eq_m128(r, e); |
326 | } |
327 | |
328 | #[simd_test(enable = "avxneconvert" )] |
329 | unsafe fn test_mm256_cvtneebf16_ps() { |
330 | let a = __m256bh([ |
331 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
332 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
333 | ]); |
334 | let r = _mm256_cvtneebf16_ps(addr_of!(a)); |
335 | let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.); |
336 | assert_eq_m256(r, e); |
337 | } |
338 | |
339 | #[simd_test(enable = "avxneconvert" )] |
340 | unsafe fn test_mm_cvtneeph_ps() { |
341 | let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); |
342 | let r = _mm_cvtneeph_ps(addr_of!(a)); |
343 | let e = _mm_setr_ps(1., 3., 5., 7.); |
344 | assert_eq_m128(r, e); |
345 | } |
346 | |
347 | #[simd_test(enable = "avxneconvert" )] |
348 | unsafe fn test_mm256_cvtneeph_ps() { |
349 | let a = __m256h([ |
350 | 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, |
351 | ]); |
352 | let r = _mm256_cvtneeph_ps(addr_of!(a)); |
353 | let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.); |
354 | assert_eq_m256(r, e); |
355 | } |
356 | |
357 | #[simd_test(enable = "avxneconvert" )] |
358 | unsafe fn test_mm_cvtneobf16_ps() { |
359 | let a = __m128bh([ |
360 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
361 | ]); |
362 | let r = _mm_cvtneobf16_ps(addr_of!(a)); |
363 | let e = _mm_setr_ps(2., 4., 6., 8.); |
364 | assert_eq_m128(r, e); |
365 | } |
366 | |
367 | #[simd_test(enable = "avxneconvert" )] |
368 | unsafe fn test_mm256_cvtneobf16_ps() { |
369 | let a = __m256bh([ |
370 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
371 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
372 | ]); |
373 | let r = _mm256_cvtneobf16_ps(addr_of!(a)); |
374 | let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.); |
375 | assert_eq_m256(r, e); |
376 | } |
377 | |
378 | #[simd_test(enable = "avxneconvert" )] |
379 | unsafe fn test_mm_cvtneoph_ps() { |
380 | let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); |
381 | let r = _mm_cvtneoph_ps(addr_of!(a)); |
382 | let e = _mm_setr_ps(2., 4., 6., 8.); |
383 | assert_eq_m128(r, e); |
384 | } |
385 | |
386 | #[simd_test(enable = "avxneconvert" )] |
387 | unsafe fn test_mm256_cvtneoph_ps() { |
388 | let a = __m256h([ |
389 | 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, |
390 | ]); |
391 | let r = _mm256_cvtneoph_ps(addr_of!(a)); |
392 | let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.); |
393 | assert_eq_m256(r, e); |
394 | } |
395 | |
396 | #[simd_test(enable = "avxneconvert" )] |
397 | unsafe fn test_mm_cvtneps_avx_pbh() { |
398 | let a = _mm_setr_ps(1., 2., 3., 4.); |
399 | let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a)); |
400 | let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR); |
401 | assert_eq!(r, e); |
402 | } |
403 | |
404 | #[simd_test(enable = "avxneconvert" )] |
405 | unsafe fn test_mm256_cvtneps_avx_pbh() { |
406 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); |
407 | let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a)); |
408 | let e = u16x8::new( |
409 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
410 | ); |
411 | assert_eq!(r, e); |
412 | } |
413 | } |
414 | |