1 | use crate::arch::asm; |
2 | use crate::core_arch::x86::*; |
3 | |
4 | #[cfg (test)] |
5 | use stdarch_test::assert_instr; |
6 | |
7 | /// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location |
8 | /// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) |
9 | /// floating-point elements, and store the results in dst. |
10 | /// |
11 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps) |
12 | #[inline ] |
13 | #[target_feature (enable = "avxneconvert" )] |
14 | #[cfg_attr (test, assert_instr(vbcstnebf162ps))] |
15 | #[unstable (feature = "stdarch_x86_avx512_bf16" , issue = "127356" )] |
16 | pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 { |
17 | bcstnebf162ps_128(a) |
18 | } |
19 | |
20 | /// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location |
21 | /// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point |
22 | /// elements, and store the results in dst. |
23 | /// |
24 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps) |
25 | #[inline ] |
26 | #[target_feature (enable = "avxneconvert" )] |
27 | #[cfg_attr (test, assert_instr(vbcstnebf162ps))] |
28 | #[unstable (feature = "stdarch_x86_avx512_bf16" , issue = "127356" )] |
29 | pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 { |
30 | bcstnebf162ps_256(a) |
31 | } |
32 | |
33 | /// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting |
34 | /// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision |
35 | /// (32-bit) floating-point elements, and store the results in dst. |
36 | /// |
37 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps) |
38 | #[inline ] |
39 | #[target_feature (enable = "avxneconvert" )] |
40 | #[cfg_attr (test, assert_instr(vbcstnesh2ps))] |
41 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
42 | pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 { |
43 | bcstnesh2ps_128(a) |
44 | } |
45 | |
46 | /// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting |
47 | /// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision |
48 | /// (32-bit) floating-point elements, and store the results in dst. |
49 | /// |
50 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps) |
51 | #[inline ] |
52 | #[target_feature (enable = "avxneconvert" )] |
53 | #[cfg_attr (test, assert_instr(vbcstnesh2ps))] |
54 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
55 | pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 { |
56 | bcstnesh2ps_256(a) |
57 | } |
58 | |
59 | /// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at |
60 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
61 | /// |
62 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps) |
63 | #[inline ] |
64 | #[target_feature (enable = "avxneconvert" )] |
65 | #[cfg_attr (test, assert_instr(vcvtneebf162ps))] |
66 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
67 | pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 { |
68 | transmute(src:cvtneebf162ps_128(a)) |
69 | } |
70 | |
71 | /// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at |
72 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
73 | /// |
74 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps) |
75 | #[inline ] |
76 | #[target_feature (enable = "avxneconvert" )] |
77 | #[cfg_attr (test, assert_instr(vcvtneebf162ps))] |
78 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
79 | pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 { |
80 | transmute(src:cvtneebf162ps_256(a)) |
81 | } |
82 | |
83 | /// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at |
84 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
85 | /// |
86 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps) |
87 | #[inline ] |
88 | #[target_feature (enable = "avxneconvert" )] |
89 | #[cfg_attr (test, assert_instr(vcvtneeph2ps))] |
90 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
91 | pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 { |
92 | transmute(src:cvtneeph2ps_128(a)) |
93 | } |
94 | |
95 | /// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at |
96 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
97 | /// |
98 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps) |
99 | #[inline ] |
100 | #[target_feature (enable = "avxneconvert" )] |
101 | #[cfg_attr (test, assert_instr(vcvtneeph2ps))] |
102 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
103 | pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 { |
104 | transmute(src:cvtneeph2ps_256(a)) |
105 | } |
106 | |
107 | /// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at |
108 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
109 | /// |
110 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps) |
111 | #[inline ] |
112 | #[target_feature (enable = "avxneconvert" )] |
113 | #[cfg_attr (test, assert_instr(vcvtneobf162ps))] |
114 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
115 | pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 { |
116 | transmute(src:cvtneobf162ps_128(a)) |
117 | } |
118 | |
119 | /// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at |
120 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
121 | /// |
122 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps) |
123 | #[inline ] |
124 | #[target_feature (enable = "avxneconvert" )] |
125 | #[cfg_attr (test, assert_instr(vcvtneobf162ps))] |
126 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
127 | pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 { |
128 | transmute(src:cvtneobf162ps_256(a)) |
129 | } |
130 | |
131 | /// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at |
132 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
133 | /// |
134 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps) |
135 | #[inline ] |
136 | #[target_feature (enable = "avxneconvert" )] |
137 | #[cfg_attr (test, assert_instr(vcvtneoph2ps))] |
138 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
139 | pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 { |
140 | transmute(src:cvtneoph2ps_128(a)) |
141 | } |
142 | |
143 | /// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at |
144 | /// location a to single precision (32-bit) floating-point elements, and store the results in dst. |
145 | /// |
146 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps) |
147 | #[inline ] |
148 | #[target_feature (enable = "avxneconvert" )] |
149 | #[cfg_attr (test, assert_instr(vcvtneoph2ps))] |
150 | #[unstable (feature = "stdarch_x86_avx512_f16" , issue = "127213" )] |
151 | pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 { |
152 | transmute(src:cvtneoph2ps_256(a)) |
153 | } |
154 | |
155 | /// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point |
156 | /// elements, and store the results in dst. |
157 | /// |
158 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh) |
159 | #[inline ] |
160 | #[target_feature (enable = "avxneconvert" )] |
161 | #[cfg_attr (test, assert_instr(vcvtneps2bf16))] |
162 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
163 | pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh { |
164 | unsafe { |
165 | let mut dst: __m128bh; |
166 | asm!( |
167 | "{{vex}}vcvtneps2bf16 { dst},{ src}" , |
168 | dst = lateout(xmm_reg) dst, |
169 | src = in(xmm_reg) a, |
170 | options(pure, nomem, nostack, preserves_flags) |
171 | ); |
172 | dst |
173 | } |
174 | } |
175 | |
176 | /// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point |
177 | /// elements, and store the results in dst. |
178 | /// |
179 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh) |
180 | #[inline ] |
181 | #[target_feature (enable = "avxneconvert" )] |
182 | #[cfg_attr (test, assert_instr(vcvtneps2bf16))] |
183 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
184 | pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh { |
185 | unsafe { |
186 | let mut dst: __m128bh; |
187 | asm!( |
188 | "{{vex}}vcvtneps2bf16 { dst},{ src}" , |
189 | dst = lateout(xmm_reg) dst, |
190 | src = in(ymm_reg) a, |
191 | options(pure, nomem, nostack, preserves_flags) |
192 | ); |
193 | dst |
194 | } |
195 | } |
196 | |
197 | #[allow (improper_ctypes)] |
198 | unsafe extern "C" { |
199 | #[link_name = "llvm.x86.vbcstnebf162ps128" ] |
200 | unsafefn bcstnebf162ps_128(a: *const bf16) -> __m128; |
201 | #[link_name = "llvm.x86.vbcstnebf162ps256" ] |
202 | unsafefn bcstnebf162ps_256(a: *const bf16) -> __m256; |
203 | #[link_name = "llvm.x86.vbcstnesh2ps128" ] |
204 | unsafefn bcstnesh2ps_128(a: *const f16) -> __m128; |
205 | #[link_name = "llvm.x86.vbcstnesh2ps256" ] |
206 | unsafefn bcstnesh2ps_256(a: *const f16) -> __m256; |
207 | |
208 | #[link_name = "llvm.x86.vcvtneebf162ps128" ] |
209 | unsafefn cvtneebf162ps_128(a: *const __m128bh) -> __m128; |
210 | #[link_name = "llvm.x86.vcvtneebf162ps256" ] |
211 | unsafefn cvtneebf162ps_256(a: *const __m256bh) -> __m256; |
212 | #[link_name = "llvm.x86.vcvtneeph2ps128" ] |
213 | unsafefn cvtneeph2ps_128(a: *const __m128h) -> __m128; |
214 | #[link_name = "llvm.x86.vcvtneeph2ps256" ] |
215 | unsafefn cvtneeph2ps_256(a: *const __m256h) -> __m256; |
216 | |
217 | #[link_name = "llvm.x86.vcvtneobf162ps128" ] |
218 | unsafefn cvtneobf162ps_128(a: *const __m128bh) -> __m128; |
219 | #[link_name = "llvm.x86.vcvtneobf162ps256" ] |
220 | unsafefn cvtneobf162ps_256(a: *const __m256bh) -> __m256; |
221 | #[link_name = "llvm.x86.vcvtneoph2ps128" ] |
222 | unsafefn cvtneoph2ps_128(a: *const __m128h) -> __m128; |
223 | #[link_name = "llvm.x86.vcvtneoph2ps256" ] |
224 | unsafefn cvtneoph2ps_256(a: *const __m256h) -> __m256; |
225 | } |
226 | |
227 | #[cfg (test)] |
228 | mod tests { |
229 | use crate::core_arch::simd::{u16x4, u16x8}; |
230 | use crate::core_arch::x86::*; |
231 | use crate::mem::transmute_copy; |
232 | use std::ptr::addr_of; |
233 | use stdarch_test::simd_test; |
234 | |
235 | const BF16_ONE: u16 = 0b0_01111111_0000000; |
236 | const BF16_TWO: u16 = 0b0_10000000_0000000; |
237 | const BF16_THREE: u16 = 0b0_10000000_1000000; |
238 | const BF16_FOUR: u16 = 0b0_10000001_0000000; |
239 | const BF16_FIVE: u16 = 0b0_10000001_0100000; |
240 | const BF16_SIX: u16 = 0b0_10000001_1000000; |
241 | const BF16_SEVEN: u16 = 0b0_10000001_1100000; |
242 | const BF16_EIGHT: u16 = 0b0_10000010_0000000; |
243 | |
244 | #[simd_test(enable = "avxneconvert" )] |
245 | unsafe fn test_mm_bcstnebf16_ps() { |
246 | let a = bf16::from_bits(BF16_ONE); |
247 | let r = _mm_bcstnebf16_ps(addr_of!(a)); |
248 | let e = _mm_set_ps(1., 1., 1., 1.); |
249 | assert_eq_m128(r, e); |
250 | } |
251 | |
252 | #[simd_test(enable = "avxneconvert" )] |
253 | unsafe fn test_mm256_bcstnebf16_ps() { |
254 | let a = bf16::from_bits(BF16_ONE); |
255 | let r = _mm256_bcstnebf16_ps(addr_of!(a)); |
256 | let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.); |
257 | assert_eq_m256(r, e); |
258 | } |
259 | |
260 | #[simd_test(enable = "avxneconvert" )] |
261 | unsafe fn test_mm_bcstnesh_ps() { |
262 | let a = 1.0_f16; |
263 | let r = _mm_bcstnesh_ps(addr_of!(a)); |
264 | let e = _mm_set_ps(1., 1., 1., 1.); |
265 | assert_eq_m128(r, e); |
266 | } |
267 | |
268 | #[simd_test(enable = "avxneconvert" )] |
269 | unsafe fn test_mm256_bcstnesh_ps() { |
270 | let a = 1.0_f16; |
271 | let r = _mm256_bcstnesh_ps(addr_of!(a)); |
272 | let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.); |
273 | assert_eq_m256(r, e); |
274 | } |
275 | |
276 | #[simd_test(enable = "avxneconvert" )] |
277 | unsafe fn test_mm_cvtneebf16_ps() { |
278 | let a = __m128bh([ |
279 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
280 | ]); |
281 | let r = _mm_cvtneebf16_ps(addr_of!(a)); |
282 | let e = _mm_setr_ps(1., 3., 5., 7.); |
283 | assert_eq_m128(r, e); |
284 | } |
285 | |
286 | #[simd_test(enable = "avxneconvert" )] |
287 | unsafe fn test_mm256_cvtneebf16_ps() { |
288 | let a = __m256bh([ |
289 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
290 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
291 | ]); |
292 | let r = _mm256_cvtneebf16_ps(addr_of!(a)); |
293 | let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.); |
294 | assert_eq_m256(r, e); |
295 | } |
296 | |
297 | #[simd_test(enable = "avxneconvert" )] |
298 | unsafe fn test_mm_cvtneeph_ps() { |
299 | let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); |
300 | let r = _mm_cvtneeph_ps(addr_of!(a)); |
301 | let e = _mm_setr_ps(1., 3., 5., 7.); |
302 | assert_eq_m128(r, e); |
303 | } |
304 | |
305 | #[simd_test(enable = "avxneconvert" )] |
306 | unsafe fn test_mm256_cvtneeph_ps() { |
307 | let a = __m256h([ |
308 | 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, |
309 | ]); |
310 | let r = _mm256_cvtneeph_ps(addr_of!(a)); |
311 | let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.); |
312 | assert_eq_m256(r, e); |
313 | } |
314 | |
315 | #[simd_test(enable = "avxneconvert" )] |
316 | unsafe fn test_mm_cvtneobf16_ps() { |
317 | let a = __m128bh([ |
318 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
319 | ]); |
320 | let r = _mm_cvtneobf16_ps(addr_of!(a)); |
321 | let e = _mm_setr_ps(2., 4., 6., 8.); |
322 | assert_eq_m128(r, e); |
323 | } |
324 | |
325 | #[simd_test(enable = "avxneconvert" )] |
326 | unsafe fn test_mm256_cvtneobf16_ps() { |
327 | let a = __m256bh([ |
328 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
329 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
330 | ]); |
331 | let r = _mm256_cvtneobf16_ps(addr_of!(a)); |
332 | let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.); |
333 | assert_eq_m256(r, e); |
334 | } |
335 | |
336 | #[simd_test(enable = "avxneconvert" )] |
337 | unsafe fn test_mm_cvtneoph_ps() { |
338 | let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); |
339 | let r = _mm_cvtneoph_ps(addr_of!(a)); |
340 | let e = _mm_setr_ps(2., 4., 6., 8.); |
341 | assert_eq_m128(r, e); |
342 | } |
343 | |
344 | #[simd_test(enable = "avxneconvert" )] |
345 | unsafe fn test_mm256_cvtneoph_ps() { |
346 | let a = __m256h([ |
347 | 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, |
348 | ]); |
349 | let r = _mm256_cvtneoph_ps(addr_of!(a)); |
350 | let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.); |
351 | assert_eq_m256(r, e); |
352 | } |
353 | |
354 | #[simd_test(enable = "avxneconvert" )] |
355 | unsafe fn test_mm_cvtneps_avx_pbh() { |
356 | let a = _mm_setr_ps(1., 2., 3., 4.); |
357 | let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a)); |
358 | let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR); |
359 | assert_eq!(r, e); |
360 | } |
361 | |
362 | #[simd_test(enable = "avxneconvert" )] |
363 | unsafe fn test_mm256_cvtneps_avx_pbh() { |
364 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); |
365 | let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a)); |
366 | let e = u16x8::new( |
367 | BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, |
368 | ); |
369 | assert_eq!(r, e); |
370 | } |
371 | } |
372 | |