avxneconvert.rs source code [crates/core_arch/src/x86/avxneconvert.rs]

1	use crate::arch::asm;
2	use crate::core_arch::x86::*;
3
4	#[cfg(test)]
5	use stdarch_test::assert_instr;
6
7	/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
8	/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
9	/// floating-point elements, and store the results in dst.
10	///
11	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
12	#[inline]
13	#[target_feature(enable = "avxneconvert")]
14	#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
15	#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
16	pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
17	bcstnebf162ps_128(a)
18	}
19
20	/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
21	/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
22	/// elements, and store the results in dst.
23	///
24	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
25	#[inline]
26	#[target_feature(enable = "avxneconvert")]
27	#[cfg_attr(test, assert_instr(vbcstnebf162ps))]
28	#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
29	pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
30	bcstnebf162ps_256(a)
31	}
32
33	/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
34	/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
35	/// (32-bit) floating-point elements, and store the results in dst.
36	///
37	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
38	#[inline]
39	#[target_feature(enable = "avxneconvert")]
40	#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
41	#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
42	pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 {
43	bcstnesh2ps_128(a)
44	}
45
46	/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
47	/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
48	/// (32-bit) floating-point elements, and store the results in dst.
49	///
50	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
51	#[inline]
52	#[target_feature(enable = "avxneconvert")]
53	#[cfg_attr(test, assert_instr(vbcstnesh2ps))]
54	#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
55	pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 {
56	bcstnesh2ps_256(a)
57	}
58
59	/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
60	/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
61	///
62	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
63	#[inline]
64	#[target_feature(enable = "avxneconvert")]
65	#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
66	#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
67	pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
68	transmute(src:cvtneebf162ps_128(a))
69	}
70
71	/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
72	/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
73	///
74	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
75	#[inline]
76	#[target_feature(enable = "avxneconvert")]
77	#[cfg_attr(test, assert_instr(vcvtneebf162ps))]
78	#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
79	pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
80	transmute(src:cvtneebf162ps_256(a))
81	}
82
83	/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
84	/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
85	///
86	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
87	#[inline]
88	#[target_feature(enable = "avxneconvert")]
89	#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
90	#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
91	pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 {
92	transmute(src:cvtneeph2ps_128(a))
93	}
94
95	/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
96	/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
97	///
98	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
99	#[inline]
100	#[target_feature(enable = "avxneconvert")]
101	#[cfg_attr(test, assert_instr(vcvtneeph2ps))]
102	#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
103	pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 {
104	transmute(src:cvtneeph2ps_256(a))
105	}
106
107	/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
108	/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
109	///
110	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
111	#[inline]
112	#[target_feature(enable = "avxneconvert")]
113	#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
114	#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
115	pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
116	transmute(src:cvtneobf162ps_128(a))
117	}
118
119	/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
120	/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
121	///
122	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
123	#[inline]
124	#[target_feature(enable = "avxneconvert")]
125	#[cfg_attr(test, assert_instr(vcvtneobf162ps))]
126	#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
127	pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
128	transmute(src:cvtneobf162ps_256(a))
129	}
130
131	/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
132	/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
133	///
134	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
135	#[inline]
136	#[target_feature(enable = "avxneconvert")]
137	#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
138	#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
139	pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 {
140	transmute(src:cvtneoph2ps_128(a))
141	}
142
143	/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
144	/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
145	///
146	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
147	#[inline]
148	#[target_feature(enable = "avxneconvert")]
149	#[cfg_attr(test, assert_instr(vcvtneoph2ps))]
150	#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
151	pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
152	transmute(src:cvtneoph2ps_256(a))
153	}
154
155	/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
156	/// elements, and store the results in dst.
157	///
158	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
159	#[inline]
160	#[target_feature(enable = "avxneconvert")]
161	#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
162	#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
163	pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
164	unsafe {
165	let mut dst: __m128bh;
166	asm!(
167	"{{vex}}vcvtneps2bf16 {dst},{src}",
168	dst = lateout(xmm_reg) dst,
169	src = in(xmm_reg) a,
170	options(pure, nomem, nostack, preserves_flags)
171	);
172	dst
173	}
174	}
175
176	/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
177	/// elements, and store the results in dst.
178	///
179	/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
180	#[inline]
181	#[target_feature(enable = "avxneconvert")]
182	#[cfg_attr(test, assert_instr(vcvtneps2bf16))]
183	#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
184	pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
185	unsafe {
186	let mut dst: __m128bh;
187	asm!(
188	"{{vex}}vcvtneps2bf16 {dst},{src}",
189	dst = lateout(xmm_reg) dst,
190	src = in(ymm_reg) a,
191	options(pure, nomem, nostack, preserves_flags)
192	);
193	dst
194	}
195	}
196
197	#[allow(improper_ctypes)]
198	unsafe extern "C" {
199	#[link_name = "llvm.x86.vbcstnebf162ps128"]
200	unsafefn bcstnebf162ps_128(a: *const bf16) -> __m128;
201	#[link_name = "llvm.x86.vbcstnebf162ps256"]
202	unsafefn bcstnebf162ps_256(a: *const bf16) -> __m256;
203	#[link_name = "llvm.x86.vbcstnesh2ps128"]
204	unsafefn bcstnesh2ps_128(a: *const f16) -> __m128;
205	#[link_name = "llvm.x86.vbcstnesh2ps256"]
206	unsafefn bcstnesh2ps_256(a: *const f16) -> __m256;
207
208	#[link_name = "llvm.x86.vcvtneebf162ps128"]
209	unsafefn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
210	#[link_name = "llvm.x86.vcvtneebf162ps256"]
211	unsafefn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
212	#[link_name = "llvm.x86.vcvtneeph2ps128"]
213	unsafefn cvtneeph2ps_128(a: *const __m128h) -> __m128;
214	#[link_name = "llvm.x86.vcvtneeph2ps256"]
215	unsafefn cvtneeph2ps_256(a: *const __m256h) -> __m256;
216
217	#[link_name = "llvm.x86.vcvtneobf162ps128"]
218	unsafefn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
219	#[link_name = "llvm.x86.vcvtneobf162ps256"]
220	unsafefn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
221	#[link_name = "llvm.x86.vcvtneoph2ps128"]
222	unsafefn cvtneoph2ps_128(a: *const __m128h) -> __m128;
223	#[link_name = "llvm.x86.vcvtneoph2ps256"]
224	unsafefn cvtneoph2ps_256(a: *const __m256h) -> __m256;
225	}
226
227	#[cfg(test)]
228	mod tests {
229	use crate::core_arch::simd::{u16x4, u16x8};
230	use crate::core_arch::x86::*;
231	use crate::mem::transmute_copy;
232	use std::ptr::addr_of;
233	use stdarch_test::simd_test;
234
235	const BF16_ONE: u16 = `0b0_01111111_0000000`;
236	const BF16_TWO: u16 = `0b0_10000000_0000000`;
237	const BF16_THREE: u16 = `0b0_10000000_1000000`;
238	const BF16_FOUR: u16 = `0b0_10000001_0000000`;
239	const BF16_FIVE: u16 = `0b0_10000001_0100000`;
240	const BF16_SIX: u16 = `0b0_10000001_1000000`;
241	const BF16_SEVEN: u16 = `0b0_10000001_1100000`;
242	const BF16_EIGHT: u16 = `0b0_10000010_0000000`;
243
244	#[simd_test(enable = "avxneconvert")]
245	unsafe fn test_mm_bcstnebf16_ps() {
246	let a = bf16::from_bits(BF16_ONE);
247	let r = _mm_bcstnebf16_ps(addr_of!(a));
248	let e = _mm_set_ps(`1.`, `1.`, `1.`, `1.`);
249	assert_eq_m128(r, e);
250	}
251
252	#[simd_test(enable = "avxneconvert")]
253	unsafe fn test_mm256_bcstnebf16_ps() {
254	let a = bf16::from_bits(BF16_ONE);
255	let r = _mm256_bcstnebf16_ps(addr_of!(a));
256	let e = _mm256_set_ps(`1.`, `1.`, `1.`, `1.`, `1.`, `1.`, `1.`, `1.`);
257	assert_eq_m256(r, e);
258	}
259
260	#[simd_test(enable = "avxneconvert")]
261	unsafe fn test_mm_bcstnesh_ps() {
262	let a = `1.0_f16`;
263	let r = _mm_bcstnesh_ps(addr_of!(a));
264	let e = _mm_set_ps(`1.`, `1.`, `1.`, `1.`);
265	assert_eq_m128(r, e);
266	}
267
268	#[simd_test(enable = "avxneconvert")]
269	unsafe fn test_mm256_bcstnesh_ps() {
270	let a = `1.0_f16`;
271	let r = _mm256_bcstnesh_ps(addr_of!(a));
272	let e = _mm256_set_ps(`1.`, `1.`, `1.`, `1.`, `1.`, `1.`, `1.`, `1.`);
273	assert_eq_m256(r, e);
274	}
275
276	#[simd_test(enable = "avxneconvert")]
277	unsafe fn test_mm_cvtneebf16_ps() {
278	let a = __m128bh([
279	BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
280	]);
281	let r = _mm_cvtneebf16_ps(addr_of!(a));
282	let e = _mm_setr_ps(`1.`, `3.`, `5.`, `7.`);
283	assert_eq_m128(r, e);
284	}
285
286	#[simd_test(enable = "avxneconvert")]
287	unsafe fn test_mm256_cvtneebf16_ps() {
288	let a = __m256bh([
289	BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
290	BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
291	]);
292	let r = _mm256_cvtneebf16_ps(addr_of!(a));
293	let e = _mm256_setr_ps(`1.`, `3.`, `5.`, `7.`, `1.`, `3.`, `5.`, `7.`);
294	assert_eq_m256(r, e);
295	}
296
297	#[simd_test(enable = "avxneconvert")]
298	unsafe fn test_mm_cvtneeph_ps() {
299	let a = __m128h([`1.0`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`]);
300	let r = _mm_cvtneeph_ps(addr_of!(a));
301	let e = _mm_setr_ps(`1.`, `3.`, `5.`, `7.`);
302	assert_eq_m128(r, e);
303	}
304
305	#[simd_test(enable = "avxneconvert")]
306	unsafe fn test_mm256_cvtneeph_ps() {
307	let a = __m256h([
308	`1.0`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`, `9.0`, `10.0`, `11.0`, `12.0`, `13.0`, `14.0`, `15.0`, `16.0`,
309	]);
310	let r = _mm256_cvtneeph_ps(addr_of!(a));
311	let e = _mm256_setr_ps(`1.`, `3.`, `5.`, `7.`, `9.`, `11.`, `13.`, `15.`);
312	assert_eq_m256(r, e);
313	}
314
315	#[simd_test(enable = "avxneconvert")]
316	unsafe fn test_mm_cvtneobf16_ps() {
317	let a = __m128bh([
318	BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
319	]);
320	let r = _mm_cvtneobf16_ps(addr_of!(a));
321	let e = _mm_setr_ps(`2.`, `4.`, `6.`, `8.`);
322	assert_eq_m128(r, e);
323	}
324
325	#[simd_test(enable = "avxneconvert")]
326	unsafe fn test_mm256_cvtneobf16_ps() {
327	let a = __m256bh([
328	BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
329	BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
330	]);
331	let r = _mm256_cvtneobf16_ps(addr_of!(a));
332	let e = _mm256_setr_ps(`2.`, `4.`, `6.`, `8.`, `2.`, `4.`, `6.`, `8.`);
333	assert_eq_m256(r, e);
334	}
335
336	#[simd_test(enable = "avxneconvert")]
337	unsafe fn test_mm_cvtneoph_ps() {
338	let a = __m128h([`1.0`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`]);
339	let r = _mm_cvtneoph_ps(addr_of!(a));
340	let e = _mm_setr_ps(`2.`, `4.`, `6.`, `8.`);
341	assert_eq_m128(r, e);
342	}
343
344	#[simd_test(enable = "avxneconvert")]
345	unsafe fn test_mm256_cvtneoph_ps() {
346	let a = __m256h([
347	`1.0`, `2.0`, `3.0`, `4.0`, `5.0`, `6.0`, `7.0`, `8.0`, `9.0`, `10.0`, `11.0`, `12.0`, `13.0`, `14.0`, `15.0`, `16.0`,
348	]);
349	let r = _mm256_cvtneoph_ps(addr_of!(a));
350	let e = _mm256_setr_ps(`2.`, `4.`, `6.`, `8.`, `10.`, `12.`, `14.`, `16.`);
351	assert_eq_m256(r, e);
352	}
353
354	#[simd_test(enable = "avxneconvert")]
355	unsafe fn test_mm_cvtneps_avx_pbh() {
356	let a = _mm_setr_ps(`1.`, `2.`, `3.`, `4.`);
357	let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
358	let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
359	assert_eq!(r, e);
360	}
361
362	#[simd_test(enable = "avxneconvert")]
363	unsafe fn test_mm256_cvtneps_avx_pbh() {
364	let a = _mm256_setr_ps(`1.`, `2.`, `3.`, `4.`, `5.`, `6.`, `7.`, `8.`);
365	let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
366	let e = u16x8::new(
367	BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
368	);
369	assert_eq!(r, e);
370	}
371	}
372