x86.rs source code [crates/half-2.4.0/src/binary16/arch/x86.rs]

1	use core::{mem::MaybeUninit, ptr};
2
3	#[cfg(target_arch = "x86")]
4	use core::arch::x86::{
5	__m128, __m128i, __m256, _mm256_cvtph_ps, _mm256_cvtps_ph, _mm_cvtph_ps,
6	_MM_FROUND_TO_NEAREST_INT,
7	};
8	#[cfg(target_arch = "x86_64")]
9	use core::arch::x86_64::{
10	__m128, __m128i, __m256, _mm256_cvtph_ps, _mm256_cvtps_ph, _mm_cvtph_ps, _mm_cvtps_ph,
11	_MM_FROUND_TO_NEAREST_INT,
12	};
13
14	#[cfg(target_arch = "x86")]
15	use core::arch::x86::_mm_cvtps_ph;
16
17	use super::convert_chunked_slice_8;
18
19	/////////////// x86/x86_64 f16c ////////////////
20
21	#[target_feature(enable = "f16c")]
22	#[inline]
23	pub(super) unsafe fn f16_to_f32_x86_f16c(i: u16) -> f32 {
24	let mut vec: MaybeUninit<__m128i> = MaybeUninit::<__m128i>::zeroed();
25	vec.as_mut_ptr().cast::<u16>().write(val:i);
26	let retval: __m128 = _mm_cvtph_ps(vec.assume_init());
27	(&retval as const __m128).cast()
28	}
29
30	#[target_feature(enable = "f16c")]
31	#[inline]
32	pub(super) unsafe fn f32_to_f16_x86_f16c(f: f32) -> u16 {
33	let mut vec: MaybeUninit<__m128> = MaybeUninit::<__m128>::zeroed();
34	vec.as_mut_ptr().cast::<f32>().write(val:f);
35	let retval: __m128i = _mm_cvtps_ph(vec.assume_init(), _MM_FROUND_TO_NEAREST_INT);
36	(&retval as const __m128i).cast()
37	}
38
39	#[target_feature(enable = "f16c")]
40	#[inline]
41	pub(super) unsafe fn f16x4_to_f32x4_x86_f16c(v: &[u16; `4`]) -> [f32; `4`] {
42	let mut vec: MaybeUninit<__m128i> = MaybeUninit::<__m128i>::zeroed();
43	ptr::copy_nonoverlapping(src:v.as_ptr(), dst:vec.as_mut_ptr().cast(), count:`4`);
44	let retval: __m128 = _mm_cvtph_ps(vec.assume_init());
45	(&retval as const __m128).cast()
46	}
47
48	#[target_feature(enable = "f16c")]
49	#[inline]
50	pub(super) unsafe fn f32x4_to_f16x4_x86_f16c(v: &[f32; `4`]) -> [u16; `4`] {
51	let mut vec: MaybeUninit<__m128> = MaybeUninit::<__m128>::uninit();
52	ptr::copy_nonoverlapping(src:v.as_ptr(), dst:vec.as_mut_ptr().cast(), count:`4`);
53	let retval: __m128i = _mm_cvtps_ph(vec.assume_init(), _MM_FROUND_TO_NEAREST_INT);
54	(&retval as const __m128i).cast()
55	}
56
57	#[target_feature(enable = "f16c")]
58	#[inline]
59	pub(super) unsafe fn f16x4_to_f64x4_x86_f16c(v: &[u16; `4`]) -> [f64; `4`] {
60	let array: [f32; 4] = f16x4_to_f32x4_x86_f16c(v);
61	// Let compiler vectorize this regular cast for now.
62	// TODO: investigate auto-detecting sse2/avx convert features
63	[
64	array[`0`] as f64,
65	array[`1`] as f64,
66	array[`2`] as f64,
67	array[`3`] as f64,
68	]
69	}
70
71	#[target_feature(enable = "f16c")]
72	#[inline]
73	pub(super) unsafe fn f64x4_to_f16x4_x86_f16c(v: &[f64; `4`]) -> [u16; `4`] {
74	// Let compiler vectorize this regular cast for now.
75	// TODO: investigate auto-detecting sse2/avx convert features
76	let v: [f32; 4] = [v[`0`] as f32, v[`1`] as f32, v[`2`] as f32, v[`3`] as f32];
77	f32x4_to_f16x4_x86_f16c(&v)
78	}
79
80	#[target_feature(enable = "f16c")]
81	#[inline]
82	pub(super) unsafe fn f16x8_to_f32x8_x86_f16c(v: &[u16; `8`]) -> [f32; `8`] {
83	let mut vec: MaybeUninit<__m128i> = MaybeUninit::<__m128i>::zeroed();
84	ptr::copy_nonoverlapping(src:v.as_ptr(), dst:vec.as_mut_ptr().cast(), count:`8`);
85	let retval: __m256 = _mm256_cvtph_ps(vec.assume_init());
86	(&retval as const __m256).cast()
87	}
88
89	#[target_feature(enable = "f16c")]
90	#[inline]
91	pub(super) unsafe fn f32x8_to_f16x8_x86_f16c(v: &[f32; `8`]) -> [u16; `8`] {
92	let mut vec: MaybeUninit<__m256> = MaybeUninit::<__m256>::uninit();
93	ptr::copy_nonoverlapping(src:v.as_ptr(), dst:vec.as_mut_ptr().cast(), count:`8`);
94	let retval: __m128i = _mm256_cvtps_ph(vec.assume_init(), _MM_FROUND_TO_NEAREST_INT);
95	(&retval as const __m128i).cast()
96	}
97
98	#[target_feature(enable = "f16c")]
99	#[inline]
100	pub(super) unsafe fn f16x8_to_f64x8_x86_f16c(v: &[u16; `8`]) -> [f64; `8`] {
101	let array: [f32; 8] = f16x8_to_f32x8_x86_f16c(v);
102	// Let compiler vectorize this regular cast for now.
103	// TODO: investigate auto-detecting sse2/avx convert features
104	[
105	array[`0`] as f64,
106	array[`1`] as f64,
107	array[`2`] as f64,
108	array[`3`] as f64,
109	array[`4`] as f64,
110	array[`5`] as f64,
111	array[`6`] as f64,
112	array[`7`] as f64,
113	]
114	}
115
116	#[target_feature(enable = "f16c")]
117	#[inline]
118	pub(super) unsafe fn f64x8_to_f16x8_x86_f16c(v: &[f64; `8`]) -> [u16; `8`] {
119	// Let compiler vectorize this regular cast for now.
120	// TODO: investigate auto-detecting sse2/avx convert features
121	let v: [f32; 8] = [
122	v[`0`] as f32,
123	v[`1`] as f32,
124	v[`2`] as f32,
125	v[`3`] as f32,
126	v[`4`] as f32,
127	v[`5`] as f32,
128	v[`6`] as f32,
129	v[`7`] as f32,
130	];
131	f32x8_to_f16x8_x86_f16c(&v)
132	}
133