1 | //! [F16C intrinsics]. |
2 | //! |
3 | //! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769 |
4 | |
5 | use crate::{ |
6 | core_arch::{simd::*, x86::*}, |
7 | // hint::unreachable_unchecked, |
8 | mem::transmute, |
9 | }; |
10 | |
11 | #[cfg (test)] |
12 | use stdarch_test::assert_instr; |
13 | |
14 | #[allow (improper_ctypes)] |
15 | extern "unadjusted" { |
16 | #[link_name = "llvm.x86.vcvtph2ps.128" ] |
17 | fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4; |
18 | #[link_name = "llvm.x86.vcvtph2ps.256" ] |
19 | fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8; |
20 | #[link_name = "llvm.x86.vcvtps2ph.128" ] |
21 | fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8; |
22 | #[link_name = "llvm.x86.vcvtps2ph.256" ] |
23 | fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8; |
24 | } |
25 | |
26 | /// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of |
27 | /// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide |
28 | /// vector. |
29 | #[inline ] |
30 | #[target_feature (enable = "f16c" )] |
31 | #[cfg_attr (test, assert_instr("vcvtph2ps" ))] |
32 | #[stable (feature = "x86_f16c_intrinsics" , since = "1.68.0" )] |
33 | pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 { |
34 | transmute(src:llvm_vcvtph2ps_128(transmute(src:a))) |
35 | } |
36 | |
37 | /// Converts the 8 x 16-bit half-precision float values in the 128-bit vector |
38 | /// `a` into 8 x 32-bit float values stored in a 256-bit wide vector. |
39 | #[inline ] |
40 | #[target_feature (enable = "f16c" )] |
41 | #[cfg_attr (test, assert_instr("vcvtph2ps" ))] |
42 | #[stable (feature = "x86_f16c_intrinsics" , since = "1.68.0" )] |
43 | pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 { |
44 | transmute(src:llvm_vcvtph2ps_256(transmute(src:a))) |
45 | } |
46 | |
47 | /// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x |
48 | /// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit |
49 | /// vector. |
50 | /// |
51 | /// Rounding is done according to the `imm_rounding` parameter, which can be one of: |
52 | /// |
53 | /// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions, |
54 | /// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions, |
55 | /// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions, |
56 | /// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, |
57 | /// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. |
58 | #[inline ] |
59 | #[target_feature (enable = "f16c" )] |
60 | #[cfg_attr (test, assert_instr("vcvtps2ph" , IMM_ROUNDING = 0))] |
61 | #[rustc_legacy_const_generics (1)] |
62 | #[stable (feature = "x86_f16c_intrinsics" , since = "1.68.0" )] |
63 | pub unsafe fn _mm_cvtps_ph<const IMM_ROUNDING: i32>(a: __m128) -> __m128i { |
64 | static_assert_uimm_bits!(IMM_ROUNDING, 3); |
65 | let a: f32x4 = a.as_f32x4(); |
66 | let r: i16x8 = llvm_vcvtps2ph_128(a, IMM_ROUNDING); |
67 | transmute(src:r) |
68 | } |
69 | |
70 | /// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x |
71 | /// 16-bit half-precision float values stored in a 128-bit wide vector. |
72 | /// |
73 | /// Rounding is done according to the `imm_rounding` parameter, which can be one of: |
74 | /// |
75 | /// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions, |
76 | /// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions, |
77 | /// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions, |
78 | /// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, |
79 | /// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. |
80 | #[inline ] |
81 | #[target_feature (enable = "f16c" )] |
82 | #[cfg_attr (test, assert_instr("vcvtps2ph" , IMM_ROUNDING = 0))] |
83 | #[rustc_legacy_const_generics (1)] |
84 | #[stable (feature = "x86_f16c_intrinsics" , since = "1.68.0" )] |
85 | pub unsafe fn _mm256_cvtps_ph<const IMM_ROUNDING: i32>(a: __m256) -> __m128i { |
86 | static_assert_uimm_bits!(IMM_ROUNDING, 3); |
87 | let a: f32x8 = a.as_f32x8(); |
88 | let r: i16x8 = llvm_vcvtps2ph_256(a, IMM_ROUNDING); |
89 | transmute(src:r) |
90 | } |
91 | |
92 | #[cfg (test)] |
93 | mod tests { |
94 | use crate::{core_arch::x86::*, mem::transmute}; |
95 | use stdarch_test::simd_test; |
96 | |
97 | #[simd_test(enable = "f16c" )] |
98 | unsafe fn test_mm_cvtph_ps() { |
99 | let array = [1_f32, 2_f32, 3_f32, 4_f32]; |
100 | let float_vec: __m128 = transmute(array); |
101 | let halfs: __m128i = _mm_cvtps_ph::<0>(float_vec); |
102 | let floats: __m128 = _mm_cvtph_ps(halfs); |
103 | let result: [f32; 4] = transmute(floats); |
104 | assert_eq!(result, array); |
105 | } |
106 | |
107 | #[simd_test(enable = "f16c" )] |
108 | unsafe fn test_mm256_cvtph_ps() { |
109 | let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32]; |
110 | let float_vec: __m256 = transmute(array); |
111 | let halfs: __m128i = _mm256_cvtps_ph::<0>(float_vec); |
112 | let floats: __m256 = _mm256_cvtph_ps(halfs); |
113 | let result: [f32; 8] = transmute(floats); |
114 | assert_eq!(result, array); |
115 | } |
116 | } |
117 | |