| 1 | //! [F16C intrinsics]. | 
| 2 | //! | 
|---|
| 3 | //! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769 | 
|---|
| 4 |  | 
|---|
| 5 | use crate::core_arch::{simd::*, x86::*}; | 
|---|
| 6 |  | 
|---|
| 7 | #[ cfg(test)] | 
|---|
| 8 | use stdarch_test::assert_instr; | 
|---|
| 9 |  | 
|---|
| 10 | #[ allow(improper_ctypes)] | 
|---|
| 11 | unsafe extern "unadjusted"{ | 
|---|
| 12 | #[ link_name= "llvm.x86.vcvtph2ps.128"] | 
|---|
| 13 | unsafefn llvm_vcvtph2ps_128(a: i16x8) -> f32x4; | 
|---|
| 14 | #[ link_name= "llvm.x86.vcvtph2ps.256"] | 
|---|
| 15 | unsafefn llvm_vcvtph2ps_256(a: i16x8) -> f32x8; | 
|---|
| 16 | #[ link_name= "llvm.x86.vcvtps2ph.128"] | 
|---|
| 17 | unsafefn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8; | 
|---|
| 18 | #[ link_name= "llvm.x86.vcvtps2ph.256"] | 
|---|
| 19 | unsafefn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8; | 
|---|
| 20 | } | 
|---|
| 21 |  | 
|---|
| 22 | /// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of | 
|---|
| 23 | /// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide | 
|---|
| 24 | /// vector. | 
|---|
| 25 | /// | 
|---|
| 26 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_ps) | 
|---|
| 27 | #[ inline] | 
|---|
| 28 | #[ target_feature(enable = "f16c")] | 
|---|
| 29 | #[ cfg_attr(test, assert_instr( "vcvtph2ps"))] | 
|---|
| 30 | #[ stable(feature = "x86_f16c_intrinsics", since = "1.68.0")] | 
|---|
| 31 | pub fn _mm_cvtph_ps(a: __m128i) -> __m128 { | 
|---|
| 32 | unsafe { transmute(src:llvm_vcvtph2ps_128(transmute(src:a))) } | 
|---|
| 33 | } | 
|---|
| 34 |  | 
|---|
| 35 | /// Converts the 8 x 16-bit half-precision float values in the 128-bit vector | 
|---|
| 36 | /// `a` into 8 x 32-bit float values stored in a 256-bit wide vector. | 
|---|
| 37 | /// | 
|---|
| 38 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_ps) | 
|---|
| 39 | #[ inline] | 
|---|
| 40 | #[ target_feature(enable = "f16c")] | 
|---|
| 41 | #[ cfg_attr(test, assert_instr( "vcvtph2ps"))] | 
|---|
| 42 | #[ stable(feature = "x86_f16c_intrinsics", since = "1.68.0")] | 
|---|
| 43 | pub fn _mm256_cvtph_ps(a: __m128i) -> __m256 { | 
|---|
| 44 | unsafe { transmute(src:llvm_vcvtph2ps_256(transmute(src:a))) } | 
|---|
| 45 | } | 
|---|
| 46 |  | 
|---|
| 47 | /// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x | 
|---|
| 48 | /// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit | 
|---|
| 49 | /// vector. | 
|---|
| 50 | /// | 
|---|
| 51 | /// Rounding is done according to the `imm_rounding` parameter, which can be one of: | 
|---|
| 52 | /// | 
|---|
| 53 | /// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions | 
|---|
| 54 | /// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions | 
|---|
| 55 | /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions | 
|---|
| 56 | /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions | 
|---|
| 57 | /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`] | 
|---|
| 58 | /// | 
|---|
| 59 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_ph) | 
|---|
| 60 | #[ inline] | 
|---|
| 61 | #[ target_feature(enable = "f16c")] | 
|---|
| 62 | #[ cfg_attr(test, assert_instr( "vcvtps2ph", IMM_ROUNDING = 0))] | 
|---|
| 63 | #[ rustc_legacy_const_generics(1)] | 
|---|
| 64 | #[ stable(feature = "x86_f16c_intrinsics", since = "1.68.0")] | 
|---|
| 65 | pub fn _mm_cvtps_ph<const IMM_ROUNDING: i32>(a: __m128) -> __m128i { | 
|---|
| 66 | static_assert_uimm_bits!(IMM_ROUNDING, 3); | 
|---|
| 67 | unsafe { | 
|---|
| 68 | let a: f32x4 = a.as_f32x4(); | 
|---|
| 69 | let r: i16x8 = llvm_vcvtps2ph_128(a, IMM_ROUNDING); | 
|---|
| 70 | transmute(src:r) | 
|---|
| 71 | } | 
|---|
| 72 | } | 
|---|
| 73 |  | 
|---|
| 74 | /// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x | 
|---|
| 75 | /// 16-bit half-precision float values stored in a 128-bit wide vector. | 
|---|
| 76 | /// | 
|---|
| 77 | /// Rounding is done according to the `imm_rounding` parameter, which can be one of: | 
|---|
| 78 | /// | 
|---|
| 79 | /// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions | 
|---|
| 80 | /// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions | 
|---|
| 81 | /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions | 
|---|
| 82 | /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions | 
|---|
| 83 | /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`] | 
|---|
| 84 | /// | 
|---|
| 85 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_ph) | 
|---|
| 86 | #[ inline] | 
|---|
| 87 | #[ target_feature(enable = "f16c")] | 
|---|
| 88 | #[ cfg_attr(test, assert_instr( "vcvtps2ph", IMM_ROUNDING = 0))] | 
|---|
| 89 | #[ rustc_legacy_const_generics(1)] | 
|---|
| 90 | #[ stable(feature = "x86_f16c_intrinsics", since = "1.68.0")] | 
|---|
| 91 | pub fn _mm256_cvtps_ph<const IMM_ROUNDING: i32>(a: __m256) -> __m128i { | 
|---|
| 92 | static_assert_uimm_bits!(IMM_ROUNDING, 3); | 
|---|
| 93 | unsafe { | 
|---|
| 94 | let a: f32x8 = a.as_f32x8(); | 
|---|
| 95 | let r: i16x8 = llvm_vcvtps2ph_256(a, IMM_ROUNDING); | 
|---|
| 96 | transmute(src:r) | 
|---|
| 97 | } | 
|---|
| 98 | } | 
|---|
| 99 |  | 
|---|
| 100 | #[ cfg(test)] | 
|---|
| 101 | mod tests { | 
|---|
| 102 | use crate::{core_arch::x86::*, mem::transmute}; | 
|---|
| 103 | use stdarch_test::simd_test; | 
|---|
| 104 |  | 
|---|
| 105 | const F16_ONE: i16 = 0x3c00; | 
|---|
| 106 | const F16_TWO: i16 = 0x4000; | 
|---|
| 107 | const F16_THREE: i16 = 0x4200; | 
|---|
| 108 | const F16_FOUR: i16 = 0x4400; | 
|---|
| 109 | const F16_FIVE: i16 = 0x4500; | 
|---|
| 110 | const F16_SIX: i16 = 0x4600; | 
|---|
| 111 | const F16_SEVEN: i16 = 0x4700; | 
|---|
| 112 | const F16_EIGHT: i16 = 0x4800; | 
|---|
| 113 |  | 
|---|
| 114 | #[simd_test(enable = "f16c")] | 
|---|
| 115 | unsafe fn test_mm_cvtph_ps() { | 
|---|
| 116 | let a = _mm_set_epi16(0, 0, 0, 0, F16_ONE, F16_TWO, F16_THREE, F16_FOUR); | 
|---|
| 117 | let r = _mm_cvtph_ps(a); | 
|---|
| 118 | let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0); | 
|---|
| 119 | assert_eq_m128(r, e); | 
|---|
| 120 | } | 
|---|
| 121 |  | 
|---|
| 122 | #[simd_test(enable = "f16c")] | 
|---|
| 123 | unsafe fn test_mm256_cvtph_ps() { | 
|---|
| 124 | let a = _mm_set_epi16( | 
|---|
| 125 | F16_ONE, F16_TWO, F16_THREE, F16_FOUR, F16_FIVE, F16_SIX, F16_SEVEN, F16_EIGHT, | 
|---|
| 126 | ); | 
|---|
| 127 | let r = _mm256_cvtph_ps(a); | 
|---|
| 128 | let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); | 
|---|
| 129 | assert_eq_m256(r, e); | 
|---|
| 130 | } | 
|---|
| 131 |  | 
|---|
| 132 | #[simd_test(enable = "f16c")] | 
|---|
| 133 | unsafe fn test_mm_cvtps_ph() { | 
|---|
| 134 | let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0); | 
|---|
| 135 | let r = _mm_cvtps_ph::<_MM_FROUND_CUR_DIRECTION>(a); | 
|---|
| 136 | let e = _mm_set_epi16(0, 0, 0, 0, F16_ONE, F16_TWO, F16_THREE, F16_FOUR); | 
|---|
| 137 | assert_eq_m128i(r, e); | 
|---|
| 138 | } | 
|---|
| 139 |  | 
|---|
| 140 | #[simd_test(enable = "f16c")] | 
|---|
| 141 | unsafe fn test_mm256_cvtps_ph() { | 
|---|
| 142 | let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); | 
|---|
| 143 | let r = _mm256_cvtps_ph::<_MM_FROUND_CUR_DIRECTION>(a); | 
|---|
| 144 | let e = _mm_set_epi16( | 
|---|
| 145 | F16_ONE, F16_TWO, F16_THREE, F16_FOUR, F16_FIVE, F16_SIX, F16_SEVEN, F16_EIGHT, | 
|---|
| 146 | ); | 
|---|
| 147 | assert_eq_m128i(r, e); | 
|---|
| 148 | } | 
|---|
| 149 | } | 
|---|
| 150 |  | 
|---|