| 1 | //! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`) |
| 2 | |
| 3 | use crate::core_arch::{simd::*, x86::*}; |
| 4 | |
| 5 | #[cfg (test)] |
| 6 | use stdarch_test::assert_instr; |
| 7 | |
| 8 | #[allow (improper_ctypes)] |
| 9 | unsafe extern "C" { |
| 10 | #[link_name = "llvm.x86.sse4a.extrq" ] |
| 11 | unsafefn extrq(x: i64x2, y: i8x16) -> i64x2; |
| 12 | #[link_name = "llvm.x86.sse4a.extrqi" ] |
| 13 | unsafefn extrqi(x: i64x2, len: u8, idx: u8) -> i64x2; |
| 14 | #[link_name = "llvm.x86.sse4a.insertq" ] |
| 15 | unsafefn insertq(x: i64x2, y: i64x2) -> i64x2; |
| 16 | #[link_name = "llvm.x86.sse4a.insertqi" ] |
| 17 | unsafefn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2; |
| 18 | #[link_name = "llvm.x86.sse4a.movnt.sd" ] |
| 19 | unsafefn movntsd(x: *mut f64, y: __m128d); |
| 20 | #[link_name = "llvm.x86.sse4a.movnt.ss" ] |
| 21 | unsafefn movntss(x: *mut f32, y: __m128); |
| 22 | } |
| 23 | |
| 24 | /// Extracts the bit range specified by `y` from the lower 64 bits of `x`. |
| 25 | /// |
| 26 | /// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The |
| 27 | /// `[5:0]` bits of `y` specify the length of the bit-range to extract. All |
| 28 | /// other bits are ignored. |
| 29 | /// |
| 30 | /// If the length is zero, it is interpreted as `64`. If the length and index |
| 31 | /// are zero, the lower 64 bits of `x` are extracted. |
| 32 | /// |
| 33 | /// If `length == 0 && index > 0` or `length + index > 64` the result is |
| 34 | /// undefined. |
| 35 | #[inline ] |
| 36 | #[target_feature (enable = "sse4a" )] |
| 37 | #[cfg_attr (test, assert_instr(extrq))] |
| 38 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
| 39 | pub fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i { |
| 40 | unsafe { transmute(src:extrq(x.as_i64x2(), y.as_i8x16())) } |
| 41 | } |
| 42 | |
| 43 | /// Extracts the specified bits from the lower 64 bits of the 128-bit integer vector operand at the |
| 44 | /// index `idx` and of the length `len`. |
| 45 | /// |
| 46 | /// `idx` specifies the index of the LSB. `len` specifies the number of bits to extract. If length |
| 47 | /// and index are both zero, bits `[63:0]` of parameter `x` are extracted. It is a compile-time error |
| 48 | /// for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero. |
| 49 | /// |
| 50 | /// Returns a 128-bit integer vector whose lower 64 bits contain the extracted bits. |
| 51 | #[inline ] |
| 52 | #[target_feature (enable = "sse4a" )] |
| 53 | #[cfg_attr (test, assert_instr(extrq, LEN = 5, IDX = 5))] |
| 54 | #[rustc_legacy_const_generics (1, 2)] |
| 55 | #[stable (feature = "simd_x86_updates" , since = "1.82.0" )] |
| 56 | pub fn _mm_extracti_si64<const LEN: i32, const IDX: i32>(x: __m128i) -> __m128i { |
| 57 | // LLVM mentions that it is UB if these are not satisfied |
| 58 | static_assert_uimm_bits!(LEN, 6); |
| 59 | static_assert_uimm_bits!(IDX, 6); |
| 60 | static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64)); |
| 61 | unsafe { transmute(src:extrqi(x.as_i64x2(), LEN as u8, IDX as u8)) } |
| 62 | } |
| 63 | |
| 64 | /// Inserts the `[length:0]` bits of `y` into `x` at `index`. |
| 65 | /// |
| 66 | /// The bits of `y`: |
| 67 | /// |
| 68 | /// - `[69:64]` specify the `length`, |
| 69 | /// - `[77:72]` specify the index. |
| 70 | /// |
| 71 | /// If the `length` is zero it is interpreted as `64`. If `index + length > 64` |
| 72 | /// or `index > 0 && length == 0` the result is undefined. |
| 73 | #[inline ] |
| 74 | #[target_feature (enable = "sse4a" )] |
| 75 | #[cfg_attr (test, assert_instr(insertq))] |
| 76 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
| 77 | pub fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i { |
| 78 | unsafe { transmute(src:insertq(x.as_i64x2(), y.as_i64x2())) } |
| 79 | } |
| 80 | |
| 81 | /// Inserts the `len` least-significant bits from the lower 64 bits of the 128-bit integer vector operand `y` into |
| 82 | /// the lower 64 bits of the 128-bit integer vector operand `x` at the index `idx` and of the length `len`. |
| 83 | /// |
| 84 | /// `idx` specifies the index of the LSB. `len` specifies the number of bits to insert. If length and index |
| 85 | /// are both zero, bits `[63:0]` of parameter `x` are replaced with bits `[63:0]` of parameter `y`. It is a |
| 86 | /// compile-time error for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero. |
| 87 | #[inline ] |
| 88 | #[target_feature (enable = "sse4a" )] |
| 89 | #[cfg_attr (test, assert_instr(insertq, LEN = 5, IDX = 5))] |
| 90 | #[rustc_legacy_const_generics (2, 3)] |
| 91 | #[stable (feature = "simd_x86_updates" , since = "1.82.0" )] |
| 92 | pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i) -> __m128i { |
| 93 | // LLVM mentions that it is UB if these are not satisfied |
| 94 | static_assert_uimm_bits!(LEN, 6); |
| 95 | static_assert_uimm_bits!(IDX, 6); |
| 96 | static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64)); |
| 97 | unsafe { transmute(src:insertqi(x.as_i64x2(), y.as_i64x2(), LEN as u8, IDX as u8)) } |
| 98 | } |
| 99 | |
| 100 | /// Non-temporal store of `a.0` into `p`. |
| 101 | /// |
| 102 | /// Writes 64-bit data to a memory location without polluting the caches. |
| 103 | /// |
| 104 | /// # Safety of non-temporal stores |
| 105 | /// |
| 106 | /// After using this intrinsic, but before any other access to the memory that this intrinsic |
| 107 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In |
| 108 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they |
| 109 | /// return. |
| 110 | /// |
| 111 | /// See [`_mm_sfence`] for details. |
| 112 | #[inline ] |
| 113 | #[target_feature (enable = "sse4a" )] |
| 114 | #[cfg_attr (test, assert_instr(movntsd))] |
| 115 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
| 116 | pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { |
| 117 | movntsd(x:p, y:a); |
| 118 | } |
| 119 | |
| 120 | /// Non-temporal store of `a.0` into `p`. |
| 121 | /// |
| 122 | /// Writes 32-bit data to a memory location without polluting the caches. |
| 123 | /// |
| 124 | /// # Safety of non-temporal stores |
| 125 | /// |
| 126 | /// After using this intrinsic, but before any other access to the memory that this intrinsic |
| 127 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In |
| 128 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they |
| 129 | /// return. |
| 130 | /// |
| 131 | /// See [`_mm_sfence`] for details. |
| 132 | #[inline ] |
| 133 | #[target_feature (enable = "sse4a" )] |
| 134 | #[cfg_attr (test, assert_instr(movntss))] |
| 135 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
| 136 | pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) { |
| 137 | movntss(x:p, y:a); |
| 138 | } |
| 139 | |
| 140 | #[cfg (test)] |
| 141 | mod tests { |
| 142 | use crate::core_arch::x86::*; |
| 143 | use stdarch_test::simd_test; |
| 144 | |
| 145 | #[simd_test(enable = "sse4a" )] |
| 146 | unsafe fn test_mm_extract_si64() { |
| 147 | let b = 0b0110_0000_0000_i64; |
| 148 | // ^^^^ bit range extracted |
| 149 | let x = _mm_setr_epi64x(b, 0); |
| 150 | let v = 0b001000___00___000100_i64; |
| 151 | // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
| 152 | let y = _mm_setr_epi64x(v, 0); |
| 153 | let e = _mm_setr_epi64x(0b0110_i64, 0); |
| 154 | let r = _mm_extract_si64(x, y); |
| 155 | assert_eq_m128i(r, e); |
| 156 | } |
| 157 | |
| 158 | #[simd_test(enable = "sse4a" )] |
| 159 | unsafe fn test_mm_extracti_si64() { |
| 160 | let a = _mm_setr_epi64x(0x0123456789abcdef, 0); |
| 161 | let r = _mm_extracti_si64::<8, 8>(a); |
| 162 | let e = _mm_setr_epi64x(0xcd, 0); |
| 163 | assert_eq_m128i(r, e); |
| 164 | } |
| 165 | |
| 166 | #[simd_test(enable = "sse4a" )] |
| 167 | unsafe fn test_mm_insert_si64() { |
| 168 | let i = 0b0110_i64; |
| 169 | // ^^^^ bit range inserted |
| 170 | let z = 0b1010_1010_1010i64; |
| 171 | // ^^^^ bit range replaced |
| 172 | let e = 0b0110_1010_1010i64; |
| 173 | // ^^^^ replaced 1010 with 0110 |
| 174 | let x = _mm_setr_epi64x(z, 0); |
| 175 | let expected = _mm_setr_epi64x(e, 0); |
| 176 | let v = 0b001000___00___000100_i64; |
| 177 | // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
| 178 | let y = _mm_setr_epi64x(i, v); |
| 179 | let r = _mm_insert_si64(x, y); |
| 180 | assert_eq_m128i(r, expected); |
| 181 | } |
| 182 | |
| 183 | #[simd_test(enable = "sse4a" )] |
| 184 | unsafe fn test_mm_inserti_si64() { |
| 185 | let a = _mm_setr_epi64x(0x0123456789abcdef, 0); |
| 186 | let b = _mm_setr_epi64x(0x0011223344556677, 0); |
| 187 | let r = _mm_inserti_si64::<8, 8>(a, b); |
| 188 | let e = _mm_setr_epi64x(0x0123456789ab77ef, 0); |
| 189 | assert_eq_m128i(r, e); |
| 190 | } |
| 191 | |
| 192 | #[repr (align(16))] |
| 193 | struct MemoryF64 { |
| 194 | data: [f64; 2], |
| 195 | } |
| 196 | |
| 197 | #[simd_test(enable = "sse4a" )] |
| 198 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
| 199 | // (non-temporal store) |
| 200 | #[cfg_attr (miri, ignore)] |
| 201 | unsafe fn test_mm_stream_sd() { |
| 202 | let mut mem = MemoryF64 { |
| 203 | data: [1.0_f64, 2.0], |
| 204 | }; |
| 205 | { |
| 206 | let vals = &mut mem.data; |
| 207 | let d = vals.as_mut_ptr(); |
| 208 | |
| 209 | let x = _mm_setr_pd(3.0, 4.0); |
| 210 | |
| 211 | _mm_stream_sd(d, x); |
| 212 | } |
| 213 | assert_eq!(mem.data[0], 3.0); |
| 214 | assert_eq!(mem.data[1], 2.0); |
| 215 | } |
| 216 | |
| 217 | #[repr (align(16))] |
| 218 | struct MemoryF32 { |
| 219 | data: [f32; 4], |
| 220 | } |
| 221 | |
| 222 | #[simd_test(enable = "sse4a" )] |
| 223 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
| 224 | // (non-temporal store) |
| 225 | #[cfg_attr (miri, ignore)] |
| 226 | unsafe fn test_mm_stream_ss() { |
| 227 | let mut mem = MemoryF32 { |
| 228 | data: [1.0_f32, 2.0, 3.0, 4.0], |
| 229 | }; |
| 230 | { |
| 231 | let vals = &mut mem.data; |
| 232 | let d = vals.as_mut_ptr(); |
| 233 | |
| 234 | let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
| 235 | |
| 236 | _mm_stream_ss(d, x); |
| 237 | } |
| 238 | assert_eq!(mem.data[0], 5.0); |
| 239 | assert_eq!(mem.data[1], 2.0); |
| 240 | assert_eq!(mem.data[2], 3.0); |
| 241 | assert_eq!(mem.data[3], 4.0); |
| 242 | } |
| 243 | } |
| 244 | |