| 1 | //! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`) | 
| 2 |  | 
|---|
| 3 | use crate::core_arch::{simd::*, x86::*}; | 
|---|
| 4 |  | 
|---|
| 5 | #[ cfg(test)] | 
|---|
| 6 | use stdarch_test::assert_instr; | 
|---|
| 7 |  | 
|---|
| 8 | #[ allow(improper_ctypes)] | 
|---|
| 9 | unsafe extern "C"{ | 
|---|
| 10 | #[ link_name= "llvm.x86.sse4a.extrq"] | 
|---|
| 11 | unsafefn extrq(x: i64x2, y: i8x16) -> i64x2; | 
|---|
| 12 | #[ link_name= "llvm.x86.sse4a.extrqi"] | 
|---|
| 13 | unsafefn extrqi(x: i64x2, len: u8, idx: u8) -> i64x2; | 
|---|
| 14 | #[ link_name= "llvm.x86.sse4a.insertq"] | 
|---|
| 15 | unsafefn insertq(x: i64x2, y: i64x2) -> i64x2; | 
|---|
| 16 | #[ link_name= "llvm.x86.sse4a.insertqi"] | 
|---|
| 17 | unsafefn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2; | 
|---|
| 18 | #[ link_name= "llvm.x86.sse4a.movnt.sd"] | 
|---|
| 19 | unsafefn movntsd(x: *mut f64, y: __m128d); | 
|---|
| 20 | #[ link_name= "llvm.x86.sse4a.movnt.ss"] | 
|---|
| 21 | unsafefn movntss(x: *mut f32, y: __m128); | 
|---|
| 22 | } | 
|---|
| 23 |  | 
|---|
| 24 | /// Extracts the bit range specified by `y` from the lower 64 bits of `x`. | 
|---|
| 25 | /// | 
|---|
| 26 | /// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The | 
|---|
| 27 | /// `[5:0]` bits of `y` specify the length of the bit-range to extract. All | 
|---|
| 28 | /// other bits are ignored. | 
|---|
| 29 | /// | 
|---|
| 30 | /// If the length is zero, it is interpreted as `64`. If the length and index | 
|---|
| 31 | /// are zero, the lower 64 bits of `x` are extracted. | 
|---|
| 32 | /// | 
|---|
| 33 | /// If `length == 0 && index > 0` or `length + index > 64` the result is | 
|---|
| 34 | /// undefined. | 
|---|
| 35 | #[ inline] | 
|---|
| 36 | #[ target_feature(enable = "sse4a")] | 
|---|
| 37 | #[ cfg_attr(test, assert_instr(extrq))] | 
|---|
| 38 | #[ stable(feature = "simd_x86", since = "1.27.0")] | 
|---|
| 39 | pub fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i { | 
|---|
| 40 | unsafe { transmute(src:extrq(x.as_i64x2(), y.as_i8x16())) } | 
|---|
| 41 | } | 
|---|
| 42 |  | 
|---|
| 43 | /// Extracts the specified bits from the lower 64 bits of the 128-bit integer vector operand at the | 
|---|
| 44 | /// index `idx` and of the length `len`. | 
|---|
| 45 | /// | 
|---|
| 46 | /// `idx` specifies the index of the LSB. `len` specifies the number of bits to extract. If length | 
|---|
| 47 | /// and index are both zero, bits `[63:0]` of parameter `x` are extracted. It is a compile-time error | 
|---|
| 48 | /// for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero. | 
|---|
| 49 | /// | 
|---|
| 50 | /// Returns a 128-bit integer vector whose lower 64 bits contain the extracted bits. | 
|---|
| 51 | #[ inline] | 
|---|
| 52 | #[ target_feature(enable = "sse4a")] | 
|---|
| 53 | #[ cfg_attr(test, assert_instr(extrq, LEN = 5, IDX = 5))] | 
|---|
| 54 | #[ rustc_legacy_const_generics(1, 2)] | 
|---|
| 55 | #[ stable(feature = "simd_x86_updates", since = "1.82.0")] | 
|---|
| 56 | pub fn _mm_extracti_si64<const LEN: i32, const IDX: i32>(x: __m128i) -> __m128i { | 
|---|
| 57 | // LLVM mentions that it is UB if these are not satisfied | 
|---|
| 58 | static_assert_uimm_bits!(LEN, 6); | 
|---|
| 59 | static_assert_uimm_bits!(IDX, 6); | 
|---|
| 60 | static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64)); | 
|---|
| 61 | unsafe { transmute(src:extrqi(x.as_i64x2(), LEN as u8, IDX as u8)) } | 
|---|
| 62 | } | 
|---|
| 63 |  | 
|---|
| 64 | /// Inserts the `[length:0]` bits of `y` into `x` at `index`. | 
|---|
| 65 | /// | 
|---|
| 66 | /// The bits of `y`: | 
|---|
| 67 | /// | 
|---|
| 68 | /// - `[69:64]` specify the `length`, | 
|---|
| 69 | /// - `[77:72]` specify the index. | 
|---|
| 70 | /// | 
|---|
| 71 | /// If the `length` is zero it is interpreted as `64`. If `index + length > 64` | 
|---|
| 72 | /// or `index > 0 && length == 0` the result is undefined. | 
|---|
| 73 | #[ inline] | 
|---|
| 74 | #[ target_feature(enable = "sse4a")] | 
|---|
| 75 | #[ cfg_attr(test, assert_instr(insertq))] | 
|---|
| 76 | #[ stable(feature = "simd_x86", since = "1.27.0")] | 
|---|
| 77 | pub fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i { | 
|---|
| 78 | unsafe { transmute(src:insertq(x.as_i64x2(), y.as_i64x2())) } | 
|---|
| 79 | } | 
|---|
| 80 |  | 
|---|
| 81 | /// Inserts the `len` least-significant bits from the lower 64 bits of the 128-bit integer vector operand `y` into | 
|---|
| 82 | /// the lower 64 bits of the 128-bit integer vector operand `x` at the index `idx` and of the length `len`. | 
|---|
| 83 | /// | 
|---|
| 84 | /// `idx` specifies the index of the LSB. `len` specifies the number of bits to insert. If length and index | 
|---|
| 85 | /// are both zero, bits `[63:0]` of parameter `x` are replaced with bits `[63:0]` of parameter `y`. It is a | 
|---|
| 86 | /// compile-time error for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero. | 
|---|
| 87 | #[ inline] | 
|---|
| 88 | #[ target_feature(enable = "sse4a")] | 
|---|
| 89 | #[ cfg_attr(test, assert_instr(insertq, LEN = 5, IDX = 5))] | 
|---|
| 90 | #[ rustc_legacy_const_generics(2, 3)] | 
|---|
| 91 | #[ stable(feature = "simd_x86_updates", since = "1.82.0")] | 
|---|
| 92 | pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i) -> __m128i { | 
|---|
| 93 | // LLVM mentions that it is UB if these are not satisfied | 
|---|
| 94 | static_assert_uimm_bits!(LEN, 6); | 
|---|
| 95 | static_assert_uimm_bits!(IDX, 6); | 
|---|
| 96 | static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64)); | 
|---|
| 97 | unsafe { transmute(src:insertqi(x.as_i64x2(), y.as_i64x2(), LEN as u8, IDX as u8)) } | 
|---|
| 98 | } | 
|---|
| 99 |  | 
|---|
| 100 | /// Non-temporal store of `a.0` into `p`. | 
|---|
| 101 | /// | 
|---|
| 102 | /// Writes 64-bit data to a memory location without polluting the caches. | 
|---|
| 103 | /// | 
|---|
| 104 | /// # Safety of non-temporal stores | 
|---|
| 105 | /// | 
|---|
| 106 | /// After using this intrinsic, but before any other access to the memory that this intrinsic | 
|---|
| 107 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In | 
|---|
| 108 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they | 
|---|
| 109 | /// return. | 
|---|
| 110 | /// | 
|---|
| 111 | /// See [`_mm_sfence`] for details. | 
|---|
| 112 | #[ inline] | 
|---|
| 113 | #[ target_feature(enable = "sse4a")] | 
|---|
| 114 | #[ cfg_attr(test, assert_instr(movntsd))] | 
|---|
| 115 | #[ stable(feature = "simd_x86", since = "1.27.0")] | 
|---|
| 116 | pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { | 
|---|
| 117 | movntsd(x:p, y:a); | 
|---|
| 118 | } | 
|---|
| 119 |  | 
|---|
| 120 | /// Non-temporal store of `a.0` into `p`. | 
|---|
| 121 | /// | 
|---|
| 122 | /// Writes 32-bit data to a memory location without polluting the caches. | 
|---|
| 123 | /// | 
|---|
| 124 | /// # Safety of non-temporal stores | 
|---|
| 125 | /// | 
|---|
| 126 | /// After using this intrinsic, but before any other access to the memory that this intrinsic | 
|---|
| 127 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In | 
|---|
| 128 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they | 
|---|
| 129 | /// return. | 
|---|
| 130 | /// | 
|---|
| 131 | /// See [`_mm_sfence`] for details. | 
|---|
| 132 | #[ inline] | 
|---|
| 133 | #[ target_feature(enable = "sse4a")] | 
|---|
| 134 | #[ cfg_attr(test, assert_instr(movntss))] | 
|---|
| 135 | #[ stable(feature = "simd_x86", since = "1.27.0")] | 
|---|
| 136 | pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) { | 
|---|
| 137 | movntss(x:p, y:a); | 
|---|
| 138 | } | 
|---|
| 139 |  | 
|---|
| 140 | #[ cfg(test)] | 
|---|
| 141 | mod tests { | 
|---|
| 142 | use crate::core_arch::x86::*; | 
|---|
| 143 | use stdarch_test::simd_test; | 
|---|
| 144 |  | 
|---|
| 145 | #[simd_test(enable = "sse4a")] | 
|---|
| 146 | unsafe fn test_mm_extract_si64() { | 
|---|
| 147 | let b = 0b0110_0000_0000_i64; | 
|---|
| 148 | //        ^^^^ bit range extracted | 
|---|
| 149 | let x = _mm_setr_epi64x(b, 0); | 
|---|
| 150 | let v = 0b001000___00___000100_i64; | 
|---|
| 151 | //        ^idx: 2^3 = 8 ^length = 2^2 = 4 | 
|---|
| 152 | let y = _mm_setr_epi64x(v, 0); | 
|---|
| 153 | let e = _mm_setr_epi64x(0b0110_i64, 0); | 
|---|
| 154 | let r = _mm_extract_si64(x, y); | 
|---|
| 155 | assert_eq_m128i(r, e); | 
|---|
| 156 | } | 
|---|
| 157 |  | 
|---|
| 158 | #[simd_test(enable = "sse4a")] | 
|---|
| 159 | unsafe fn test_mm_extracti_si64() { | 
|---|
| 160 | let a = _mm_setr_epi64x(0x0123456789abcdef, 0); | 
|---|
| 161 | let r = _mm_extracti_si64::<8, 8>(a); | 
|---|
| 162 | let e = _mm_setr_epi64x(0xcd, 0); | 
|---|
| 163 | assert_eq_m128i(r, e); | 
|---|
| 164 | } | 
|---|
| 165 |  | 
|---|
| 166 | #[simd_test(enable = "sse4a")] | 
|---|
| 167 | unsafe fn test_mm_insert_si64() { | 
|---|
| 168 | let i = 0b0110_i64; | 
|---|
| 169 | //        ^^^^ bit range inserted | 
|---|
| 170 | let z = 0b1010_1010_1010i64; | 
|---|
| 171 | //        ^^^^ bit range replaced | 
|---|
| 172 | let e = 0b0110_1010_1010i64; | 
|---|
| 173 | //        ^^^^ replaced 1010 with 0110 | 
|---|
| 174 | let x = _mm_setr_epi64x(z, 0); | 
|---|
| 175 | let expected = _mm_setr_epi64x(e, 0); | 
|---|
| 176 | let v = 0b001000___00___000100_i64; | 
|---|
| 177 | //        ^idx: 2^3 = 8 ^length = 2^2 = 4 | 
|---|
| 178 | let y = _mm_setr_epi64x(i, v); | 
|---|
| 179 | let r = _mm_insert_si64(x, y); | 
|---|
| 180 | assert_eq_m128i(r, expected); | 
|---|
| 181 | } | 
|---|
| 182 |  | 
|---|
| 183 | #[simd_test(enable = "sse4a")] | 
|---|
| 184 | unsafe fn test_mm_inserti_si64() { | 
|---|
| 185 | let a = _mm_setr_epi64x(0x0123456789abcdef, 0); | 
|---|
| 186 | let b = _mm_setr_epi64x(0x0011223344556677, 0); | 
|---|
| 187 | let r = _mm_inserti_si64::<8, 8>(a, b); | 
|---|
| 188 | let e = _mm_setr_epi64x(0x0123456789ab77ef, 0); | 
|---|
| 189 | assert_eq_m128i(r, e); | 
|---|
| 190 | } | 
|---|
| 191 |  | 
|---|
| 192 | #[ repr(align(16))] | 
|---|
| 193 | struct MemoryF64 { | 
|---|
| 194 | data: [f64; 2], | 
|---|
| 195 | } | 
|---|
| 196 |  | 
|---|
| 197 | #[simd_test(enable = "sse4a")] | 
|---|
| 198 | // Miri cannot support this until it is clear how it fits in the Rust memory model | 
|---|
| 199 | // (non-temporal store) | 
|---|
| 200 | #[ cfg_attr(miri, ignore)] | 
|---|
| 201 | unsafe fn test_mm_stream_sd() { | 
|---|
| 202 | let mut mem = MemoryF64 { | 
|---|
| 203 | data: [1.0_f64, 2.0], | 
|---|
| 204 | }; | 
|---|
| 205 | { | 
|---|
| 206 | let vals = &mut mem.data; | 
|---|
| 207 | let d = vals.as_mut_ptr(); | 
|---|
| 208 |  | 
|---|
| 209 | let x = _mm_setr_pd(3.0, 4.0); | 
|---|
| 210 |  | 
|---|
| 211 | _mm_stream_sd(d, x); | 
|---|
| 212 | } | 
|---|
| 213 | assert_eq!(mem.data[0], 3.0); | 
|---|
| 214 | assert_eq!(mem.data[1], 2.0); | 
|---|
| 215 | } | 
|---|
| 216 |  | 
|---|
| 217 | #[ repr(align(16))] | 
|---|
| 218 | struct MemoryF32 { | 
|---|
| 219 | data: [f32; 4], | 
|---|
| 220 | } | 
|---|
| 221 |  | 
|---|
| 222 | #[simd_test(enable = "sse4a")] | 
|---|
| 223 | // Miri cannot support this until it is clear how it fits in the Rust memory model | 
|---|
| 224 | // (non-temporal store) | 
|---|
| 225 | #[ cfg_attr(miri, ignore)] | 
|---|
| 226 | unsafe fn test_mm_stream_ss() { | 
|---|
| 227 | let mut mem = MemoryF32 { | 
|---|
| 228 | data: [1.0_f32, 2.0, 3.0, 4.0], | 
|---|
| 229 | }; | 
|---|
| 230 | { | 
|---|
| 231 | let vals = &mut mem.data; | 
|---|
| 232 | let d = vals.as_mut_ptr(); | 
|---|
| 233 |  | 
|---|
| 234 | let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | 
|---|
| 235 |  | 
|---|
| 236 | _mm_stream_ss(d, x); | 
|---|
| 237 | } | 
|---|
| 238 | assert_eq!(mem.data[0], 5.0); | 
|---|
| 239 | assert_eq!(mem.data[1], 2.0); | 
|---|
| 240 | assert_eq!(mem.data[2], 3.0); | 
|---|
| 241 | assert_eq!(mem.data[3], 4.0); | 
|---|
| 242 | } | 
|---|
| 243 | } | 
|---|
| 244 |  | 
|---|