1 | //! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`) |
2 | |
3 | use crate::core_arch::{simd::*, x86::*}; |
4 | |
5 | #[cfg (test)] |
6 | use stdarch_test::assert_instr; |
7 | |
8 | #[allow (improper_ctypes)] |
9 | unsafe extern "C" { |
10 | #[link_name = "llvm.x86.sse4a.extrq" ] |
11 | unsafefn extrq(x: i64x2, y: i8x16) -> i64x2; |
12 | #[link_name = "llvm.x86.sse4a.extrqi" ] |
13 | unsafefn extrqi(x: i64x2, len: u8, idx: u8) -> i64x2; |
14 | #[link_name = "llvm.x86.sse4a.insertq" ] |
15 | unsafefn insertq(x: i64x2, y: i64x2) -> i64x2; |
16 | #[link_name = "llvm.x86.sse4a.insertqi" ] |
17 | unsafefn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2; |
18 | #[link_name = "llvm.x86.sse4a.movnt.sd" ] |
19 | unsafefn movntsd(x: *mut f64, y: __m128d); |
20 | #[link_name = "llvm.x86.sse4a.movnt.ss" ] |
21 | unsafefn movntss(x: *mut f32, y: __m128); |
22 | } |
23 | |
24 | /// Extracts the bit range specified by `y` from the lower 64 bits of `x`. |
25 | /// |
26 | /// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The |
27 | /// `[5:0]` bits of `y` specify the length of the bit-range to extract. All |
28 | /// other bits are ignored. |
29 | /// |
30 | /// If the length is zero, it is interpreted as `64`. If the length and index |
31 | /// are zero, the lower 64 bits of `x` are extracted. |
32 | /// |
33 | /// If `length == 0 && index > 0` or `length + index > 64` the result is |
34 | /// undefined. |
35 | #[inline ] |
36 | #[target_feature (enable = "sse4a" )] |
37 | #[cfg_attr (test, assert_instr(extrq))] |
38 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
39 | pub fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i { |
40 | unsafe { transmute(src:extrq(x.as_i64x2(), y.as_i8x16())) } |
41 | } |
42 | |
43 | /// Extracts the specified bits from the lower 64 bits of the 128-bit integer vector operand at the |
44 | /// index `idx` and of the length `len`. |
45 | /// |
46 | /// `idx` specifies the index of the LSB. `len` specifies the number of bits to extract. If length |
47 | /// and index are both zero, bits `[63:0]` of parameter `x` are extracted. It is a compile-time error |
48 | /// for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero. |
49 | /// |
50 | /// Returns a 128-bit integer vector whose lower 64 bits contain the extracted bits. |
51 | #[inline ] |
52 | #[target_feature (enable = "sse4a" )] |
53 | #[cfg_attr (test, assert_instr(extrq, LEN = 5, IDX = 5))] |
54 | #[rustc_legacy_const_generics (1, 2)] |
55 | #[stable (feature = "simd_x86_updates" , since = "1.82.0" )] |
56 | pub fn _mm_extracti_si64<const LEN: i32, const IDX: i32>(x: __m128i) -> __m128i { |
57 | // LLVM mentions that it is UB if these are not satisfied |
58 | static_assert_uimm_bits!(LEN, 6); |
59 | static_assert_uimm_bits!(IDX, 6); |
60 | static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64)); |
61 | unsafe { transmute(src:extrqi(x.as_i64x2(), LEN as u8, IDX as u8)) } |
62 | } |
63 | |
64 | /// Inserts the `[length:0]` bits of `y` into `x` at `index`. |
65 | /// |
66 | /// The bits of `y`: |
67 | /// |
68 | /// - `[69:64]` specify the `length`, |
69 | /// - `[77:72]` specify the index. |
70 | /// |
71 | /// If the `length` is zero it is interpreted as `64`. If `index + length > 64` |
72 | /// or `index > 0 && length == 0` the result is undefined. |
73 | #[inline ] |
74 | #[target_feature (enable = "sse4a" )] |
75 | #[cfg_attr (test, assert_instr(insertq))] |
76 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
77 | pub fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i { |
78 | unsafe { transmute(src:insertq(x.as_i64x2(), y.as_i64x2())) } |
79 | } |
80 | |
81 | /// Inserts the `len` least-significant bits from the lower 64 bits of the 128-bit integer vector operand `y` into |
82 | /// the lower 64 bits of the 128-bit integer vector operand `x` at the index `idx` and of the length `len`. |
83 | /// |
84 | /// `idx` specifies the index of the LSB. `len` specifies the number of bits to insert. If length and index |
85 | /// are both zero, bits `[63:0]` of parameter `x` are replaced with bits `[63:0]` of parameter `y`. It is a |
86 | /// compile-time error for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero. |
87 | #[inline ] |
88 | #[target_feature (enable = "sse4a" )] |
89 | #[cfg_attr (test, assert_instr(insertq, LEN = 5, IDX = 5))] |
90 | #[rustc_legacy_const_generics (2, 3)] |
91 | #[stable (feature = "simd_x86_updates" , since = "1.82.0" )] |
92 | pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i) -> __m128i { |
93 | // LLVM mentions that it is UB if these are not satisfied |
94 | static_assert_uimm_bits!(LEN, 6); |
95 | static_assert_uimm_bits!(IDX, 6); |
96 | static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64)); |
97 | unsafe { transmute(src:insertqi(x.as_i64x2(), y.as_i64x2(), LEN as u8, IDX as u8)) } |
98 | } |
99 | |
100 | /// Non-temporal store of `a.0` into `p`. |
101 | /// |
102 | /// Writes 64-bit data to a memory location without polluting the caches. |
103 | /// |
104 | /// # Safety of non-temporal stores |
105 | /// |
106 | /// After using this intrinsic, but before any other access to the memory that this intrinsic |
107 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In |
108 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they |
109 | /// return. |
110 | /// |
111 | /// See [`_mm_sfence`] for details. |
112 | #[inline ] |
113 | #[target_feature (enable = "sse4a" )] |
114 | #[cfg_attr (test, assert_instr(movntsd))] |
115 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
116 | pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { |
117 | movntsd(x:p, y:a); |
118 | } |
119 | |
120 | /// Non-temporal store of `a.0` into `p`. |
121 | /// |
122 | /// Writes 32-bit data to a memory location without polluting the caches. |
123 | /// |
124 | /// # Safety of non-temporal stores |
125 | /// |
126 | /// After using this intrinsic, but before any other access to the memory that this intrinsic |
127 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In |
128 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they |
129 | /// return. |
130 | /// |
131 | /// See [`_mm_sfence`] for details. |
132 | #[inline ] |
133 | #[target_feature (enable = "sse4a" )] |
134 | #[cfg_attr (test, assert_instr(movntss))] |
135 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
136 | pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) { |
137 | movntss(x:p, y:a); |
138 | } |
139 | |
140 | #[cfg (test)] |
141 | mod tests { |
142 | use crate::core_arch::x86::*; |
143 | use stdarch_test::simd_test; |
144 | |
145 | #[simd_test(enable = "sse4a" )] |
146 | unsafe fn test_mm_extract_si64() { |
147 | let b = 0b0110_0000_0000_i64; |
148 | // ^^^^ bit range extracted |
149 | let x = _mm_setr_epi64x(b, 0); |
150 | let v = 0b001000___00___000100_i64; |
151 | // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
152 | let y = _mm_setr_epi64x(v, 0); |
153 | let e = _mm_setr_epi64x(0b0110_i64, 0); |
154 | let r = _mm_extract_si64(x, y); |
155 | assert_eq_m128i(r, e); |
156 | } |
157 | |
158 | #[simd_test(enable = "sse4a" )] |
159 | unsafe fn test_mm_extracti_si64() { |
160 | let a = _mm_setr_epi64x(0x0123456789abcdef, 0); |
161 | let r = _mm_extracti_si64::<8, 8>(a); |
162 | let e = _mm_setr_epi64x(0xcd, 0); |
163 | assert_eq_m128i(r, e); |
164 | } |
165 | |
166 | #[simd_test(enable = "sse4a" )] |
167 | unsafe fn test_mm_insert_si64() { |
168 | let i = 0b0110_i64; |
169 | // ^^^^ bit range inserted |
170 | let z = 0b1010_1010_1010i64; |
171 | // ^^^^ bit range replaced |
172 | let e = 0b0110_1010_1010i64; |
173 | // ^^^^ replaced 1010 with 0110 |
174 | let x = _mm_setr_epi64x(z, 0); |
175 | let expected = _mm_setr_epi64x(e, 0); |
176 | let v = 0b001000___00___000100_i64; |
177 | // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
178 | let y = _mm_setr_epi64x(i, v); |
179 | let r = _mm_insert_si64(x, y); |
180 | assert_eq_m128i(r, expected); |
181 | } |
182 | |
183 | #[simd_test(enable = "sse4a" )] |
184 | unsafe fn test_mm_inserti_si64() { |
185 | let a = _mm_setr_epi64x(0x0123456789abcdef, 0); |
186 | let b = _mm_setr_epi64x(0x0011223344556677, 0); |
187 | let r = _mm_inserti_si64::<8, 8>(a, b); |
188 | let e = _mm_setr_epi64x(0x0123456789ab77ef, 0); |
189 | assert_eq_m128i(r, e); |
190 | } |
191 | |
192 | #[repr (align(16))] |
193 | struct MemoryF64 { |
194 | data: [f64; 2], |
195 | } |
196 | |
197 | #[simd_test(enable = "sse4a" )] |
198 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
199 | // (non-temporal store) |
200 | #[cfg_attr (miri, ignore)] |
201 | unsafe fn test_mm_stream_sd() { |
202 | let mut mem = MemoryF64 { |
203 | data: [1.0_f64, 2.0], |
204 | }; |
205 | { |
206 | let vals = &mut mem.data; |
207 | let d = vals.as_mut_ptr(); |
208 | |
209 | let x = _mm_setr_pd(3.0, 4.0); |
210 | |
211 | _mm_stream_sd(d, x); |
212 | } |
213 | assert_eq!(mem.data[0], 3.0); |
214 | assert_eq!(mem.data[1], 2.0); |
215 | } |
216 | |
217 | #[repr (align(16))] |
218 | struct MemoryF32 { |
219 | data: [f32; 4], |
220 | } |
221 | |
222 | #[simd_test(enable = "sse4a" )] |
223 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
224 | // (non-temporal store) |
225 | #[cfg_attr (miri, ignore)] |
226 | unsafe fn test_mm_stream_ss() { |
227 | let mut mem = MemoryF32 { |
228 | data: [1.0_f32, 2.0, 3.0, 4.0], |
229 | }; |
230 | { |
231 | let vals = &mut mem.data; |
232 | let d = vals.as_mut_ptr(); |
233 | |
234 | let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
235 | |
236 | _mm_stream_ss(d, x); |
237 | } |
238 | assert_eq!(mem.data[0], 5.0); |
239 | assert_eq!(mem.data[1], 2.0); |
240 | assert_eq!(mem.data[2], 3.0); |
241 | assert_eq!(mem.data[3], 4.0); |
242 | } |
243 | } |
244 | |