1//! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`)
2
3use crate::core_arch::{simd::*, x86::*};
4
5#[cfg(test)]
6use stdarch_test::assert_instr;
7
8#[allow(improper_ctypes)]
9unsafe extern "C" {
10 #[link_name = "llvm.x86.sse4a.extrq"]
11 unsafefn extrq(x: i64x2, y: i8x16) -> i64x2;
12 #[link_name = "llvm.x86.sse4a.extrqi"]
13 unsafefn extrqi(x: i64x2, len: u8, idx: u8) -> i64x2;
14 #[link_name = "llvm.x86.sse4a.insertq"]
15 unsafefn insertq(x: i64x2, y: i64x2) -> i64x2;
16 #[link_name = "llvm.x86.sse4a.insertqi"]
17 unsafefn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
18 #[link_name = "llvm.x86.sse4a.movnt.sd"]
19 unsafefn movntsd(x: *mut f64, y: __m128d);
20 #[link_name = "llvm.x86.sse4a.movnt.ss"]
21 unsafefn movntss(x: *mut f32, y: __m128);
22}
23
24/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
25///
26/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
27/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
28/// other bits are ignored.
29///
30/// If the length is zero, it is interpreted as `64`. If the length and index
31/// are zero, the lower 64 bits of `x` are extracted.
32///
33/// If `length == 0 && index > 0` or `length + index > 64` the result is
34/// undefined.
35#[inline]
36#[target_feature(enable = "sse4a")]
37#[cfg_attr(test, assert_instr(extrq))]
38#[stable(feature = "simd_x86", since = "1.27.0")]
39pub fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
40 unsafe { transmute(src:extrq(x.as_i64x2(), y.as_i8x16())) }
41}
42
43/// Extracts the specified bits from the lower 64 bits of the 128-bit integer vector operand at the
44/// index `idx` and of the length `len`.
45///
46/// `idx` specifies the index of the LSB. `len` specifies the number of bits to extract. If length
47/// and index are both zero, bits `[63:0]` of parameter `x` are extracted. It is a compile-time error
48/// for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
49///
50/// Returns a 128-bit integer vector whose lower 64 bits contain the extracted bits.
51#[inline]
52#[target_feature(enable = "sse4a")]
53#[cfg_attr(test, assert_instr(extrq, LEN = 5, IDX = 5))]
54#[rustc_legacy_const_generics(1, 2)]
55#[stable(feature = "simd_x86_updates", since = "1.82.0")]
56pub fn _mm_extracti_si64<const LEN: i32, const IDX: i32>(x: __m128i) -> __m128i {
57 // LLVM mentions that it is UB if these are not satisfied
58 static_assert_uimm_bits!(LEN, 6);
59 static_assert_uimm_bits!(IDX, 6);
60 static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
61 unsafe { transmute(src:extrqi(x.as_i64x2(), LEN as u8, IDX as u8)) }
62}
63
64/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
65///
66/// The bits of `y`:
67///
68/// - `[69:64]` specify the `length`,
69/// - `[77:72]` specify the index.
70///
71/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
72/// or `index > 0 && length == 0` the result is undefined.
73#[inline]
74#[target_feature(enable = "sse4a")]
75#[cfg_attr(test, assert_instr(insertq))]
76#[stable(feature = "simd_x86", since = "1.27.0")]
77pub fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
78 unsafe { transmute(src:insertq(x.as_i64x2(), y.as_i64x2())) }
79}
80
81/// Inserts the `len` least-significant bits from the lower 64 bits of the 128-bit integer vector operand `y` into
82/// the lower 64 bits of the 128-bit integer vector operand `x` at the index `idx` and of the length `len`.
83///
84/// `idx` specifies the index of the LSB. `len` specifies the number of bits to insert. If length and index
85/// are both zero, bits `[63:0]` of parameter `x` are replaced with bits `[63:0]` of parameter `y`. It is a
86/// compile-time error for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
87#[inline]
88#[target_feature(enable = "sse4a")]
89#[cfg_attr(test, assert_instr(insertq, LEN = 5, IDX = 5))]
90#[rustc_legacy_const_generics(2, 3)]
91#[stable(feature = "simd_x86_updates", since = "1.82.0")]
92pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i) -> __m128i {
93 // LLVM mentions that it is UB if these are not satisfied
94 static_assert_uimm_bits!(LEN, 6);
95 static_assert_uimm_bits!(IDX, 6);
96 static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
97 unsafe { transmute(src:insertqi(x.as_i64x2(), y.as_i64x2(), LEN as u8, IDX as u8)) }
98}
99
100/// Non-temporal store of `a.0` into `p`.
101///
102/// Writes 64-bit data to a memory location without polluting the caches.
103///
104/// # Safety of non-temporal stores
105///
106/// After using this intrinsic, but before any other access to the memory that this intrinsic
107/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
108/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
109/// return.
110///
111/// See [`_mm_sfence`] for details.
112#[inline]
113#[target_feature(enable = "sse4a")]
114#[cfg_attr(test, assert_instr(movntsd))]
115#[stable(feature = "simd_x86", since = "1.27.0")]
116pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
117 movntsd(x:p, y:a);
118}
119
120/// Non-temporal store of `a.0` into `p`.
121///
122/// Writes 32-bit data to a memory location without polluting the caches.
123///
124/// # Safety of non-temporal stores
125///
126/// After using this intrinsic, but before any other access to the memory that this intrinsic
127/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
128/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
129/// return.
130///
131/// See [`_mm_sfence`] for details.
132#[inline]
133#[target_feature(enable = "sse4a")]
134#[cfg_attr(test, assert_instr(movntss))]
135#[stable(feature = "simd_x86", since = "1.27.0")]
136pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
137 movntss(x:p, y:a);
138}
139
140#[cfg(test)]
141mod tests {
142 use crate::core_arch::x86::*;
143 use stdarch_test::simd_test;
144
145 #[simd_test(enable = "sse4a")]
146 unsafe fn test_mm_extract_si64() {
147 let b = 0b0110_0000_0000_i64;
148 // ^^^^ bit range extracted
149 let x = _mm_setr_epi64x(b, 0);
150 let v = 0b001000___00___000100_i64;
151 // ^idx: 2^3 = 8 ^length = 2^2 = 4
152 let y = _mm_setr_epi64x(v, 0);
153 let e = _mm_setr_epi64x(0b0110_i64, 0);
154 let r = _mm_extract_si64(x, y);
155 assert_eq_m128i(r, e);
156 }
157
158 #[simd_test(enable = "sse4a")]
159 unsafe fn test_mm_extracti_si64() {
160 let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
161 let r = _mm_extracti_si64::<8, 8>(a);
162 let e = _mm_setr_epi64x(0xcd, 0);
163 assert_eq_m128i(r, e);
164 }
165
166 #[simd_test(enable = "sse4a")]
167 unsafe fn test_mm_insert_si64() {
168 let i = 0b0110_i64;
169 // ^^^^ bit range inserted
170 let z = 0b1010_1010_1010i64;
171 // ^^^^ bit range replaced
172 let e = 0b0110_1010_1010i64;
173 // ^^^^ replaced 1010 with 0110
174 let x = _mm_setr_epi64x(z, 0);
175 let expected = _mm_setr_epi64x(e, 0);
176 let v = 0b001000___00___000100_i64;
177 // ^idx: 2^3 = 8 ^length = 2^2 = 4
178 let y = _mm_setr_epi64x(i, v);
179 let r = _mm_insert_si64(x, y);
180 assert_eq_m128i(r, expected);
181 }
182
183 #[simd_test(enable = "sse4a")]
184 unsafe fn test_mm_inserti_si64() {
185 let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
186 let b = _mm_setr_epi64x(0x0011223344556677, 0);
187 let r = _mm_inserti_si64::<8, 8>(a, b);
188 let e = _mm_setr_epi64x(0x0123456789ab77ef, 0);
189 assert_eq_m128i(r, e);
190 }
191
192 #[repr(align(16))]
193 struct MemoryF64 {
194 data: [f64; 2],
195 }
196
197 #[simd_test(enable = "sse4a")]
198 // Miri cannot support this until it is clear how it fits in the Rust memory model
199 // (non-temporal store)
200 #[cfg_attr(miri, ignore)]
201 unsafe fn test_mm_stream_sd() {
202 let mut mem = MemoryF64 {
203 data: [1.0_f64, 2.0],
204 };
205 {
206 let vals = &mut mem.data;
207 let d = vals.as_mut_ptr();
208
209 let x = _mm_setr_pd(3.0, 4.0);
210
211 _mm_stream_sd(d, x);
212 }
213 assert_eq!(mem.data[0], 3.0);
214 assert_eq!(mem.data[1], 2.0);
215 }
216
217 #[repr(align(16))]
218 struct MemoryF32 {
219 data: [f32; 4],
220 }
221
222 #[simd_test(enable = "sse4a")]
223 // Miri cannot support this until it is clear how it fits in the Rust memory model
224 // (non-temporal store)
225 #[cfg_attr(miri, ignore)]
226 unsafe fn test_mm_stream_ss() {
227 let mut mem = MemoryF32 {
228 data: [1.0_f32, 2.0, 3.0, 4.0],
229 };
230 {
231 let vals = &mut mem.data;
232 let d = vals.as_mut_ptr();
233
234 let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
235
236 _mm_stream_ss(d, x);
237 }
238 assert_eq!(mem.data[0], 5.0);
239 assert_eq!(mem.data[1], 2.0);
240 assert_eq!(mem.data[2], 3.0);
241 assert_eq!(mem.data[3], 4.0);
242 }
243}
244