1//! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`)
2
3use crate::core_arch::{simd::*, x86::*};
4
5#[cfg(test)]
6use stdarch_test::assert_instr;
7
8#[allow(improper_ctypes)]
9unsafe extern "C" {
10 #[link_name = "llvm.x86.sse4a.extrq"]
11 unsafefn extrq(x: i64x2, y: i8x16) -> i64x2;
12 #[link_name = "llvm.x86.sse4a.extrqi"]
13 unsafefn extrqi(x: i64x2, len: u8, idx: u8) -> i64x2;
14 #[link_name = "llvm.x86.sse4a.insertq"]
15 unsafefn insertq(x: i64x2, y: i64x2) -> i64x2;
16 #[link_name = "llvm.x86.sse4a.insertqi"]
17 unsafefn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
18}
19
20/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
21///
22/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
23/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
24/// other bits are ignored.
25///
26/// If the length is zero, it is interpreted as `64`. If the length and index
27/// are zero, the lower 64 bits of `x` are extracted.
28///
29/// If `length == 0 && index > 0` or `length + index > 64` the result is
30/// undefined.
31#[inline]
32#[target_feature(enable = "sse4a")]
33#[cfg_attr(test, assert_instr(extrq))]
34#[stable(feature = "simd_x86", since = "1.27.0")]
35pub fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
36 unsafe { transmute(src:extrq(x.as_i64x2(), y.as_i8x16())) }
37}
38
39/// Extracts the specified bits from the lower 64 bits of the 128-bit integer vector operand at the
40/// index `idx` and of the length `len`.
41///
42/// `idx` specifies the index of the LSB. `len` specifies the number of bits to extract. If length
43/// and index are both zero, bits `[63:0]` of parameter `x` are extracted. It is a compile-time error
44/// for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
45///
46/// Returns a 128-bit integer vector whose lower 64 bits contain the extracted bits.
47#[inline]
48#[target_feature(enable = "sse4a")]
49#[cfg_attr(test, assert_instr(extrq, LEN = 5, IDX = 5))]
50#[rustc_legacy_const_generics(1, 2)]
51#[stable(feature = "simd_x86_updates", since = "1.82.0")]
52pub fn _mm_extracti_si64<const LEN: i32, const IDX: i32>(x: __m128i) -> __m128i {
53 // LLVM mentions that it is UB if these are not satisfied
54 static_assert_uimm_bits!(LEN, 6);
55 static_assert_uimm_bits!(IDX, 6);
56 static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
57 unsafe { transmute(src:extrqi(x.as_i64x2(), LEN as u8, IDX as u8)) }
58}
59
60/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
61///
62/// The bits of `y`:
63///
64/// - `[69:64]` specify the `length`,
65/// - `[77:72]` specify the index.
66///
67/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
68/// or `index > 0 && length == 0` the result is undefined.
69#[inline]
70#[target_feature(enable = "sse4a")]
71#[cfg_attr(test, assert_instr(insertq))]
72#[stable(feature = "simd_x86", since = "1.27.0")]
73pub fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
74 unsafe { transmute(src:insertq(x.as_i64x2(), y.as_i64x2())) }
75}
76
77/// Inserts the `len` least-significant bits from the lower 64 bits of the 128-bit integer vector operand `y` into
78/// the lower 64 bits of the 128-bit integer vector operand `x` at the index `idx` and of the length `len`.
79///
80/// `idx` specifies the index of the LSB. `len` specifies the number of bits to insert. If length and index
81/// are both zero, bits `[63:0]` of parameter `x` are replaced with bits `[63:0]` of parameter `y`. It is a
82/// compile-time error for `len + idx` to be greater than 64 or for `len` to be zero and `idx` to be non-zero.
83#[inline]
84#[target_feature(enable = "sse4a")]
85#[cfg_attr(test, assert_instr(insertq, LEN = 5, IDX = 5))]
86#[rustc_legacy_const_generics(2, 3)]
87#[stable(feature = "simd_x86_updates", since = "1.82.0")]
88pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i) -> __m128i {
89 // LLVM mentions that it is UB if these are not satisfied
90 static_assert_uimm_bits!(LEN, 6);
91 static_assert_uimm_bits!(IDX, 6);
92 static_assert!((LEN == 0 && IDX == 0) || (LEN != 0 && LEN + IDX <= 64));
93 unsafe { transmute(src:insertqi(x.as_i64x2(), y.as_i64x2(), LEN as u8, IDX as u8)) }
94}
95
96/// Non-temporal store of `a.0` into `p`.
97///
98/// Writes 64-bit data to a memory location without polluting the caches.
99///
100/// # Safety of non-temporal stores
101///
102/// After using this intrinsic, but before any other access to the memory that this intrinsic
103/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
104/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
105/// return.
106///
107/// See [`_mm_sfence`] for details.
108#[inline]
109#[target_feature(enable = "sse4a")]
110#[cfg_attr(test, assert_instr(movntsd))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
113 // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
114 crate::arch::asm!(
115 vps!("movntsd", ",{a}"),
116 p = in(reg) p,
117 a = in(xmm_reg) a,
118 options(nostack, preserves_flags),
119 );
120}
121
122/// Non-temporal store of `a.0` into `p`.
123///
124/// Writes 32-bit data to a memory location without polluting the caches.
125///
126/// # Safety of non-temporal stores
127///
128/// After using this intrinsic, but before any other access to the memory that this intrinsic
129/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
130/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
131/// return.
132///
133/// See [`_mm_sfence`] for details.
134#[inline]
135#[target_feature(enable = "sse4a")]
136#[cfg_attr(test, assert_instr(movntss))]
137#[stable(feature = "simd_x86", since = "1.27.0")]
138pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
139 // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
140 crate::arch::asm!(
141 vps!("movntss", ",{a}"),
142 p = in(reg) p,
143 a = in(xmm_reg) a,
144 options(nostack, preserves_flags),
145 );
146}
147
148#[cfg(test)]
149mod tests {
150 use crate::core_arch::x86::*;
151 use stdarch_test::simd_test;
152
153 #[simd_test(enable = "sse4a")]
154 fn test_mm_extract_si64() {
155 let b = 0b0110_0000_0000_i64;
156 // ^^^^ bit range extracted
157 let x = _mm_setr_epi64x(b, 0);
158 let v = 0b001000___00___000100_i64;
159 // ^idx: 2^3 = 8 ^length = 2^2 = 4
160 let y = _mm_setr_epi64x(v, 0);
161 let e = _mm_setr_epi64x(0b0110_i64, 0);
162 let r = _mm_extract_si64(x, y);
163 assert_eq_m128i(r, e);
164 }
165
166 #[simd_test(enable = "sse4a")]
167 fn test_mm_extracti_si64() {
168 let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
169 let r = _mm_extracti_si64::<8, 8>(a);
170 let e = _mm_setr_epi64x(0xcd, 0);
171 assert_eq_m128i(r, e);
172 }
173
174 #[simd_test(enable = "sse4a")]
175 fn test_mm_insert_si64() {
176 let i = 0b0110_i64;
177 // ^^^^ bit range inserted
178 let z = 0b1010_1010_1010i64;
179 // ^^^^ bit range replaced
180 let e = 0b0110_1010_1010i64;
181 // ^^^^ replaced 1010 with 0110
182 let x = _mm_setr_epi64x(z, 0);
183 let expected = _mm_setr_epi64x(e, 0);
184 let v = 0b001000___00___000100_i64;
185 // ^idx: 2^3 = 8 ^length = 2^2 = 4
186 let y = _mm_setr_epi64x(i, v);
187 let r = _mm_insert_si64(x, y);
188 assert_eq_m128i(r, expected);
189 }
190
191 #[simd_test(enable = "sse4a")]
192 fn test_mm_inserti_si64() {
193 let a = _mm_setr_epi64x(0x0123456789abcdef, 0);
194 let b = _mm_setr_epi64x(0x0011223344556677, 0);
195 let r = _mm_inserti_si64::<8, 8>(a, b);
196 let e = _mm_setr_epi64x(0x0123456789ab77ef, 0);
197 assert_eq_m128i(r, e);
198 }
199
200 #[repr(align(16))]
201 struct MemoryF64 {
202 data: [f64; 2],
203 }
204
205 #[simd_test(enable = "sse4a")]
206 // Miri cannot support this until it is clear how it fits in the Rust memory model
207 // (non-temporal store)
208 #[cfg_attr(miri, ignore)]
209 fn test_mm_stream_sd() {
210 let mut mem = MemoryF64 {
211 data: [1.0_f64, 2.0],
212 };
213 {
214 let vals = &mut mem.data;
215 let d = vals.as_mut_ptr();
216
217 let x = _mm_setr_pd(3.0, 4.0);
218
219 unsafe {
220 _mm_stream_sd(d, x);
221 }
222 _mm_sfence();
223 }
224 assert_eq!(mem.data[0], 3.0);
225 assert_eq!(mem.data[1], 2.0);
226 }
227
228 #[repr(align(16))]
229 struct MemoryF32 {
230 data: [f32; 4],
231 }
232
233 #[simd_test(enable = "sse4a")]
234 // Miri cannot support this until it is clear how it fits in the Rust memory model
235 // (non-temporal store)
236 #[cfg_attr(miri, ignore)]
237 fn test_mm_stream_ss() {
238 let mut mem = MemoryF32 {
239 data: [1.0_f32, 2.0, 3.0, 4.0],
240 };
241 {
242 let vals = &mut mem.data;
243 let d = vals.as_mut_ptr();
244
245 let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
246
247 unsafe {
248 _mm_stream_ss(d, x);
249 }
250 _mm_sfence();
251 }
252 assert_eq!(mem.data[0], 5.0);
253 assert_eq!(mem.data[1], 2.0);
254 assert_eq!(mem.data[2], 3.0);
255 assert_eq!(mem.data[3], 4.0);
256 }
257}
258