1//! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`)
2
3use crate::core_arch::{simd::*, x86::*};
4
5#[cfg(test)]
6use stdarch_test::assert_instr;
7
8#[allow(improper_ctypes)]
9extern "C" {
10 #[link_name = "llvm.x86.sse4a.extrq"]
11 fn extrq(x: i64x2, y: i8x16) -> i64x2;
12 #[link_name = "llvm.x86.sse4a.insertq"]
13 fn insertq(x: i64x2, y: i64x2) -> i64x2;
14 #[link_name = "llvm.x86.sse4a.movnt.sd"]
15 fn movntsd(x: *mut f64, y: __m128d);
16 #[link_name = "llvm.x86.sse4a.movnt.ss"]
17 fn movntss(x: *mut f32, y: __m128);
18}
19
20// FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ
21// FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ
22
23/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
24///
25/// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The
26/// `[5:0]` bits of `y` specify the length of the bit-range to extract. All
27/// other bits are ignored.
28///
29/// If the length is zero, it is interpreted as `64`. If the length and index
30/// are zero, the lower 64 bits of `x` are extracted.
31///
32/// If `length == 0 && index > 0` or `length + index > 64` the result is
33/// undefined.
34#[inline]
35#[target_feature(enable = "sse4a")]
36#[cfg_attr(test, assert_instr(extrq))]
37#[stable(feature = "simd_x86", since = "1.27.0")]
38pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
39 transmute(src:extrq(x:x.as_i64x2(), y:y.as_i8x16()))
40}
41
42/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
43///
44/// The bits of `y`:
45///
46/// - `[69:64]` specify the `length`,
47/// - `[77:72]` specify the index.
48///
49/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
50/// or `index > 0 && length == 0` the result is undefined.
51#[inline]
52#[target_feature(enable = "sse4a")]
53#[cfg_attr(test, assert_instr(insertq))]
54#[stable(feature = "simd_x86", since = "1.27.0")]
55pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
56 transmute(src:insertq(x:x.as_i64x2(), y:y.as_i64x2()))
57}
58
59/// Non-temporal store of `a.0` into `p`.
60///
61/// Writes 64-bit data to a memory location without polluting the caches.
62///
63/// # Safety of non-temporal stores
64///
65/// After using this intrinsic, but before any other access to the memory that this intrinsic
66/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
67/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
68/// return.
69///
70/// See [`_mm_sfence`] for details.
71#[inline]
72#[target_feature(enable = "sse4a")]
73#[cfg_attr(test, assert_instr(movntsd))]
74#[stable(feature = "simd_x86", since = "1.27.0")]
75pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
76 movntsd(x:p, y:a);
77}
78
79/// Non-temporal store of `a.0` into `p`.
80///
81/// Writes 32-bit data to a memory location without polluting the caches.
82///
83/// # Safety of non-temporal stores
84///
85/// After using this intrinsic, but before any other access to the memory that this intrinsic
86/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
87/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
88/// return.
89///
90/// See [`_mm_sfence`] for details.
91#[inline]
92#[target_feature(enable = "sse4a")]
93#[cfg_attr(test, assert_instr(movntss))]
94#[stable(feature = "simd_x86", since = "1.27.0")]
95pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
96 movntss(x:p, y:a);
97}
98
99#[cfg(test)]
100mod tests {
101 use crate::core_arch::x86::*;
102 use stdarch_test::simd_test;
103
104 #[simd_test(enable = "sse4a")]
105 unsafe fn test_mm_extract_si64() {
106 let b = 0b0110_0000_0000_i64;
107 // ^^^^ bit range extracted
108 let x = _mm_setr_epi64x(b, 0);
109 let v = 0b001000___00___000100_i64;
110 // ^idx: 2^3 = 8 ^length = 2^2 = 4
111 let y = _mm_setr_epi64x(v, 0);
112 let e = _mm_setr_epi64x(0b0110_i64, 0);
113 let r = _mm_extract_si64(x, y);
114 assert_eq_m128i(r, e);
115 }
116
117 #[simd_test(enable = "sse4a")]
118 unsafe fn test_mm_insert_si64() {
119 let i = 0b0110_i64;
120 // ^^^^ bit range inserted
121 let z = 0b1010_1010_1010i64;
122 // ^^^^ bit range replaced
123 let e = 0b0110_1010_1010i64;
124 // ^^^^ replaced 1010 with 0110
125 let x = _mm_setr_epi64x(z, 0);
126 let expected = _mm_setr_epi64x(e, 0);
127 let v = 0b001000___00___000100_i64;
128 // ^idx: 2^3 = 8 ^length = 2^2 = 4
129 let y = _mm_setr_epi64x(i, v);
130 let r = _mm_insert_si64(x, y);
131 assert_eq_m128i(r, expected);
132 }
133
134 #[repr(align(16))]
135 struct MemoryF64 {
136 data: [f64; 2],
137 }
138
139 #[simd_test(enable = "sse4a")]
140 // Miri cannot support this until it is clear how it fits in the Rust memory model
141 // (non-temporal store)
142 #[cfg_attr(miri, ignore)]
143 unsafe fn test_mm_stream_sd() {
144 let mut mem = MemoryF64 {
145 data: [1.0_f64, 2.0],
146 };
147 {
148 let vals = &mut mem.data;
149 let d = vals.as_mut_ptr();
150
151 let x = _mm_setr_pd(3.0, 4.0);
152
153 _mm_stream_sd(d, x);
154 }
155 assert_eq!(mem.data[0], 3.0);
156 assert_eq!(mem.data[1], 2.0);
157 }
158
159 #[repr(align(16))]
160 struct MemoryF32 {
161 data: [f32; 4],
162 }
163
164 #[simd_test(enable = "sse4a")]
165 // Miri cannot support this until it is clear how it fits in the Rust memory model
166 // (non-temporal store)
167 #[cfg_attr(miri, ignore)]
168 unsafe fn test_mm_stream_ss() {
169 let mut mem = MemoryF32 {
170 data: [1.0_f32, 2.0, 3.0, 4.0],
171 };
172 {
173 let vals = &mut mem.data;
174 let d = vals.as_mut_ptr();
175
176 let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
177
178 _mm_stream_ss(d, x);
179 }
180 assert_eq!(mem.data[0], 5.0);
181 assert_eq!(mem.data[1], 2.0);
182 assert_eq!(mem.data[2], 3.0);
183 assert_eq!(mem.data[3], 4.0);
184 }
185}
186