1//! Streaming SIMD Extensions 3 (SSE3)
2
3use crate::core_arch::{simd::*, x86::*};
4use crate::intrinsics::simd::*;
5
6#[cfg(test)]
7use stdarch_test::assert_instr;
8
9/// Alternatively add and subtract packed single-precision (32-bit)
10/// floating-point elements in `a` to/from packed elements in `b`.
11///
12/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_ps)
13#[inline]
14#[target_feature(enable = "sse3")]
15#[cfg_attr(test, assert_instr(addsubps))]
16#[stable(feature = "simd_x86", since = "1.27.0")]
17pub fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
18 unsafe {
19 let a: f32x4 = a.as_f32x4();
20 let b: f32x4 = b.as_f32x4();
21 let add: f32x4 = simd_add(x:a, y:b);
22 let sub: f32x4 = simd_sub(lhs:a, rhs:b);
23 simd_shuffle!(add, sub, [4, 1, 6, 3])
24 }
25}
26
27/// Alternatively add and subtract packed double-precision (64-bit)
28/// floating-point elements in `a` to/from packed elements in `b`.
29///
30/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd)
31#[inline]
32#[target_feature(enable = "sse3")]
33#[cfg_attr(test, assert_instr(addsubpd))]
34#[stable(feature = "simd_x86", since = "1.27.0")]
35pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
36 unsafe {
37 let a: f64x2 = a.as_f64x2();
38 let b: f64x2 = b.as_f64x2();
39 let add: f64x2 = simd_add(x:a, y:b);
40 let sub: f64x2 = simd_sub(lhs:a, rhs:b);
41 simd_shuffle!(add, sub, [2, 1])
42 }
43}
44
45/// Horizontally adds adjacent pairs of double-precision (64-bit)
46/// floating-point elements in `a` and `b`, and pack the results.
47///
48/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd)
49#[inline]
50#[target_feature(enable = "sse3")]
51#[cfg_attr(test, assert_instr(haddpd))]
52#[stable(feature = "simd_x86", since = "1.27.0")]
53pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
54 unsafe { haddpd(a, b) }
55}
56
57/// Horizontally adds adjacent pairs of single-precision (32-bit)
58/// floating-point elements in `a` and `b`, and pack the results.
59///
60/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps)
61#[inline]
62#[target_feature(enable = "sse3")]
63#[cfg_attr(test, assert_instr(haddps))]
64#[stable(feature = "simd_x86", since = "1.27.0")]
65pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
66 unsafe { haddps(a, b) }
67}
68
69/// Horizontally subtract adjacent pairs of double-precision (64-bit)
70/// floating-point elements in `a` and `b`, and pack the results.
71///
72/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd)
73#[inline]
74#[target_feature(enable = "sse3")]
75#[cfg_attr(test, assert_instr(hsubpd))]
76#[stable(feature = "simd_x86", since = "1.27.0")]
77pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
78 unsafe { hsubpd(a, b) }
79}
80
81/// Horizontally adds adjacent pairs of single-precision (32-bit)
82/// floating-point elements in `a` and `b`, and pack the results.
83///
84/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps)
85#[inline]
86#[target_feature(enable = "sse3")]
87#[cfg_attr(test, assert_instr(hsubps))]
88#[stable(feature = "simd_x86", since = "1.27.0")]
89pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
90 unsafe { hsubps(a, b) }
91}
92
93/// Loads 128-bits of integer data from unaligned memory.
94/// This intrinsic may perform better than `_mm_loadu_si128`
95/// when the data crosses a cache line boundary.
96///
97/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128)
98#[inline]
99#[target_feature(enable = "sse3")]
100#[cfg_attr(test, assert_instr(lddqu))]
101#[stable(feature = "simd_x86", since = "1.27.0")]
102pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
103 transmute(src:lddqu(mem_addr as *const _))
104}
105
106/// Duplicate the low double-precision (64-bit) floating-point element
107/// from `a`.
108///
109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd)
110#[inline]
111#[target_feature(enable = "sse3")]
112#[cfg_attr(test, assert_instr(movddup))]
113#[stable(feature = "simd_x86", since = "1.27.0")]
114pub fn _mm_movedup_pd(a: __m128d) -> __m128d {
115 unsafe { simd_shuffle!(a, a, [0, 0]) }
116}
117
118/// Loads a double-precision (64-bit) floating-point element from memory
119/// into both elements of return vector.
120///
121/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd)
122#[inline]
123#[target_feature(enable = "sse3")]
124#[cfg_attr(test, assert_instr(movddup))]
125#[stable(feature = "simd_x86", since = "1.27.0")]
126pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
127 _mm_load1_pd(mem_addr)
128}
129
130/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
131/// from `a`.
132///
133/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps)
134#[inline]
135#[target_feature(enable = "sse3")]
136#[cfg_attr(test, assert_instr(movshdup))]
137#[stable(feature = "simd_x86", since = "1.27.0")]
138pub fn _mm_movehdup_ps(a: __m128) -> __m128 {
139 unsafe { simd_shuffle!(a, a, [1, 1, 3, 3]) }
140}
141
142/// Duplicate even-indexed single-precision (32-bit) floating-point elements
143/// from `a`.
144///
145/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps)
146#[inline]
147#[target_feature(enable = "sse3")]
148#[cfg_attr(test, assert_instr(movsldup))]
149#[stable(feature = "simd_x86", since = "1.27.0")]
150pub fn _mm_moveldup_ps(a: __m128) -> __m128 {
151 unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
152}
153
154#[allow(improper_ctypes)]
155unsafe extern "C" {
156 #[link_name = "llvm.x86.sse3.hadd.pd"]
157 unsafefn haddpd(a: __m128d, b: __m128d) -> __m128d;
158 #[link_name = "llvm.x86.sse3.hadd.ps"]
159 unsafefn haddps(a: __m128, b: __m128) -> __m128;
160 #[link_name = "llvm.x86.sse3.hsub.pd"]
161 unsafefn hsubpd(a: __m128d, b: __m128d) -> __m128d;
162 #[link_name = "llvm.x86.sse3.hsub.ps"]
163 unsafefn hsubps(a: __m128, b: __m128) -> __m128;
164 #[link_name = "llvm.x86.sse3.ldu.dq"]
165 unsafefn lddqu(mem_addr: *const i8) -> i8x16;
166}
167
168#[cfg(test)]
169mod tests {
170 use stdarch_test::simd_test;
171
172 use crate::core_arch::x86::*;
173
174 #[simd_test(enable = "sse3")]
175 unsafe fn test_mm_addsub_ps() {
176 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
177 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
178 let r = _mm_addsub_ps(a, b);
179 assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
180 }
181
182 #[simd_test(enable = "sse3")]
183 unsafe fn test_mm_addsub_pd() {
184 let a = _mm_setr_pd(-1.0, 5.0);
185 let b = _mm_setr_pd(-100.0, 20.0);
186 let r = _mm_addsub_pd(a, b);
187 assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
188 }
189
190 #[simd_test(enable = "sse3")]
191 unsafe fn test_mm_hadd_pd() {
192 let a = _mm_setr_pd(-1.0, 5.0);
193 let b = _mm_setr_pd(-100.0, 20.0);
194 let r = _mm_hadd_pd(a, b);
195 assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
196 }
197
198 #[simd_test(enable = "sse3")]
199 unsafe fn test_mm_hadd_ps() {
200 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
201 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
202 let r = _mm_hadd_ps(a, b);
203 assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
204 }
205
206 #[simd_test(enable = "sse3")]
207 unsafe fn test_mm_hsub_pd() {
208 let a = _mm_setr_pd(-1.0, 5.0);
209 let b = _mm_setr_pd(-100.0, 20.0);
210 let r = _mm_hsub_pd(a, b);
211 assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
212 }
213
214 #[simd_test(enable = "sse3")]
215 unsafe fn test_mm_hsub_ps() {
216 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
217 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
218 let r = _mm_hsub_ps(a, b);
219 assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
220 }
221
222 #[simd_test(enable = "sse3")]
223 unsafe fn test_mm_lddqu_si128() {
224 #[rustfmt::skip]
225 let a = _mm_setr_epi8(
226 1, 2, 3, 4,
227 5, 6, 7, 8,
228 9, 10, 11, 12,
229 13, 14, 15, 16,
230 );
231 let r = _mm_lddqu_si128(&a);
232 assert_eq_m128i(a, r);
233 }
234
235 #[simd_test(enable = "sse3")]
236 unsafe fn test_mm_movedup_pd() {
237 let a = _mm_setr_pd(-1.0, 5.0);
238 let r = _mm_movedup_pd(a);
239 assert_eq_m128d(r, _mm_setr_pd(-1.0, -1.0));
240 }
241
242 #[simd_test(enable = "sse3")]
243 unsafe fn test_mm_movehdup_ps() {
244 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
245 let r = _mm_movehdup_ps(a);
246 assert_eq_m128(r, _mm_setr_ps(5.0, 5.0, -10.0, -10.0));
247 }
248
249 #[simd_test(enable = "sse3")]
250 unsafe fn test_mm_moveldup_ps() {
251 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
252 let r = _mm_moveldup_ps(a);
253 assert_eq_m128(r, _mm_setr_ps(-1.0, -1.0, 0.0, 0.0));
254 }
255
256 #[simd_test(enable = "sse3")]
257 unsafe fn test_mm_loaddup_pd() {
258 let d = -5.0;
259 let r = _mm_loaddup_pd(&d);
260 assert_eq_m128d(r, _mm_setr_pd(d, d));
261 }
262}
263