sse3.rs source code [crates/core_arch/src/x86/sse3.rs]

1	//! Streaming SIMD Extensions 3 (SSE3)
2
3	use crate::core_arch::{simd::, x86::};
4	use crate::intrinsics::simd::*;
5
6	#[cfg(test)]
7	use stdarch_test::assert_instr;
8
9	/// Alternatively add and subtract packed single-precision (32-bit)
10	/// floating-point elements in `a` to/from packed elements in `b`.
11	///
12	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_ps)
13	#[inline]
14	#[target_feature(enable = "sse3")]
15	#[cfg_attr(test, assert_instr(addsubps))]
16	#[stable(feature = "simd_x86", since = "1.27.0")]
17	pub fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
18	unsafe {
19	let a: f32x4 = a.as_f32x4();
20	let b: f32x4 = b.as_f32x4();
21	let add: f32x4 = simd_add(x:a, y:b);
22	let sub: f32x4 = simd_sub(lhs:a, rhs:b);
23	simd_shuffle!(add, sub, [`4`, `1`, `6`, `3`])
24	}
25	}
26
27	/// Alternatively add and subtract packed double-precision (64-bit)
28	/// floating-point elements in `a` to/from packed elements in `b`.
29	///
30	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd)
31	#[inline]
32	#[target_feature(enable = "sse3")]
33	#[cfg_attr(test, assert_instr(addsubpd))]
34	#[stable(feature = "simd_x86", since = "1.27.0")]
35	pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
36	unsafe {
37	let a: f64x2 = a.as_f64x2();
38	let b: f64x2 = b.as_f64x2();
39	let add: f64x2 = simd_add(x:a, y:b);
40	let sub: f64x2 = simd_sub(lhs:a, rhs:b);
41	simd_shuffle!(add, sub, [`2`, `1`])
42	}
43	}
44
45	/// Horizontally adds adjacent pairs of double-precision (64-bit)
46	/// floating-point elements in `a` and `b`, and pack the results.
47	///
48	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd)
49	#[inline]
50	#[target_feature(enable = "sse3")]
51	#[cfg_attr(test, assert_instr(haddpd))]
52	#[stable(feature = "simd_x86", since = "1.27.0")]
53	pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
54	unsafe { haddpd(a, b) }
55	}
56
57	/// Horizontally adds adjacent pairs of single-precision (32-bit)
58	/// floating-point elements in `a` and `b`, and pack the results.
59	///
60	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps)
61	#[inline]
62	#[target_feature(enable = "sse3")]
63	#[cfg_attr(test, assert_instr(haddps))]
64	#[stable(feature = "simd_x86", since = "1.27.0")]
65	pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
66	unsafe { haddps(a, b) }
67	}
68
69	/// Horizontally subtract adjacent pairs of double-precision (64-bit)
70	/// floating-point elements in `a` and `b`, and pack the results.
71	///
72	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd)
73	#[inline]
74	#[target_feature(enable = "sse3")]
75	#[cfg_attr(test, assert_instr(hsubpd))]
76	#[stable(feature = "simd_x86", since = "1.27.0")]
77	pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
78	unsafe { hsubpd(a, b) }
79	}
80
81	/// Horizontally adds adjacent pairs of single-precision (32-bit)
82	/// floating-point elements in `a` and `b`, and pack the results.
83	///
84	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps)
85	#[inline]
86	#[target_feature(enable = "sse3")]
87	#[cfg_attr(test, assert_instr(hsubps))]
88	#[stable(feature = "simd_x86", since = "1.27.0")]
89	pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
90	unsafe { hsubps(a, b) }
91	}
92
93	/// Loads 128-bits of integer data from unaligned memory.
94	/// This intrinsic may perform better than `_mm_loadu_si128`
95	/// when the data crosses a cache line boundary.
96	///
97	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128)
98	#[inline]
99	#[target_feature(enable = "sse3")]
100	#[cfg_attr(test, assert_instr(lddqu))]
101	#[stable(feature = "simd_x86", since = "1.27.0")]
102	pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
103	transmute(src:lddqu(mem_addr as *const _))
104	}
105
106	/// Duplicate the low double-precision (64-bit) floating-point element
107	/// from `a`.
108	///
109	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd)
110	#[inline]
111	#[target_feature(enable = "sse3")]
112	#[cfg_attr(test, assert_instr(movddup))]
113	#[stable(feature = "simd_x86", since = "1.27.0")]
114	pub fn _mm_movedup_pd(a: __m128d) -> __m128d {
115	unsafe { simd_shuffle!(a, a, [`0`, `0`]) }
116	}
117
118	/// Loads a double-precision (64-bit) floating-point element from memory
119	/// into both elements of return vector.
120	///
121	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd)
122	#[inline]
123	#[target_feature(enable = "sse3")]
124	#[cfg_attr(test, assert_instr(movddup))]
125	#[stable(feature = "simd_x86", since = "1.27.0")]
126	pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
127	_mm_load1_pd(mem_addr)
128	}
129
130	/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
131	/// from `a`.
132	///
133	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps)
134	#[inline]
135	#[target_feature(enable = "sse3")]
136	#[cfg_attr(test, assert_instr(movshdup))]
137	#[stable(feature = "simd_x86", since = "1.27.0")]
138	pub fn _mm_movehdup_ps(a: __m128) -> __m128 {
139	unsafe { simd_shuffle!(a, a, [`1`, `1`, `3`, `3`]) }
140	}
141
142	/// Duplicate even-indexed single-precision (32-bit) floating-point elements
143	/// from `a`.
144	///
145	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps)
146	#[inline]
147	#[target_feature(enable = "sse3")]
148	#[cfg_attr(test, assert_instr(movsldup))]
149	#[stable(feature = "simd_x86", since = "1.27.0")]
150	pub fn _mm_moveldup_ps(a: __m128) -> __m128 {
151	unsafe { simd_shuffle!(a, a, [`0`, `0`, `2`, `2`]) }
152	}
153
154	#[allow(improper_ctypes)]
155	unsafe extern "C" {
156	#[link_name = "llvm.x86.sse3.hadd.pd"]
157	unsafefn haddpd(a: __m128d, b: __m128d) -> __m128d;
158	#[link_name = "llvm.x86.sse3.hadd.ps"]
159	unsafefn haddps(a: __m128, b: __m128) -> __m128;
160	#[link_name = "llvm.x86.sse3.hsub.pd"]
161	unsafefn hsubpd(a: __m128d, b: __m128d) -> __m128d;
162	#[link_name = "llvm.x86.sse3.hsub.ps"]
163	unsafefn hsubps(a: __m128, b: __m128) -> __m128;
164	#[link_name = "llvm.x86.sse3.ldu.dq"]
165	unsafefn lddqu(mem_addr: *const i8) -> i8x16;
166	}
167
168	#[cfg(test)]
169	mod tests {
170	use stdarch_test::simd_test;
171
172	use crate::core_arch::x86::*;
173
174	#[simd_test(enable = "sse3")]
175	unsafe fn test_mm_addsub_ps() {
176	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
177	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
178	let r = _mm_addsub_ps(a, b);
179	assert_eq_m128(r, _mm_setr_ps(`99.0`, `25.0`, `0.0`, `-15.0`));
180	}
181
182	#[simd_test(enable = "sse3")]
183	unsafe fn test_mm_addsub_pd() {
184	let a = _mm_setr_pd(`-1.0`, `5.0`);
185	let b = _mm_setr_pd(`-100.0`, `20.0`);
186	let r = _mm_addsub_pd(a, b);
187	assert_eq_m128d(r, _mm_setr_pd(`99.0`, `25.0`));
188	}
189
190	#[simd_test(enable = "sse3")]
191	unsafe fn test_mm_hadd_pd() {
192	let a = _mm_setr_pd(`-1.0`, `5.0`);
193	let b = _mm_setr_pd(`-100.0`, `20.0`);
194	let r = _mm_hadd_pd(a, b);
195	assert_eq_m128d(r, _mm_setr_pd(`4.0`, `-80.0`));
196	}
197
198	#[simd_test(enable = "sse3")]
199	unsafe fn test_mm_hadd_ps() {
200	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
201	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
202	let r = _mm_hadd_ps(a, b);
203	assert_eq_m128(r, _mm_setr_ps(`4.0`, `-10.0`, `-80.0`, `-5.0`));
204	}
205
206	#[simd_test(enable = "sse3")]
207	unsafe fn test_mm_hsub_pd() {
208	let a = _mm_setr_pd(`-1.0`, `5.0`);
209	let b = _mm_setr_pd(`-100.0`, `20.0`);
210	let r = _mm_hsub_pd(a, b);
211	assert_eq_m128d(r, _mm_setr_pd(`-6.0`, `-120.0`));
212	}
213
214	#[simd_test(enable = "sse3")]
215	unsafe fn test_mm_hsub_ps() {
216	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
217	let b = _mm_setr_ps(`-100.0`, `20.0`, `0.0`, `-5.0`);
218	let r = _mm_hsub_ps(a, b);
219	assert_eq_m128(r, _mm_setr_ps(`-6.0`, `10.0`, `-120.0`, `5.0`));
220	}
221
222	#[simd_test(enable = "sse3")]
223	unsafe fn test_mm_lddqu_si128() {
224	#[rustfmt::skip]
225	let a = _mm_setr_epi8(
226	`1`, `2`, `3`, `4`,
227	`5`, `6`, `7`, `8`,
228	`9`, `10`, `11`, `12`,
229	`13`, `14`, `15`, `16`,
230	);
231	let r = _mm_lddqu_si128(&a);
232	assert_eq_m128i(a, r);
233	}
234
235	#[simd_test(enable = "sse3")]
236	unsafe fn test_mm_movedup_pd() {
237	let a = _mm_setr_pd(`-1.0`, `5.0`);
238	let r = _mm_movedup_pd(a);
239	assert_eq_m128d(r, _mm_setr_pd(`-1.0`, `-1.0`));
240	}
241
242	#[simd_test(enable = "sse3")]
243	unsafe fn test_mm_movehdup_ps() {
244	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
245	let r = _mm_movehdup_ps(a);
246	assert_eq_m128(r, _mm_setr_ps(`5.0`, `5.0`, `-10.0`, `-10.0`));
247	}
248
249	#[simd_test(enable = "sse3")]
250	unsafe fn test_mm_moveldup_ps() {
251	let a = _mm_setr_ps(`-1.0`, `5.0`, `0.0`, `-10.0`);
252	let r = _mm_moveldup_ps(a);
253	assert_eq_m128(r, _mm_setr_ps(`-1.0`, `-1.0`, `0.0`, `0.0`));
254	}
255
256	#[simd_test(enable = "sse3")]
257	unsafe fn test_mm_loaddup_pd() {
258	let d = `-5.0`;
259	let r = _mm_loaddup_pd(&d);
260	assert_eq_m128d(r, _mm_setr_pd(d, d));
261	}
262	}
263