1 | //! Fused Multiply-Add instruction set (FMA) |
2 | //! |
3 | //! The FMA instruction set is an extension to the 128 and 256-bit SSE |
4 | //! instructions in the x86 microprocessor instruction set to perform fused |
5 | //! multiply–add (FMA) operations. |
6 | //! |
7 | //! The references are: |
8 | //! |
9 | //! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: |
10 | //! Instruction Set Reference, A-Z][intel64_ref]. |
11 | //! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and |
12 | //! System Instructions][amd64_ref]. |
13 | //! |
14 | //! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the |
15 | //! instructions available. |
16 | //! |
17 | //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf |
18 | //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf |
19 | //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate |
20 | |
21 | use crate::core_arch::x86::*; |
22 | use crate::intrinsics::simd::{simd_fma, simd_neg}; |
23 | use crate::intrinsics::{fmaf32, fmaf64}; |
24 | |
25 | #[cfg (test)] |
26 | use stdarch_test::assert_instr; |
27 | |
28 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
29 | /// and `b`, and add the intermediate result to packed elements in `c`. |
30 | /// |
31 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_pd) |
32 | #[inline ] |
33 | #[target_feature (enable = "fma" )] |
34 | #[cfg_attr (test, assert_instr(vfmadd))] |
35 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
36 | pub fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
37 | unsafe { simd_fma(x:a, y:b, z:c) } |
38 | } |
39 | |
40 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
41 | /// and `b`, and add the intermediate result to packed elements in `c`. |
42 | /// |
43 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_pd) |
44 | #[inline ] |
45 | #[target_feature (enable = "fma" )] |
46 | #[cfg_attr (test, assert_instr(vfmadd))] |
47 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
48 | pub fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
49 | unsafe { simd_fma(x:a, y:b, z:c) } |
50 | } |
51 | |
52 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
53 | /// and `b`, and add the intermediate result to packed elements in `c`. |
54 | /// |
55 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ps) |
56 | #[inline ] |
57 | #[target_feature (enable = "fma" )] |
58 | #[cfg_attr (test, assert_instr(vfmadd))] |
59 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
60 | pub fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
61 | unsafe { simd_fma(x:a, y:b, z:c) } |
62 | } |
63 | |
64 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
65 | /// and `b`, and add the intermediate result to packed elements in `c`. |
66 | /// |
67 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_ps) |
68 | #[inline ] |
69 | #[target_feature (enable = "fma" )] |
70 | #[cfg_attr (test, assert_instr(vfmadd))] |
71 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
72 | pub fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
73 | unsafe { simd_fma(x:a, y:b, z:c) } |
74 | } |
75 | |
76 | /// Multiplies the lower double-precision (64-bit) floating-point elements in |
77 | /// `a` and `b`, and add the intermediate result to the lower element in `c`. |
78 | /// Stores the result in the lower element of the returned value, and copy the |
79 | /// upper element from `a` to the upper elements of the result. |
80 | /// |
81 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_sd) |
82 | #[inline ] |
83 | #[target_feature (enable = "fma" )] |
84 | #[cfg_attr (test, assert_instr(vfmadd))] |
85 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
86 | pub fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
87 | unsafe { |
88 | simd_insert!( |
89 | a, |
90 | 0, |
91 | fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), _mm_cvtsd_f64(c)) |
92 | ) |
93 | } |
94 | } |
95 | |
96 | /// Multiplies the lower single-precision (32-bit) floating-point elements in |
97 | /// `a` and `b`, and add the intermediate result to the lower element in `c`. |
98 | /// Stores the result in the lower element of the returned value, and copy the |
99 | /// 3 upper elements from `a` to the upper elements of the result. |
100 | /// |
101 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ss) |
102 | #[inline ] |
103 | #[target_feature (enable = "fma" )] |
104 | #[cfg_attr (test, assert_instr(vfmadd))] |
105 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
106 | pub fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 { |
107 | unsafe { |
108 | simd_insert!( |
109 | a, |
110 | 0, |
111 | fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), _mm_cvtss_f32(c)) |
112 | ) |
113 | } |
114 | } |
115 | |
116 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
117 | /// and `b`, and alternatively add and subtract packed elements in `c` to/from |
118 | /// the intermediate result. |
119 | /// |
120 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd) |
121 | #[inline ] |
122 | #[target_feature (enable = "fma" )] |
123 | #[cfg_attr (test, assert_instr(vfmaddsub))] |
124 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
125 | pub fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
126 | unsafe { |
127 | let add: __m128d = simd_fma(x:a, y:b, z:c); |
128 | let sub: __m128d = simd_fma(x:a, y:b, z:simd_neg(c)); |
129 | simd_shuffle!(add, sub, [2, 1]) |
130 | } |
131 | } |
132 | |
133 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
134 | /// and `b`, and alternatively add and subtract packed elements in `c` to/from |
135 | /// the intermediate result. |
136 | /// |
137 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd) |
138 | #[inline ] |
139 | #[target_feature (enable = "fma" )] |
140 | #[cfg_attr (test, assert_instr(vfmaddsub))] |
141 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
142 | pub fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
143 | unsafe { |
144 | let add: __m256d = simd_fma(x:a, y:b, z:c); |
145 | let sub: __m256d = simd_fma(x:a, y:b, z:simd_neg(c)); |
146 | simd_shuffle!(add, sub, [4, 1, 6, 3]) |
147 | } |
148 | } |
149 | |
150 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
151 | /// and `b`, and alternatively add and subtract packed elements in `c` to/from |
152 | /// the intermediate result. |
153 | /// |
154 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps) |
155 | #[inline ] |
156 | #[target_feature (enable = "fma" )] |
157 | #[cfg_attr (test, assert_instr(vfmaddsub))] |
158 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
159 | pub fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
160 | unsafe { |
161 | let add: __m128 = simd_fma(x:a, y:b, z:c); |
162 | let sub: __m128 = simd_fma(x:a, y:b, z:simd_neg(c)); |
163 | simd_shuffle!(add, sub, [4, 1, 6, 3]) |
164 | } |
165 | } |
166 | |
167 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
168 | /// and `b`, and alternatively add and subtract packed elements in `c` to/from |
169 | /// the intermediate result. |
170 | /// |
171 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps) |
172 | #[inline ] |
173 | #[target_feature (enable = "fma" )] |
174 | #[cfg_attr (test, assert_instr(vfmaddsub))] |
175 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
176 | pub fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
177 | unsafe { |
178 | let add: __m256 = simd_fma(x:a, y:b, z:c); |
179 | let sub: __m256 = simd_fma(x:a, y:b, z:simd_neg(c)); |
180 | simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7]) |
181 | } |
182 | } |
183 | |
184 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
185 | /// and `b`, and subtract packed elements in `c` from the intermediate result. |
186 | /// |
187 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_pd) |
188 | #[inline ] |
189 | #[target_feature (enable = "fma" )] |
190 | #[cfg_attr (test, assert_instr(vfmsub))] |
191 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
192 | pub fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
193 | unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) } |
194 | } |
195 | |
196 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
197 | /// and `b`, and subtract packed elements in `c` from the intermediate result. |
198 | /// |
199 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_pd) |
200 | #[inline ] |
201 | #[target_feature (enable = "fma" )] |
202 | #[cfg_attr (test, assert_instr(vfmsub))] |
203 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
204 | pub fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
205 | unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) } |
206 | } |
207 | |
208 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
209 | /// and `b`, and subtract packed elements in `c` from the intermediate result. |
210 | /// |
211 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ps) |
212 | #[inline ] |
213 | #[target_feature (enable = "fma" )] |
214 | #[cfg_attr (test, assert_instr(vfmsub213ps))] |
215 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
216 | pub fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
217 | unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) } |
218 | } |
219 | |
220 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
221 | /// and `b`, and subtract packed elements in `c` from the intermediate result. |
222 | /// |
223 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_ps) |
224 | #[inline ] |
225 | #[target_feature (enable = "fma" )] |
226 | #[cfg_attr (test, assert_instr(vfmsub213ps))] |
227 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
228 | pub fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
229 | unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) } |
230 | } |
231 | |
232 | /// Multiplies the lower double-precision (64-bit) floating-point elements in |
233 | /// `a` and `b`, and subtract the lower element in `c` from the intermediate |
234 | /// result. Store the result in the lower element of the returned value, and |
235 | /// copy the upper element from `a` to the upper elements of the result. |
236 | /// |
237 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_sd) |
238 | #[inline ] |
239 | #[target_feature (enable = "fma" )] |
240 | #[cfg_attr (test, assert_instr(vfmsub))] |
241 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
242 | pub fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
243 | unsafe { |
244 | simd_insert!( |
245 | a, |
246 | 0, |
247 | fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), -_mm_cvtsd_f64(c)) |
248 | ) |
249 | } |
250 | } |
251 | |
252 | /// Multiplies the lower single-precision (32-bit) floating-point elements in |
253 | /// `a` and `b`, and subtract the lower element in `c` from the intermediate |
254 | /// result. Store the result in the lower element of the returned value, and |
255 | /// copy the 3 upper elements from `a` to the upper elements of the result. |
256 | /// |
257 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ss) |
258 | #[inline ] |
259 | #[target_feature (enable = "fma" )] |
260 | #[cfg_attr (test, assert_instr(vfmsub))] |
261 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
262 | pub fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 { |
263 | unsafe { |
264 | simd_insert!( |
265 | a, |
266 | 0, |
267 | fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), -_mm_cvtss_f32(c)) |
268 | ) |
269 | } |
270 | } |
271 | |
272 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
273 | /// and `b`, and alternatively subtract and add packed elements in `c` from/to |
274 | /// the intermediate result. |
275 | /// |
276 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd) |
277 | #[inline ] |
278 | #[target_feature (enable = "fma" )] |
279 | #[cfg_attr (test, assert_instr(vfmsubadd))] |
280 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
281 | pub fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
282 | unsafe { |
283 | let add: __m128d = simd_fma(x:a, y:b, z:c); |
284 | let sub: __m128d = simd_fma(x:a, y:b, z:simd_neg(c)); |
285 | simd_shuffle!(add, sub, [0, 3]) |
286 | } |
287 | } |
288 | |
289 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
290 | /// and `b`, and alternatively subtract and add packed elements in `c` from/to |
291 | /// the intermediate result. |
292 | /// |
293 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd) |
294 | #[inline ] |
295 | #[target_feature (enable = "fma" )] |
296 | #[cfg_attr (test, assert_instr(vfmsubadd))] |
297 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
298 | pub fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
299 | unsafe { |
300 | let add: __m256d = simd_fma(x:a, y:b, z:c); |
301 | let sub: __m256d = simd_fma(x:a, y:b, z:simd_neg(c)); |
302 | simd_shuffle!(add, sub, [0, 5, 2, 7]) |
303 | } |
304 | } |
305 | |
306 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
307 | /// and `b`, and alternatively subtract and add packed elements in `c` from/to |
308 | /// the intermediate result. |
309 | /// |
310 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps) |
311 | #[inline ] |
312 | #[target_feature (enable = "fma" )] |
313 | #[cfg_attr (test, assert_instr(vfmsubadd))] |
314 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
315 | pub fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
316 | unsafe { |
317 | let add: __m128 = simd_fma(x:a, y:b, z:c); |
318 | let sub: __m128 = simd_fma(x:a, y:b, z:simd_neg(c)); |
319 | simd_shuffle!(add, sub, [0, 5, 2, 7]) |
320 | } |
321 | } |
322 | |
323 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
324 | /// and `b`, and alternatively subtract and add packed elements in `c` from/to |
325 | /// the intermediate result. |
326 | /// |
327 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps) |
328 | #[inline ] |
329 | #[target_feature (enable = "fma" )] |
330 | #[cfg_attr (test, assert_instr(vfmsubadd))] |
331 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
332 | pub fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
333 | unsafe { |
334 | let add: __m256 = simd_fma(x:a, y:b, z:c); |
335 | let sub: __m256 = simd_fma(x:a, y:b, z:simd_neg(c)); |
336 | simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15]) |
337 | } |
338 | } |
339 | |
340 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
341 | /// and `b`, and add the negated intermediate result to packed elements in `c`. |
342 | /// |
343 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd) |
344 | #[inline ] |
345 | #[target_feature (enable = "fma" )] |
346 | #[cfg_attr (test, assert_instr(vfnmadd))] |
347 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
348 | pub fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
349 | unsafe { simd_fma(x:simd_neg(a), y:b, z:c) } |
350 | } |
351 | |
352 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
353 | /// and `b`, and add the negated intermediate result to packed elements in `c`. |
354 | /// |
355 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd) |
356 | #[inline ] |
357 | #[target_feature (enable = "fma" )] |
358 | #[cfg_attr (test, assert_instr(vfnmadd))] |
359 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
360 | pub fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
361 | unsafe { simd_fma(x:simd_neg(a), y:b, z:c) } |
362 | } |
363 | |
364 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
365 | /// and `b`, and add the negated intermediate result to packed elements in `c`. |
366 | /// |
367 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps) |
368 | #[inline ] |
369 | #[target_feature (enable = "fma" )] |
370 | #[cfg_attr (test, assert_instr(vfnmadd))] |
371 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
372 | pub fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
373 | unsafe { simd_fma(x:simd_neg(a), y:b, z:c) } |
374 | } |
375 | |
376 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
377 | /// and `b`, and add the negated intermediate result to packed elements in `c`. |
378 | /// |
379 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps) |
380 | #[inline ] |
381 | #[target_feature (enable = "fma" )] |
382 | #[cfg_attr (test, assert_instr(vfnmadd))] |
383 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
384 | pub fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
385 | unsafe { simd_fma(x:simd_neg(a), y:b, z:c) } |
386 | } |
387 | |
388 | /// Multiplies the lower double-precision (64-bit) floating-point elements in |
389 | /// `a` and `b`, and add the negated intermediate result to the lower element |
390 | /// in `c`. Store the result in the lower element of the returned value, and |
391 | /// copy the upper element from `a` to the upper elements of the result. |
392 | /// |
393 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_sd) |
394 | #[inline ] |
395 | #[target_feature (enable = "fma" )] |
396 | #[cfg_attr (test, assert_instr(vfnmadd))] |
397 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
398 | pub fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
399 | unsafe { |
400 | simd_insert!( |
401 | a, |
402 | 0, |
403 | fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), _mm_cvtsd_f64(c)) |
404 | ) |
405 | } |
406 | } |
407 | |
408 | /// Multiplies the lower single-precision (32-bit) floating-point elements in |
409 | /// `a` and `b`, and add the negated intermediate result to the lower element |
410 | /// in `c`. Store the result in the lower element of the returned value, and |
411 | /// copy the 3 upper elements from `a` to the upper elements of the result. |
412 | /// |
413 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ss) |
414 | #[inline ] |
415 | #[target_feature (enable = "fma" )] |
416 | #[cfg_attr (test, assert_instr(vfnmadd))] |
417 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
418 | pub fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 { |
419 | unsafe { |
420 | simd_insert!( |
421 | a, |
422 | 0, |
423 | fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), _mm_cvtss_f32(c)) |
424 | ) |
425 | } |
426 | } |
427 | |
428 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
429 | /// and `b`, and subtract packed elements in `c` from the negated intermediate |
430 | /// result. |
431 | /// |
432 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_pd) |
433 | #[inline ] |
434 | #[target_feature (enable = "fma" )] |
435 | #[cfg_attr (test, assert_instr(vfnmsub))] |
436 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
437 | pub fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
438 | unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) } |
439 | } |
440 | |
441 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
442 | /// and `b`, and subtract packed elements in `c` from the negated intermediate |
443 | /// result. |
444 | /// |
445 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_pd) |
446 | #[inline ] |
447 | #[target_feature (enable = "fma" )] |
448 | #[cfg_attr (test, assert_instr(vfnmsub))] |
449 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
450 | pub fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
451 | unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) } |
452 | } |
453 | |
454 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
455 | /// and `b`, and subtract packed elements in `c` from the negated intermediate |
456 | /// result. |
457 | /// |
458 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ps) |
459 | #[inline ] |
460 | #[target_feature (enable = "fma" )] |
461 | #[cfg_attr (test, assert_instr(vfnmsub))] |
462 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
463 | pub fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
464 | unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) } |
465 | } |
466 | |
467 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
468 | /// and `b`, and subtract packed elements in `c` from the negated intermediate |
469 | /// result. |
470 | /// |
471 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_ps) |
472 | #[inline ] |
473 | #[target_feature (enable = "fma" )] |
474 | #[cfg_attr (test, assert_instr(vfnmsub))] |
475 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
476 | pub fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
477 | unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) } |
478 | } |
479 | |
480 | /// Multiplies the lower double-precision (64-bit) floating-point elements in |
481 | /// `a` and `b`, and subtract packed elements in `c` from the negated |
482 | /// intermediate result. Store the result in the lower element of the returned |
483 | /// value, and copy the upper element from `a` to the upper elements of the |
484 | /// result. |
485 | /// |
486 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_sd) |
487 | #[inline ] |
488 | #[target_feature (enable = "fma" )] |
489 | #[cfg_attr (test, assert_instr(vfnmsub))] |
490 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
491 | pub fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
492 | unsafe { |
493 | simd_insert!( |
494 | a, |
495 | 0, |
496 | fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), -_mm_cvtsd_f64(c)) |
497 | ) |
498 | } |
499 | } |
500 | |
501 | /// Multiplies the lower single-precision (32-bit) floating-point elements in |
502 | /// `a` and `b`, and subtract packed elements in `c` from the negated |
503 | /// intermediate result. Store the result in the lower element of the |
504 | /// returned value, and copy the 3 upper elements from `a` to the upper |
505 | /// elements of the result. |
506 | /// |
507 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ss) |
508 | #[inline ] |
509 | #[target_feature (enable = "fma" )] |
510 | #[cfg_attr (test, assert_instr(vfnmsub))] |
511 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
512 | pub fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 { |
513 | unsafe { |
514 | simd_insert!( |
515 | a, |
516 | 0, |
517 | fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), -_mm_cvtss_f32(c)) |
518 | ) |
519 | } |
520 | } |
521 | |
522 | #[cfg (test)] |
523 | mod tests { |
524 | |
525 | use stdarch_test::simd_test; |
526 | |
527 | use crate::core_arch::x86::*; |
528 | |
529 | #[simd_test(enable = "fma" )] |
530 | unsafe fn test_mm_fmadd_pd() { |
531 | let a = _mm_setr_pd(1., 2.); |
532 | let b = _mm_setr_pd(5., 3.); |
533 | let c = _mm_setr_pd(4., 9.); |
534 | let r = _mm_setr_pd(9., 15.); |
535 | assert_eq_m128d(_mm_fmadd_pd(a, b, c), r); |
536 | } |
537 | |
538 | #[simd_test(enable = "fma" )] |
539 | unsafe fn test_mm256_fmadd_pd() { |
540 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
541 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
542 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
543 | let r = _mm256_setr_pd(9., 15., 22., 15.); |
544 | assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r); |
545 | } |
546 | |
547 | #[simd_test(enable = "fma" )] |
548 | unsafe fn test_mm_fmadd_ps() { |
549 | let a = _mm_setr_ps(1., 2., 3., 4.); |
550 | let b = _mm_setr_ps(5., 3., 7., 2.); |
551 | let c = _mm_setr_ps(4., 9., 1., 7.); |
552 | let r = _mm_setr_ps(9., 15., 22., 15.); |
553 | assert_eq_m128(_mm_fmadd_ps(a, b, c), r); |
554 | } |
555 | |
556 | #[simd_test(enable = "fma" )] |
557 | unsafe fn test_mm256_fmadd_ps() { |
558 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
559 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
560 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
561 | let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.); |
562 | assert_eq_m256(_mm256_fmadd_ps(a, b, c), r); |
563 | } |
564 | |
565 | #[simd_test(enable = "fma" )] |
566 | unsafe fn test_mm_fmadd_sd() { |
567 | let a = _mm_setr_pd(1., 2.); |
568 | let b = _mm_setr_pd(5., 3.); |
569 | let c = _mm_setr_pd(4., 9.); |
570 | let r = _mm_setr_pd(9., 2.); |
571 | assert_eq_m128d(_mm_fmadd_sd(a, b, c), r); |
572 | } |
573 | |
574 | #[simd_test(enable = "fma" )] |
575 | unsafe fn test_mm_fmadd_ss() { |
576 | let a = _mm_setr_ps(1., 2., 3., 4.); |
577 | let b = _mm_setr_ps(5., 3., 7., 2.); |
578 | let c = _mm_setr_ps(4., 9., 1., 7.); |
579 | let r = _mm_setr_ps(9., 2., 3., 4.); |
580 | assert_eq_m128(_mm_fmadd_ss(a, b, c), r); |
581 | } |
582 | |
583 | #[simd_test(enable = "fma" )] |
584 | unsafe fn test_mm_fmaddsub_pd() { |
585 | let a = _mm_setr_pd(1., 2.); |
586 | let b = _mm_setr_pd(5., 3.); |
587 | let c = _mm_setr_pd(4., 9.); |
588 | let r = _mm_setr_pd(1., 15.); |
589 | assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r); |
590 | } |
591 | |
592 | #[simd_test(enable = "fma" )] |
593 | unsafe fn test_mm256_fmaddsub_pd() { |
594 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
595 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
596 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
597 | let r = _mm256_setr_pd(1., 15., 20., 15.); |
598 | assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r); |
599 | } |
600 | |
601 | #[simd_test(enable = "fma" )] |
602 | unsafe fn test_mm_fmaddsub_ps() { |
603 | let a = _mm_setr_ps(1., 2., 3., 4.); |
604 | let b = _mm_setr_ps(5., 3., 7., 2.); |
605 | let c = _mm_setr_ps(4., 9., 1., 7.); |
606 | let r = _mm_setr_ps(1., 15., 20., 15.); |
607 | assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r); |
608 | } |
609 | |
610 | #[simd_test(enable = "fma" )] |
611 | unsafe fn test_mm256_fmaddsub_ps() { |
612 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
613 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
614 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
615 | let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.); |
616 | assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r); |
617 | } |
618 | |
619 | #[simd_test(enable = "fma" )] |
620 | unsafe fn test_mm_fmsub_pd() { |
621 | let a = _mm_setr_pd(1., 2.); |
622 | let b = _mm_setr_pd(5., 3.); |
623 | let c = _mm_setr_pd(4., 9.); |
624 | let r = _mm_setr_pd(1., -3.); |
625 | assert_eq_m128d(_mm_fmsub_pd(a, b, c), r); |
626 | } |
627 | |
628 | #[simd_test(enable = "fma" )] |
629 | unsafe fn test_mm256_fmsub_pd() { |
630 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
631 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
632 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
633 | let r = _mm256_setr_pd(1., -3., 20., 1.); |
634 | assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r); |
635 | } |
636 | |
637 | #[simd_test(enable = "fma" )] |
638 | unsafe fn test_mm_fmsub_ps() { |
639 | let a = _mm_setr_ps(1., 2., 3., 4.); |
640 | let b = _mm_setr_ps(5., 3., 7., 2.); |
641 | let c = _mm_setr_ps(4., 9., 1., 7.); |
642 | let r = _mm_setr_ps(1., -3., 20., 1.); |
643 | assert_eq_m128(_mm_fmsub_ps(a, b, c), r); |
644 | } |
645 | |
646 | #[simd_test(enable = "fma" )] |
647 | unsafe fn test_mm256_fmsub_ps() { |
648 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
649 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
650 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
651 | let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.); |
652 | assert_eq_m256(_mm256_fmsub_ps(a, b, c), r); |
653 | } |
654 | |
655 | #[simd_test(enable = "fma" )] |
656 | unsafe fn test_mm_fmsub_sd() { |
657 | let a = _mm_setr_pd(1., 2.); |
658 | let b = _mm_setr_pd(5., 3.); |
659 | let c = _mm_setr_pd(4., 9.); |
660 | let r = _mm_setr_pd(1., 2.); |
661 | assert_eq_m128d(_mm_fmsub_sd(a, b, c), r); |
662 | } |
663 | |
664 | #[simd_test(enable = "fma" )] |
665 | unsafe fn test_mm_fmsub_ss() { |
666 | let a = _mm_setr_ps(1., 2., 3., 4.); |
667 | let b = _mm_setr_ps(5., 3., 7., 2.); |
668 | let c = _mm_setr_ps(4., 9., 1., 7.); |
669 | let r = _mm_setr_ps(1., 2., 3., 4.); |
670 | assert_eq_m128(_mm_fmsub_ss(a, b, c), r); |
671 | } |
672 | |
673 | #[simd_test(enable = "fma" )] |
674 | unsafe fn test_mm_fmsubadd_pd() { |
675 | let a = _mm_setr_pd(1., 2.); |
676 | let b = _mm_setr_pd(5., 3.); |
677 | let c = _mm_setr_pd(4., 9.); |
678 | let r = _mm_setr_pd(9., -3.); |
679 | assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r); |
680 | } |
681 | |
682 | #[simd_test(enable = "fma" )] |
683 | unsafe fn test_mm256_fmsubadd_pd() { |
684 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
685 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
686 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
687 | let r = _mm256_setr_pd(9., -3., 22., 1.); |
688 | assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r); |
689 | } |
690 | |
691 | #[simd_test(enable = "fma" )] |
692 | unsafe fn test_mm_fmsubadd_ps() { |
693 | let a = _mm_setr_ps(1., 2., 3., 4.); |
694 | let b = _mm_setr_ps(5., 3., 7., 2.); |
695 | let c = _mm_setr_ps(4., 9., 1., 7.); |
696 | let r = _mm_setr_ps(9., -3., 22., 1.); |
697 | assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r); |
698 | } |
699 | |
700 | #[simd_test(enable = "fma" )] |
701 | unsafe fn test_mm256_fmsubadd_ps() { |
702 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
703 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
704 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
705 | let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.); |
706 | assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r); |
707 | } |
708 | |
709 | #[simd_test(enable = "fma" )] |
710 | unsafe fn test_mm_fnmadd_pd() { |
711 | let a = _mm_setr_pd(1., 2.); |
712 | let b = _mm_setr_pd(5., 3.); |
713 | let c = _mm_setr_pd(4., 9.); |
714 | let r = _mm_setr_pd(-1., 3.); |
715 | assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r); |
716 | } |
717 | |
718 | #[simd_test(enable = "fma" )] |
719 | unsafe fn test_mm256_fnmadd_pd() { |
720 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
721 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
722 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
723 | let r = _mm256_setr_pd(-1., 3., -20., -1.); |
724 | assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r); |
725 | } |
726 | |
727 | #[simd_test(enable = "fma" )] |
728 | unsafe fn test_mm_fnmadd_ps() { |
729 | let a = _mm_setr_ps(1., 2., 3., 4.); |
730 | let b = _mm_setr_ps(5., 3., 7., 2.); |
731 | let c = _mm_setr_ps(4., 9., 1., 7.); |
732 | let r = _mm_setr_ps(-1., 3., -20., -1.); |
733 | assert_eq_m128(_mm_fnmadd_ps(a, b, c), r); |
734 | } |
735 | |
736 | #[simd_test(enable = "fma" )] |
737 | unsafe fn test_mm256_fnmadd_ps() { |
738 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
739 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
740 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
741 | let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.); |
742 | assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r); |
743 | } |
744 | |
745 | #[simd_test(enable = "fma" )] |
746 | unsafe fn test_mm_fnmadd_sd() { |
747 | let a = _mm_setr_pd(1., 2.); |
748 | let b = _mm_setr_pd(5., 3.); |
749 | let c = _mm_setr_pd(4., 9.); |
750 | let r = _mm_setr_pd(-1., 2.); |
751 | assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r); |
752 | } |
753 | |
754 | #[simd_test(enable = "fma" )] |
755 | unsafe fn test_mm_fnmadd_ss() { |
756 | let a = _mm_setr_ps(1., 2., 3., 4.); |
757 | let b = _mm_setr_ps(5., 3., 7., 2.); |
758 | let c = _mm_setr_ps(4., 9., 1., 7.); |
759 | let r = _mm_setr_ps(-1., 2., 3., 4.); |
760 | assert_eq_m128(_mm_fnmadd_ss(a, b, c), r); |
761 | } |
762 | |
763 | #[simd_test(enable = "fma" )] |
764 | unsafe fn test_mm_fnmsub_pd() { |
765 | let a = _mm_setr_pd(1., 2.); |
766 | let b = _mm_setr_pd(5., 3.); |
767 | let c = _mm_setr_pd(4., 9.); |
768 | let r = _mm_setr_pd(-9., -15.); |
769 | assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r); |
770 | } |
771 | |
772 | #[simd_test(enable = "fma" )] |
773 | unsafe fn test_mm256_fnmsub_pd() { |
774 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
775 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
776 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
777 | let r = _mm256_setr_pd(-9., -15., -22., -15.); |
778 | assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r); |
779 | } |
780 | |
781 | #[simd_test(enable = "fma" )] |
782 | unsafe fn test_mm_fnmsub_ps() { |
783 | let a = _mm_setr_ps(1., 2., 3., 4.); |
784 | let b = _mm_setr_ps(5., 3., 7., 2.); |
785 | let c = _mm_setr_ps(4., 9., 1., 7.); |
786 | let r = _mm_setr_ps(-9., -15., -22., -15.); |
787 | assert_eq_m128(_mm_fnmsub_ps(a, b, c), r); |
788 | } |
789 | |
790 | #[simd_test(enable = "fma" )] |
791 | unsafe fn test_mm256_fnmsub_ps() { |
792 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
793 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
794 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
795 | let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.); |
796 | assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r); |
797 | } |
798 | |
799 | #[simd_test(enable = "fma" )] |
800 | unsafe fn test_mm_fnmsub_sd() { |
801 | let a = _mm_setr_pd(1., 2.); |
802 | let b = _mm_setr_pd(5., 3.); |
803 | let c = _mm_setr_pd(4., 9.); |
804 | let r = _mm_setr_pd(-9., 2.); |
805 | assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r); |
806 | } |
807 | |
808 | #[simd_test(enable = "fma" )] |
809 | unsafe fn test_mm_fnmsub_ss() { |
810 | let a = _mm_setr_ps(1., 2., 3., 4.); |
811 | let b = _mm_setr_ps(5., 3., 7., 2.); |
812 | let c = _mm_setr_ps(4., 9., 1., 7.); |
813 | let r = _mm_setr_ps(-9., 2., 3., 4.); |
814 | assert_eq_m128(_mm_fnmsub_ss(a, b, c), r); |
815 | } |
816 | } |
817 | |