1//! Fused Multiply-Add instruction set (FMA)
2//!
3//! The FMA instruction set is an extension to the 128 and 256-bit SSE
4//! instructions in the x86 microprocessor instruction set to perform fused
5//! multiply–add (FMA) operations.
6//!
7//! The references are:
8//!
9//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
10//! Instruction Set Reference, A-Z][intel64_ref].
11//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
12//! System Instructions][amd64_ref].
13//!
14//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
15//! instructions available.
16//!
17//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
18//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::x86::*;
22use crate::intrinsics::simd::{simd_fma, simd_neg};
23use crate::intrinsics::{fmaf32, fmaf64};
24
25#[cfg(test)]
26use stdarch_test::assert_instr;
27
28/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
29/// and `b`, and add the intermediate result to packed elements in `c`.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_pd)
32#[inline]
33#[target_feature(enable = "fma")]
34#[cfg_attr(test, assert_instr(vfmadd))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
37 unsafe { simd_fma(x:a, y:b, z:c) }
38}
39
40/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
41/// and `b`, and add the intermediate result to packed elements in `c`.
42///
43/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_pd)
44#[inline]
45#[target_feature(enable = "fma")]
46#[cfg_attr(test, assert_instr(vfmadd))]
47#[stable(feature = "simd_x86", since = "1.27.0")]
48pub fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
49 unsafe { simd_fma(x:a, y:b, z:c) }
50}
51
52/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
53/// and `b`, and add the intermediate result to packed elements in `c`.
54///
55/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ps)
56#[inline]
57#[target_feature(enable = "fma")]
58#[cfg_attr(test, assert_instr(vfmadd))]
59#[stable(feature = "simd_x86", since = "1.27.0")]
60pub fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
61 unsafe { simd_fma(x:a, y:b, z:c) }
62}
63
64/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
65/// and `b`, and add the intermediate result to packed elements in `c`.
66///
67/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_ps)
68#[inline]
69#[target_feature(enable = "fma")]
70#[cfg_attr(test, assert_instr(vfmadd))]
71#[stable(feature = "simd_x86", since = "1.27.0")]
72pub fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
73 unsafe { simd_fma(x:a, y:b, z:c) }
74}
75
76/// Multiplies the lower double-precision (64-bit) floating-point elements in
77/// `a` and `b`, and add the intermediate result to the lower element in `c`.
78/// Stores the result in the lower element of the returned value, and copy the
79/// upper element from `a` to the upper elements of the result.
80///
81/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_sd)
82#[inline]
83#[target_feature(enable = "fma")]
84#[cfg_attr(test, assert_instr(vfmadd))]
85#[stable(feature = "simd_x86", since = "1.27.0")]
86pub fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
87 unsafe {
88 simd_insert!(
89 a,
90 0,
91 fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
92 )
93 }
94}
95
96/// Multiplies the lower single-precision (32-bit) floating-point elements in
97/// `a` and `b`, and add the intermediate result to the lower element in `c`.
98/// Stores the result in the lower element of the returned value, and copy the
99/// 3 upper elements from `a` to the upper elements of the result.
100///
101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ss)
102#[inline]
103#[target_feature(enable = "fma")]
104#[cfg_attr(test, assert_instr(vfmadd))]
105#[stable(feature = "simd_x86", since = "1.27.0")]
106pub fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
107 unsafe {
108 simd_insert!(
109 a,
110 0,
111 fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), _mm_cvtss_f32(c))
112 )
113 }
114}
115
116/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
117/// and `b`, and alternatively add and subtract packed elements in `c` to/from
118/// the intermediate result.
119///
120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd)
121#[inline]
122#[target_feature(enable = "fma")]
123#[cfg_attr(test, assert_instr(vfmaddsub))]
124#[stable(feature = "simd_x86", since = "1.27.0")]
125pub fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
126 unsafe {
127 let add: __m128d = simd_fma(x:a, y:b, z:c);
128 let sub: __m128d = simd_fma(x:a, y:b, z:simd_neg(c));
129 simd_shuffle!(add, sub, [2, 1])
130 }
131}
132
133/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
134/// and `b`, and alternatively add and subtract packed elements in `c` to/from
135/// the intermediate result.
136///
137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd)
138#[inline]
139#[target_feature(enable = "fma")]
140#[cfg_attr(test, assert_instr(vfmaddsub))]
141#[stable(feature = "simd_x86", since = "1.27.0")]
142pub fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
143 unsafe {
144 let add: __m256d = simd_fma(x:a, y:b, z:c);
145 let sub: __m256d = simd_fma(x:a, y:b, z:simd_neg(c));
146 simd_shuffle!(add, sub, [4, 1, 6, 3])
147 }
148}
149
150/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
151/// and `b`, and alternatively add and subtract packed elements in `c` to/from
152/// the intermediate result.
153///
154/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps)
155#[inline]
156#[target_feature(enable = "fma")]
157#[cfg_attr(test, assert_instr(vfmaddsub))]
158#[stable(feature = "simd_x86", since = "1.27.0")]
159pub fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
160 unsafe {
161 let add: __m128 = simd_fma(x:a, y:b, z:c);
162 let sub: __m128 = simd_fma(x:a, y:b, z:simd_neg(c));
163 simd_shuffle!(add, sub, [4, 1, 6, 3])
164 }
165}
166
167/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
168/// and `b`, and alternatively add and subtract packed elements in `c` to/from
169/// the intermediate result.
170///
171/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps)
172#[inline]
173#[target_feature(enable = "fma")]
174#[cfg_attr(test, assert_instr(vfmaddsub))]
175#[stable(feature = "simd_x86", since = "1.27.0")]
176pub fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
177 unsafe {
178 let add: __m256 = simd_fma(x:a, y:b, z:c);
179 let sub: __m256 = simd_fma(x:a, y:b, z:simd_neg(c));
180 simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
181 }
182}
183
184/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
185/// and `b`, and subtract packed elements in `c` from the intermediate result.
186///
187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_pd)
188#[inline]
189#[target_feature(enable = "fma")]
190#[cfg_attr(test, assert_instr(vfmsub))]
191#[stable(feature = "simd_x86", since = "1.27.0")]
192pub fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
193 unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) }
194}
195
196/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
197/// and `b`, and subtract packed elements in `c` from the intermediate result.
198///
199/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_pd)
200#[inline]
201#[target_feature(enable = "fma")]
202#[cfg_attr(test, assert_instr(vfmsub))]
203#[stable(feature = "simd_x86", since = "1.27.0")]
204pub fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
205 unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) }
206}
207
208/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
209/// and `b`, and subtract packed elements in `c` from the intermediate result.
210///
211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ps)
212#[inline]
213#[target_feature(enable = "fma")]
214#[cfg_attr(test, assert_instr(vfmsub213ps))]
215#[stable(feature = "simd_x86", since = "1.27.0")]
216pub fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
217 unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) }
218}
219
220/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
221/// and `b`, and subtract packed elements in `c` from the intermediate result.
222///
223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_ps)
224#[inline]
225#[target_feature(enable = "fma")]
226#[cfg_attr(test, assert_instr(vfmsub213ps))]
227#[stable(feature = "simd_x86", since = "1.27.0")]
228pub fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
229 unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) }
230}
231
232/// Multiplies the lower double-precision (64-bit) floating-point elements in
233/// `a` and `b`, and subtract the lower element in `c` from the intermediate
234/// result. Store the result in the lower element of the returned value, and
235/// copy the upper element from `a` to the upper elements of the result.
236///
237/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_sd)
238#[inline]
239#[target_feature(enable = "fma")]
240#[cfg_attr(test, assert_instr(vfmsub))]
241#[stable(feature = "simd_x86", since = "1.27.0")]
242pub fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
243 unsafe {
244 simd_insert!(
245 a,
246 0,
247 fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
248 )
249 }
250}
251
252/// Multiplies the lower single-precision (32-bit) floating-point elements in
253/// `a` and `b`, and subtract the lower element in `c` from the intermediate
254/// result. Store the result in the lower element of the returned value, and
255/// copy the 3 upper elements from `a` to the upper elements of the result.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ss)
258#[inline]
259#[target_feature(enable = "fma")]
260#[cfg_attr(test, assert_instr(vfmsub))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262pub fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
263 unsafe {
264 simd_insert!(
265 a,
266 0,
267 fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), -_mm_cvtss_f32(c))
268 )
269 }
270}
271
272/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
273/// and `b`, and alternatively subtract and add packed elements in `c` from/to
274/// the intermediate result.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd)
277#[inline]
278#[target_feature(enable = "fma")]
279#[cfg_attr(test, assert_instr(vfmsubadd))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281pub fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
282 unsafe {
283 let add: __m128d = simd_fma(x:a, y:b, z:c);
284 let sub: __m128d = simd_fma(x:a, y:b, z:simd_neg(c));
285 simd_shuffle!(add, sub, [0, 3])
286 }
287}
288
289/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
290/// and `b`, and alternatively subtract and add packed elements in `c` from/to
291/// the intermediate result.
292///
293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd)
294#[inline]
295#[target_feature(enable = "fma")]
296#[cfg_attr(test, assert_instr(vfmsubadd))]
297#[stable(feature = "simd_x86", since = "1.27.0")]
298pub fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
299 unsafe {
300 let add: __m256d = simd_fma(x:a, y:b, z:c);
301 let sub: __m256d = simd_fma(x:a, y:b, z:simd_neg(c));
302 simd_shuffle!(add, sub, [0, 5, 2, 7])
303 }
304}
305
306/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
307/// and `b`, and alternatively subtract and add packed elements in `c` from/to
308/// the intermediate result.
309///
310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps)
311#[inline]
312#[target_feature(enable = "fma")]
313#[cfg_attr(test, assert_instr(vfmsubadd))]
314#[stable(feature = "simd_x86", since = "1.27.0")]
315pub fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
316 unsafe {
317 let add: __m128 = simd_fma(x:a, y:b, z:c);
318 let sub: __m128 = simd_fma(x:a, y:b, z:simd_neg(c));
319 simd_shuffle!(add, sub, [0, 5, 2, 7])
320 }
321}
322
323/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
324/// and `b`, and alternatively subtract and add packed elements in `c` from/to
325/// the intermediate result.
326///
327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps)
328#[inline]
329#[target_feature(enable = "fma")]
330#[cfg_attr(test, assert_instr(vfmsubadd))]
331#[stable(feature = "simd_x86", since = "1.27.0")]
332pub fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
333 unsafe {
334 let add: __m256 = simd_fma(x:a, y:b, z:c);
335 let sub: __m256 = simd_fma(x:a, y:b, z:simd_neg(c));
336 simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
337 }
338}
339
340/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
341/// and `b`, and add the negated intermediate result to packed elements in `c`.
342///
343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd)
344#[inline]
345#[target_feature(enable = "fma")]
346#[cfg_attr(test, assert_instr(vfnmadd))]
347#[stable(feature = "simd_x86", since = "1.27.0")]
348pub fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
349 unsafe { simd_fma(x:simd_neg(a), y:b, z:c) }
350}
351
352/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
353/// and `b`, and add the negated intermediate result to packed elements in `c`.
354///
355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd)
356#[inline]
357#[target_feature(enable = "fma")]
358#[cfg_attr(test, assert_instr(vfnmadd))]
359#[stable(feature = "simd_x86", since = "1.27.0")]
360pub fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
361 unsafe { simd_fma(x:simd_neg(a), y:b, z:c) }
362}
363
364/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
365/// and `b`, and add the negated intermediate result to packed elements in `c`.
366///
367/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps)
368#[inline]
369#[target_feature(enable = "fma")]
370#[cfg_attr(test, assert_instr(vfnmadd))]
371#[stable(feature = "simd_x86", since = "1.27.0")]
372pub fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
373 unsafe { simd_fma(x:simd_neg(a), y:b, z:c) }
374}
375
376/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
377/// and `b`, and add the negated intermediate result to packed elements in `c`.
378///
379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps)
380#[inline]
381#[target_feature(enable = "fma")]
382#[cfg_attr(test, assert_instr(vfnmadd))]
383#[stable(feature = "simd_x86", since = "1.27.0")]
384pub fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
385 unsafe { simd_fma(x:simd_neg(a), y:b, z:c) }
386}
387
388/// Multiplies the lower double-precision (64-bit) floating-point elements in
389/// `a` and `b`, and add the negated intermediate result to the lower element
390/// in `c`. Store the result in the lower element of the returned value, and
391/// copy the upper element from `a` to the upper elements of the result.
392///
393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_sd)
394#[inline]
395#[target_feature(enable = "fma")]
396#[cfg_attr(test, assert_instr(vfnmadd))]
397#[stable(feature = "simd_x86", since = "1.27.0")]
398pub fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
399 unsafe {
400 simd_insert!(
401 a,
402 0,
403 fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
404 )
405 }
406}
407
408/// Multiplies the lower single-precision (32-bit) floating-point elements in
409/// `a` and `b`, and add the negated intermediate result to the lower element
410/// in `c`. Store the result in the lower element of the returned value, and
411/// copy the 3 upper elements from `a` to the upper elements of the result.
412///
413/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ss)
414#[inline]
415#[target_feature(enable = "fma")]
416#[cfg_attr(test, assert_instr(vfnmadd))]
417#[stable(feature = "simd_x86", since = "1.27.0")]
418pub fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
419 unsafe {
420 simd_insert!(
421 a,
422 0,
423 fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), _mm_cvtss_f32(c))
424 )
425 }
426}
427
428/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
429/// and `b`, and subtract packed elements in `c` from the negated intermediate
430/// result.
431///
432/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_pd)
433#[inline]
434#[target_feature(enable = "fma")]
435#[cfg_attr(test, assert_instr(vfnmsub))]
436#[stable(feature = "simd_x86", since = "1.27.0")]
437pub fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
438 unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) }
439}
440
441/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
442/// and `b`, and subtract packed elements in `c` from the negated intermediate
443/// result.
444///
445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_pd)
446#[inline]
447#[target_feature(enable = "fma")]
448#[cfg_attr(test, assert_instr(vfnmsub))]
449#[stable(feature = "simd_x86", since = "1.27.0")]
450pub fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
451 unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) }
452}
453
454/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
455/// and `b`, and subtract packed elements in `c` from the negated intermediate
456/// result.
457///
458/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ps)
459#[inline]
460#[target_feature(enable = "fma")]
461#[cfg_attr(test, assert_instr(vfnmsub))]
462#[stable(feature = "simd_x86", since = "1.27.0")]
463pub fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
464 unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) }
465}
466
467/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
468/// and `b`, and subtract packed elements in `c` from the negated intermediate
469/// result.
470///
471/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_ps)
472#[inline]
473#[target_feature(enable = "fma")]
474#[cfg_attr(test, assert_instr(vfnmsub))]
475#[stable(feature = "simd_x86", since = "1.27.0")]
476pub fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
477 unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) }
478}
479
480/// Multiplies the lower double-precision (64-bit) floating-point elements in
481/// `a` and `b`, and subtract packed elements in `c` from the negated
482/// intermediate result. Store the result in the lower element of the returned
483/// value, and copy the upper element from `a` to the upper elements of the
484/// result.
485///
486/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_sd)
487#[inline]
488#[target_feature(enable = "fma")]
489#[cfg_attr(test, assert_instr(vfnmsub))]
490#[stable(feature = "simd_x86", since = "1.27.0")]
491pub fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
492 unsafe {
493 simd_insert!(
494 a,
495 0,
496 fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
497 )
498 }
499}
500
501/// Multiplies the lower single-precision (32-bit) floating-point elements in
502/// `a` and `b`, and subtract packed elements in `c` from the negated
503/// intermediate result. Store the result in the lower element of the
504/// returned value, and copy the 3 upper elements from `a` to the upper
505/// elements of the result.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ss)
508#[inline]
509#[target_feature(enable = "fma")]
510#[cfg_attr(test, assert_instr(vfnmsub))]
511#[stable(feature = "simd_x86", since = "1.27.0")]
512pub fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
513 unsafe {
514 simd_insert!(
515 a,
516 0,
517 fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), -_mm_cvtss_f32(c))
518 )
519 }
520}
521
522#[cfg(test)]
523mod tests {
524
525 use stdarch_test::simd_test;
526
527 use crate::core_arch::x86::*;
528
529 #[simd_test(enable = "fma")]
530 unsafe fn test_mm_fmadd_pd() {
531 let a = _mm_setr_pd(1., 2.);
532 let b = _mm_setr_pd(5., 3.);
533 let c = _mm_setr_pd(4., 9.);
534 let r = _mm_setr_pd(9., 15.);
535 assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
536 }
537
538 #[simd_test(enable = "fma")]
539 unsafe fn test_mm256_fmadd_pd() {
540 let a = _mm256_setr_pd(1., 2., 3., 4.);
541 let b = _mm256_setr_pd(5., 3., 7., 2.);
542 let c = _mm256_setr_pd(4., 9., 1., 7.);
543 let r = _mm256_setr_pd(9., 15., 22., 15.);
544 assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
545 }
546
547 #[simd_test(enable = "fma")]
548 unsafe fn test_mm_fmadd_ps() {
549 let a = _mm_setr_ps(1., 2., 3., 4.);
550 let b = _mm_setr_ps(5., 3., 7., 2.);
551 let c = _mm_setr_ps(4., 9., 1., 7.);
552 let r = _mm_setr_ps(9., 15., 22., 15.);
553 assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
554 }
555
556 #[simd_test(enable = "fma")]
557 unsafe fn test_mm256_fmadd_ps() {
558 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
559 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
560 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
561 let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
562 assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
563 }
564
565 #[simd_test(enable = "fma")]
566 unsafe fn test_mm_fmadd_sd() {
567 let a = _mm_setr_pd(1., 2.);
568 let b = _mm_setr_pd(5., 3.);
569 let c = _mm_setr_pd(4., 9.);
570 let r = _mm_setr_pd(9., 2.);
571 assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
572 }
573
574 #[simd_test(enable = "fma")]
575 unsafe fn test_mm_fmadd_ss() {
576 let a = _mm_setr_ps(1., 2., 3., 4.);
577 let b = _mm_setr_ps(5., 3., 7., 2.);
578 let c = _mm_setr_ps(4., 9., 1., 7.);
579 let r = _mm_setr_ps(9., 2., 3., 4.);
580 assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
581 }
582
583 #[simd_test(enable = "fma")]
584 unsafe fn test_mm_fmaddsub_pd() {
585 let a = _mm_setr_pd(1., 2.);
586 let b = _mm_setr_pd(5., 3.);
587 let c = _mm_setr_pd(4., 9.);
588 let r = _mm_setr_pd(1., 15.);
589 assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
590 }
591
592 #[simd_test(enable = "fma")]
593 unsafe fn test_mm256_fmaddsub_pd() {
594 let a = _mm256_setr_pd(1., 2., 3., 4.);
595 let b = _mm256_setr_pd(5., 3., 7., 2.);
596 let c = _mm256_setr_pd(4., 9., 1., 7.);
597 let r = _mm256_setr_pd(1., 15., 20., 15.);
598 assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
599 }
600
601 #[simd_test(enable = "fma")]
602 unsafe fn test_mm_fmaddsub_ps() {
603 let a = _mm_setr_ps(1., 2., 3., 4.);
604 let b = _mm_setr_ps(5., 3., 7., 2.);
605 let c = _mm_setr_ps(4., 9., 1., 7.);
606 let r = _mm_setr_ps(1., 15., 20., 15.);
607 assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
608 }
609
610 #[simd_test(enable = "fma")]
611 unsafe fn test_mm256_fmaddsub_ps() {
612 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
613 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
614 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
615 let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
616 assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
617 }
618
619 #[simd_test(enable = "fma")]
620 unsafe fn test_mm_fmsub_pd() {
621 let a = _mm_setr_pd(1., 2.);
622 let b = _mm_setr_pd(5., 3.);
623 let c = _mm_setr_pd(4., 9.);
624 let r = _mm_setr_pd(1., -3.);
625 assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
626 }
627
628 #[simd_test(enable = "fma")]
629 unsafe fn test_mm256_fmsub_pd() {
630 let a = _mm256_setr_pd(1., 2., 3., 4.);
631 let b = _mm256_setr_pd(5., 3., 7., 2.);
632 let c = _mm256_setr_pd(4., 9., 1., 7.);
633 let r = _mm256_setr_pd(1., -3., 20., 1.);
634 assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
635 }
636
637 #[simd_test(enable = "fma")]
638 unsafe fn test_mm_fmsub_ps() {
639 let a = _mm_setr_ps(1., 2., 3., 4.);
640 let b = _mm_setr_ps(5., 3., 7., 2.);
641 let c = _mm_setr_ps(4., 9., 1., 7.);
642 let r = _mm_setr_ps(1., -3., 20., 1.);
643 assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
644 }
645
646 #[simd_test(enable = "fma")]
647 unsafe fn test_mm256_fmsub_ps() {
648 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
649 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
650 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
651 let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
652 assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
653 }
654
655 #[simd_test(enable = "fma")]
656 unsafe fn test_mm_fmsub_sd() {
657 let a = _mm_setr_pd(1., 2.);
658 let b = _mm_setr_pd(5., 3.);
659 let c = _mm_setr_pd(4., 9.);
660 let r = _mm_setr_pd(1., 2.);
661 assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
662 }
663
664 #[simd_test(enable = "fma")]
665 unsafe fn test_mm_fmsub_ss() {
666 let a = _mm_setr_ps(1., 2., 3., 4.);
667 let b = _mm_setr_ps(5., 3., 7., 2.);
668 let c = _mm_setr_ps(4., 9., 1., 7.);
669 let r = _mm_setr_ps(1., 2., 3., 4.);
670 assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
671 }
672
673 #[simd_test(enable = "fma")]
674 unsafe fn test_mm_fmsubadd_pd() {
675 let a = _mm_setr_pd(1., 2.);
676 let b = _mm_setr_pd(5., 3.);
677 let c = _mm_setr_pd(4., 9.);
678 let r = _mm_setr_pd(9., -3.);
679 assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
680 }
681
682 #[simd_test(enable = "fma")]
683 unsafe fn test_mm256_fmsubadd_pd() {
684 let a = _mm256_setr_pd(1., 2., 3., 4.);
685 let b = _mm256_setr_pd(5., 3., 7., 2.);
686 let c = _mm256_setr_pd(4., 9., 1., 7.);
687 let r = _mm256_setr_pd(9., -3., 22., 1.);
688 assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
689 }
690
691 #[simd_test(enable = "fma")]
692 unsafe fn test_mm_fmsubadd_ps() {
693 let a = _mm_setr_ps(1., 2., 3., 4.);
694 let b = _mm_setr_ps(5., 3., 7., 2.);
695 let c = _mm_setr_ps(4., 9., 1., 7.);
696 let r = _mm_setr_ps(9., -3., 22., 1.);
697 assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
698 }
699
700 #[simd_test(enable = "fma")]
701 unsafe fn test_mm256_fmsubadd_ps() {
702 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
703 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
704 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
705 let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
706 assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
707 }
708
709 #[simd_test(enable = "fma")]
710 unsafe fn test_mm_fnmadd_pd() {
711 let a = _mm_setr_pd(1., 2.);
712 let b = _mm_setr_pd(5., 3.);
713 let c = _mm_setr_pd(4., 9.);
714 let r = _mm_setr_pd(-1., 3.);
715 assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
716 }
717
718 #[simd_test(enable = "fma")]
719 unsafe fn test_mm256_fnmadd_pd() {
720 let a = _mm256_setr_pd(1., 2., 3., 4.);
721 let b = _mm256_setr_pd(5., 3., 7., 2.);
722 let c = _mm256_setr_pd(4., 9., 1., 7.);
723 let r = _mm256_setr_pd(-1., 3., -20., -1.);
724 assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
725 }
726
727 #[simd_test(enable = "fma")]
728 unsafe fn test_mm_fnmadd_ps() {
729 let a = _mm_setr_ps(1., 2., 3., 4.);
730 let b = _mm_setr_ps(5., 3., 7., 2.);
731 let c = _mm_setr_ps(4., 9., 1., 7.);
732 let r = _mm_setr_ps(-1., 3., -20., -1.);
733 assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
734 }
735
736 #[simd_test(enable = "fma")]
737 unsafe fn test_mm256_fnmadd_ps() {
738 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
739 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
740 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
741 let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
742 assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
743 }
744
745 #[simd_test(enable = "fma")]
746 unsafe fn test_mm_fnmadd_sd() {
747 let a = _mm_setr_pd(1., 2.);
748 let b = _mm_setr_pd(5., 3.);
749 let c = _mm_setr_pd(4., 9.);
750 let r = _mm_setr_pd(-1., 2.);
751 assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
752 }
753
754 #[simd_test(enable = "fma")]
755 unsafe fn test_mm_fnmadd_ss() {
756 let a = _mm_setr_ps(1., 2., 3., 4.);
757 let b = _mm_setr_ps(5., 3., 7., 2.);
758 let c = _mm_setr_ps(4., 9., 1., 7.);
759 let r = _mm_setr_ps(-1., 2., 3., 4.);
760 assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
761 }
762
763 #[simd_test(enable = "fma")]
764 unsafe fn test_mm_fnmsub_pd() {
765 let a = _mm_setr_pd(1., 2.);
766 let b = _mm_setr_pd(5., 3.);
767 let c = _mm_setr_pd(4., 9.);
768 let r = _mm_setr_pd(-9., -15.);
769 assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
770 }
771
772 #[simd_test(enable = "fma")]
773 unsafe fn test_mm256_fnmsub_pd() {
774 let a = _mm256_setr_pd(1., 2., 3., 4.);
775 let b = _mm256_setr_pd(5., 3., 7., 2.);
776 let c = _mm256_setr_pd(4., 9., 1., 7.);
777 let r = _mm256_setr_pd(-9., -15., -22., -15.);
778 assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
779 }
780
781 #[simd_test(enable = "fma")]
782 unsafe fn test_mm_fnmsub_ps() {
783 let a = _mm_setr_ps(1., 2., 3., 4.);
784 let b = _mm_setr_ps(5., 3., 7., 2.);
785 let c = _mm_setr_ps(4., 9., 1., 7.);
786 let r = _mm_setr_ps(-9., -15., -22., -15.);
787 assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
788 }
789
790 #[simd_test(enable = "fma")]
791 unsafe fn test_mm256_fnmsub_ps() {
792 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
793 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
794 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
795 let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
796 assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
797 }
798
799 #[simd_test(enable = "fma")]
800 unsafe fn test_mm_fnmsub_sd() {
801 let a = _mm_setr_pd(1., 2.);
802 let b = _mm_setr_pd(5., 3.);
803 let c = _mm_setr_pd(4., 9.);
804 let r = _mm_setr_pd(-9., 2.);
805 assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
806 }
807
808 #[simd_test(enable = "fma")]
809 unsafe fn test_mm_fnmsub_ss() {
810 let a = _mm_setr_ps(1., 2., 3., 4.);
811 let b = _mm_setr_ps(5., 3., 7., 2.);
812 let c = _mm_setr_ps(4., 9., 1., 7.);
813 let r = _mm_setr_ps(-9., 2., 3., 4.);
814 assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
815 }
816}
817