1//! Fused Multiply-Add instruction set (FMA)
2//!
3//! The FMA instruction set is an extension to the 128 and 256-bit SSE
4//! instructions in the x86 microprocessor instruction set to perform fused
5//! multiply–add (FMA) operations.
6//!
7//! The references are:
8//!
9//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
10//! Instruction Set Reference, A-Z][intel64_ref].
11//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
12//! System Instructions][amd64_ref].
13//!
14//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
15//! instructions available.
16//!
17//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
18//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::x86::*;
22use crate::intrinsics::simd::{simd_fma, simd_neg};
23use crate::intrinsics::{fmaf32, fmaf64};
24
25#[cfg(test)]
26use stdarch_test::assert_instr;
27
28/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
29/// and `b`, and add the intermediate result to packed elements in `c`.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_pd)
32#[inline]
33#[target_feature(enable = "fma")]
34#[cfg_attr(test, assert_instr(vfmadd))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
37pub const fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
38 unsafe { simd_fma(x:a, y:b, z:c) }
39}
40
41/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
42/// and `b`, and add the intermediate result to packed elements in `c`.
43///
44/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_pd)
45#[inline]
46#[target_feature(enable = "fma")]
47#[cfg_attr(test, assert_instr(vfmadd))]
48#[stable(feature = "simd_x86", since = "1.27.0")]
49#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
50pub const fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
51 unsafe { simd_fma(x:a, y:b, z:c) }
52}
53
54/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
55/// and `b`, and add the intermediate result to packed elements in `c`.
56///
57/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ps)
58#[inline]
59#[target_feature(enable = "fma")]
60#[cfg_attr(test, assert_instr(vfmadd))]
61#[stable(feature = "simd_x86", since = "1.27.0")]
62#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
63pub const fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
64 unsafe { simd_fma(x:a, y:b, z:c) }
65}
66
67/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
68/// and `b`, and add the intermediate result to packed elements in `c`.
69///
70/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_ps)
71#[inline]
72#[target_feature(enable = "fma")]
73#[cfg_attr(test, assert_instr(vfmadd))]
74#[stable(feature = "simd_x86", since = "1.27.0")]
75#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
76pub const fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
77 unsafe { simd_fma(x:a, y:b, z:c) }
78}
79
80/// Multiplies the lower double-precision (64-bit) floating-point elements in
81/// `a` and `b`, and add the intermediate result to the lower element in `c`.
82/// Stores the result in the lower element of the returned value, and copy the
83/// upper element from `a` to the upper elements of the result.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_sd)
86#[inline]
87#[target_feature(enable = "fma")]
88#[cfg_attr(test, assert_instr(vfmadd))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
91pub const fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
92 unsafe {
93 simd_insert!(
94 a,
95 0,
96 fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
97 )
98 }
99}
100
101/// Multiplies the lower single-precision (32-bit) floating-point elements in
102/// `a` and `b`, and add the intermediate result to the lower element in `c`.
103/// Stores the result in the lower element of the returned value, and copy the
104/// 3 upper elements from `a` to the upper elements of the result.
105///
106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ss)
107#[inline]
108#[target_feature(enable = "fma")]
109#[cfg_attr(test, assert_instr(vfmadd))]
110#[stable(feature = "simd_x86", since = "1.27.0")]
111#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
112pub const fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
113 unsafe {
114 simd_insert!(
115 a,
116 0,
117 fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), _mm_cvtss_f32(c))
118 )
119 }
120}
121
122/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
123/// and `b`, and alternatively add and subtract packed elements in `c` to/from
124/// the intermediate result.
125///
126/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd)
127#[inline]
128#[target_feature(enable = "fma")]
129#[cfg_attr(test, assert_instr(vfmaddsub))]
130#[stable(feature = "simd_x86", since = "1.27.0")]
131#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
132pub const fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
133 unsafe {
134 let add: __m128d = simd_fma(x:a, y:b, z:c);
135 let sub: __m128d = simd_fma(x:a, y:b, z:simd_neg(c));
136 simd_shuffle!(add, sub, [2, 1])
137 }
138}
139
140/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
141/// and `b`, and alternatively add and subtract packed elements in `c` to/from
142/// the intermediate result.
143///
144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd)
145#[inline]
146#[target_feature(enable = "fma")]
147#[cfg_attr(test, assert_instr(vfmaddsub))]
148#[stable(feature = "simd_x86", since = "1.27.0")]
149#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
150pub const fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
151 unsafe {
152 let add: __m256d = simd_fma(x:a, y:b, z:c);
153 let sub: __m256d = simd_fma(x:a, y:b, z:simd_neg(c));
154 simd_shuffle!(add, sub, [4, 1, 6, 3])
155 }
156}
157
158/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
159/// and `b`, and alternatively add and subtract packed elements in `c` to/from
160/// the intermediate result.
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps)
163#[inline]
164#[target_feature(enable = "fma")]
165#[cfg_attr(test, assert_instr(vfmaddsub))]
166#[stable(feature = "simd_x86", since = "1.27.0")]
167#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
168pub const fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
169 unsafe {
170 let add: __m128 = simd_fma(x:a, y:b, z:c);
171 let sub: __m128 = simd_fma(x:a, y:b, z:simd_neg(c));
172 simd_shuffle!(add, sub, [4, 1, 6, 3])
173 }
174}
175
176/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
177/// and `b`, and alternatively add and subtract packed elements in `c` to/from
178/// the intermediate result.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps)
181#[inline]
182#[target_feature(enable = "fma")]
183#[cfg_attr(test, assert_instr(vfmaddsub))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
186pub const fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
187 unsafe {
188 let add: __m256 = simd_fma(x:a, y:b, z:c);
189 let sub: __m256 = simd_fma(x:a, y:b, z:simd_neg(c));
190 simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
191 }
192}
193
194/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
195/// and `b`, and subtract packed elements in `c` from the intermediate result.
196///
197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_pd)
198#[inline]
199#[target_feature(enable = "fma")]
200#[cfg_attr(test, assert_instr(vfmsub))]
201#[stable(feature = "simd_x86", since = "1.27.0")]
202#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
203pub const fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
204 unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) }
205}
206
207/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
208/// and `b`, and subtract packed elements in `c` from the intermediate result.
209///
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_pd)
211#[inline]
212#[target_feature(enable = "fma")]
213#[cfg_attr(test, assert_instr(vfmsub))]
214#[stable(feature = "simd_x86", since = "1.27.0")]
215#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
216pub const fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
217 unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) }
218}
219
220/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
221/// and `b`, and subtract packed elements in `c` from the intermediate result.
222///
223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ps)
224#[inline]
225#[target_feature(enable = "fma")]
226#[cfg_attr(test, assert_instr(vfmsub213ps))]
227#[stable(feature = "simd_x86", since = "1.27.0")]
228#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
229pub const fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
230 unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) }
231}
232
233/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
234/// and `b`, and subtract packed elements in `c` from the intermediate result.
235///
236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_ps)
237#[inline]
238#[target_feature(enable = "fma")]
239#[cfg_attr(test, assert_instr(vfmsub213ps))]
240#[stable(feature = "simd_x86", since = "1.27.0")]
241#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
242pub const fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
243 unsafe { simd_fma(x:a, y:b, z:simd_neg(c)) }
244}
245
246/// Multiplies the lower double-precision (64-bit) floating-point elements in
247/// `a` and `b`, and subtract the lower element in `c` from the intermediate
248/// result. Store the result in the lower element of the returned value, and
249/// copy the upper element from `a` to the upper elements of the result.
250///
251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_sd)
252#[inline]
253#[target_feature(enable = "fma")]
254#[cfg_attr(test, assert_instr(vfmsub))]
255#[stable(feature = "simd_x86", since = "1.27.0")]
256#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
257pub const fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
258 unsafe {
259 simd_insert!(
260 a,
261 0,
262 fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
263 )
264 }
265}
266
267/// Multiplies the lower single-precision (32-bit) floating-point elements in
268/// `a` and `b`, and subtract the lower element in `c` from the intermediate
269/// result. Store the result in the lower element of the returned value, and
270/// copy the 3 upper elements from `a` to the upper elements of the result.
271///
272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ss)
273#[inline]
274#[target_feature(enable = "fma")]
275#[cfg_attr(test, assert_instr(vfmsub))]
276#[stable(feature = "simd_x86", since = "1.27.0")]
277#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
278pub const fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
279 unsafe {
280 simd_insert!(
281 a,
282 0,
283 fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), -_mm_cvtss_f32(c))
284 )
285 }
286}
287
288/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
289/// and `b`, and alternatively subtract and add packed elements in `c` from/to
290/// the intermediate result.
291///
292/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd)
293#[inline]
294#[target_feature(enable = "fma")]
295#[cfg_attr(test, assert_instr(vfmsubadd))]
296#[stable(feature = "simd_x86", since = "1.27.0")]
297#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
298pub const fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
299 unsafe {
300 let add: __m128d = simd_fma(x:a, y:b, z:c);
301 let sub: __m128d = simd_fma(x:a, y:b, z:simd_neg(c));
302 simd_shuffle!(add, sub, [0, 3])
303 }
304}
305
306/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
307/// and `b`, and alternatively subtract and add packed elements in `c` from/to
308/// the intermediate result.
309///
310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd)
311#[inline]
312#[target_feature(enable = "fma")]
313#[cfg_attr(test, assert_instr(vfmsubadd))]
314#[stable(feature = "simd_x86", since = "1.27.0")]
315#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
316pub const fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
317 unsafe {
318 let add: __m256d = simd_fma(x:a, y:b, z:c);
319 let sub: __m256d = simd_fma(x:a, y:b, z:simd_neg(c));
320 simd_shuffle!(add, sub, [0, 5, 2, 7])
321 }
322}
323
324/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
325/// and `b`, and alternatively subtract and add packed elements in `c` from/to
326/// the intermediate result.
327///
328/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps)
329#[inline]
330#[target_feature(enable = "fma")]
331#[cfg_attr(test, assert_instr(vfmsubadd))]
332#[stable(feature = "simd_x86", since = "1.27.0")]
333#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
334pub const fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
335 unsafe {
336 let add: __m128 = simd_fma(x:a, y:b, z:c);
337 let sub: __m128 = simd_fma(x:a, y:b, z:simd_neg(c));
338 simd_shuffle!(add, sub, [0, 5, 2, 7])
339 }
340}
341
342/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
343/// and `b`, and alternatively subtract and add packed elements in `c` from/to
344/// the intermediate result.
345///
346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps)
347#[inline]
348#[target_feature(enable = "fma")]
349#[cfg_attr(test, assert_instr(vfmsubadd))]
350#[stable(feature = "simd_x86", since = "1.27.0")]
351#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
352pub const fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
353 unsafe {
354 let add: __m256 = simd_fma(x:a, y:b, z:c);
355 let sub: __m256 = simd_fma(x:a, y:b, z:simd_neg(c));
356 simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
357 }
358}
359
360/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
361/// and `b`, and add the negated intermediate result to packed elements in `c`.
362///
363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd)
364#[inline]
365#[target_feature(enable = "fma")]
366#[cfg_attr(test, assert_instr(vfnmadd))]
367#[stable(feature = "simd_x86", since = "1.27.0")]
368#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
369pub const fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
370 unsafe { simd_fma(x:simd_neg(a), y:b, z:c) }
371}
372
373/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
374/// and `b`, and add the negated intermediate result to packed elements in `c`.
375///
376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd)
377#[inline]
378#[target_feature(enable = "fma")]
379#[cfg_attr(test, assert_instr(vfnmadd))]
380#[stable(feature = "simd_x86", since = "1.27.0")]
381#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
382pub const fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
383 unsafe { simd_fma(x:simd_neg(a), y:b, z:c) }
384}
385
386/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
387/// and `b`, and add the negated intermediate result to packed elements in `c`.
388///
389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps)
390#[inline]
391#[target_feature(enable = "fma")]
392#[cfg_attr(test, assert_instr(vfnmadd))]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
395pub const fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
396 unsafe { simd_fma(x:simd_neg(a), y:b, z:c) }
397}
398
399/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
400/// and `b`, and add the negated intermediate result to packed elements in `c`.
401///
402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps)
403#[inline]
404#[target_feature(enable = "fma")]
405#[cfg_attr(test, assert_instr(vfnmadd))]
406#[stable(feature = "simd_x86", since = "1.27.0")]
407#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
408pub const fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
409 unsafe { simd_fma(x:simd_neg(a), y:b, z:c) }
410}
411
412/// Multiplies the lower double-precision (64-bit) floating-point elements in
413/// `a` and `b`, and add the negated intermediate result to the lower element
414/// in `c`. Store the result in the lower element of the returned value, and
415/// copy the upper element from `a` to the upper elements of the result.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_sd)
418#[inline]
419#[target_feature(enable = "fma")]
420#[cfg_attr(test, assert_instr(vfnmadd))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
423pub const fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
424 unsafe {
425 simd_insert!(
426 a,
427 0,
428 fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
429 )
430 }
431}
432
433/// Multiplies the lower single-precision (32-bit) floating-point elements in
434/// `a` and `b`, and add the negated intermediate result to the lower element
435/// in `c`. Store the result in the lower element of the returned value, and
436/// copy the 3 upper elements from `a` to the upper elements of the result.
437///
438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ss)
439#[inline]
440#[target_feature(enable = "fma")]
441#[cfg_attr(test, assert_instr(vfnmadd))]
442#[stable(feature = "simd_x86", since = "1.27.0")]
443#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
444pub const fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
445 unsafe {
446 simd_insert!(
447 a,
448 0,
449 fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), _mm_cvtss_f32(c))
450 )
451 }
452}
453
454/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
455/// and `b`, and subtract packed elements in `c` from the negated intermediate
456/// result.
457///
458/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_pd)
459#[inline]
460#[target_feature(enable = "fma")]
461#[cfg_attr(test, assert_instr(vfnmsub))]
462#[stable(feature = "simd_x86", since = "1.27.0")]
463#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
464pub const fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
465 unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) }
466}
467
468/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
469/// and `b`, and subtract packed elements in `c` from the negated intermediate
470/// result.
471///
472/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_pd)
473#[inline]
474#[target_feature(enable = "fma")]
475#[cfg_attr(test, assert_instr(vfnmsub))]
476#[stable(feature = "simd_x86", since = "1.27.0")]
477#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
478pub const fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
479 unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) }
480}
481
482/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
483/// and `b`, and subtract packed elements in `c` from the negated intermediate
484/// result.
485///
486/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ps)
487#[inline]
488#[target_feature(enable = "fma")]
489#[cfg_attr(test, assert_instr(vfnmsub))]
490#[stable(feature = "simd_x86", since = "1.27.0")]
491#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
492pub const fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
493 unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) }
494}
495
496/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
497/// and `b`, and subtract packed elements in `c` from the negated intermediate
498/// result.
499///
500/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_ps)
501#[inline]
502#[target_feature(enable = "fma")]
503#[cfg_attr(test, assert_instr(vfnmsub))]
504#[stable(feature = "simd_x86", since = "1.27.0")]
505#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
506pub const fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
507 unsafe { simd_fma(x:simd_neg(a), y:b, z:simd_neg(c)) }
508}
509
510/// Multiplies the lower double-precision (64-bit) floating-point elements in
511/// `a` and `b`, and subtract packed elements in `c` from the negated
512/// intermediate result. Store the result in the lower element of the returned
513/// value, and copy the upper element from `a` to the upper elements of the
514/// result.
515///
516/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_sd)
517#[inline]
518#[target_feature(enable = "fma")]
519#[cfg_attr(test, assert_instr(vfnmsub))]
520#[stable(feature = "simd_x86", since = "1.27.0")]
521#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
522pub const fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
523 unsafe {
524 simd_insert!(
525 a,
526 0,
527 fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
528 )
529 }
530}
531
532/// Multiplies the lower single-precision (32-bit) floating-point elements in
533/// `a` and `b`, and subtract packed elements in `c` from the negated
534/// intermediate result. Store the result in the lower element of the
535/// returned value, and copy the 3 upper elements from `a` to the upper
536/// elements of the result.
537///
538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ss)
539#[inline]
540#[target_feature(enable = "fma")]
541#[cfg_attr(test, assert_instr(vfnmsub))]
542#[stable(feature = "simd_x86", since = "1.27.0")]
543#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
544pub const fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
545 unsafe {
546 simd_insert!(
547 a,
548 0,
549 fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), -_mm_cvtss_f32(c))
550 )
551 }
552}
553
554#[cfg(test)]
555mod tests {
556 use crate::core_arch::assert_eq_const as assert_eq;
557
558 use stdarch_test::simd_test;
559
560 use crate::core_arch::x86::*;
561
562 #[simd_test(enable = "fma")]
563 const fn test_mm_fmadd_pd() {
564 let a = _mm_setr_pd(1., 2.);
565 let b = _mm_setr_pd(5., 3.);
566 let c = _mm_setr_pd(4., 9.);
567 let r = _mm_setr_pd(9., 15.);
568 assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
569 }
570
571 #[simd_test(enable = "fma")]
572 const fn test_mm256_fmadd_pd() {
573 let a = _mm256_setr_pd(1., 2., 3., 4.);
574 let b = _mm256_setr_pd(5., 3., 7., 2.);
575 let c = _mm256_setr_pd(4., 9., 1., 7.);
576 let r = _mm256_setr_pd(9., 15., 22., 15.);
577 assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
578 }
579
580 #[simd_test(enable = "fma")]
581 const fn test_mm_fmadd_ps() {
582 let a = _mm_setr_ps(1., 2., 3., 4.);
583 let b = _mm_setr_ps(5., 3., 7., 2.);
584 let c = _mm_setr_ps(4., 9., 1., 7.);
585 let r = _mm_setr_ps(9., 15., 22., 15.);
586 assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
587 }
588
589 #[simd_test(enable = "fma")]
590 const fn test_mm256_fmadd_ps() {
591 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
592 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
593 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
594 let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
595 assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
596 }
597
598 #[simd_test(enable = "fma")]
599 const fn test_mm_fmadd_sd() {
600 let a = _mm_setr_pd(1., 2.);
601 let b = _mm_setr_pd(5., 3.);
602 let c = _mm_setr_pd(4., 9.);
603 let r = _mm_setr_pd(9., 2.);
604 assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
605 }
606
607 #[simd_test(enable = "fma")]
608 const fn test_mm_fmadd_ss() {
609 let a = _mm_setr_ps(1., 2., 3., 4.);
610 let b = _mm_setr_ps(5., 3., 7., 2.);
611 let c = _mm_setr_ps(4., 9., 1., 7.);
612 let r = _mm_setr_ps(9., 2., 3., 4.);
613 assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
614 }
615
616 #[simd_test(enable = "fma")]
617 const fn test_mm_fmaddsub_pd() {
618 let a = _mm_setr_pd(1., 2.);
619 let b = _mm_setr_pd(5., 3.);
620 let c = _mm_setr_pd(4., 9.);
621 let r = _mm_setr_pd(1., 15.);
622 assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
623 }
624
625 #[simd_test(enable = "fma")]
626 const fn test_mm256_fmaddsub_pd() {
627 let a = _mm256_setr_pd(1., 2., 3., 4.);
628 let b = _mm256_setr_pd(5., 3., 7., 2.);
629 let c = _mm256_setr_pd(4., 9., 1., 7.);
630 let r = _mm256_setr_pd(1., 15., 20., 15.);
631 assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
632 }
633
634 #[simd_test(enable = "fma")]
635 const fn test_mm_fmaddsub_ps() {
636 let a = _mm_setr_ps(1., 2., 3., 4.);
637 let b = _mm_setr_ps(5., 3., 7., 2.);
638 let c = _mm_setr_ps(4., 9., 1., 7.);
639 let r = _mm_setr_ps(1., 15., 20., 15.);
640 assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
641 }
642
643 #[simd_test(enable = "fma")]
644 const fn test_mm256_fmaddsub_ps() {
645 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
646 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
647 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
648 let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
649 assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
650 }
651
652 #[simd_test(enable = "fma")]
653 const fn test_mm_fmsub_pd() {
654 let a = _mm_setr_pd(1., 2.);
655 let b = _mm_setr_pd(5., 3.);
656 let c = _mm_setr_pd(4., 9.);
657 let r = _mm_setr_pd(1., -3.);
658 assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
659 }
660
661 #[simd_test(enable = "fma")]
662 const fn test_mm256_fmsub_pd() {
663 let a = _mm256_setr_pd(1., 2., 3., 4.);
664 let b = _mm256_setr_pd(5., 3., 7., 2.);
665 let c = _mm256_setr_pd(4., 9., 1., 7.);
666 let r = _mm256_setr_pd(1., -3., 20., 1.);
667 assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
668 }
669
670 #[simd_test(enable = "fma")]
671 const fn test_mm_fmsub_ps() {
672 let a = _mm_setr_ps(1., 2., 3., 4.);
673 let b = _mm_setr_ps(5., 3., 7., 2.);
674 let c = _mm_setr_ps(4., 9., 1., 7.);
675 let r = _mm_setr_ps(1., -3., 20., 1.);
676 assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
677 }
678
679 #[simd_test(enable = "fma")]
680 const fn test_mm256_fmsub_ps() {
681 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
682 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
683 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
684 let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
685 assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
686 }
687
688 #[simd_test(enable = "fma")]
689 const fn test_mm_fmsub_sd() {
690 let a = _mm_setr_pd(1., 2.);
691 let b = _mm_setr_pd(5., 3.);
692 let c = _mm_setr_pd(4., 9.);
693 let r = _mm_setr_pd(1., 2.);
694 assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
695 }
696
697 #[simd_test(enable = "fma")]
698 const fn test_mm_fmsub_ss() {
699 let a = _mm_setr_ps(1., 2., 3., 4.);
700 let b = _mm_setr_ps(5., 3., 7., 2.);
701 let c = _mm_setr_ps(4., 9., 1., 7.);
702 let r = _mm_setr_ps(1., 2., 3., 4.);
703 assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
704 }
705
706 #[simd_test(enable = "fma")]
707 const fn test_mm_fmsubadd_pd() {
708 let a = _mm_setr_pd(1., 2.);
709 let b = _mm_setr_pd(5., 3.);
710 let c = _mm_setr_pd(4., 9.);
711 let r = _mm_setr_pd(9., -3.);
712 assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
713 }
714
715 #[simd_test(enable = "fma")]
716 const fn test_mm256_fmsubadd_pd() {
717 let a = _mm256_setr_pd(1., 2., 3., 4.);
718 let b = _mm256_setr_pd(5., 3., 7., 2.);
719 let c = _mm256_setr_pd(4., 9., 1., 7.);
720 let r = _mm256_setr_pd(9., -3., 22., 1.);
721 assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
722 }
723
724 #[simd_test(enable = "fma")]
725 const fn test_mm_fmsubadd_ps() {
726 let a = _mm_setr_ps(1., 2., 3., 4.);
727 let b = _mm_setr_ps(5., 3., 7., 2.);
728 let c = _mm_setr_ps(4., 9., 1., 7.);
729 let r = _mm_setr_ps(9., -3., 22., 1.);
730 assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
731 }
732
733 #[simd_test(enable = "fma")]
734 const fn test_mm256_fmsubadd_ps() {
735 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
736 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
737 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
738 let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
739 assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
740 }
741
742 #[simd_test(enable = "fma")]
743 const fn test_mm_fnmadd_pd() {
744 let a = _mm_setr_pd(1., 2.);
745 let b = _mm_setr_pd(5., 3.);
746 let c = _mm_setr_pd(4., 9.);
747 let r = _mm_setr_pd(-1., 3.);
748 assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
749 }
750
751 #[simd_test(enable = "fma")]
752 const fn test_mm256_fnmadd_pd() {
753 let a = _mm256_setr_pd(1., 2., 3., 4.);
754 let b = _mm256_setr_pd(5., 3., 7., 2.);
755 let c = _mm256_setr_pd(4., 9., 1., 7.);
756 let r = _mm256_setr_pd(-1., 3., -20., -1.);
757 assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
758 }
759
760 #[simd_test(enable = "fma")]
761 const fn test_mm_fnmadd_ps() {
762 let a = _mm_setr_ps(1., 2., 3., 4.);
763 let b = _mm_setr_ps(5., 3., 7., 2.);
764 let c = _mm_setr_ps(4., 9., 1., 7.);
765 let r = _mm_setr_ps(-1., 3., -20., -1.);
766 assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
767 }
768
769 #[simd_test(enable = "fma")]
770 const fn test_mm256_fnmadd_ps() {
771 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
772 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
773 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
774 let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
775 assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
776 }
777
778 #[simd_test(enable = "fma")]
779 const fn test_mm_fnmadd_sd() {
780 let a = _mm_setr_pd(1., 2.);
781 let b = _mm_setr_pd(5., 3.);
782 let c = _mm_setr_pd(4., 9.);
783 let r = _mm_setr_pd(-1., 2.);
784 assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
785 }
786
787 #[simd_test(enable = "fma")]
788 const fn test_mm_fnmadd_ss() {
789 let a = _mm_setr_ps(1., 2., 3., 4.);
790 let b = _mm_setr_ps(5., 3., 7., 2.);
791 let c = _mm_setr_ps(4., 9., 1., 7.);
792 let r = _mm_setr_ps(-1., 2., 3., 4.);
793 assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
794 }
795
796 #[simd_test(enable = "fma")]
797 const fn test_mm_fnmsub_pd() {
798 let a = _mm_setr_pd(1., 2.);
799 let b = _mm_setr_pd(5., 3.);
800 let c = _mm_setr_pd(4., 9.);
801 let r = _mm_setr_pd(-9., -15.);
802 assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
803 }
804
805 #[simd_test(enable = "fma")]
806 const fn test_mm256_fnmsub_pd() {
807 let a = _mm256_setr_pd(1., 2., 3., 4.);
808 let b = _mm256_setr_pd(5., 3., 7., 2.);
809 let c = _mm256_setr_pd(4., 9., 1., 7.);
810 let r = _mm256_setr_pd(-9., -15., -22., -15.);
811 assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
812 }
813
814 #[simd_test(enable = "fma")]
815 const fn test_mm_fnmsub_ps() {
816 let a = _mm_setr_ps(1., 2., 3., 4.);
817 let b = _mm_setr_ps(5., 3., 7., 2.);
818 let c = _mm_setr_ps(4., 9., 1., 7.);
819 let r = _mm_setr_ps(-9., -15., -22., -15.);
820 assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
821 }
822
823 #[simd_test(enable = "fma")]
824 const fn test_mm256_fnmsub_ps() {
825 let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
826 let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
827 let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
828 let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
829 assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
830 }
831
832 #[simd_test(enable = "fma")]
833 const fn test_mm_fnmsub_sd() {
834 let a = _mm_setr_pd(1., 2.);
835 let b = _mm_setr_pd(5., 3.);
836 let c = _mm_setr_pd(4., 9.);
837 let r = _mm_setr_pd(-9., 2.);
838 assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
839 }
840
841 #[simd_test(enable = "fma")]
842 const fn test_mm_fnmsub_ss() {
843 let a = _mm_setr_ps(1., 2., 3., 4.);
844 let b = _mm_setr_ps(5., 3., 7., 2.);
845 let c = _mm_setr_ps(4., 9., 1., 7.);
846 let r = _mm_setr_ps(-9., 2., 3., 4.);
847 assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
848 }
849}
850